1 //===- LoopVectorize.cpp - A Loop Vectorizer ------------------------------===// 2 // 3 // Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. 4 // See https://llvm.org/LICENSE.txt for license information. 5 // SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception 6 // 7 //===----------------------------------------------------------------------===// 8 // 9 // This is the LLVM loop vectorizer. This pass modifies 'vectorizable' loops 10 // and generates target-independent LLVM-IR. 11 // The vectorizer uses the TargetTransformInfo analysis to estimate the costs 12 // of instructions in order to estimate the profitability of vectorization. 13 // 14 // The loop vectorizer combines consecutive loop iterations into a single 15 // 'wide' iteration. After this transformation the index is incremented 16 // by the SIMD vector width, and not by one. 17 // 18 // This pass has three parts: 19 // 1. The main loop pass that drives the different parts. 20 // 2. LoopVectorizationLegality - A unit that checks for the legality 21 // of the vectorization. 22 // 3. InnerLoopVectorizer - A unit that performs the actual 23 // widening of instructions. 24 // 4. LoopVectorizationCostModel - A unit that checks for the profitability 25 // of vectorization. It decides on the optimal vector width, which 26 // can be one, if vectorization is not profitable. 27 // 28 // There is a development effort going on to migrate loop vectorizer to the 29 // VPlan infrastructure and to introduce outer loop vectorization support (see 30 // docs/Proposal/VectorizationPlan.rst and 31 // http://lists.llvm.org/pipermail/llvm-dev/2017-December/119523.html). For this 32 // purpose, we temporarily introduced the VPlan-native vectorization path: an 33 // alternative vectorization path that is natively implemented on top of the 34 // VPlan infrastructure. See EnableVPlanNativePath for enabling. 35 // 36 //===----------------------------------------------------------------------===// 37 // 38 // The reduction-variable vectorization is based on the paper: 39 // D. Nuzman and R. Henderson. Multi-platform Auto-vectorization. 40 // 41 // Variable uniformity checks are inspired by: 42 // Karrenberg, R. and Hack, S. Whole Function Vectorization. 43 // 44 // The interleaved access vectorization is based on the paper: 45 // Dorit Nuzman, Ira Rosen and Ayal Zaks. Auto-Vectorization of Interleaved 46 // Data for SIMD 47 // 48 // Other ideas/concepts are from: 49 // A. Zaks and D. Nuzman. Autovectorization in GCC-two years later. 50 // 51 // S. Maleki, Y. Gao, M. Garzaran, T. Wong and D. Padua. An Evaluation of 52 // Vectorizing Compilers. 53 // 54 //===----------------------------------------------------------------------===// 55 56 #include "llvm/Transforms/Vectorize/LoopVectorize.h" 57 #include "LoopVectorizationPlanner.h" 58 #include "VPRecipeBuilder.h" 59 #include "VPlan.h" 60 #include "VPlanHCFGBuilder.h" 61 #include "VPlanPredicator.h" 62 #include "VPlanTransforms.h" 63 #include "llvm/ADT/APInt.h" 64 #include "llvm/ADT/ArrayRef.h" 65 #include "llvm/ADT/DenseMap.h" 66 #include "llvm/ADT/DenseMapInfo.h" 67 #include "llvm/ADT/Hashing.h" 68 #include "llvm/ADT/MapVector.h" 69 #include "llvm/ADT/None.h" 70 #include "llvm/ADT/Optional.h" 71 #include "llvm/ADT/STLExtras.h" 72 #include "llvm/ADT/SetVector.h" 73 #include "llvm/ADT/SmallPtrSet.h" 74 #include "llvm/ADT/SmallVector.h" 75 #include "llvm/ADT/Statistic.h" 76 #include "llvm/ADT/StringRef.h" 77 #include "llvm/ADT/Twine.h" 78 #include "llvm/ADT/iterator_range.h" 79 #include "llvm/Analysis/AssumptionCache.h" 80 #include "llvm/Analysis/BasicAliasAnalysis.h" 81 #include "llvm/Analysis/BlockFrequencyInfo.h" 82 #include "llvm/Analysis/CFG.h" 83 #include "llvm/Analysis/CodeMetrics.h" 84 #include "llvm/Analysis/DemandedBits.h" 85 #include "llvm/Analysis/GlobalsModRef.h" 86 #include "llvm/Analysis/LoopAccessAnalysis.h" 87 #include "llvm/Analysis/LoopAnalysisManager.h" 88 #include "llvm/Analysis/LoopInfo.h" 89 #include "llvm/Analysis/LoopIterator.h" 90 #include "llvm/Analysis/MemorySSA.h" 91 #include "llvm/Analysis/OptimizationRemarkEmitter.h" 92 #include "llvm/Analysis/ProfileSummaryInfo.h" 93 #include "llvm/Analysis/ScalarEvolution.h" 94 #include "llvm/Analysis/ScalarEvolutionExpressions.h" 95 #include "llvm/Analysis/TargetLibraryInfo.h" 96 #include "llvm/Analysis/TargetTransformInfo.h" 97 #include "llvm/Analysis/VectorUtils.h" 98 #include "llvm/IR/Attributes.h" 99 #include "llvm/IR/BasicBlock.h" 100 #include "llvm/IR/CFG.h" 101 #include "llvm/IR/Constant.h" 102 #include "llvm/IR/Constants.h" 103 #include "llvm/IR/DataLayout.h" 104 #include "llvm/IR/DebugInfoMetadata.h" 105 #include "llvm/IR/DebugLoc.h" 106 #include "llvm/IR/DerivedTypes.h" 107 #include "llvm/IR/DiagnosticInfo.h" 108 #include "llvm/IR/Dominators.h" 109 #include "llvm/IR/Function.h" 110 #include "llvm/IR/IRBuilder.h" 111 #include "llvm/IR/InstrTypes.h" 112 #include "llvm/IR/Instruction.h" 113 #include "llvm/IR/Instructions.h" 114 #include "llvm/IR/IntrinsicInst.h" 115 #include "llvm/IR/Intrinsics.h" 116 #include "llvm/IR/LLVMContext.h" 117 #include "llvm/IR/Metadata.h" 118 #include "llvm/IR/Module.h" 119 #include "llvm/IR/Operator.h" 120 #include "llvm/IR/Type.h" 121 #include "llvm/IR/Use.h" 122 #include "llvm/IR/User.h" 123 #include "llvm/IR/Value.h" 124 #include "llvm/IR/ValueHandle.h" 125 #include "llvm/IR/Verifier.h" 126 #include "llvm/InitializePasses.h" 127 #include "llvm/Pass.h" 128 #include "llvm/Support/Casting.h" 129 #include "llvm/Support/CommandLine.h" 130 #include "llvm/Support/Compiler.h" 131 #include "llvm/Support/Debug.h" 132 #include "llvm/Support/ErrorHandling.h" 133 #include "llvm/Support/MathExtras.h" 134 #include "llvm/Support/raw_ostream.h" 135 #include "llvm/Transforms/Utils/BasicBlockUtils.h" 136 #include "llvm/Transforms/Utils/InjectTLIMappings.h" 137 #include "llvm/Transforms/Utils/LoopSimplify.h" 138 #include "llvm/Transforms/Utils/LoopUtils.h" 139 #include "llvm/Transforms/Utils/LoopVersioning.h" 140 #include "llvm/Transforms/Utils/ScalarEvolutionExpander.h" 141 #include "llvm/Transforms/Utils/SizeOpts.h" 142 #include "llvm/Transforms/Vectorize/LoopVectorizationLegality.h" 143 #include <algorithm> 144 #include <cassert> 145 #include <cstdint> 146 #include <cstdlib> 147 #include <functional> 148 #include <iterator> 149 #include <limits> 150 #include <memory> 151 #include <string> 152 #include <tuple> 153 #include <utility> 154 155 using namespace llvm; 156 157 #define LV_NAME "loop-vectorize" 158 #define DEBUG_TYPE LV_NAME 159 160 /// @{ 161 /// Metadata attribute names 162 static const char *const LLVMLoopVectorizeFollowupAll = 163 "llvm.loop.vectorize.followup_all"; 164 static const char *const LLVMLoopVectorizeFollowupVectorized = 165 "llvm.loop.vectorize.followup_vectorized"; 166 static const char *const LLVMLoopVectorizeFollowupEpilogue = 167 "llvm.loop.vectorize.followup_epilogue"; 168 /// @} 169 170 STATISTIC(LoopsVectorized, "Number of loops vectorized"); 171 STATISTIC(LoopsAnalyzed, "Number of loops analyzed for vectorization"); 172 173 /// Loops with a known constant trip count below this number are vectorized only 174 /// if no scalar iteration overheads are incurred. 175 static cl::opt<unsigned> TinyTripCountVectorThreshold( 176 "vectorizer-min-trip-count", cl::init(16), cl::Hidden, 177 cl::desc("Loops with a constant trip count that is smaller than this " 178 "value are vectorized only if no scalar iteration overheads " 179 "are incurred.")); 180 181 // Indicates that an epilogue is undesired, predication is preferred. 182 // This means that the vectorizer will try to fold the loop-tail (epilogue) 183 // into the loop and predicate the loop body accordingly. 184 static cl::opt<bool> PreferPredicateOverEpilog( 185 "prefer-predicate-over-epilog", cl::init(false), cl::Hidden, 186 cl::desc("Indicate that an epilogue is undesired, predication should be " 187 "used instead.")); 188 189 static cl::opt<bool> MaximizeBandwidth( 190 "vectorizer-maximize-bandwidth", cl::init(false), cl::Hidden, 191 cl::desc("Maximize bandwidth when selecting vectorization factor which " 192 "will be determined by the smallest type in loop.")); 193 194 static cl::opt<bool> EnableInterleavedMemAccesses( 195 "enable-interleaved-mem-accesses", cl::init(false), cl::Hidden, 196 cl::desc("Enable vectorization on interleaved memory accesses in a loop")); 197 198 /// An interleave-group may need masking if it resides in a block that needs 199 /// predication, or in order to mask away gaps. 200 static cl::opt<bool> EnableMaskedInterleavedMemAccesses( 201 "enable-masked-interleaved-mem-accesses", cl::init(false), cl::Hidden, 202 cl::desc("Enable vectorization on masked interleaved memory accesses in a loop")); 203 204 static cl::opt<unsigned> TinyTripCountInterleaveThreshold( 205 "tiny-trip-count-interleave-threshold", cl::init(128), cl::Hidden, 206 cl::desc("We don't interleave loops with a estimated constant trip count " 207 "below this number")); 208 209 static cl::opt<unsigned> ForceTargetNumScalarRegs( 210 "force-target-num-scalar-regs", cl::init(0), cl::Hidden, 211 cl::desc("A flag that overrides the target's number of scalar registers.")); 212 213 static cl::opt<unsigned> ForceTargetNumVectorRegs( 214 "force-target-num-vector-regs", cl::init(0), cl::Hidden, 215 cl::desc("A flag that overrides the target's number of vector registers.")); 216 217 static cl::opt<unsigned> ForceTargetMaxScalarInterleaveFactor( 218 "force-target-max-scalar-interleave", cl::init(0), cl::Hidden, 219 cl::desc("A flag that overrides the target's max interleave factor for " 220 "scalar loops.")); 221 222 static cl::opt<unsigned> ForceTargetMaxVectorInterleaveFactor( 223 "force-target-max-vector-interleave", cl::init(0), cl::Hidden, 224 cl::desc("A flag that overrides the target's max interleave factor for " 225 "vectorized loops.")); 226 227 static cl::opt<unsigned> ForceTargetInstructionCost( 228 "force-target-instruction-cost", cl::init(0), cl::Hidden, 229 cl::desc("A flag that overrides the target's expected cost for " 230 "an instruction to a single constant value. Mostly " 231 "useful for getting consistent testing.")); 232 233 static cl::opt<unsigned> SmallLoopCost( 234 "small-loop-cost", cl::init(20), cl::Hidden, 235 cl::desc( 236 "The cost of a loop that is considered 'small' by the interleaver.")); 237 238 static cl::opt<bool> LoopVectorizeWithBlockFrequency( 239 "loop-vectorize-with-block-frequency", cl::init(true), cl::Hidden, 240 cl::desc("Enable the use of the block frequency analysis to access PGO " 241 "heuristics minimizing code growth in cold regions and being more " 242 "aggressive in hot regions.")); 243 244 // Runtime interleave loops for load/store throughput. 245 static cl::opt<bool> EnableLoadStoreRuntimeInterleave( 246 "enable-loadstore-runtime-interleave", cl::init(true), cl::Hidden, 247 cl::desc( 248 "Enable runtime interleaving until load/store ports are saturated")); 249 250 /// The number of stores in a loop that are allowed to need predication. 251 static cl::opt<unsigned> NumberOfStoresToPredicate( 252 "vectorize-num-stores-pred", cl::init(1), cl::Hidden, 253 cl::desc("Max number of stores to be predicated behind an if.")); 254 255 static cl::opt<bool> EnableIndVarRegisterHeur( 256 "enable-ind-var-reg-heur", cl::init(true), cl::Hidden, 257 cl::desc("Count the induction variable only once when interleaving")); 258 259 static cl::opt<bool> EnableCondStoresVectorization( 260 "enable-cond-stores-vec", cl::init(true), cl::Hidden, 261 cl::desc("Enable if predication of stores during vectorization.")); 262 263 static cl::opt<unsigned> MaxNestedScalarReductionIC( 264 "max-nested-scalar-reduction-interleave", cl::init(2), cl::Hidden, 265 cl::desc("The maximum interleave count to use when interleaving a scalar " 266 "reduction in a nested loop.")); 267 268 cl::opt<bool> EnableVPlanNativePath( 269 "enable-vplan-native-path", cl::init(false), cl::Hidden, 270 cl::desc("Enable VPlan-native vectorization path with " 271 "support for outer loop vectorization.")); 272 273 // FIXME: Remove this switch once we have divergence analysis. Currently we 274 // assume divergent non-backedge branches when this switch is true. 275 cl::opt<bool> EnableVPlanPredication( 276 "enable-vplan-predication", cl::init(false), cl::Hidden, 277 cl::desc("Enable VPlan-native vectorization path predicator with " 278 "support for outer loop vectorization.")); 279 280 // This flag enables the stress testing of the VPlan H-CFG construction in the 281 // VPlan-native vectorization path. It must be used in conjuction with 282 // -enable-vplan-native-path. -vplan-verify-hcfg can also be used to enable the 283 // verification of the H-CFGs built. 284 static cl::opt<bool> VPlanBuildStressTest( 285 "vplan-build-stress-test", cl::init(false), cl::Hidden, 286 cl::desc( 287 "Build VPlan for every supported loop nest in the function and bail " 288 "out right after the build (stress test the VPlan H-CFG construction " 289 "in the VPlan-native vectorization path).")); 290 291 cl::opt<bool> llvm::EnableLoopInterleaving( 292 "interleave-loops", cl::init(true), cl::Hidden, 293 cl::desc("Enable loop interleaving in Loop vectorization passes")); 294 cl::opt<bool> llvm::EnableLoopVectorization( 295 "vectorize-loops", cl::init(true), cl::Hidden, 296 cl::desc("Run the Loop vectorization passes")); 297 298 /// A helper function that returns the type of loaded or stored value. 299 static Type *getMemInstValueType(Value *I) { 300 assert((isa<LoadInst>(I) || isa<StoreInst>(I)) && 301 "Expected Load or Store instruction"); 302 if (auto *LI = dyn_cast<LoadInst>(I)) 303 return LI->getType(); 304 return cast<StoreInst>(I)->getValueOperand()->getType(); 305 } 306 307 /// A helper function that returns true if the given type is irregular. The 308 /// type is irregular if its allocated size doesn't equal the store size of an 309 /// element of the corresponding vector type at the given vectorization factor. 310 static bool hasIrregularType(Type *Ty, const DataLayout &DL, unsigned VF) { 311 // Determine if an array of VF elements of type Ty is "bitcast compatible" 312 // with a <VF x Ty> vector. 313 if (VF > 1) { 314 auto *VectorTy = FixedVectorType::get(Ty, VF); 315 return VF * DL.getTypeAllocSize(Ty) != DL.getTypeStoreSize(VectorTy); 316 } 317 318 // If the vectorization factor is one, we just check if an array of type Ty 319 // requires padding between elements. 320 return DL.getTypeAllocSizeInBits(Ty) != DL.getTypeSizeInBits(Ty); 321 } 322 323 /// A helper function that returns the reciprocal of the block probability of 324 /// predicated blocks. If we return X, we are assuming the predicated block 325 /// will execute once for every X iterations of the loop header. 326 /// 327 /// TODO: We should use actual block probability here, if available. Currently, 328 /// we always assume predicated blocks have a 50% chance of executing. 329 static unsigned getReciprocalPredBlockProb() { return 2; } 330 331 /// A helper function that adds a 'fast' flag to floating-point operations. 332 static Value *addFastMathFlag(Value *V) { 333 if (isa<FPMathOperator>(V)) 334 cast<Instruction>(V)->setFastMathFlags(FastMathFlags::getFast()); 335 return V; 336 } 337 338 static Value *addFastMathFlag(Value *V, FastMathFlags FMF) { 339 if (isa<FPMathOperator>(V)) 340 cast<Instruction>(V)->setFastMathFlags(FMF); 341 return V; 342 } 343 344 /// A helper function that returns an integer or floating-point constant with 345 /// value C. 346 static Constant *getSignedIntOrFpConstant(Type *Ty, int64_t C) { 347 return Ty->isIntegerTy() ? ConstantInt::getSigned(Ty, C) 348 : ConstantFP::get(Ty, C); 349 } 350 351 /// Returns "best known" trip count for the specified loop \p L as defined by 352 /// the following procedure: 353 /// 1) Returns exact trip count if it is known. 354 /// 2) Returns expected trip count according to profile data if any. 355 /// 3) Returns upper bound estimate if it is known. 356 /// 4) Returns None if all of the above failed. 357 static Optional<unsigned> getSmallBestKnownTC(ScalarEvolution &SE, Loop *L) { 358 // Check if exact trip count is known. 359 if (unsigned ExpectedTC = SE.getSmallConstantTripCount(L)) 360 return ExpectedTC; 361 362 // Check if there is an expected trip count available from profile data. 363 if (LoopVectorizeWithBlockFrequency) 364 if (auto EstimatedTC = getLoopEstimatedTripCount(L)) 365 return EstimatedTC; 366 367 // Check if upper bound estimate is known. 368 if (unsigned ExpectedTC = SE.getSmallConstantMaxTripCount(L)) 369 return ExpectedTC; 370 371 return None; 372 } 373 374 namespace llvm { 375 376 /// InnerLoopVectorizer vectorizes loops which contain only one basic 377 /// block to a specified vectorization factor (VF). 378 /// This class performs the widening of scalars into vectors, or multiple 379 /// scalars. This class also implements the following features: 380 /// * It inserts an epilogue loop for handling loops that don't have iteration 381 /// counts that are known to be a multiple of the vectorization factor. 382 /// * It handles the code generation for reduction variables. 383 /// * Scalarization (implementation using scalars) of un-vectorizable 384 /// instructions. 385 /// InnerLoopVectorizer does not perform any vectorization-legality 386 /// checks, and relies on the caller to check for the different legality 387 /// aspects. The InnerLoopVectorizer relies on the 388 /// LoopVectorizationLegality class to provide information about the induction 389 /// and reduction variables that were found to a given vectorization factor. 390 class InnerLoopVectorizer { 391 public: 392 InnerLoopVectorizer(Loop *OrigLoop, PredicatedScalarEvolution &PSE, 393 LoopInfo *LI, DominatorTree *DT, 394 const TargetLibraryInfo *TLI, 395 const TargetTransformInfo *TTI, AssumptionCache *AC, 396 OptimizationRemarkEmitter *ORE, unsigned VecWidth, 397 unsigned UnrollFactor, LoopVectorizationLegality *LVL, 398 LoopVectorizationCostModel *CM, BlockFrequencyInfo *BFI, 399 ProfileSummaryInfo *PSI) 400 : OrigLoop(OrigLoop), PSE(PSE), LI(LI), DT(DT), TLI(TLI), TTI(TTI), 401 AC(AC), ORE(ORE), VF(VecWidth), UF(UnrollFactor), 402 Builder(PSE.getSE()->getContext()), 403 VectorLoopValueMap(UnrollFactor, VecWidth), Legal(LVL), Cost(CM), 404 BFI(BFI), PSI(PSI) { 405 // Query this against the original loop and save it here because the profile 406 // of the original loop header may change as the transformation happens. 407 OptForSizeBasedOnProfile = llvm::shouldOptimizeForSize( 408 OrigLoop->getHeader(), PSI, BFI, PGSOQueryType::IRPass); 409 } 410 411 virtual ~InnerLoopVectorizer() = default; 412 413 /// Create a new empty loop that will contain vectorized instructions later 414 /// on, while the old loop will be used as the scalar remainder. Control flow 415 /// is generated around the vectorized (and scalar epilogue) loops consisting 416 /// of various checks and bypasses. Return the pre-header block of the new 417 /// loop. 418 BasicBlock *createVectorizedLoopSkeleton(); 419 420 /// Widen a single instruction within the innermost loop. 421 void widenInstruction(Instruction &I, VPUser &Operands, 422 VPTransformState &State); 423 424 /// Widen a single call instruction within the innermost loop. 425 void widenCallInstruction(CallInst &I, VPUser &ArgOperands, 426 VPTransformState &State); 427 428 /// Widen a single select instruction within the innermost loop. 429 void widenSelectInstruction(SelectInst &I, VPUser &Operands, 430 bool InvariantCond, VPTransformState &State); 431 432 /// Fix the vectorized code, taking care of header phi's, live-outs, and more. 433 void fixVectorizedLoop(); 434 435 // Return true if any runtime check is added. 436 bool areSafetyChecksAdded() { return AddedSafetyChecks; } 437 438 /// A type for vectorized values in the new loop. Each value from the 439 /// original loop, when vectorized, is represented by UF vector values in the 440 /// new unrolled loop, where UF is the unroll factor. 441 using VectorParts = SmallVector<Value *, 2>; 442 443 /// Vectorize a single GetElementPtrInst based on information gathered and 444 /// decisions taken during planning. 445 void widenGEP(GetElementPtrInst *GEP, VPUser &Indices, unsigned UF, 446 unsigned VF, bool IsPtrLoopInvariant, 447 SmallBitVector &IsIndexLoopInvariant, VPTransformState &State); 448 449 /// Vectorize a single PHINode in a block. This method handles the induction 450 /// variable canonicalization. It supports both VF = 1 for unrolled loops and 451 /// arbitrary length vectors. 452 void widenPHIInstruction(Instruction *PN, unsigned UF, unsigned VF); 453 454 /// A helper function to scalarize a single Instruction in the innermost loop. 455 /// Generates a sequence of scalar instances for each lane between \p MinLane 456 /// and \p MaxLane, times each part between \p MinPart and \p MaxPart, 457 /// inclusive. Uses the VPValue operands from \p Operands instead of \p 458 /// Instr's operands. 459 void scalarizeInstruction(Instruction *Instr, VPUser &Operands, 460 const VPIteration &Instance, bool IfPredicateInstr, 461 VPTransformState &State); 462 463 /// Widen an integer or floating-point induction variable \p IV. If \p Trunc 464 /// is provided, the integer induction variable will first be truncated to 465 /// the corresponding type. 466 void widenIntOrFpInduction(PHINode *IV, TruncInst *Trunc = nullptr); 467 468 /// getOrCreateVectorValue and getOrCreateScalarValue coordinate to generate a 469 /// vector or scalar value on-demand if one is not yet available. When 470 /// vectorizing a loop, we visit the definition of an instruction before its 471 /// uses. When visiting the definition, we either vectorize or scalarize the 472 /// instruction, creating an entry for it in the corresponding map. (In some 473 /// cases, such as induction variables, we will create both vector and scalar 474 /// entries.) Then, as we encounter uses of the definition, we derive values 475 /// for each scalar or vector use unless such a value is already available. 476 /// For example, if we scalarize a definition and one of its uses is vector, 477 /// we build the required vector on-demand with an insertelement sequence 478 /// when visiting the use. Otherwise, if the use is scalar, we can use the 479 /// existing scalar definition. 480 /// 481 /// Return a value in the new loop corresponding to \p V from the original 482 /// loop at unroll index \p Part. If the value has already been vectorized, 483 /// the corresponding vector entry in VectorLoopValueMap is returned. If, 484 /// however, the value has a scalar entry in VectorLoopValueMap, we construct 485 /// a new vector value on-demand by inserting the scalar values into a vector 486 /// with an insertelement sequence. If the value has been neither vectorized 487 /// nor scalarized, it must be loop invariant, so we simply broadcast the 488 /// value into a vector. 489 Value *getOrCreateVectorValue(Value *V, unsigned Part); 490 491 /// Return a value in the new loop corresponding to \p V from the original 492 /// loop at unroll and vector indices \p Instance. If the value has been 493 /// vectorized but not scalarized, the necessary extractelement instruction 494 /// will be generated. 495 Value *getOrCreateScalarValue(Value *V, const VPIteration &Instance); 496 497 /// Construct the vector value of a scalarized value \p V one lane at a time. 498 void packScalarIntoVectorValue(Value *V, const VPIteration &Instance); 499 500 /// Try to vectorize interleaved access group \p Group with the base address 501 /// given in \p Addr, optionally masking the vector operations if \p 502 /// BlockInMask is non-null. Use \p State to translate given VPValues to IR 503 /// values in the vectorized loop. 504 void vectorizeInterleaveGroup(const InterleaveGroup<Instruction> *Group, 505 VPTransformState &State, VPValue *Addr, 506 VPValue *BlockInMask = nullptr); 507 508 /// Vectorize Load and Store instructions with the base address given in \p 509 /// Addr, optionally masking the vector operations if \p BlockInMask is 510 /// non-null. Use \p State to translate given VPValues to IR values in the 511 /// vectorized loop. 512 void vectorizeMemoryInstruction(Instruction *Instr, VPTransformState &State, 513 VPValue *Addr, VPValue *StoredValue, 514 VPValue *BlockInMask); 515 516 /// Set the debug location in the builder using the debug location in 517 /// the instruction. 518 void setDebugLocFromInst(IRBuilder<> &B, const Value *Ptr); 519 520 /// Fix the non-induction PHIs in the OrigPHIsToFix vector. 521 void fixNonInductionPHIs(void); 522 523 protected: 524 friend class LoopVectorizationPlanner; 525 526 /// A small list of PHINodes. 527 using PhiVector = SmallVector<PHINode *, 4>; 528 529 /// A type for scalarized values in the new loop. Each value from the 530 /// original loop, when scalarized, is represented by UF x VF scalar values 531 /// in the new unrolled loop, where UF is the unroll factor and VF is the 532 /// vectorization factor. 533 using ScalarParts = SmallVector<SmallVector<Value *, 4>, 2>; 534 535 /// Set up the values of the IVs correctly when exiting the vector loop. 536 void fixupIVUsers(PHINode *OrigPhi, const InductionDescriptor &II, 537 Value *CountRoundDown, Value *EndValue, 538 BasicBlock *MiddleBlock); 539 540 /// Create a new induction variable inside L. 541 PHINode *createInductionVariable(Loop *L, Value *Start, Value *End, 542 Value *Step, Instruction *DL); 543 544 /// Handle all cross-iteration phis in the header. 545 void fixCrossIterationPHIs(); 546 547 /// Fix a first-order recurrence. This is the second phase of vectorizing 548 /// this phi node. 549 void fixFirstOrderRecurrence(PHINode *Phi); 550 551 /// Fix a reduction cross-iteration phi. This is the second phase of 552 /// vectorizing this phi node. 553 void fixReduction(PHINode *Phi); 554 555 /// Clear NSW/NUW flags from reduction instructions if necessary. 556 void clearReductionWrapFlags(RecurrenceDescriptor &RdxDesc); 557 558 /// The Loop exit block may have single value PHI nodes with some 559 /// incoming value. While vectorizing we only handled real values 560 /// that were defined inside the loop and we should have one value for 561 /// each predecessor of its parent basic block. See PR14725. 562 void fixLCSSAPHIs(); 563 564 /// Iteratively sink the scalarized operands of a predicated instruction into 565 /// the block that was created for it. 566 void sinkScalarOperands(Instruction *PredInst); 567 568 /// Shrinks vector element sizes to the smallest bitwidth they can be legally 569 /// represented as. 570 void truncateToMinimalBitwidths(); 571 572 /// Create a broadcast instruction. This method generates a broadcast 573 /// instruction (shuffle) for loop invariant values and for the induction 574 /// value. If this is the induction variable then we extend it to N, N+1, ... 575 /// this is needed because each iteration in the loop corresponds to a SIMD 576 /// element. 577 virtual Value *getBroadcastInstrs(Value *V); 578 579 /// This function adds (StartIdx, StartIdx + Step, StartIdx + 2*Step, ...) 580 /// to each vector element of Val. The sequence starts at StartIndex. 581 /// \p Opcode is relevant for FP induction variable. 582 virtual Value *getStepVector(Value *Val, int StartIdx, Value *Step, 583 Instruction::BinaryOps Opcode = 584 Instruction::BinaryOpsEnd); 585 586 /// Compute scalar induction steps. \p ScalarIV is the scalar induction 587 /// variable on which to base the steps, \p Step is the size of the step, and 588 /// \p EntryVal is the value from the original loop that maps to the steps. 589 /// Note that \p EntryVal doesn't have to be an induction variable - it 590 /// can also be a truncate instruction. 591 void buildScalarSteps(Value *ScalarIV, Value *Step, Instruction *EntryVal, 592 const InductionDescriptor &ID); 593 594 /// Create a vector induction phi node based on an existing scalar one. \p 595 /// EntryVal is the value from the original loop that maps to the vector phi 596 /// node, and \p Step is the loop-invariant step. If \p EntryVal is a 597 /// truncate instruction, instead of widening the original IV, we widen a 598 /// version of the IV truncated to \p EntryVal's type. 599 void createVectorIntOrFpInductionPHI(const InductionDescriptor &II, 600 Value *Step, Instruction *EntryVal); 601 602 /// Returns true if an instruction \p I should be scalarized instead of 603 /// vectorized for the chosen vectorization factor. 604 bool shouldScalarizeInstruction(Instruction *I) const; 605 606 /// Returns true if we should generate a scalar version of \p IV. 607 bool needsScalarInduction(Instruction *IV) const; 608 609 /// If there is a cast involved in the induction variable \p ID, which should 610 /// be ignored in the vectorized loop body, this function records the 611 /// VectorLoopValue of the respective Phi also as the VectorLoopValue of the 612 /// cast. We had already proved that the casted Phi is equal to the uncasted 613 /// Phi in the vectorized loop (under a runtime guard), and therefore 614 /// there is no need to vectorize the cast - the same value can be used in the 615 /// vector loop for both the Phi and the cast. 616 /// If \p VectorLoopValue is a scalarized value, \p Lane is also specified, 617 /// Otherwise, \p VectorLoopValue is a widened/vectorized value. 618 /// 619 /// \p EntryVal is the value from the original loop that maps to the vector 620 /// phi node and is used to distinguish what is the IV currently being 621 /// processed - original one (if \p EntryVal is a phi corresponding to the 622 /// original IV) or the "newly-created" one based on the proof mentioned above 623 /// (see also buildScalarSteps() and createVectorIntOrFPInductionPHI()). In the 624 /// latter case \p EntryVal is a TruncInst and we must not record anything for 625 /// that IV, but it's error-prone to expect callers of this routine to care 626 /// about that, hence this explicit parameter. 627 void recordVectorLoopValueForInductionCast(const InductionDescriptor &ID, 628 const Instruction *EntryVal, 629 Value *VectorLoopValue, 630 unsigned Part, 631 unsigned Lane = UINT_MAX); 632 633 /// Generate a shuffle sequence that will reverse the vector Vec. 634 virtual Value *reverseVector(Value *Vec); 635 636 /// Returns (and creates if needed) the original loop trip count. 637 Value *getOrCreateTripCount(Loop *NewLoop); 638 639 /// Returns (and creates if needed) the trip count of the widened loop. 640 Value *getOrCreateVectorTripCount(Loop *NewLoop); 641 642 /// Returns a bitcasted value to the requested vector type. 643 /// Also handles bitcasts of vector<float> <-> vector<pointer> types. 644 Value *createBitOrPointerCast(Value *V, VectorType *DstVTy, 645 const DataLayout &DL); 646 647 /// Emit a bypass check to see if the vector trip count is zero, including if 648 /// it overflows. 649 void emitMinimumIterationCountCheck(Loop *L, BasicBlock *Bypass); 650 651 /// Emit a bypass check to see if all of the SCEV assumptions we've 652 /// had to make are correct. 653 void emitSCEVChecks(Loop *L, BasicBlock *Bypass); 654 655 /// Emit bypass checks to check any memory assumptions we may have made. 656 void emitMemRuntimeChecks(Loop *L, BasicBlock *Bypass); 657 658 /// Compute the transformed value of Index at offset StartValue using step 659 /// StepValue. 660 /// For integer induction, returns StartValue + Index * StepValue. 661 /// For pointer induction, returns StartValue[Index * StepValue]. 662 /// FIXME: The newly created binary instructions should contain nsw/nuw 663 /// flags, which can be found from the original scalar operations. 664 Value *emitTransformedIndex(IRBuilder<> &B, Value *Index, ScalarEvolution *SE, 665 const DataLayout &DL, 666 const InductionDescriptor &ID) const; 667 668 /// Emit basic blocks (prefixed with \p Prefix) for the iteration check, 669 /// vector loop preheader, middle block and scalar preheader. Also 670 /// allocate a loop object for the new vector loop and return it. 671 Loop *createVectorLoopSkeleton(StringRef Prefix); 672 673 /// Create new phi nodes for the induction variables to resume iteration count 674 /// in the scalar epilogue, from where the vectorized loop left off (given by 675 /// \p VectorTripCount). 676 void createInductionResumeValues(Loop *L, Value *VectorTripCount); 677 678 /// Complete the loop skeleton by adding debug MDs, creating appropriate 679 /// conditional branches in the middle block, preparing the builder and 680 /// running the verifier. Take in the vector loop \p L as argument, and return 681 /// the preheader of the completed vector loop. 682 BasicBlock *completeLoopSkeleton(Loop *L, MDNode *OrigLoopID); 683 684 /// Add additional metadata to \p To that was not present on \p Orig. 685 /// 686 /// Currently this is used to add the noalias annotations based on the 687 /// inserted memchecks. Use this for instructions that are *cloned* into the 688 /// vector loop. 689 void addNewMetadata(Instruction *To, const Instruction *Orig); 690 691 /// Add metadata from one instruction to another. 692 /// 693 /// This includes both the original MDs from \p From and additional ones (\see 694 /// addNewMetadata). Use this for *newly created* instructions in the vector 695 /// loop. 696 void addMetadata(Instruction *To, Instruction *From); 697 698 /// Similar to the previous function but it adds the metadata to a 699 /// vector of instructions. 700 void addMetadata(ArrayRef<Value *> To, Instruction *From); 701 702 /// The original loop. 703 Loop *OrigLoop; 704 705 /// A wrapper around ScalarEvolution used to add runtime SCEV checks. Applies 706 /// dynamic knowledge to simplify SCEV expressions and converts them to a 707 /// more usable form. 708 PredicatedScalarEvolution &PSE; 709 710 /// Loop Info. 711 LoopInfo *LI; 712 713 /// Dominator Tree. 714 DominatorTree *DT; 715 716 /// Alias Analysis. 717 AAResults *AA; 718 719 /// Target Library Info. 720 const TargetLibraryInfo *TLI; 721 722 /// Target Transform Info. 723 const TargetTransformInfo *TTI; 724 725 /// Assumption Cache. 726 AssumptionCache *AC; 727 728 /// Interface to emit optimization remarks. 729 OptimizationRemarkEmitter *ORE; 730 731 /// LoopVersioning. It's only set up (non-null) if memchecks were 732 /// used. 733 /// 734 /// This is currently only used to add no-alias metadata based on the 735 /// memchecks. The actually versioning is performed manually. 736 std::unique_ptr<LoopVersioning> LVer; 737 738 /// The vectorization SIMD factor to use. Each vector will have this many 739 /// vector elements. 740 unsigned VF; 741 742 /// The vectorization unroll factor to use. Each scalar is vectorized to this 743 /// many different vector instructions. 744 unsigned UF; 745 746 /// The builder that we use 747 IRBuilder<> Builder; 748 749 // --- Vectorization state --- 750 751 /// The vector-loop preheader. 752 BasicBlock *LoopVectorPreHeader; 753 754 /// The scalar-loop preheader. 755 BasicBlock *LoopScalarPreHeader; 756 757 /// Middle Block between the vector and the scalar. 758 BasicBlock *LoopMiddleBlock; 759 760 /// The ExitBlock of the scalar loop. 761 BasicBlock *LoopExitBlock; 762 763 /// The vector loop body. 764 BasicBlock *LoopVectorBody; 765 766 /// The scalar loop body. 767 BasicBlock *LoopScalarBody; 768 769 /// A list of all bypass blocks. The first block is the entry of the loop. 770 SmallVector<BasicBlock *, 4> LoopBypassBlocks; 771 772 /// The new Induction variable which was added to the new block. 773 PHINode *Induction = nullptr; 774 775 /// The induction variable of the old basic block. 776 PHINode *OldInduction = nullptr; 777 778 /// Maps values from the original loop to their corresponding values in the 779 /// vectorized loop. A key value can map to either vector values, scalar 780 /// values or both kinds of values, depending on whether the key was 781 /// vectorized and scalarized. 782 VectorizerValueMap VectorLoopValueMap; 783 784 /// Store instructions that were predicated. 785 SmallVector<Instruction *, 4> PredicatedInstructions; 786 787 /// Trip count of the original loop. 788 Value *TripCount = nullptr; 789 790 /// Trip count of the widened loop (TripCount - TripCount % (VF*UF)) 791 Value *VectorTripCount = nullptr; 792 793 /// The legality analysis. 794 LoopVectorizationLegality *Legal; 795 796 /// The profitablity analysis. 797 LoopVectorizationCostModel *Cost; 798 799 // Record whether runtime checks are added. 800 bool AddedSafetyChecks = false; 801 802 // Holds the end values for each induction variable. We save the end values 803 // so we can later fix-up the external users of the induction variables. 804 DenseMap<PHINode *, Value *> IVEndValues; 805 806 // Vector of original scalar PHIs whose corresponding widened PHIs need to be 807 // fixed up at the end of vector code generation. 808 SmallVector<PHINode *, 8> OrigPHIsToFix; 809 810 /// BFI and PSI are used to check for profile guided size optimizations. 811 BlockFrequencyInfo *BFI; 812 ProfileSummaryInfo *PSI; 813 814 // Whether this loop should be optimized for size based on profile guided size 815 // optimizatios. 816 bool OptForSizeBasedOnProfile; 817 }; 818 819 class InnerLoopUnroller : public InnerLoopVectorizer { 820 public: 821 InnerLoopUnroller(Loop *OrigLoop, PredicatedScalarEvolution &PSE, 822 LoopInfo *LI, DominatorTree *DT, 823 const TargetLibraryInfo *TLI, 824 const TargetTransformInfo *TTI, AssumptionCache *AC, 825 OptimizationRemarkEmitter *ORE, unsigned UnrollFactor, 826 LoopVectorizationLegality *LVL, 827 LoopVectorizationCostModel *CM, BlockFrequencyInfo *BFI, 828 ProfileSummaryInfo *PSI) 829 : InnerLoopVectorizer(OrigLoop, PSE, LI, DT, TLI, TTI, AC, ORE, 1, 830 UnrollFactor, LVL, CM, BFI, PSI) {} 831 832 private: 833 Value *getBroadcastInstrs(Value *V) override; 834 Value *getStepVector(Value *Val, int StartIdx, Value *Step, 835 Instruction::BinaryOps Opcode = 836 Instruction::BinaryOpsEnd) override; 837 Value *reverseVector(Value *Vec) override; 838 }; 839 840 } // end namespace llvm 841 842 /// Look for a meaningful debug location on the instruction or it's 843 /// operands. 844 static Instruction *getDebugLocFromInstOrOperands(Instruction *I) { 845 if (!I) 846 return I; 847 848 DebugLoc Empty; 849 if (I->getDebugLoc() != Empty) 850 return I; 851 852 for (User::op_iterator OI = I->op_begin(), OE = I->op_end(); OI != OE; ++OI) { 853 if (Instruction *OpInst = dyn_cast<Instruction>(*OI)) 854 if (OpInst->getDebugLoc() != Empty) 855 return OpInst; 856 } 857 858 return I; 859 } 860 861 void InnerLoopVectorizer::setDebugLocFromInst(IRBuilder<> &B, const Value *Ptr) { 862 if (const Instruction *Inst = dyn_cast_or_null<Instruction>(Ptr)) { 863 const DILocation *DIL = Inst->getDebugLoc(); 864 if (DIL && Inst->getFunction()->isDebugInfoForProfiling() && 865 !isa<DbgInfoIntrinsic>(Inst)) { 866 auto NewDIL = DIL->cloneByMultiplyingDuplicationFactor(UF * VF); 867 if (NewDIL) 868 B.SetCurrentDebugLocation(NewDIL.getValue()); 869 else 870 LLVM_DEBUG(dbgs() 871 << "Failed to create new discriminator: " 872 << DIL->getFilename() << " Line: " << DIL->getLine()); 873 } 874 else 875 B.SetCurrentDebugLocation(DIL); 876 } else 877 B.SetCurrentDebugLocation(DebugLoc()); 878 } 879 880 /// Write a record \p DebugMsg about vectorization failure to the debug 881 /// output stream. If \p I is passed, it is an instruction that prevents 882 /// vectorization. 883 #ifndef NDEBUG 884 static void debugVectorizationFailure(const StringRef DebugMsg, 885 Instruction *I) { 886 dbgs() << "LV: Not vectorizing: " << DebugMsg; 887 if (I != nullptr) 888 dbgs() << " " << *I; 889 else 890 dbgs() << '.'; 891 dbgs() << '\n'; 892 } 893 #endif 894 895 /// Create an analysis remark that explains why vectorization failed 896 /// 897 /// \p PassName is the name of the pass (e.g. can be AlwaysPrint). \p 898 /// RemarkName is the identifier for the remark. If \p I is passed it is an 899 /// instruction that prevents vectorization. Otherwise \p TheLoop is used for 900 /// the location of the remark. \return the remark object that can be 901 /// streamed to. 902 static OptimizationRemarkAnalysis createLVAnalysis(const char *PassName, 903 StringRef RemarkName, Loop *TheLoop, Instruction *I) { 904 Value *CodeRegion = TheLoop->getHeader(); 905 DebugLoc DL = TheLoop->getStartLoc(); 906 907 if (I) { 908 CodeRegion = I->getParent(); 909 // If there is no debug location attached to the instruction, revert back to 910 // using the loop's. 911 if (I->getDebugLoc()) 912 DL = I->getDebugLoc(); 913 } 914 915 OptimizationRemarkAnalysis R(PassName, RemarkName, DL, CodeRegion); 916 R << "loop not vectorized: "; 917 return R; 918 } 919 920 namespace llvm { 921 922 void reportVectorizationFailure(const StringRef DebugMsg, 923 const StringRef OREMsg, const StringRef ORETag, 924 OptimizationRemarkEmitter *ORE, Loop *TheLoop, Instruction *I) { 925 LLVM_DEBUG(debugVectorizationFailure(DebugMsg, I)); 926 LoopVectorizeHints Hints(TheLoop, true /* doesn't matter */, *ORE); 927 ORE->emit(createLVAnalysis(Hints.vectorizeAnalysisPassName(), 928 ORETag, TheLoop, I) << OREMsg); 929 } 930 931 } // end namespace llvm 932 933 #ifndef NDEBUG 934 /// \return string containing a file name and a line # for the given loop. 935 static std::string getDebugLocString(const Loop *L) { 936 std::string Result; 937 if (L) { 938 raw_string_ostream OS(Result); 939 if (const DebugLoc LoopDbgLoc = L->getStartLoc()) 940 LoopDbgLoc.print(OS); 941 else 942 // Just print the module name. 943 OS << L->getHeader()->getParent()->getParent()->getModuleIdentifier(); 944 OS.flush(); 945 } 946 return Result; 947 } 948 #endif 949 950 void InnerLoopVectorizer::addNewMetadata(Instruction *To, 951 const Instruction *Orig) { 952 // If the loop was versioned with memchecks, add the corresponding no-alias 953 // metadata. 954 if (LVer && (isa<LoadInst>(Orig) || isa<StoreInst>(Orig))) 955 LVer->annotateInstWithNoAlias(To, Orig); 956 } 957 958 void InnerLoopVectorizer::addMetadata(Instruction *To, 959 Instruction *From) { 960 propagateMetadata(To, From); 961 addNewMetadata(To, From); 962 } 963 964 void InnerLoopVectorizer::addMetadata(ArrayRef<Value *> To, 965 Instruction *From) { 966 for (Value *V : To) { 967 if (Instruction *I = dyn_cast<Instruction>(V)) 968 addMetadata(I, From); 969 } 970 } 971 972 namespace llvm { 973 974 // Loop vectorization cost-model hints how the scalar epilogue loop should be 975 // lowered. 976 enum ScalarEpilogueLowering { 977 978 // The default: allowing scalar epilogues. 979 CM_ScalarEpilogueAllowed, 980 981 // Vectorization with OptForSize: don't allow epilogues. 982 CM_ScalarEpilogueNotAllowedOptSize, 983 984 // A special case of vectorisation with OptForSize: loops with a very small 985 // trip count are considered for vectorization under OptForSize, thereby 986 // making sure the cost of their loop body is dominant, free of runtime 987 // guards and scalar iteration overheads. 988 CM_ScalarEpilogueNotAllowedLowTripLoop, 989 990 // Loop hint predicate indicating an epilogue is undesired. 991 CM_ScalarEpilogueNotNeededUsePredicate 992 }; 993 994 /// LoopVectorizationCostModel - estimates the expected speedups due to 995 /// vectorization. 996 /// In many cases vectorization is not profitable. This can happen because of 997 /// a number of reasons. In this class we mainly attempt to predict the 998 /// expected speedup/slowdowns due to the supported instruction set. We use the 999 /// TargetTransformInfo to query the different backends for the cost of 1000 /// different operations. 1001 class LoopVectorizationCostModel { 1002 public: 1003 LoopVectorizationCostModel(ScalarEpilogueLowering SEL, Loop *L, 1004 PredicatedScalarEvolution &PSE, LoopInfo *LI, 1005 LoopVectorizationLegality *Legal, 1006 const TargetTransformInfo &TTI, 1007 const TargetLibraryInfo *TLI, DemandedBits *DB, 1008 AssumptionCache *AC, 1009 OptimizationRemarkEmitter *ORE, const Function *F, 1010 const LoopVectorizeHints *Hints, 1011 InterleavedAccessInfo &IAI) 1012 : ScalarEpilogueStatus(SEL), TheLoop(L), PSE(PSE), LI(LI), Legal(Legal), 1013 TTI(TTI), TLI(TLI), DB(DB), AC(AC), ORE(ORE), TheFunction(F), 1014 Hints(Hints), InterleaveInfo(IAI) {} 1015 1016 /// \return An upper bound for the vectorization factor, or None if 1017 /// vectorization and interleaving should be avoided up front. 1018 Optional<unsigned> computeMaxVF(unsigned UserVF, unsigned UserIC); 1019 1020 /// \return True if runtime checks are required for vectorization, and false 1021 /// otherwise. 1022 bool runtimeChecksRequired(); 1023 1024 /// \return The most profitable vectorization factor and the cost of that VF. 1025 /// This method checks every power of two up to MaxVF. If UserVF is not ZERO 1026 /// then this vectorization factor will be selected if vectorization is 1027 /// possible. 1028 VectorizationFactor selectVectorizationFactor(unsigned MaxVF); 1029 1030 /// Setup cost-based decisions for user vectorization factor. 1031 void selectUserVectorizationFactor(unsigned UserVF) { 1032 collectUniformsAndScalars(UserVF); 1033 collectInstsToScalarize(UserVF); 1034 } 1035 1036 /// \return The size (in bits) of the smallest and widest types in the code 1037 /// that needs to be vectorized. We ignore values that remain scalar such as 1038 /// 64 bit loop indices. 1039 std::pair<unsigned, unsigned> getSmallestAndWidestTypes(); 1040 1041 /// \return The desired interleave count. 1042 /// If interleave count has been specified by metadata it will be returned. 1043 /// Otherwise, the interleave count is computed and returned. VF and LoopCost 1044 /// are the selected vectorization factor and the cost of the selected VF. 1045 unsigned selectInterleaveCount(unsigned VF, unsigned LoopCost); 1046 1047 /// Memory access instruction may be vectorized in more than one way. 1048 /// Form of instruction after vectorization depends on cost. 1049 /// This function takes cost-based decisions for Load/Store instructions 1050 /// and collects them in a map. This decisions map is used for building 1051 /// the lists of loop-uniform and loop-scalar instructions. 1052 /// The calculated cost is saved with widening decision in order to 1053 /// avoid redundant calculations. 1054 void setCostBasedWideningDecision(unsigned VF); 1055 1056 /// A struct that represents some properties of the register usage 1057 /// of a loop. 1058 struct RegisterUsage { 1059 /// Holds the number of loop invariant values that are used in the loop. 1060 /// The key is ClassID of target-provided register class. 1061 SmallMapVector<unsigned, unsigned, 4> LoopInvariantRegs; 1062 /// Holds the maximum number of concurrent live intervals in the loop. 1063 /// The key is ClassID of target-provided register class. 1064 SmallMapVector<unsigned, unsigned, 4> MaxLocalUsers; 1065 }; 1066 1067 /// \return Returns information about the register usages of the loop for the 1068 /// given vectorization factors. 1069 SmallVector<RegisterUsage, 8> calculateRegisterUsage(ArrayRef<unsigned> VFs); 1070 1071 /// Collect values we want to ignore in the cost model. 1072 void collectValuesToIgnore(); 1073 1074 /// \returns The smallest bitwidth each instruction can be represented with. 1075 /// The vector equivalents of these instructions should be truncated to this 1076 /// type. 1077 const MapVector<Instruction *, uint64_t> &getMinimalBitwidths() const { 1078 return MinBWs; 1079 } 1080 1081 /// \returns True if it is more profitable to scalarize instruction \p I for 1082 /// vectorization factor \p VF. 1083 bool isProfitableToScalarize(Instruction *I, unsigned VF) const { 1084 assert(VF > 1 && "Profitable to scalarize relevant only for VF > 1."); 1085 1086 // Cost model is not run in the VPlan-native path - return conservative 1087 // result until this changes. 1088 if (EnableVPlanNativePath) 1089 return false; 1090 1091 auto Scalars = InstsToScalarize.find(VF); 1092 assert(Scalars != InstsToScalarize.end() && 1093 "VF not yet analyzed for scalarization profitability"); 1094 return Scalars->second.find(I) != Scalars->second.end(); 1095 } 1096 1097 /// Returns true if \p I is known to be uniform after vectorization. 1098 bool isUniformAfterVectorization(Instruction *I, unsigned VF) const { 1099 if (VF == 1) 1100 return true; 1101 1102 // Cost model is not run in the VPlan-native path - return conservative 1103 // result until this changes. 1104 if (EnableVPlanNativePath) 1105 return false; 1106 1107 auto UniformsPerVF = Uniforms.find(VF); 1108 assert(UniformsPerVF != Uniforms.end() && 1109 "VF not yet analyzed for uniformity"); 1110 return UniformsPerVF->second.count(I); 1111 } 1112 1113 /// Returns true if \p I is known to be scalar after vectorization. 1114 bool isScalarAfterVectorization(Instruction *I, unsigned VF) const { 1115 if (VF == 1) 1116 return true; 1117 1118 // Cost model is not run in the VPlan-native path - return conservative 1119 // result until this changes. 1120 if (EnableVPlanNativePath) 1121 return false; 1122 1123 auto ScalarsPerVF = Scalars.find(VF); 1124 assert(ScalarsPerVF != Scalars.end() && 1125 "Scalar values are not calculated for VF"); 1126 return ScalarsPerVF->second.count(I); 1127 } 1128 1129 /// \returns True if instruction \p I can be truncated to a smaller bitwidth 1130 /// for vectorization factor \p VF. 1131 bool canTruncateToMinimalBitwidth(Instruction *I, unsigned VF) const { 1132 return VF > 1 && MinBWs.find(I) != MinBWs.end() && 1133 !isProfitableToScalarize(I, VF) && 1134 !isScalarAfterVectorization(I, VF); 1135 } 1136 1137 /// Decision that was taken during cost calculation for memory instruction. 1138 enum InstWidening { 1139 CM_Unknown, 1140 CM_Widen, // For consecutive accesses with stride +1. 1141 CM_Widen_Reverse, // For consecutive accesses with stride -1. 1142 CM_Interleave, 1143 CM_GatherScatter, 1144 CM_Scalarize 1145 }; 1146 1147 /// Save vectorization decision \p W and \p Cost taken by the cost model for 1148 /// instruction \p I and vector width \p VF. 1149 void setWideningDecision(Instruction *I, unsigned VF, InstWidening W, 1150 unsigned Cost) { 1151 assert(VF >= 2 && "Expected VF >=2"); 1152 WideningDecisions[std::make_pair(I, VF)] = std::make_pair(W, Cost); 1153 } 1154 1155 /// Save vectorization decision \p W and \p Cost taken by the cost model for 1156 /// interleaving group \p Grp and vector width \p VF. 1157 void setWideningDecision(const InterleaveGroup<Instruction> *Grp, unsigned VF, 1158 InstWidening W, unsigned Cost) { 1159 assert(VF >= 2 && "Expected VF >=2"); 1160 /// Broadcast this decicion to all instructions inside the group. 1161 /// But the cost will be assigned to one instruction only. 1162 for (unsigned i = 0; i < Grp->getFactor(); ++i) { 1163 if (auto *I = Grp->getMember(i)) { 1164 if (Grp->getInsertPos() == I) 1165 WideningDecisions[std::make_pair(I, VF)] = std::make_pair(W, Cost); 1166 else 1167 WideningDecisions[std::make_pair(I, VF)] = std::make_pair(W, 0); 1168 } 1169 } 1170 } 1171 1172 /// Return the cost model decision for the given instruction \p I and vector 1173 /// width \p VF. Return CM_Unknown if this instruction did not pass 1174 /// through the cost modeling. 1175 InstWidening getWideningDecision(Instruction *I, unsigned VF) { 1176 assert(VF >= 2 && "Expected VF >=2"); 1177 1178 // Cost model is not run in the VPlan-native path - return conservative 1179 // result until this changes. 1180 if (EnableVPlanNativePath) 1181 return CM_GatherScatter; 1182 1183 std::pair<Instruction *, unsigned> InstOnVF = std::make_pair(I, VF); 1184 auto Itr = WideningDecisions.find(InstOnVF); 1185 if (Itr == WideningDecisions.end()) 1186 return CM_Unknown; 1187 return Itr->second.first; 1188 } 1189 1190 /// Return the vectorization cost for the given instruction \p I and vector 1191 /// width \p VF. 1192 unsigned getWideningCost(Instruction *I, unsigned VF) { 1193 assert(VF >= 2 && "Expected VF >=2"); 1194 std::pair<Instruction *, unsigned> InstOnVF = std::make_pair(I, VF); 1195 assert(WideningDecisions.find(InstOnVF) != WideningDecisions.end() && 1196 "The cost is not calculated"); 1197 return WideningDecisions[InstOnVF].second; 1198 } 1199 1200 /// Return True if instruction \p I is an optimizable truncate whose operand 1201 /// is an induction variable. Such a truncate will be removed by adding a new 1202 /// induction variable with the destination type. 1203 bool isOptimizableIVTruncate(Instruction *I, unsigned VF) { 1204 // If the instruction is not a truncate, return false. 1205 auto *Trunc = dyn_cast<TruncInst>(I); 1206 if (!Trunc) 1207 return false; 1208 1209 // Get the source and destination types of the truncate. 1210 Type *SrcTy = ToVectorTy(cast<CastInst>(I)->getSrcTy(), VF); 1211 Type *DestTy = ToVectorTy(cast<CastInst>(I)->getDestTy(), VF); 1212 1213 // If the truncate is free for the given types, return false. Replacing a 1214 // free truncate with an induction variable would add an induction variable 1215 // update instruction to each iteration of the loop. We exclude from this 1216 // check the primary induction variable since it will need an update 1217 // instruction regardless. 1218 Value *Op = Trunc->getOperand(0); 1219 if (Op != Legal->getPrimaryInduction() && TTI.isTruncateFree(SrcTy, DestTy)) 1220 return false; 1221 1222 // If the truncated value is not an induction variable, return false. 1223 return Legal->isInductionPhi(Op); 1224 } 1225 1226 /// Collects the instructions to scalarize for each predicated instruction in 1227 /// the loop. 1228 void collectInstsToScalarize(unsigned VF); 1229 1230 /// Collect Uniform and Scalar values for the given \p VF. 1231 /// The sets depend on CM decision for Load/Store instructions 1232 /// that may be vectorized as interleave, gather-scatter or scalarized. 1233 void collectUniformsAndScalars(unsigned VF) { 1234 // Do the analysis once. 1235 if (VF == 1 || Uniforms.find(VF) != Uniforms.end()) 1236 return; 1237 setCostBasedWideningDecision(VF); 1238 collectLoopUniforms(VF); 1239 collectLoopScalars(VF); 1240 } 1241 1242 /// Returns true if the target machine supports masked store operation 1243 /// for the given \p DataType and kind of access to \p Ptr. 1244 bool isLegalMaskedStore(Type *DataType, Value *Ptr, Align Alignment) { 1245 return Legal->isConsecutivePtr(Ptr) && 1246 TTI.isLegalMaskedStore(DataType, Alignment); 1247 } 1248 1249 /// Returns true if the target machine supports masked load operation 1250 /// for the given \p DataType and kind of access to \p Ptr. 1251 bool isLegalMaskedLoad(Type *DataType, Value *Ptr, Align Alignment) { 1252 return Legal->isConsecutivePtr(Ptr) && 1253 TTI.isLegalMaskedLoad(DataType, Alignment); 1254 } 1255 1256 /// Returns true if the target machine supports masked scatter operation 1257 /// for the given \p DataType. 1258 bool isLegalMaskedScatter(Type *DataType, Align Alignment) { 1259 return TTI.isLegalMaskedScatter(DataType, Alignment); 1260 } 1261 1262 /// Returns true if the target machine supports masked gather operation 1263 /// for the given \p DataType. 1264 bool isLegalMaskedGather(Type *DataType, Align Alignment) { 1265 return TTI.isLegalMaskedGather(DataType, Alignment); 1266 } 1267 1268 /// Returns true if the target machine can represent \p V as a masked gather 1269 /// or scatter operation. 1270 bool isLegalGatherOrScatter(Value *V) { 1271 bool LI = isa<LoadInst>(V); 1272 bool SI = isa<StoreInst>(V); 1273 if (!LI && !SI) 1274 return false; 1275 auto *Ty = getMemInstValueType(V); 1276 Align Align = getLoadStoreAlignment(V); 1277 return (LI && isLegalMaskedGather(Ty, Align)) || 1278 (SI && isLegalMaskedScatter(Ty, Align)); 1279 } 1280 1281 /// Returns true if \p I is an instruction that will be scalarized with 1282 /// predication. Such instructions include conditional stores and 1283 /// instructions that may divide by zero. 1284 /// If a non-zero VF has been calculated, we check if I will be scalarized 1285 /// predication for that VF. 1286 bool isScalarWithPredication(Instruction *I, unsigned VF = 1); 1287 1288 // Returns true if \p I is an instruction that will be predicated either 1289 // through scalar predication or masked load/store or masked gather/scatter. 1290 // Superset of instructions that return true for isScalarWithPredication. 1291 bool isPredicatedInst(Instruction *I) { 1292 if (!blockNeedsPredication(I->getParent())) 1293 return false; 1294 // Loads and stores that need some form of masked operation are predicated 1295 // instructions. 1296 if (isa<LoadInst>(I) || isa<StoreInst>(I)) 1297 return Legal->isMaskRequired(I); 1298 return isScalarWithPredication(I); 1299 } 1300 1301 /// Returns true if \p I is a memory instruction with consecutive memory 1302 /// access that can be widened. 1303 bool memoryInstructionCanBeWidened(Instruction *I, unsigned VF = 1); 1304 1305 /// Returns true if \p I is a memory instruction in an interleaved-group 1306 /// of memory accesses that can be vectorized with wide vector loads/stores 1307 /// and shuffles. 1308 bool interleavedAccessCanBeWidened(Instruction *I, unsigned VF = 1); 1309 1310 /// Check if \p Instr belongs to any interleaved access group. 1311 bool isAccessInterleaved(Instruction *Instr) { 1312 return InterleaveInfo.isInterleaved(Instr); 1313 } 1314 1315 /// Get the interleaved access group that \p Instr belongs to. 1316 const InterleaveGroup<Instruction> * 1317 getInterleavedAccessGroup(Instruction *Instr) { 1318 return InterleaveInfo.getInterleaveGroup(Instr); 1319 } 1320 1321 /// Returns true if an interleaved group requires a scalar iteration 1322 /// to handle accesses with gaps, and there is nothing preventing us from 1323 /// creating a scalar epilogue. 1324 bool requiresScalarEpilogue() const { 1325 return isScalarEpilogueAllowed() && InterleaveInfo.requiresScalarEpilogue(); 1326 } 1327 1328 /// Returns true if a scalar epilogue is not allowed due to optsize or a 1329 /// loop hint annotation. 1330 bool isScalarEpilogueAllowed() const { 1331 return ScalarEpilogueStatus == CM_ScalarEpilogueAllowed; 1332 } 1333 1334 /// Returns true if all loop blocks should be masked to fold tail loop. 1335 bool foldTailByMasking() const { return FoldTailByMasking; } 1336 1337 bool blockNeedsPredication(BasicBlock *BB) { 1338 return foldTailByMasking() || Legal->blockNeedsPredication(BB); 1339 } 1340 1341 /// Estimate cost of an intrinsic call instruction CI if it were vectorized 1342 /// with factor VF. Return the cost of the instruction, including 1343 /// scalarization overhead if it's needed. 1344 unsigned getVectorIntrinsicCost(CallInst *CI, unsigned VF); 1345 1346 /// Estimate cost of a call instruction CI if it were vectorized with factor 1347 /// VF. Return the cost of the instruction, including scalarization overhead 1348 /// if it's needed. The flag NeedToScalarize shows if the call needs to be 1349 /// scalarized - 1350 /// i.e. either vector version isn't available, or is too expensive. 1351 unsigned getVectorCallCost(CallInst *CI, unsigned VF, bool &NeedToScalarize); 1352 1353 /// Invalidates decisions already taken by the cost model. 1354 void invalidateCostModelingDecisions() { 1355 WideningDecisions.clear(); 1356 Uniforms.clear(); 1357 Scalars.clear(); 1358 } 1359 1360 private: 1361 unsigned NumPredStores = 0; 1362 1363 /// \return An upper bound for the vectorization factor, a power-of-2 larger 1364 /// than zero. One is returned if vectorization should best be avoided due 1365 /// to cost. 1366 unsigned computeFeasibleMaxVF(unsigned ConstTripCount); 1367 1368 /// The vectorization cost is a combination of the cost itself and a boolean 1369 /// indicating whether any of the contributing operations will actually 1370 /// operate on 1371 /// vector values after type legalization in the backend. If this latter value 1372 /// is 1373 /// false, then all operations will be scalarized (i.e. no vectorization has 1374 /// actually taken place). 1375 using VectorizationCostTy = std::pair<unsigned, bool>; 1376 1377 /// Returns the expected execution cost. The unit of the cost does 1378 /// not matter because we use the 'cost' units to compare different 1379 /// vector widths. The cost that is returned is *not* normalized by 1380 /// the factor width. 1381 VectorizationCostTy expectedCost(unsigned VF); 1382 1383 /// Returns the execution time cost of an instruction for a given vector 1384 /// width. Vector width of one means scalar. 1385 VectorizationCostTy getInstructionCost(Instruction *I, unsigned VF); 1386 1387 /// The cost-computation logic from getInstructionCost which provides 1388 /// the vector type as an output parameter. 1389 unsigned getInstructionCost(Instruction *I, unsigned VF, Type *&VectorTy); 1390 1391 /// Calculate vectorization cost of memory instruction \p I. 1392 unsigned getMemoryInstructionCost(Instruction *I, unsigned VF); 1393 1394 /// The cost computation for scalarized memory instruction. 1395 unsigned getMemInstScalarizationCost(Instruction *I, unsigned VF); 1396 1397 /// The cost computation for interleaving group of memory instructions. 1398 unsigned getInterleaveGroupCost(Instruction *I, unsigned VF); 1399 1400 /// The cost computation for Gather/Scatter instruction. 1401 unsigned getGatherScatterCost(Instruction *I, unsigned VF); 1402 1403 /// The cost computation for widening instruction \p I with consecutive 1404 /// memory access. 1405 unsigned getConsecutiveMemOpCost(Instruction *I, unsigned VF); 1406 1407 /// The cost calculation for Load/Store instruction \p I with uniform pointer - 1408 /// Load: scalar load + broadcast. 1409 /// Store: scalar store + (loop invariant value stored? 0 : extract of last 1410 /// element) 1411 unsigned getUniformMemOpCost(Instruction *I, unsigned VF); 1412 1413 /// Estimate the overhead of scalarizing an instruction. This is a 1414 /// convenience wrapper for the type-based getScalarizationOverhead API. 1415 unsigned getScalarizationOverhead(Instruction *I, unsigned VF); 1416 1417 /// Returns whether the instruction is a load or store and will be a emitted 1418 /// as a vector operation. 1419 bool isConsecutiveLoadOrStore(Instruction *I); 1420 1421 /// Returns true if an artificially high cost for emulated masked memrefs 1422 /// should be used. 1423 bool useEmulatedMaskMemRefHack(Instruction *I); 1424 1425 /// Map of scalar integer values to the smallest bitwidth they can be legally 1426 /// represented as. The vector equivalents of these values should be truncated 1427 /// to this type. 1428 MapVector<Instruction *, uint64_t> MinBWs; 1429 1430 /// A type representing the costs for instructions if they were to be 1431 /// scalarized rather than vectorized. The entries are Instruction-Cost 1432 /// pairs. 1433 using ScalarCostsTy = DenseMap<Instruction *, unsigned>; 1434 1435 /// A set containing all BasicBlocks that are known to present after 1436 /// vectorization as a predicated block. 1437 SmallPtrSet<BasicBlock *, 4> PredicatedBBsAfterVectorization; 1438 1439 /// Records whether it is allowed to have the original scalar loop execute at 1440 /// least once. This may be needed as a fallback loop in case runtime 1441 /// aliasing/dependence checks fail, or to handle the tail/remainder 1442 /// iterations when the trip count is unknown or doesn't divide by the VF, 1443 /// or as a peel-loop to handle gaps in interleave-groups. 1444 /// Under optsize and when the trip count is very small we don't allow any 1445 /// iterations to execute in the scalar loop. 1446 ScalarEpilogueLowering ScalarEpilogueStatus = CM_ScalarEpilogueAllowed; 1447 1448 /// All blocks of loop are to be masked to fold tail of scalar iterations. 1449 bool FoldTailByMasking = false; 1450 1451 /// A map holding scalar costs for different vectorization factors. The 1452 /// presence of a cost for an instruction in the mapping indicates that the 1453 /// instruction will be scalarized when vectorizing with the associated 1454 /// vectorization factor. The entries are VF-ScalarCostTy pairs. 1455 DenseMap<unsigned, ScalarCostsTy> InstsToScalarize; 1456 1457 /// Holds the instructions known to be uniform after vectorization. 1458 /// The data is collected per VF. 1459 DenseMap<unsigned, SmallPtrSet<Instruction *, 4>> Uniforms; 1460 1461 /// Holds the instructions known to be scalar after vectorization. 1462 /// The data is collected per VF. 1463 DenseMap<unsigned, SmallPtrSet<Instruction *, 4>> Scalars; 1464 1465 /// Holds the instructions (address computations) that are forced to be 1466 /// scalarized. 1467 DenseMap<unsigned, SmallPtrSet<Instruction *, 4>> ForcedScalars; 1468 1469 /// Returns the expected difference in cost from scalarizing the expression 1470 /// feeding a predicated instruction \p PredInst. The instructions to 1471 /// scalarize and their scalar costs are collected in \p ScalarCosts. A 1472 /// non-negative return value implies the expression will be scalarized. 1473 /// Currently, only single-use chains are considered for scalarization. 1474 int computePredInstDiscount(Instruction *PredInst, ScalarCostsTy &ScalarCosts, 1475 unsigned VF); 1476 1477 /// Collect the instructions that are uniform after vectorization. An 1478 /// instruction is uniform if we represent it with a single scalar value in 1479 /// the vectorized loop corresponding to each vector iteration. Examples of 1480 /// uniform instructions include pointer operands of consecutive or 1481 /// interleaved memory accesses. Note that although uniformity implies an 1482 /// instruction will be scalar, the reverse is not true. In general, a 1483 /// scalarized instruction will be represented by VF scalar values in the 1484 /// vectorized loop, each corresponding to an iteration of the original 1485 /// scalar loop. 1486 void collectLoopUniforms(unsigned VF); 1487 1488 /// Collect the instructions that are scalar after vectorization. An 1489 /// instruction is scalar if it is known to be uniform or will be scalarized 1490 /// during vectorization. Non-uniform scalarized instructions will be 1491 /// represented by VF values in the vectorized loop, each corresponding to an 1492 /// iteration of the original scalar loop. 1493 void collectLoopScalars(unsigned VF); 1494 1495 /// Keeps cost model vectorization decision and cost for instructions. 1496 /// Right now it is used for memory instructions only. 1497 using DecisionList = DenseMap<std::pair<Instruction *, unsigned>, 1498 std::pair<InstWidening, unsigned>>; 1499 1500 DecisionList WideningDecisions; 1501 1502 /// Returns true if \p V is expected to be vectorized and it needs to be 1503 /// extracted. 1504 bool needsExtract(Value *V, unsigned VF) const { 1505 Instruction *I = dyn_cast<Instruction>(V); 1506 if (VF == 1 || !I || !TheLoop->contains(I) || TheLoop->isLoopInvariant(I)) 1507 return false; 1508 1509 // Assume we can vectorize V (and hence we need extraction) if the 1510 // scalars are not computed yet. This can happen, because it is called 1511 // via getScalarizationOverhead from setCostBasedWideningDecision, before 1512 // the scalars are collected. That should be a safe assumption in most 1513 // cases, because we check if the operands have vectorizable types 1514 // beforehand in LoopVectorizationLegality. 1515 return Scalars.find(VF) == Scalars.end() || 1516 !isScalarAfterVectorization(I, VF); 1517 }; 1518 1519 /// Returns a range containing only operands needing to be extracted. 1520 SmallVector<Value *, 4> filterExtractingOperands(Instruction::op_range Ops, 1521 unsigned VF) { 1522 return SmallVector<Value *, 4>(make_filter_range( 1523 Ops, [this, VF](Value *V) { return this->needsExtract(V, VF); })); 1524 } 1525 1526 public: 1527 /// The loop that we evaluate. 1528 Loop *TheLoop; 1529 1530 /// Predicated scalar evolution analysis. 1531 PredicatedScalarEvolution &PSE; 1532 1533 /// Loop Info analysis. 1534 LoopInfo *LI; 1535 1536 /// Vectorization legality. 1537 LoopVectorizationLegality *Legal; 1538 1539 /// Vector target information. 1540 const TargetTransformInfo &TTI; 1541 1542 /// Target Library Info. 1543 const TargetLibraryInfo *TLI; 1544 1545 /// Demanded bits analysis. 1546 DemandedBits *DB; 1547 1548 /// Assumption cache. 1549 AssumptionCache *AC; 1550 1551 /// Interface to emit optimization remarks. 1552 OptimizationRemarkEmitter *ORE; 1553 1554 const Function *TheFunction; 1555 1556 /// Loop Vectorize Hint. 1557 const LoopVectorizeHints *Hints; 1558 1559 /// The interleave access information contains groups of interleaved accesses 1560 /// with the same stride and close to each other. 1561 InterleavedAccessInfo &InterleaveInfo; 1562 1563 /// Values to ignore in the cost model. 1564 SmallPtrSet<const Value *, 16> ValuesToIgnore; 1565 1566 /// Values to ignore in the cost model when VF > 1. 1567 SmallPtrSet<const Value *, 16> VecValuesToIgnore; 1568 }; 1569 1570 } // end namespace llvm 1571 1572 // Return true if \p OuterLp is an outer loop annotated with hints for explicit 1573 // vectorization. The loop needs to be annotated with #pragma omp simd 1574 // simdlen(#) or #pragma clang vectorize(enable) vectorize_width(#). If the 1575 // vector length information is not provided, vectorization is not considered 1576 // explicit. Interleave hints are not allowed either. These limitations will be 1577 // relaxed in the future. 1578 // Please, note that we are currently forced to abuse the pragma 'clang 1579 // vectorize' semantics. This pragma provides *auto-vectorization hints* 1580 // (i.e., LV must check that vectorization is legal) whereas pragma 'omp simd' 1581 // provides *explicit vectorization hints* (LV can bypass legal checks and 1582 // assume that vectorization is legal). However, both hints are implemented 1583 // using the same metadata (llvm.loop.vectorize, processed by 1584 // LoopVectorizeHints). This will be fixed in the future when the native IR 1585 // representation for pragma 'omp simd' is introduced. 1586 static bool isExplicitVecOuterLoop(Loop *OuterLp, 1587 OptimizationRemarkEmitter *ORE) { 1588 assert(!OuterLp->empty() && "This is not an outer loop"); 1589 LoopVectorizeHints Hints(OuterLp, true /*DisableInterleaving*/, *ORE); 1590 1591 // Only outer loops with an explicit vectorization hint are supported. 1592 // Unannotated outer loops are ignored. 1593 if (Hints.getForce() == LoopVectorizeHints::FK_Undefined) 1594 return false; 1595 1596 Function *Fn = OuterLp->getHeader()->getParent(); 1597 if (!Hints.allowVectorization(Fn, OuterLp, 1598 true /*VectorizeOnlyWhenForced*/)) { 1599 LLVM_DEBUG(dbgs() << "LV: Loop hints prevent outer loop vectorization.\n"); 1600 return false; 1601 } 1602 1603 if (Hints.getInterleave() > 1) { 1604 // TODO: Interleave support is future work. 1605 LLVM_DEBUG(dbgs() << "LV: Not vectorizing: Interleave is not supported for " 1606 "outer loops.\n"); 1607 Hints.emitRemarkWithHints(); 1608 return false; 1609 } 1610 1611 return true; 1612 } 1613 1614 static void collectSupportedLoops(Loop &L, LoopInfo *LI, 1615 OptimizationRemarkEmitter *ORE, 1616 SmallVectorImpl<Loop *> &V) { 1617 // Collect inner loops and outer loops without irreducible control flow. For 1618 // now, only collect outer loops that have explicit vectorization hints. If we 1619 // are stress testing the VPlan H-CFG construction, we collect the outermost 1620 // loop of every loop nest. 1621 if (L.empty() || VPlanBuildStressTest || 1622 (EnableVPlanNativePath && isExplicitVecOuterLoop(&L, ORE))) { 1623 LoopBlocksRPO RPOT(&L); 1624 RPOT.perform(LI); 1625 if (!containsIrreducibleCFG<const BasicBlock *>(RPOT, *LI)) { 1626 V.push_back(&L); 1627 // TODO: Collect inner loops inside marked outer loops in case 1628 // vectorization fails for the outer loop. Do not invoke 1629 // 'containsIrreducibleCFG' again for inner loops when the outer loop is 1630 // already known to be reducible. We can use an inherited attribute for 1631 // that. 1632 return; 1633 } 1634 } 1635 for (Loop *InnerL : L) 1636 collectSupportedLoops(*InnerL, LI, ORE, V); 1637 } 1638 1639 namespace { 1640 1641 /// The LoopVectorize Pass. 1642 struct LoopVectorize : public FunctionPass { 1643 /// Pass identification, replacement for typeid 1644 static char ID; 1645 1646 LoopVectorizePass Impl; 1647 1648 explicit LoopVectorize(bool InterleaveOnlyWhenForced = false, 1649 bool VectorizeOnlyWhenForced = false) 1650 : FunctionPass(ID), 1651 Impl({InterleaveOnlyWhenForced, VectorizeOnlyWhenForced}) { 1652 initializeLoopVectorizePass(*PassRegistry::getPassRegistry()); 1653 } 1654 1655 bool runOnFunction(Function &F) override { 1656 if (skipFunction(F)) 1657 return false; 1658 1659 auto *SE = &getAnalysis<ScalarEvolutionWrapperPass>().getSE(); 1660 auto *LI = &getAnalysis<LoopInfoWrapperPass>().getLoopInfo(); 1661 auto *TTI = &getAnalysis<TargetTransformInfoWrapperPass>().getTTI(F); 1662 auto *DT = &getAnalysis<DominatorTreeWrapperPass>().getDomTree(); 1663 auto *BFI = &getAnalysis<BlockFrequencyInfoWrapperPass>().getBFI(); 1664 auto *TLIP = getAnalysisIfAvailable<TargetLibraryInfoWrapperPass>(); 1665 auto *TLI = TLIP ? &TLIP->getTLI(F) : nullptr; 1666 auto *AA = &getAnalysis<AAResultsWrapperPass>().getAAResults(); 1667 auto *AC = &getAnalysis<AssumptionCacheTracker>().getAssumptionCache(F); 1668 auto *LAA = &getAnalysis<LoopAccessLegacyAnalysis>(); 1669 auto *DB = &getAnalysis<DemandedBitsWrapperPass>().getDemandedBits(); 1670 auto *ORE = &getAnalysis<OptimizationRemarkEmitterWrapperPass>().getORE(); 1671 auto *PSI = &getAnalysis<ProfileSummaryInfoWrapperPass>().getPSI(); 1672 1673 std::function<const LoopAccessInfo &(Loop &)> GetLAA = 1674 [&](Loop &L) -> const LoopAccessInfo & { return LAA->getInfo(&L); }; 1675 1676 return Impl.runImpl(F, *SE, *LI, *TTI, *DT, *BFI, TLI, *DB, *AA, *AC, 1677 GetLAA, *ORE, PSI).MadeAnyChange; 1678 } 1679 1680 void getAnalysisUsage(AnalysisUsage &AU) const override { 1681 AU.addRequired<AssumptionCacheTracker>(); 1682 AU.addRequired<BlockFrequencyInfoWrapperPass>(); 1683 AU.addRequired<DominatorTreeWrapperPass>(); 1684 AU.addRequired<LoopInfoWrapperPass>(); 1685 AU.addRequired<ScalarEvolutionWrapperPass>(); 1686 AU.addRequired<TargetTransformInfoWrapperPass>(); 1687 AU.addRequired<AAResultsWrapperPass>(); 1688 AU.addRequired<LoopAccessLegacyAnalysis>(); 1689 AU.addRequired<DemandedBitsWrapperPass>(); 1690 AU.addRequired<OptimizationRemarkEmitterWrapperPass>(); 1691 AU.addRequired<InjectTLIMappingsLegacy>(); 1692 1693 // We currently do not preserve loopinfo/dominator analyses with outer loop 1694 // vectorization. Until this is addressed, mark these analyses as preserved 1695 // only for non-VPlan-native path. 1696 // TODO: Preserve Loop and Dominator analyses for VPlan-native path. 1697 if (!EnableVPlanNativePath) { 1698 AU.addPreserved<LoopInfoWrapperPass>(); 1699 AU.addPreserved<DominatorTreeWrapperPass>(); 1700 } 1701 1702 AU.addPreserved<BasicAAWrapperPass>(); 1703 AU.addPreserved<GlobalsAAWrapperPass>(); 1704 AU.addRequired<ProfileSummaryInfoWrapperPass>(); 1705 } 1706 }; 1707 1708 } // end anonymous namespace 1709 1710 //===----------------------------------------------------------------------===// 1711 // Implementation of LoopVectorizationLegality, InnerLoopVectorizer and 1712 // LoopVectorizationCostModel and LoopVectorizationPlanner. 1713 //===----------------------------------------------------------------------===// 1714 1715 Value *InnerLoopVectorizer::getBroadcastInstrs(Value *V) { 1716 // We need to place the broadcast of invariant variables outside the loop, 1717 // but only if it's proven safe to do so. Else, broadcast will be inside 1718 // vector loop body. 1719 Instruction *Instr = dyn_cast<Instruction>(V); 1720 bool SafeToHoist = OrigLoop->isLoopInvariant(V) && 1721 (!Instr || 1722 DT->dominates(Instr->getParent(), LoopVectorPreHeader)); 1723 // Place the code for broadcasting invariant variables in the new preheader. 1724 IRBuilder<>::InsertPointGuard Guard(Builder); 1725 if (SafeToHoist) 1726 Builder.SetInsertPoint(LoopVectorPreHeader->getTerminator()); 1727 1728 // Broadcast the scalar into all locations in the vector. 1729 Value *Shuf = Builder.CreateVectorSplat(VF, V, "broadcast"); 1730 1731 return Shuf; 1732 } 1733 1734 void InnerLoopVectorizer::createVectorIntOrFpInductionPHI( 1735 const InductionDescriptor &II, Value *Step, Instruction *EntryVal) { 1736 assert((isa<PHINode>(EntryVal) || isa<TruncInst>(EntryVal)) && 1737 "Expected either an induction phi-node or a truncate of it!"); 1738 Value *Start = II.getStartValue(); 1739 1740 // Construct the initial value of the vector IV in the vector loop preheader 1741 auto CurrIP = Builder.saveIP(); 1742 Builder.SetInsertPoint(LoopVectorPreHeader->getTerminator()); 1743 if (isa<TruncInst>(EntryVal)) { 1744 assert(Start->getType()->isIntegerTy() && 1745 "Truncation requires an integer type"); 1746 auto *TruncType = cast<IntegerType>(EntryVal->getType()); 1747 Step = Builder.CreateTrunc(Step, TruncType); 1748 Start = Builder.CreateCast(Instruction::Trunc, Start, TruncType); 1749 } 1750 Value *SplatStart = Builder.CreateVectorSplat(VF, Start); 1751 Value *SteppedStart = 1752 getStepVector(SplatStart, 0, Step, II.getInductionOpcode()); 1753 1754 // We create vector phi nodes for both integer and floating-point induction 1755 // variables. Here, we determine the kind of arithmetic we will perform. 1756 Instruction::BinaryOps AddOp; 1757 Instruction::BinaryOps MulOp; 1758 if (Step->getType()->isIntegerTy()) { 1759 AddOp = Instruction::Add; 1760 MulOp = Instruction::Mul; 1761 } else { 1762 AddOp = II.getInductionOpcode(); 1763 MulOp = Instruction::FMul; 1764 } 1765 1766 // Multiply the vectorization factor by the step using integer or 1767 // floating-point arithmetic as appropriate. 1768 Value *ConstVF = getSignedIntOrFpConstant(Step->getType(), VF); 1769 Value *Mul = addFastMathFlag(Builder.CreateBinOp(MulOp, Step, ConstVF)); 1770 1771 // Create a vector splat to use in the induction update. 1772 // 1773 // FIXME: If the step is non-constant, we create the vector splat with 1774 // IRBuilder. IRBuilder can constant-fold the multiply, but it doesn't 1775 // handle a constant vector splat. 1776 Value *SplatVF = 1777 isa<Constant>(Mul) 1778 ? ConstantVector::getSplat({VF, false}, cast<Constant>(Mul)) 1779 : Builder.CreateVectorSplat(VF, Mul); 1780 Builder.restoreIP(CurrIP); 1781 1782 // We may need to add the step a number of times, depending on the unroll 1783 // factor. The last of those goes into the PHI. 1784 PHINode *VecInd = PHINode::Create(SteppedStart->getType(), 2, "vec.ind", 1785 &*LoopVectorBody->getFirstInsertionPt()); 1786 VecInd->setDebugLoc(EntryVal->getDebugLoc()); 1787 Instruction *LastInduction = VecInd; 1788 for (unsigned Part = 0; Part < UF; ++Part) { 1789 VectorLoopValueMap.setVectorValue(EntryVal, Part, LastInduction); 1790 1791 if (isa<TruncInst>(EntryVal)) 1792 addMetadata(LastInduction, EntryVal); 1793 recordVectorLoopValueForInductionCast(II, EntryVal, LastInduction, Part); 1794 1795 LastInduction = cast<Instruction>(addFastMathFlag( 1796 Builder.CreateBinOp(AddOp, LastInduction, SplatVF, "step.add"))); 1797 LastInduction->setDebugLoc(EntryVal->getDebugLoc()); 1798 } 1799 1800 // Move the last step to the end of the latch block. This ensures consistent 1801 // placement of all induction updates. 1802 auto *LoopVectorLatch = LI->getLoopFor(LoopVectorBody)->getLoopLatch(); 1803 auto *Br = cast<BranchInst>(LoopVectorLatch->getTerminator()); 1804 auto *ICmp = cast<Instruction>(Br->getCondition()); 1805 LastInduction->moveBefore(ICmp); 1806 LastInduction->setName("vec.ind.next"); 1807 1808 VecInd->addIncoming(SteppedStart, LoopVectorPreHeader); 1809 VecInd->addIncoming(LastInduction, LoopVectorLatch); 1810 } 1811 1812 bool InnerLoopVectorizer::shouldScalarizeInstruction(Instruction *I) const { 1813 return Cost->isScalarAfterVectorization(I, VF) || 1814 Cost->isProfitableToScalarize(I, VF); 1815 } 1816 1817 bool InnerLoopVectorizer::needsScalarInduction(Instruction *IV) const { 1818 if (shouldScalarizeInstruction(IV)) 1819 return true; 1820 auto isScalarInst = [&](User *U) -> bool { 1821 auto *I = cast<Instruction>(U); 1822 return (OrigLoop->contains(I) && shouldScalarizeInstruction(I)); 1823 }; 1824 return llvm::any_of(IV->users(), isScalarInst); 1825 } 1826 1827 void InnerLoopVectorizer::recordVectorLoopValueForInductionCast( 1828 const InductionDescriptor &ID, const Instruction *EntryVal, 1829 Value *VectorLoopVal, unsigned Part, unsigned Lane) { 1830 assert((isa<PHINode>(EntryVal) || isa<TruncInst>(EntryVal)) && 1831 "Expected either an induction phi-node or a truncate of it!"); 1832 1833 // This induction variable is not the phi from the original loop but the 1834 // newly-created IV based on the proof that casted Phi is equal to the 1835 // uncasted Phi in the vectorized loop (under a runtime guard possibly). It 1836 // re-uses the same InductionDescriptor that original IV uses but we don't 1837 // have to do any recording in this case - that is done when original IV is 1838 // processed. 1839 if (isa<TruncInst>(EntryVal)) 1840 return; 1841 1842 const SmallVectorImpl<Instruction *> &Casts = ID.getCastInsts(); 1843 if (Casts.empty()) 1844 return; 1845 // Only the first Cast instruction in the Casts vector is of interest. 1846 // The rest of the Casts (if exist) have no uses outside the 1847 // induction update chain itself. 1848 Instruction *CastInst = *Casts.begin(); 1849 if (Lane < UINT_MAX) 1850 VectorLoopValueMap.setScalarValue(CastInst, {Part, Lane}, VectorLoopVal); 1851 else 1852 VectorLoopValueMap.setVectorValue(CastInst, Part, VectorLoopVal); 1853 } 1854 1855 void InnerLoopVectorizer::widenIntOrFpInduction(PHINode *IV, TruncInst *Trunc) { 1856 assert((IV->getType()->isIntegerTy() || IV != OldInduction) && 1857 "Primary induction variable must have an integer type"); 1858 1859 auto II = Legal->getInductionVars().find(IV); 1860 assert(II != Legal->getInductionVars().end() && "IV is not an induction"); 1861 1862 auto ID = II->second; 1863 assert(IV->getType() == ID.getStartValue()->getType() && "Types must match"); 1864 1865 // The value from the original loop to which we are mapping the new induction 1866 // variable. 1867 Instruction *EntryVal = Trunc ? cast<Instruction>(Trunc) : IV; 1868 1869 auto &DL = OrigLoop->getHeader()->getModule()->getDataLayout(); 1870 1871 // Generate code for the induction step. Note that induction steps are 1872 // required to be loop-invariant 1873 auto CreateStepValue = [&](const SCEV *Step) -> Value * { 1874 assert(PSE.getSE()->isLoopInvariant(Step, OrigLoop) && 1875 "Induction step should be loop invariant"); 1876 if (PSE.getSE()->isSCEVable(IV->getType())) { 1877 SCEVExpander Exp(*PSE.getSE(), DL, "induction"); 1878 return Exp.expandCodeFor(Step, Step->getType(), 1879 LoopVectorPreHeader->getTerminator()); 1880 } 1881 return cast<SCEVUnknown>(Step)->getValue(); 1882 }; 1883 1884 // The scalar value to broadcast. This is derived from the canonical 1885 // induction variable. If a truncation type is given, truncate the canonical 1886 // induction variable and step. Otherwise, derive these values from the 1887 // induction descriptor. 1888 auto CreateScalarIV = [&](Value *&Step) -> Value * { 1889 Value *ScalarIV = Induction; 1890 if (IV != OldInduction) { 1891 ScalarIV = IV->getType()->isIntegerTy() 1892 ? Builder.CreateSExtOrTrunc(Induction, IV->getType()) 1893 : Builder.CreateCast(Instruction::SIToFP, Induction, 1894 IV->getType()); 1895 ScalarIV = emitTransformedIndex(Builder, ScalarIV, PSE.getSE(), DL, ID); 1896 ScalarIV->setName("offset.idx"); 1897 } 1898 if (Trunc) { 1899 auto *TruncType = cast<IntegerType>(Trunc->getType()); 1900 assert(Step->getType()->isIntegerTy() && 1901 "Truncation requires an integer step"); 1902 ScalarIV = Builder.CreateTrunc(ScalarIV, TruncType); 1903 Step = Builder.CreateTrunc(Step, TruncType); 1904 } 1905 return ScalarIV; 1906 }; 1907 1908 // Create the vector values from the scalar IV, in the absence of creating a 1909 // vector IV. 1910 auto CreateSplatIV = [&](Value *ScalarIV, Value *Step) { 1911 Value *Broadcasted = getBroadcastInstrs(ScalarIV); 1912 for (unsigned Part = 0; Part < UF; ++Part) { 1913 Value *EntryPart = 1914 getStepVector(Broadcasted, VF * Part, Step, ID.getInductionOpcode()); 1915 VectorLoopValueMap.setVectorValue(EntryVal, Part, EntryPart); 1916 if (Trunc) 1917 addMetadata(EntryPart, Trunc); 1918 recordVectorLoopValueForInductionCast(ID, EntryVal, EntryPart, Part); 1919 } 1920 }; 1921 1922 // Now do the actual transformations, and start with creating the step value. 1923 Value *Step = CreateStepValue(ID.getStep()); 1924 if (VF <= 1) { 1925 Value *ScalarIV = CreateScalarIV(Step); 1926 CreateSplatIV(ScalarIV, Step); 1927 return; 1928 } 1929 1930 // Determine if we want a scalar version of the induction variable. This is 1931 // true if the induction variable itself is not widened, or if it has at 1932 // least one user in the loop that is not widened. 1933 auto NeedsScalarIV = needsScalarInduction(EntryVal); 1934 if (!NeedsScalarIV) { 1935 createVectorIntOrFpInductionPHI(ID, Step, EntryVal); 1936 return; 1937 } 1938 1939 // Try to create a new independent vector induction variable. If we can't 1940 // create the phi node, we will splat the scalar induction variable in each 1941 // loop iteration. 1942 if (!shouldScalarizeInstruction(EntryVal)) { 1943 createVectorIntOrFpInductionPHI(ID, Step, EntryVal); 1944 Value *ScalarIV = CreateScalarIV(Step); 1945 // Create scalar steps that can be used by instructions we will later 1946 // scalarize. Note that the addition of the scalar steps will not increase 1947 // the number of instructions in the loop in the common case prior to 1948 // InstCombine. We will be trading one vector extract for each scalar step. 1949 buildScalarSteps(ScalarIV, Step, EntryVal, ID); 1950 return; 1951 } 1952 1953 // All IV users are scalar instructions, so only emit a scalar IV, not a 1954 // vectorised IV. Except when we tail-fold, then the splat IV feeds the 1955 // predicate used by the masked loads/stores. 1956 Value *ScalarIV = CreateScalarIV(Step); 1957 if (!Cost->isScalarEpilogueAllowed()) 1958 CreateSplatIV(ScalarIV, Step); 1959 buildScalarSteps(ScalarIV, Step, EntryVal, ID); 1960 } 1961 1962 Value *InnerLoopVectorizer::getStepVector(Value *Val, int StartIdx, Value *Step, 1963 Instruction::BinaryOps BinOp) { 1964 // Create and check the types. 1965 auto *ValVTy = cast<VectorType>(Val->getType()); 1966 int VLen = ValVTy->getNumElements(); 1967 1968 Type *STy = Val->getType()->getScalarType(); 1969 assert((STy->isIntegerTy() || STy->isFloatingPointTy()) && 1970 "Induction Step must be an integer or FP"); 1971 assert(Step->getType() == STy && "Step has wrong type"); 1972 1973 SmallVector<Constant *, 8> Indices; 1974 1975 if (STy->isIntegerTy()) { 1976 // Create a vector of consecutive numbers from zero to VF. 1977 for (int i = 0; i < VLen; ++i) 1978 Indices.push_back(ConstantInt::get(STy, StartIdx + i)); 1979 1980 // Add the consecutive indices to the vector value. 1981 Constant *Cv = ConstantVector::get(Indices); 1982 assert(Cv->getType() == Val->getType() && "Invalid consecutive vec"); 1983 Step = Builder.CreateVectorSplat(VLen, Step); 1984 assert(Step->getType() == Val->getType() && "Invalid step vec"); 1985 // FIXME: The newly created binary instructions should contain nsw/nuw flags, 1986 // which can be found from the original scalar operations. 1987 Step = Builder.CreateMul(Cv, Step); 1988 return Builder.CreateAdd(Val, Step, "induction"); 1989 } 1990 1991 // Floating point induction. 1992 assert((BinOp == Instruction::FAdd || BinOp == Instruction::FSub) && 1993 "Binary Opcode should be specified for FP induction"); 1994 // Create a vector of consecutive numbers from zero to VF. 1995 for (int i = 0; i < VLen; ++i) 1996 Indices.push_back(ConstantFP::get(STy, (double)(StartIdx + i))); 1997 1998 // Add the consecutive indices to the vector value. 1999 Constant *Cv = ConstantVector::get(Indices); 2000 2001 Step = Builder.CreateVectorSplat(VLen, Step); 2002 2003 // Floating point operations had to be 'fast' to enable the induction. 2004 FastMathFlags Flags; 2005 Flags.setFast(); 2006 2007 Value *MulOp = Builder.CreateFMul(Cv, Step); 2008 if (isa<Instruction>(MulOp)) 2009 // Have to check, MulOp may be a constant 2010 cast<Instruction>(MulOp)->setFastMathFlags(Flags); 2011 2012 Value *BOp = Builder.CreateBinOp(BinOp, Val, MulOp, "induction"); 2013 if (isa<Instruction>(BOp)) 2014 cast<Instruction>(BOp)->setFastMathFlags(Flags); 2015 return BOp; 2016 } 2017 2018 void InnerLoopVectorizer::buildScalarSteps(Value *ScalarIV, Value *Step, 2019 Instruction *EntryVal, 2020 const InductionDescriptor &ID) { 2021 // We shouldn't have to build scalar steps if we aren't vectorizing. 2022 assert(VF > 1 && "VF should be greater than one"); 2023 2024 // Get the value type and ensure it and the step have the same integer type. 2025 Type *ScalarIVTy = ScalarIV->getType()->getScalarType(); 2026 assert(ScalarIVTy == Step->getType() && 2027 "Val and Step should have the same type"); 2028 2029 // We build scalar steps for both integer and floating-point induction 2030 // variables. Here, we determine the kind of arithmetic we will perform. 2031 Instruction::BinaryOps AddOp; 2032 Instruction::BinaryOps MulOp; 2033 if (ScalarIVTy->isIntegerTy()) { 2034 AddOp = Instruction::Add; 2035 MulOp = Instruction::Mul; 2036 } else { 2037 AddOp = ID.getInductionOpcode(); 2038 MulOp = Instruction::FMul; 2039 } 2040 2041 // Determine the number of scalars we need to generate for each unroll 2042 // iteration. If EntryVal is uniform, we only need to generate the first 2043 // lane. Otherwise, we generate all VF values. 2044 unsigned Lanes = 2045 Cost->isUniformAfterVectorization(cast<Instruction>(EntryVal), VF) ? 1 2046 : VF; 2047 // Compute the scalar steps and save the results in VectorLoopValueMap. 2048 for (unsigned Part = 0; Part < UF; ++Part) { 2049 for (unsigned Lane = 0; Lane < Lanes; ++Lane) { 2050 auto *StartIdx = getSignedIntOrFpConstant(ScalarIVTy, VF * Part + Lane); 2051 auto *Mul = addFastMathFlag(Builder.CreateBinOp(MulOp, StartIdx, Step)); 2052 auto *Add = addFastMathFlag(Builder.CreateBinOp(AddOp, ScalarIV, Mul)); 2053 VectorLoopValueMap.setScalarValue(EntryVal, {Part, Lane}, Add); 2054 recordVectorLoopValueForInductionCast(ID, EntryVal, Add, Part, Lane); 2055 } 2056 } 2057 } 2058 2059 Value *InnerLoopVectorizer::getOrCreateVectorValue(Value *V, unsigned Part) { 2060 assert(V != Induction && "The new induction variable should not be used."); 2061 assert(!V->getType()->isVectorTy() && "Can't widen a vector"); 2062 assert(!V->getType()->isVoidTy() && "Type does not produce a value"); 2063 2064 // If we have a stride that is replaced by one, do it here. Defer this for 2065 // the VPlan-native path until we start running Legal checks in that path. 2066 if (!EnableVPlanNativePath && Legal->hasStride(V)) 2067 V = ConstantInt::get(V->getType(), 1); 2068 2069 // If we have a vector mapped to this value, return it. 2070 if (VectorLoopValueMap.hasVectorValue(V, Part)) 2071 return VectorLoopValueMap.getVectorValue(V, Part); 2072 2073 // If the value has not been vectorized, check if it has been scalarized 2074 // instead. If it has been scalarized, and we actually need the value in 2075 // vector form, we will construct the vector values on demand. 2076 if (VectorLoopValueMap.hasAnyScalarValue(V)) { 2077 Value *ScalarValue = VectorLoopValueMap.getScalarValue(V, {Part, 0}); 2078 2079 // If we've scalarized a value, that value should be an instruction. 2080 auto *I = cast<Instruction>(V); 2081 2082 // If we aren't vectorizing, we can just copy the scalar map values over to 2083 // the vector map. 2084 if (VF == 1) { 2085 VectorLoopValueMap.setVectorValue(V, Part, ScalarValue); 2086 return ScalarValue; 2087 } 2088 2089 // Get the last scalar instruction we generated for V and Part. If the value 2090 // is known to be uniform after vectorization, this corresponds to lane zero 2091 // of the Part unroll iteration. Otherwise, the last instruction is the one 2092 // we created for the last vector lane of the Part unroll iteration. 2093 unsigned LastLane = Cost->isUniformAfterVectorization(I, VF) ? 0 : VF - 1; 2094 auto *LastInst = cast<Instruction>( 2095 VectorLoopValueMap.getScalarValue(V, {Part, LastLane})); 2096 2097 // Set the insert point after the last scalarized instruction. This ensures 2098 // the insertelement sequence will directly follow the scalar definitions. 2099 auto OldIP = Builder.saveIP(); 2100 auto NewIP = std::next(BasicBlock::iterator(LastInst)); 2101 Builder.SetInsertPoint(&*NewIP); 2102 2103 // However, if we are vectorizing, we need to construct the vector values. 2104 // If the value is known to be uniform after vectorization, we can just 2105 // broadcast the scalar value corresponding to lane zero for each unroll 2106 // iteration. Otherwise, we construct the vector values using insertelement 2107 // instructions. Since the resulting vectors are stored in 2108 // VectorLoopValueMap, we will only generate the insertelements once. 2109 Value *VectorValue = nullptr; 2110 if (Cost->isUniformAfterVectorization(I, VF)) { 2111 VectorValue = getBroadcastInstrs(ScalarValue); 2112 VectorLoopValueMap.setVectorValue(V, Part, VectorValue); 2113 } else { 2114 // Initialize packing with insertelements to start from undef. 2115 Value *Undef = UndefValue::get(FixedVectorType::get(V->getType(), VF)); 2116 VectorLoopValueMap.setVectorValue(V, Part, Undef); 2117 for (unsigned Lane = 0; Lane < VF; ++Lane) 2118 packScalarIntoVectorValue(V, {Part, Lane}); 2119 VectorValue = VectorLoopValueMap.getVectorValue(V, Part); 2120 } 2121 Builder.restoreIP(OldIP); 2122 return VectorValue; 2123 } 2124 2125 // If this scalar is unknown, assume that it is a constant or that it is 2126 // loop invariant. Broadcast V and save the value for future uses. 2127 Value *B = getBroadcastInstrs(V); 2128 VectorLoopValueMap.setVectorValue(V, Part, B); 2129 return B; 2130 } 2131 2132 Value * 2133 InnerLoopVectorizer::getOrCreateScalarValue(Value *V, 2134 const VPIteration &Instance) { 2135 // If the value is not an instruction contained in the loop, it should 2136 // already be scalar. 2137 if (OrigLoop->isLoopInvariant(V)) 2138 return V; 2139 2140 assert(Instance.Lane > 0 2141 ? !Cost->isUniformAfterVectorization(cast<Instruction>(V), VF) 2142 : true && "Uniform values only have lane zero"); 2143 2144 // If the value from the original loop has not been vectorized, it is 2145 // represented by UF x VF scalar values in the new loop. Return the requested 2146 // scalar value. 2147 if (VectorLoopValueMap.hasScalarValue(V, Instance)) 2148 return VectorLoopValueMap.getScalarValue(V, Instance); 2149 2150 // If the value has not been scalarized, get its entry in VectorLoopValueMap 2151 // for the given unroll part. If this entry is not a vector type (i.e., the 2152 // vectorization factor is one), there is no need to generate an 2153 // extractelement instruction. 2154 auto *U = getOrCreateVectorValue(V, Instance.Part); 2155 if (!U->getType()->isVectorTy()) { 2156 assert(VF == 1 && "Value not scalarized has non-vector type"); 2157 return U; 2158 } 2159 2160 // Otherwise, the value from the original loop has been vectorized and is 2161 // represented by UF vector values. Extract and return the requested scalar 2162 // value from the appropriate vector lane. 2163 return Builder.CreateExtractElement(U, Builder.getInt32(Instance.Lane)); 2164 } 2165 2166 void InnerLoopVectorizer::packScalarIntoVectorValue( 2167 Value *V, const VPIteration &Instance) { 2168 assert(V != Induction && "The new induction variable should not be used."); 2169 assert(!V->getType()->isVectorTy() && "Can't pack a vector"); 2170 assert(!V->getType()->isVoidTy() && "Type does not produce a value"); 2171 2172 Value *ScalarInst = VectorLoopValueMap.getScalarValue(V, Instance); 2173 Value *VectorValue = VectorLoopValueMap.getVectorValue(V, Instance.Part); 2174 VectorValue = Builder.CreateInsertElement(VectorValue, ScalarInst, 2175 Builder.getInt32(Instance.Lane)); 2176 VectorLoopValueMap.resetVectorValue(V, Instance.Part, VectorValue); 2177 } 2178 2179 Value *InnerLoopVectorizer::reverseVector(Value *Vec) { 2180 assert(Vec->getType()->isVectorTy() && "Invalid type"); 2181 SmallVector<int, 8> ShuffleMask; 2182 for (unsigned i = 0; i < VF; ++i) 2183 ShuffleMask.push_back(VF - i - 1); 2184 2185 return Builder.CreateShuffleVector(Vec, UndefValue::get(Vec->getType()), 2186 ShuffleMask, "reverse"); 2187 } 2188 2189 // Return whether we allow using masked interleave-groups (for dealing with 2190 // strided loads/stores that reside in predicated blocks, or for dealing 2191 // with gaps). 2192 static bool useMaskedInterleavedAccesses(const TargetTransformInfo &TTI) { 2193 // If an override option has been passed in for interleaved accesses, use it. 2194 if (EnableMaskedInterleavedMemAccesses.getNumOccurrences() > 0) 2195 return EnableMaskedInterleavedMemAccesses; 2196 2197 return TTI.enableMaskedInterleavedAccessVectorization(); 2198 } 2199 2200 // Try to vectorize the interleave group that \p Instr belongs to. 2201 // 2202 // E.g. Translate following interleaved load group (factor = 3): 2203 // for (i = 0; i < N; i+=3) { 2204 // R = Pic[i]; // Member of index 0 2205 // G = Pic[i+1]; // Member of index 1 2206 // B = Pic[i+2]; // Member of index 2 2207 // ... // do something to R, G, B 2208 // } 2209 // To: 2210 // %wide.vec = load <12 x i32> ; Read 4 tuples of R,G,B 2211 // %R.vec = shuffle %wide.vec, undef, <0, 3, 6, 9> ; R elements 2212 // %G.vec = shuffle %wide.vec, undef, <1, 4, 7, 10> ; G elements 2213 // %B.vec = shuffle %wide.vec, undef, <2, 5, 8, 11> ; B elements 2214 // 2215 // Or translate following interleaved store group (factor = 3): 2216 // for (i = 0; i < N; i+=3) { 2217 // ... do something to R, G, B 2218 // Pic[i] = R; // Member of index 0 2219 // Pic[i+1] = G; // Member of index 1 2220 // Pic[i+2] = B; // Member of index 2 2221 // } 2222 // To: 2223 // %R_G.vec = shuffle %R.vec, %G.vec, <0, 1, 2, ..., 7> 2224 // %B_U.vec = shuffle %B.vec, undef, <0, 1, 2, 3, u, u, u, u> 2225 // %interleaved.vec = shuffle %R_G.vec, %B_U.vec, 2226 // <0, 4, 8, 1, 5, 9, 2, 6, 10, 3, 7, 11> ; Interleave R,G,B elements 2227 // store <12 x i32> %interleaved.vec ; Write 4 tuples of R,G,B 2228 void InnerLoopVectorizer::vectorizeInterleaveGroup( 2229 const InterleaveGroup<Instruction> *Group, VPTransformState &State, 2230 VPValue *Addr, VPValue *BlockInMask) { 2231 Instruction *Instr = Group->getInsertPos(); 2232 const DataLayout &DL = Instr->getModule()->getDataLayout(); 2233 2234 // Prepare for the vector type of the interleaved load/store. 2235 Type *ScalarTy = getMemInstValueType(Instr); 2236 unsigned InterleaveFactor = Group->getFactor(); 2237 auto *VecTy = FixedVectorType::get(ScalarTy, InterleaveFactor * VF); 2238 2239 // Prepare for the new pointers. 2240 SmallVector<Value *, 2> AddrParts; 2241 unsigned Index = Group->getIndex(Instr); 2242 2243 // TODO: extend the masked interleaved-group support to reversed access. 2244 assert((!BlockInMask || !Group->isReverse()) && 2245 "Reversed masked interleave-group not supported."); 2246 2247 // If the group is reverse, adjust the index to refer to the last vector lane 2248 // instead of the first. We adjust the index from the first vector lane, 2249 // rather than directly getting the pointer for lane VF - 1, because the 2250 // pointer operand of the interleaved access is supposed to be uniform. For 2251 // uniform instructions, we're only required to generate a value for the 2252 // first vector lane in each unroll iteration. 2253 if (Group->isReverse()) 2254 Index += (VF - 1) * Group->getFactor(); 2255 2256 for (unsigned Part = 0; Part < UF; Part++) { 2257 Value *AddrPart = State.get(Addr, {Part, 0}); 2258 setDebugLocFromInst(Builder, AddrPart); 2259 2260 // Notice current instruction could be any index. Need to adjust the address 2261 // to the member of index 0. 2262 // 2263 // E.g. a = A[i+1]; // Member of index 1 (Current instruction) 2264 // b = A[i]; // Member of index 0 2265 // Current pointer is pointed to A[i+1], adjust it to A[i]. 2266 // 2267 // E.g. A[i+1] = a; // Member of index 1 2268 // A[i] = b; // Member of index 0 2269 // A[i+2] = c; // Member of index 2 (Current instruction) 2270 // Current pointer is pointed to A[i+2], adjust it to A[i]. 2271 2272 bool InBounds = false; 2273 if (auto *gep = dyn_cast<GetElementPtrInst>(AddrPart->stripPointerCasts())) 2274 InBounds = gep->isInBounds(); 2275 AddrPart = Builder.CreateGEP(ScalarTy, AddrPart, Builder.getInt32(-Index)); 2276 cast<GetElementPtrInst>(AddrPart)->setIsInBounds(InBounds); 2277 2278 // Cast to the vector pointer type. 2279 unsigned AddressSpace = AddrPart->getType()->getPointerAddressSpace(); 2280 Type *PtrTy = VecTy->getPointerTo(AddressSpace); 2281 AddrParts.push_back(Builder.CreateBitCast(AddrPart, PtrTy)); 2282 } 2283 2284 setDebugLocFromInst(Builder, Instr); 2285 Value *UndefVec = UndefValue::get(VecTy); 2286 2287 Value *MaskForGaps = nullptr; 2288 if (Group->requiresScalarEpilogue() && !Cost->isScalarEpilogueAllowed()) { 2289 MaskForGaps = createBitMaskForGaps(Builder, VF, *Group); 2290 assert(MaskForGaps && "Mask for Gaps is required but it is null"); 2291 } 2292 2293 // Vectorize the interleaved load group. 2294 if (isa<LoadInst>(Instr)) { 2295 // For each unroll part, create a wide load for the group. 2296 SmallVector<Value *, 2> NewLoads; 2297 for (unsigned Part = 0; Part < UF; Part++) { 2298 Instruction *NewLoad; 2299 if (BlockInMask || MaskForGaps) { 2300 assert(useMaskedInterleavedAccesses(*TTI) && 2301 "masked interleaved groups are not allowed."); 2302 Value *GroupMask = MaskForGaps; 2303 if (BlockInMask) { 2304 Value *BlockInMaskPart = State.get(BlockInMask, Part); 2305 auto *Undefs = UndefValue::get(BlockInMaskPart->getType()); 2306 Value *ShuffledMask = Builder.CreateShuffleVector( 2307 BlockInMaskPart, Undefs, 2308 createReplicatedMask(InterleaveFactor, VF), "interleaved.mask"); 2309 GroupMask = MaskForGaps 2310 ? Builder.CreateBinOp(Instruction::And, ShuffledMask, 2311 MaskForGaps) 2312 : ShuffledMask; 2313 } 2314 NewLoad = 2315 Builder.CreateMaskedLoad(AddrParts[Part], Group->getAlign(), 2316 GroupMask, UndefVec, "wide.masked.vec"); 2317 } 2318 else 2319 NewLoad = Builder.CreateAlignedLoad(VecTy, AddrParts[Part], 2320 Group->getAlign(), "wide.vec"); 2321 Group->addMetadata(NewLoad); 2322 NewLoads.push_back(NewLoad); 2323 } 2324 2325 // For each member in the group, shuffle out the appropriate data from the 2326 // wide loads. 2327 for (unsigned I = 0; I < InterleaveFactor; ++I) { 2328 Instruction *Member = Group->getMember(I); 2329 2330 // Skip the gaps in the group. 2331 if (!Member) 2332 continue; 2333 2334 auto StrideMask = createStrideMask(I, InterleaveFactor, VF); 2335 for (unsigned Part = 0; Part < UF; Part++) { 2336 Value *StridedVec = Builder.CreateShuffleVector( 2337 NewLoads[Part], UndefVec, StrideMask, "strided.vec"); 2338 2339 // If this member has different type, cast the result type. 2340 if (Member->getType() != ScalarTy) { 2341 VectorType *OtherVTy = FixedVectorType::get(Member->getType(), VF); 2342 StridedVec = createBitOrPointerCast(StridedVec, OtherVTy, DL); 2343 } 2344 2345 if (Group->isReverse()) 2346 StridedVec = reverseVector(StridedVec); 2347 2348 VectorLoopValueMap.setVectorValue(Member, Part, StridedVec); 2349 } 2350 } 2351 return; 2352 } 2353 2354 // The sub vector type for current instruction. 2355 auto *SubVT = FixedVectorType::get(ScalarTy, VF); 2356 2357 // Vectorize the interleaved store group. 2358 for (unsigned Part = 0; Part < UF; Part++) { 2359 // Collect the stored vector from each member. 2360 SmallVector<Value *, 4> StoredVecs; 2361 for (unsigned i = 0; i < InterleaveFactor; i++) { 2362 // Interleaved store group doesn't allow a gap, so each index has a member 2363 Instruction *Member = Group->getMember(i); 2364 assert(Member && "Fail to get a member from an interleaved store group"); 2365 2366 Value *StoredVec = getOrCreateVectorValue( 2367 cast<StoreInst>(Member)->getValueOperand(), Part); 2368 if (Group->isReverse()) 2369 StoredVec = reverseVector(StoredVec); 2370 2371 // If this member has different type, cast it to a unified type. 2372 2373 if (StoredVec->getType() != SubVT) 2374 StoredVec = createBitOrPointerCast(StoredVec, SubVT, DL); 2375 2376 StoredVecs.push_back(StoredVec); 2377 } 2378 2379 // Concatenate all vectors into a wide vector. 2380 Value *WideVec = concatenateVectors(Builder, StoredVecs); 2381 2382 // Interleave the elements in the wide vector. 2383 Value *IVec = Builder.CreateShuffleVector( 2384 WideVec, UndefVec, createInterleaveMask(VF, InterleaveFactor), 2385 "interleaved.vec"); 2386 2387 Instruction *NewStoreInstr; 2388 if (BlockInMask) { 2389 Value *BlockInMaskPart = State.get(BlockInMask, Part); 2390 auto *Undefs = UndefValue::get(BlockInMaskPart->getType()); 2391 Value *ShuffledMask = Builder.CreateShuffleVector( 2392 BlockInMaskPart, Undefs, createReplicatedMask(InterleaveFactor, VF), 2393 "interleaved.mask"); 2394 NewStoreInstr = Builder.CreateMaskedStore( 2395 IVec, AddrParts[Part], Group->getAlign(), ShuffledMask); 2396 } 2397 else 2398 NewStoreInstr = 2399 Builder.CreateAlignedStore(IVec, AddrParts[Part], Group->getAlign()); 2400 2401 Group->addMetadata(NewStoreInstr); 2402 } 2403 } 2404 2405 void InnerLoopVectorizer::vectorizeMemoryInstruction(Instruction *Instr, 2406 VPTransformState &State, 2407 VPValue *Addr, 2408 VPValue *StoredValue, 2409 VPValue *BlockInMask) { 2410 // Attempt to issue a wide load. 2411 LoadInst *LI = dyn_cast<LoadInst>(Instr); 2412 StoreInst *SI = dyn_cast<StoreInst>(Instr); 2413 2414 assert((LI || SI) && "Invalid Load/Store instruction"); 2415 assert((!SI || StoredValue) && "No stored value provided for widened store"); 2416 assert((!LI || !StoredValue) && "Stored value provided for widened load"); 2417 2418 LoopVectorizationCostModel::InstWidening Decision = 2419 Cost->getWideningDecision(Instr, VF); 2420 assert((Decision == LoopVectorizationCostModel::CM_Widen || 2421 Decision == LoopVectorizationCostModel::CM_Widen_Reverse || 2422 Decision == LoopVectorizationCostModel::CM_GatherScatter) && 2423 "CM decision is not to widen the memory instruction"); 2424 2425 Type *ScalarDataTy = getMemInstValueType(Instr); 2426 auto *DataTy = FixedVectorType::get(ScalarDataTy, VF); 2427 const Align Alignment = getLoadStoreAlignment(Instr); 2428 2429 // Determine if the pointer operand of the access is either consecutive or 2430 // reverse consecutive. 2431 bool Reverse = (Decision == LoopVectorizationCostModel::CM_Widen_Reverse); 2432 bool ConsecutiveStride = 2433 Reverse || (Decision == LoopVectorizationCostModel::CM_Widen); 2434 bool CreateGatherScatter = 2435 (Decision == LoopVectorizationCostModel::CM_GatherScatter); 2436 2437 // Either Ptr feeds a vector load/store, or a vector GEP should feed a vector 2438 // gather/scatter. Otherwise Decision should have been to Scalarize. 2439 assert((ConsecutiveStride || CreateGatherScatter) && 2440 "The instruction should be scalarized"); 2441 (void)ConsecutiveStride; 2442 2443 VectorParts BlockInMaskParts(UF); 2444 bool isMaskRequired = BlockInMask; 2445 if (isMaskRequired) 2446 for (unsigned Part = 0; Part < UF; ++Part) 2447 BlockInMaskParts[Part] = State.get(BlockInMask, Part); 2448 2449 const auto CreateVecPtr = [&](unsigned Part, Value *Ptr) -> Value * { 2450 // Calculate the pointer for the specific unroll-part. 2451 GetElementPtrInst *PartPtr = nullptr; 2452 2453 bool InBounds = false; 2454 if (auto *gep = dyn_cast<GetElementPtrInst>(Ptr->stripPointerCasts())) 2455 InBounds = gep->isInBounds(); 2456 2457 if (Reverse) { 2458 // If the address is consecutive but reversed, then the 2459 // wide store needs to start at the last vector element. 2460 PartPtr = cast<GetElementPtrInst>( 2461 Builder.CreateGEP(ScalarDataTy, Ptr, Builder.getInt32(-Part * VF))); 2462 PartPtr->setIsInBounds(InBounds); 2463 PartPtr = cast<GetElementPtrInst>( 2464 Builder.CreateGEP(ScalarDataTy, PartPtr, Builder.getInt32(1 - VF))); 2465 PartPtr->setIsInBounds(InBounds); 2466 if (isMaskRequired) // Reverse of a null all-one mask is a null mask. 2467 BlockInMaskParts[Part] = reverseVector(BlockInMaskParts[Part]); 2468 } else { 2469 PartPtr = cast<GetElementPtrInst>( 2470 Builder.CreateGEP(ScalarDataTy, Ptr, Builder.getInt32(Part * VF))); 2471 PartPtr->setIsInBounds(InBounds); 2472 } 2473 2474 unsigned AddressSpace = Ptr->getType()->getPointerAddressSpace(); 2475 return Builder.CreateBitCast(PartPtr, DataTy->getPointerTo(AddressSpace)); 2476 }; 2477 2478 // Handle Stores: 2479 if (SI) { 2480 setDebugLocFromInst(Builder, SI); 2481 2482 for (unsigned Part = 0; Part < UF; ++Part) { 2483 Instruction *NewSI = nullptr; 2484 Value *StoredVal = State.get(StoredValue, Part); 2485 if (CreateGatherScatter) { 2486 Value *MaskPart = isMaskRequired ? BlockInMaskParts[Part] : nullptr; 2487 Value *VectorGep = State.get(Addr, Part); 2488 NewSI = Builder.CreateMaskedScatter(StoredVal, VectorGep, Alignment, 2489 MaskPart); 2490 } else { 2491 if (Reverse) { 2492 // If we store to reverse consecutive memory locations, then we need 2493 // to reverse the order of elements in the stored value. 2494 StoredVal = reverseVector(StoredVal); 2495 // We don't want to update the value in the map as it might be used in 2496 // another expression. So don't call resetVectorValue(StoredVal). 2497 } 2498 auto *VecPtr = CreateVecPtr(Part, State.get(Addr, {0, 0})); 2499 if (isMaskRequired) 2500 NewSI = Builder.CreateMaskedStore(StoredVal, VecPtr, Alignment, 2501 BlockInMaskParts[Part]); 2502 else 2503 NewSI = Builder.CreateAlignedStore(StoredVal, VecPtr, Alignment); 2504 } 2505 addMetadata(NewSI, SI); 2506 } 2507 return; 2508 } 2509 2510 // Handle loads. 2511 assert(LI && "Must have a load instruction"); 2512 setDebugLocFromInst(Builder, LI); 2513 for (unsigned Part = 0; Part < UF; ++Part) { 2514 Value *NewLI; 2515 if (CreateGatherScatter) { 2516 Value *MaskPart = isMaskRequired ? BlockInMaskParts[Part] : nullptr; 2517 Value *VectorGep = State.get(Addr, Part); 2518 NewLI = Builder.CreateMaskedGather(VectorGep, Alignment, MaskPart, 2519 nullptr, "wide.masked.gather"); 2520 addMetadata(NewLI, LI); 2521 } else { 2522 auto *VecPtr = CreateVecPtr(Part, State.get(Addr, {0, 0})); 2523 if (isMaskRequired) 2524 NewLI = Builder.CreateMaskedLoad( 2525 VecPtr, Alignment, BlockInMaskParts[Part], UndefValue::get(DataTy), 2526 "wide.masked.load"); 2527 else 2528 NewLI = 2529 Builder.CreateAlignedLoad(DataTy, VecPtr, Alignment, "wide.load"); 2530 2531 // Add metadata to the load, but setVectorValue to the reverse shuffle. 2532 addMetadata(NewLI, LI); 2533 if (Reverse) 2534 NewLI = reverseVector(NewLI); 2535 } 2536 VectorLoopValueMap.setVectorValue(Instr, Part, NewLI); 2537 } 2538 } 2539 2540 void InnerLoopVectorizer::scalarizeInstruction(Instruction *Instr, VPUser &User, 2541 const VPIteration &Instance, 2542 bool IfPredicateInstr, 2543 VPTransformState &State) { 2544 assert(!Instr->getType()->isAggregateType() && "Can't handle vectors"); 2545 2546 setDebugLocFromInst(Builder, Instr); 2547 2548 // Does this instruction return a value ? 2549 bool IsVoidRetTy = Instr->getType()->isVoidTy(); 2550 2551 Instruction *Cloned = Instr->clone(); 2552 if (!IsVoidRetTy) 2553 Cloned->setName(Instr->getName() + ".cloned"); 2554 2555 // Replace the operands of the cloned instructions with their scalar 2556 // equivalents in the new loop. 2557 for (unsigned op = 0, e = User.getNumOperands(); op != e; ++op) { 2558 auto *NewOp = State.get(User.getOperand(op), Instance); 2559 Cloned->setOperand(op, NewOp); 2560 } 2561 addNewMetadata(Cloned, Instr); 2562 2563 // Place the cloned scalar in the new loop. 2564 Builder.Insert(Cloned); 2565 2566 // Add the cloned scalar to the scalar map entry. 2567 VectorLoopValueMap.setScalarValue(Instr, Instance, Cloned); 2568 2569 // If we just cloned a new assumption, add it the assumption cache. 2570 if (auto *II = dyn_cast<IntrinsicInst>(Cloned)) 2571 if (II->getIntrinsicID() == Intrinsic::assume) 2572 AC->registerAssumption(II); 2573 2574 // End if-block. 2575 if (IfPredicateInstr) 2576 PredicatedInstructions.push_back(Cloned); 2577 } 2578 2579 PHINode *InnerLoopVectorizer::createInductionVariable(Loop *L, Value *Start, 2580 Value *End, Value *Step, 2581 Instruction *DL) { 2582 BasicBlock *Header = L->getHeader(); 2583 BasicBlock *Latch = L->getLoopLatch(); 2584 // As we're just creating this loop, it's possible no latch exists 2585 // yet. If so, use the header as this will be a single block loop. 2586 if (!Latch) 2587 Latch = Header; 2588 2589 IRBuilder<> Builder(&*Header->getFirstInsertionPt()); 2590 Instruction *OldInst = getDebugLocFromInstOrOperands(OldInduction); 2591 setDebugLocFromInst(Builder, OldInst); 2592 auto *Induction = Builder.CreatePHI(Start->getType(), 2, "index"); 2593 2594 Builder.SetInsertPoint(Latch->getTerminator()); 2595 setDebugLocFromInst(Builder, OldInst); 2596 2597 // Create i+1 and fill the PHINode. 2598 Value *Next = Builder.CreateAdd(Induction, Step, "index.next"); 2599 Induction->addIncoming(Start, L->getLoopPreheader()); 2600 Induction->addIncoming(Next, Latch); 2601 // Create the compare. 2602 Value *ICmp = Builder.CreateICmpEQ(Next, End); 2603 Builder.CreateCondBr(ICmp, L->getExitBlock(), Header); 2604 2605 // Now we have two terminators. Remove the old one from the block. 2606 Latch->getTerminator()->eraseFromParent(); 2607 2608 return Induction; 2609 } 2610 2611 Value *InnerLoopVectorizer::getOrCreateTripCount(Loop *L) { 2612 if (TripCount) 2613 return TripCount; 2614 2615 assert(L && "Create Trip Count for null loop."); 2616 IRBuilder<> Builder(L->getLoopPreheader()->getTerminator()); 2617 // Find the loop boundaries. 2618 ScalarEvolution *SE = PSE.getSE(); 2619 const SCEV *BackedgeTakenCount = PSE.getBackedgeTakenCount(); 2620 assert(BackedgeTakenCount != SE->getCouldNotCompute() && 2621 "Invalid loop count"); 2622 2623 Type *IdxTy = Legal->getWidestInductionType(); 2624 assert(IdxTy && "No type for induction"); 2625 2626 // The exit count might have the type of i64 while the phi is i32. This can 2627 // happen if we have an induction variable that is sign extended before the 2628 // compare. The only way that we get a backedge taken count is that the 2629 // induction variable was signed and as such will not overflow. In such a case 2630 // truncation is legal. 2631 if (SE->getTypeSizeInBits(BackedgeTakenCount->getType()) > 2632 IdxTy->getPrimitiveSizeInBits()) 2633 BackedgeTakenCount = SE->getTruncateOrNoop(BackedgeTakenCount, IdxTy); 2634 BackedgeTakenCount = SE->getNoopOrZeroExtend(BackedgeTakenCount, IdxTy); 2635 2636 // Get the total trip count from the count by adding 1. 2637 const SCEV *ExitCount = SE->getAddExpr( 2638 BackedgeTakenCount, SE->getOne(BackedgeTakenCount->getType())); 2639 2640 const DataLayout &DL = L->getHeader()->getModule()->getDataLayout(); 2641 2642 // Expand the trip count and place the new instructions in the preheader. 2643 // Notice that the pre-header does not change, only the loop body. 2644 SCEVExpander Exp(*SE, DL, "induction"); 2645 2646 // Count holds the overall loop count (N). 2647 TripCount = Exp.expandCodeFor(ExitCount, ExitCount->getType(), 2648 L->getLoopPreheader()->getTerminator()); 2649 2650 if (TripCount->getType()->isPointerTy()) 2651 TripCount = 2652 CastInst::CreatePointerCast(TripCount, IdxTy, "exitcount.ptrcnt.to.int", 2653 L->getLoopPreheader()->getTerminator()); 2654 2655 return TripCount; 2656 } 2657 2658 Value *InnerLoopVectorizer::getOrCreateVectorTripCount(Loop *L) { 2659 if (VectorTripCount) 2660 return VectorTripCount; 2661 2662 Value *TC = getOrCreateTripCount(L); 2663 IRBuilder<> Builder(L->getLoopPreheader()->getTerminator()); 2664 2665 Type *Ty = TC->getType(); 2666 Constant *Step = ConstantInt::get(Ty, VF * UF); 2667 2668 // If the tail is to be folded by masking, round the number of iterations N 2669 // up to a multiple of Step instead of rounding down. This is done by first 2670 // adding Step-1 and then rounding down. Note that it's ok if this addition 2671 // overflows: the vector induction variable will eventually wrap to zero given 2672 // that it starts at zero and its Step is a power of two; the loop will then 2673 // exit, with the last early-exit vector comparison also producing all-true. 2674 if (Cost->foldTailByMasking()) { 2675 assert(isPowerOf2_32(VF * UF) && 2676 "VF*UF must be a power of 2 when folding tail by masking"); 2677 TC = Builder.CreateAdd(TC, ConstantInt::get(Ty, VF * UF - 1), "n.rnd.up"); 2678 } 2679 2680 // Now we need to generate the expression for the part of the loop that the 2681 // vectorized body will execute. This is equal to N - (N % Step) if scalar 2682 // iterations are not required for correctness, or N - Step, otherwise. Step 2683 // is equal to the vectorization factor (number of SIMD elements) times the 2684 // unroll factor (number of SIMD instructions). 2685 Value *R = Builder.CreateURem(TC, Step, "n.mod.vf"); 2686 2687 // If there is a non-reversed interleaved group that may speculatively access 2688 // memory out-of-bounds, we need to ensure that there will be at least one 2689 // iteration of the scalar epilogue loop. Thus, if the step evenly divides 2690 // the trip count, we set the remainder to be equal to the step. If the step 2691 // does not evenly divide the trip count, no adjustment is necessary since 2692 // there will already be scalar iterations. Note that the minimum iterations 2693 // check ensures that N >= Step. 2694 if (VF > 1 && Cost->requiresScalarEpilogue()) { 2695 auto *IsZero = Builder.CreateICmpEQ(R, ConstantInt::get(R->getType(), 0)); 2696 R = Builder.CreateSelect(IsZero, Step, R); 2697 } 2698 2699 VectorTripCount = Builder.CreateSub(TC, R, "n.vec"); 2700 2701 return VectorTripCount; 2702 } 2703 2704 Value *InnerLoopVectorizer::createBitOrPointerCast(Value *V, VectorType *DstVTy, 2705 const DataLayout &DL) { 2706 // Verify that V is a vector type with same number of elements as DstVTy. 2707 unsigned VF = DstVTy->getNumElements(); 2708 VectorType *SrcVecTy = cast<VectorType>(V->getType()); 2709 assert((VF == SrcVecTy->getNumElements()) && "Vector dimensions do not match"); 2710 Type *SrcElemTy = SrcVecTy->getElementType(); 2711 Type *DstElemTy = DstVTy->getElementType(); 2712 assert((DL.getTypeSizeInBits(SrcElemTy) == DL.getTypeSizeInBits(DstElemTy)) && 2713 "Vector elements must have same size"); 2714 2715 // Do a direct cast if element types are castable. 2716 if (CastInst::isBitOrNoopPointerCastable(SrcElemTy, DstElemTy, DL)) { 2717 return Builder.CreateBitOrPointerCast(V, DstVTy); 2718 } 2719 // V cannot be directly casted to desired vector type. 2720 // May happen when V is a floating point vector but DstVTy is a vector of 2721 // pointers or vice-versa. Handle this using a two-step bitcast using an 2722 // intermediate Integer type for the bitcast i.e. Ptr <-> Int <-> Float. 2723 assert((DstElemTy->isPointerTy() != SrcElemTy->isPointerTy()) && 2724 "Only one type should be a pointer type"); 2725 assert((DstElemTy->isFloatingPointTy() != SrcElemTy->isFloatingPointTy()) && 2726 "Only one type should be a floating point type"); 2727 Type *IntTy = 2728 IntegerType::getIntNTy(V->getContext(), DL.getTypeSizeInBits(SrcElemTy)); 2729 auto *VecIntTy = FixedVectorType::get(IntTy, VF); 2730 Value *CastVal = Builder.CreateBitOrPointerCast(V, VecIntTy); 2731 return Builder.CreateBitOrPointerCast(CastVal, DstVTy); 2732 } 2733 2734 void InnerLoopVectorizer::emitMinimumIterationCountCheck(Loop *L, 2735 BasicBlock *Bypass) { 2736 Value *Count = getOrCreateTripCount(L); 2737 // Reuse existing vector loop preheader for TC checks. 2738 // Note that new preheader block is generated for vector loop. 2739 BasicBlock *const TCCheckBlock = LoopVectorPreHeader; 2740 IRBuilder<> Builder(TCCheckBlock->getTerminator()); 2741 2742 // Generate code to check if the loop's trip count is less than VF * UF, or 2743 // equal to it in case a scalar epilogue is required; this implies that the 2744 // vector trip count is zero. This check also covers the case where adding one 2745 // to the backedge-taken count overflowed leading to an incorrect trip count 2746 // of zero. In this case we will also jump to the scalar loop. 2747 auto P = Cost->requiresScalarEpilogue() ? ICmpInst::ICMP_ULE 2748 : ICmpInst::ICMP_ULT; 2749 2750 // If tail is to be folded, vector loop takes care of all iterations. 2751 Value *CheckMinIters = Builder.getFalse(); 2752 if (!Cost->foldTailByMasking()) 2753 CheckMinIters = Builder.CreateICmp( 2754 P, Count, ConstantInt::get(Count->getType(), VF * UF), 2755 "min.iters.check"); 2756 2757 // Create new preheader for vector loop. 2758 LoopVectorPreHeader = 2759 SplitBlock(TCCheckBlock, TCCheckBlock->getTerminator(), DT, LI, nullptr, 2760 "vector.ph"); 2761 2762 assert(DT->properlyDominates(DT->getNode(TCCheckBlock), 2763 DT->getNode(Bypass)->getIDom()) && 2764 "TC check is expected to dominate Bypass"); 2765 2766 // Update dominator for Bypass & LoopExit. 2767 DT->changeImmediateDominator(Bypass, TCCheckBlock); 2768 DT->changeImmediateDominator(LoopExitBlock, TCCheckBlock); 2769 2770 ReplaceInstWithInst( 2771 TCCheckBlock->getTerminator(), 2772 BranchInst::Create(Bypass, LoopVectorPreHeader, CheckMinIters)); 2773 LoopBypassBlocks.push_back(TCCheckBlock); 2774 } 2775 2776 void InnerLoopVectorizer::emitSCEVChecks(Loop *L, BasicBlock *Bypass) { 2777 // Reuse existing vector loop preheader for SCEV checks. 2778 // Note that new preheader block is generated for vector loop. 2779 BasicBlock *const SCEVCheckBlock = LoopVectorPreHeader; 2780 2781 // Generate the code to check that the SCEV assumptions that we made. 2782 // We want the new basic block to start at the first instruction in a 2783 // sequence of instructions that form a check. 2784 SCEVExpander Exp(*PSE.getSE(), Bypass->getModule()->getDataLayout(), 2785 "scev.check"); 2786 Value *SCEVCheck = Exp.expandCodeForPredicate( 2787 &PSE.getUnionPredicate(), SCEVCheckBlock->getTerminator()); 2788 2789 if (auto *C = dyn_cast<ConstantInt>(SCEVCheck)) 2790 if (C->isZero()) 2791 return; 2792 2793 assert(!(SCEVCheckBlock->getParent()->hasOptSize() || 2794 OptForSizeBasedOnProfile) && 2795 "Cannot SCEV check stride or overflow when optimizing for size"); 2796 2797 SCEVCheckBlock->setName("vector.scevcheck"); 2798 // Create new preheader for vector loop. 2799 LoopVectorPreHeader = 2800 SplitBlock(SCEVCheckBlock, SCEVCheckBlock->getTerminator(), DT, LI, 2801 nullptr, "vector.ph"); 2802 2803 // Update dominator only if this is first RT check. 2804 if (LoopBypassBlocks.empty()) { 2805 DT->changeImmediateDominator(Bypass, SCEVCheckBlock); 2806 DT->changeImmediateDominator(LoopExitBlock, SCEVCheckBlock); 2807 } 2808 2809 ReplaceInstWithInst( 2810 SCEVCheckBlock->getTerminator(), 2811 BranchInst::Create(Bypass, LoopVectorPreHeader, SCEVCheck)); 2812 LoopBypassBlocks.push_back(SCEVCheckBlock); 2813 AddedSafetyChecks = true; 2814 } 2815 2816 void InnerLoopVectorizer::emitMemRuntimeChecks(Loop *L, BasicBlock *Bypass) { 2817 // VPlan-native path does not do any analysis for runtime checks currently. 2818 if (EnableVPlanNativePath) 2819 return; 2820 2821 // Reuse existing vector loop preheader for runtime memory checks. 2822 // Note that new preheader block is generated for vector loop. 2823 BasicBlock *const MemCheckBlock = L->getLoopPreheader(); 2824 2825 // Generate the code that checks in runtime if arrays overlap. We put the 2826 // checks into a separate block to make the more common case of few elements 2827 // faster. 2828 auto *LAI = Legal->getLAI(); 2829 const auto &RtPtrChecking = *LAI->getRuntimePointerChecking(); 2830 if (!RtPtrChecking.Need) 2831 return; 2832 Instruction *FirstCheckInst; 2833 Instruction *MemRuntimeCheck; 2834 std::tie(FirstCheckInst, MemRuntimeCheck) = 2835 addRuntimeChecks(MemCheckBlock->getTerminator(), OrigLoop, 2836 RtPtrChecking.getChecks(), RtPtrChecking.getSE()); 2837 assert(MemRuntimeCheck && "no RT checks generated although RtPtrChecking " 2838 "claimed checks are required"); 2839 2840 if (MemCheckBlock->getParent()->hasOptSize() || OptForSizeBasedOnProfile) { 2841 assert(Cost->Hints->getForce() == LoopVectorizeHints::FK_Enabled && 2842 "Cannot emit memory checks when optimizing for size, unless forced " 2843 "to vectorize."); 2844 ORE->emit([&]() { 2845 return OptimizationRemarkAnalysis(DEBUG_TYPE, "VectorizationCodeSize", 2846 L->getStartLoc(), L->getHeader()) 2847 << "Code-size may be reduced by not forcing " 2848 "vectorization, or by source-code modifications " 2849 "eliminating the need for runtime checks " 2850 "(e.g., adding 'restrict')."; 2851 }); 2852 } 2853 2854 MemCheckBlock->setName("vector.memcheck"); 2855 // Create new preheader for vector loop. 2856 LoopVectorPreHeader = 2857 SplitBlock(MemCheckBlock, MemCheckBlock->getTerminator(), DT, LI, nullptr, 2858 "vector.ph"); 2859 2860 // Update dominator only if this is first RT check. 2861 if (LoopBypassBlocks.empty()) { 2862 DT->changeImmediateDominator(Bypass, MemCheckBlock); 2863 DT->changeImmediateDominator(LoopExitBlock, MemCheckBlock); 2864 } 2865 2866 ReplaceInstWithInst( 2867 MemCheckBlock->getTerminator(), 2868 BranchInst::Create(Bypass, LoopVectorPreHeader, MemRuntimeCheck)); 2869 LoopBypassBlocks.push_back(MemCheckBlock); 2870 AddedSafetyChecks = true; 2871 2872 // We currently don't use LoopVersioning for the actual loop cloning but we 2873 // still use it to add the noalias metadata. 2874 LVer = std::make_unique<LoopVersioning>(*Legal->getLAI(), OrigLoop, LI, DT, 2875 PSE.getSE()); 2876 LVer->prepareNoAliasMetadata(); 2877 } 2878 2879 Value *InnerLoopVectorizer::emitTransformedIndex( 2880 IRBuilder<> &B, Value *Index, ScalarEvolution *SE, const DataLayout &DL, 2881 const InductionDescriptor &ID) const { 2882 2883 SCEVExpander Exp(*SE, DL, "induction"); 2884 auto Step = ID.getStep(); 2885 auto StartValue = ID.getStartValue(); 2886 assert(Index->getType() == Step->getType() && 2887 "Index type does not match StepValue type"); 2888 2889 // Note: the IR at this point is broken. We cannot use SE to create any new 2890 // SCEV and then expand it, hoping that SCEV's simplification will give us 2891 // a more optimal code. Unfortunately, attempt of doing so on invalid IR may 2892 // lead to various SCEV crashes. So all we can do is to use builder and rely 2893 // on InstCombine for future simplifications. Here we handle some trivial 2894 // cases only. 2895 auto CreateAdd = [&B](Value *X, Value *Y) { 2896 assert(X->getType() == Y->getType() && "Types don't match!"); 2897 if (auto *CX = dyn_cast<ConstantInt>(X)) 2898 if (CX->isZero()) 2899 return Y; 2900 if (auto *CY = dyn_cast<ConstantInt>(Y)) 2901 if (CY->isZero()) 2902 return X; 2903 return B.CreateAdd(X, Y); 2904 }; 2905 2906 auto CreateMul = [&B](Value *X, Value *Y) { 2907 assert(X->getType() == Y->getType() && "Types don't match!"); 2908 if (auto *CX = dyn_cast<ConstantInt>(X)) 2909 if (CX->isOne()) 2910 return Y; 2911 if (auto *CY = dyn_cast<ConstantInt>(Y)) 2912 if (CY->isOne()) 2913 return X; 2914 return B.CreateMul(X, Y); 2915 }; 2916 2917 // Get a suitable insert point for SCEV expansion. For blocks in the vector 2918 // loop, choose the end of the vector loop header (=LoopVectorBody), because 2919 // the DomTree is not kept up-to-date for additional blocks generated in the 2920 // vector loop. By using the header as insertion point, we guarantee that the 2921 // expanded instructions dominate all their uses. 2922 auto GetInsertPoint = [this, &B]() { 2923 BasicBlock *InsertBB = B.GetInsertPoint()->getParent(); 2924 if (InsertBB != LoopVectorBody && 2925 LI->getLoopFor(LoopVectorBody) == LI->getLoopFor(InsertBB)) 2926 return LoopVectorBody->getTerminator(); 2927 return &*B.GetInsertPoint(); 2928 }; 2929 switch (ID.getKind()) { 2930 case InductionDescriptor::IK_IntInduction: { 2931 assert(Index->getType() == StartValue->getType() && 2932 "Index type does not match StartValue type"); 2933 if (ID.getConstIntStepValue() && ID.getConstIntStepValue()->isMinusOne()) 2934 return B.CreateSub(StartValue, Index); 2935 auto *Offset = CreateMul( 2936 Index, Exp.expandCodeFor(Step, Index->getType(), GetInsertPoint())); 2937 return CreateAdd(StartValue, Offset); 2938 } 2939 case InductionDescriptor::IK_PtrInduction: { 2940 assert(isa<SCEVConstant>(Step) && 2941 "Expected constant step for pointer induction"); 2942 return B.CreateGEP( 2943 StartValue->getType()->getPointerElementType(), StartValue, 2944 CreateMul(Index, 2945 Exp.expandCodeFor(Step, Index->getType(), GetInsertPoint()))); 2946 } 2947 case InductionDescriptor::IK_FpInduction: { 2948 assert(Step->getType()->isFloatingPointTy() && "Expected FP Step value"); 2949 auto InductionBinOp = ID.getInductionBinOp(); 2950 assert(InductionBinOp && 2951 (InductionBinOp->getOpcode() == Instruction::FAdd || 2952 InductionBinOp->getOpcode() == Instruction::FSub) && 2953 "Original bin op should be defined for FP induction"); 2954 2955 Value *StepValue = cast<SCEVUnknown>(Step)->getValue(); 2956 2957 // Floating point operations had to be 'fast' to enable the induction. 2958 FastMathFlags Flags; 2959 Flags.setFast(); 2960 2961 Value *MulExp = B.CreateFMul(StepValue, Index); 2962 if (isa<Instruction>(MulExp)) 2963 // We have to check, the MulExp may be a constant. 2964 cast<Instruction>(MulExp)->setFastMathFlags(Flags); 2965 2966 Value *BOp = B.CreateBinOp(InductionBinOp->getOpcode(), StartValue, MulExp, 2967 "induction"); 2968 if (isa<Instruction>(BOp)) 2969 cast<Instruction>(BOp)->setFastMathFlags(Flags); 2970 2971 return BOp; 2972 } 2973 case InductionDescriptor::IK_NoInduction: 2974 return nullptr; 2975 } 2976 llvm_unreachable("invalid enum"); 2977 } 2978 2979 Loop *InnerLoopVectorizer::createVectorLoopSkeleton(StringRef Prefix) { 2980 LoopScalarBody = OrigLoop->getHeader(); 2981 LoopVectorPreHeader = OrigLoop->getLoopPreheader(); 2982 LoopExitBlock = OrigLoop->getExitBlock(); 2983 assert(LoopExitBlock && "Must have an exit block"); 2984 assert(LoopVectorPreHeader && "Invalid loop structure"); 2985 2986 LoopMiddleBlock = 2987 SplitBlock(LoopVectorPreHeader, LoopVectorPreHeader->getTerminator(), DT, 2988 LI, nullptr, Twine(Prefix) + "middle.block"); 2989 LoopScalarPreHeader = 2990 SplitBlock(LoopMiddleBlock, LoopMiddleBlock->getTerminator(), DT, LI, 2991 nullptr, Twine(Prefix) + "scalar.ph"); 2992 // We intentionally don't let SplitBlock to update LoopInfo since 2993 // LoopVectorBody should belong to another loop than LoopVectorPreHeader. 2994 // LoopVectorBody is explicitly added to the correct place few lines later. 2995 LoopVectorBody = 2996 SplitBlock(LoopVectorPreHeader, LoopVectorPreHeader->getTerminator(), DT, 2997 nullptr, nullptr, Twine(Prefix) + "vector.body"); 2998 2999 // Update dominator for loop exit. 3000 DT->changeImmediateDominator(LoopExitBlock, LoopMiddleBlock); 3001 3002 // Create and register the new vector loop. 3003 Loop *Lp = LI->AllocateLoop(); 3004 Loop *ParentLoop = OrigLoop->getParentLoop(); 3005 3006 // Insert the new loop into the loop nest and register the new basic blocks 3007 // before calling any utilities such as SCEV that require valid LoopInfo. 3008 if (ParentLoop) { 3009 ParentLoop->addChildLoop(Lp); 3010 } else { 3011 LI->addTopLevelLoop(Lp); 3012 } 3013 Lp->addBasicBlockToLoop(LoopVectorBody, *LI); 3014 return Lp; 3015 } 3016 3017 void InnerLoopVectorizer::createInductionResumeValues(Loop *L, 3018 Value *VectorTripCount) { 3019 assert(VectorTripCount && L && "Expected valid arguments"); 3020 // We are going to resume the execution of the scalar loop. 3021 // Go over all of the induction variables that we found and fix the 3022 // PHIs that are left in the scalar version of the loop. 3023 // The starting values of PHI nodes depend on the counter of the last 3024 // iteration in the vectorized loop. 3025 // If we come from a bypass edge then we need to start from the original 3026 // start value. 3027 for (auto &InductionEntry : Legal->getInductionVars()) { 3028 PHINode *OrigPhi = InductionEntry.first; 3029 InductionDescriptor II = InductionEntry.second; 3030 3031 // Create phi nodes to merge from the backedge-taken check block. 3032 PHINode *BCResumeVal = 3033 PHINode::Create(OrigPhi->getType(), 3, "bc.resume.val", 3034 LoopScalarPreHeader->getTerminator()); 3035 // Copy original phi DL over to the new one. 3036 BCResumeVal->setDebugLoc(OrigPhi->getDebugLoc()); 3037 Value *&EndValue = IVEndValues[OrigPhi]; 3038 if (OrigPhi == OldInduction) { 3039 // We know what the end value is. 3040 EndValue = VectorTripCount; 3041 } else { 3042 IRBuilder<> B(L->getLoopPreheader()->getTerminator()); 3043 Type *StepType = II.getStep()->getType(); 3044 Instruction::CastOps CastOp = 3045 CastInst::getCastOpcode(VectorTripCount, true, StepType, true); 3046 Value *CRD = B.CreateCast(CastOp, VectorTripCount, StepType, "cast.crd"); 3047 const DataLayout &DL = LoopScalarBody->getModule()->getDataLayout(); 3048 EndValue = emitTransformedIndex(B, CRD, PSE.getSE(), DL, II); 3049 EndValue->setName("ind.end"); 3050 } 3051 3052 // The new PHI merges the original incoming value, in case of a bypass, 3053 // or the value at the end of the vectorized loop. 3054 BCResumeVal->addIncoming(EndValue, LoopMiddleBlock); 3055 3056 // Fix the scalar body counter (PHI node). 3057 // The old induction's phi node in the scalar body needs the truncated 3058 // value. 3059 for (BasicBlock *BB : LoopBypassBlocks) 3060 BCResumeVal->addIncoming(II.getStartValue(), BB); 3061 OrigPhi->setIncomingValueForBlock(LoopScalarPreHeader, BCResumeVal); 3062 } 3063 } 3064 3065 BasicBlock *InnerLoopVectorizer::completeLoopSkeleton(Loop *L, 3066 MDNode *OrigLoopID) { 3067 assert(L && "Expected valid loop."); 3068 3069 // The trip counts should be cached by now. 3070 Value *Count = getOrCreateTripCount(L); 3071 Value *VectorTripCount = getOrCreateVectorTripCount(L); 3072 3073 // We need the OrigLoop (scalar loop part) latch terminator to help 3074 // produce correct debug info for the middle block BB instructions. 3075 // The legality check stage guarantees that the loop will have a single 3076 // latch. 3077 assert(isa<BranchInst>(OrigLoop->getLoopLatch()->getTerminator()) && 3078 "Scalar loop latch terminator isn't a branch"); 3079 BranchInst *ScalarLatchBr = 3080 cast<BranchInst>(OrigLoop->getLoopLatch()->getTerminator()); 3081 3082 // Add a check in the middle block to see if we have completed 3083 // all of the iterations in the first vector loop. 3084 // If (N - N%VF) == N, then we *don't* need to run the remainder. 3085 // If tail is to be folded, we know we don't need to run the remainder. 3086 Value *CmpN = Builder.getTrue(); 3087 if (!Cost->foldTailByMasking()) { 3088 CmpN = CmpInst::Create(Instruction::ICmp, CmpInst::ICMP_EQ, Count, 3089 VectorTripCount, "cmp.n", 3090 LoopMiddleBlock->getTerminator()); 3091 3092 // Here we use the same DebugLoc as the scalar loop latch branch instead 3093 // of the corresponding compare because they may have ended up with 3094 // different line numbers and we want to avoid awkward line stepping while 3095 // debugging. Eg. if the compare has got a line number inside the loop. 3096 cast<Instruction>(CmpN)->setDebugLoc(ScalarLatchBr->getDebugLoc()); 3097 } 3098 3099 BranchInst *BrInst = 3100 BranchInst::Create(LoopExitBlock, LoopScalarPreHeader, CmpN); 3101 BrInst->setDebugLoc(ScalarLatchBr->getDebugLoc()); 3102 ReplaceInstWithInst(LoopMiddleBlock->getTerminator(), BrInst); 3103 3104 // Get ready to start creating new instructions into the vectorized body. 3105 assert(LoopVectorPreHeader == L->getLoopPreheader() && 3106 "Inconsistent vector loop preheader"); 3107 Builder.SetInsertPoint(&*LoopVectorBody->getFirstInsertionPt()); 3108 3109 Optional<MDNode *> VectorizedLoopID = 3110 makeFollowupLoopID(OrigLoopID, {LLVMLoopVectorizeFollowupAll, 3111 LLVMLoopVectorizeFollowupVectorized}); 3112 if (VectorizedLoopID.hasValue()) { 3113 L->setLoopID(VectorizedLoopID.getValue()); 3114 3115 // Do not setAlreadyVectorized if loop attributes have been defined 3116 // explicitly. 3117 return LoopVectorPreHeader; 3118 } 3119 3120 // Keep all loop hints from the original loop on the vector loop (we'll 3121 // replace the vectorizer-specific hints below). 3122 if (MDNode *LID = OrigLoop->getLoopID()) 3123 L->setLoopID(LID); 3124 3125 LoopVectorizeHints Hints(L, true, *ORE); 3126 Hints.setAlreadyVectorized(); 3127 3128 #ifdef EXPENSIVE_CHECKS 3129 assert(DT->verify(DominatorTree::VerificationLevel::Fast)); 3130 LI->verify(*DT); 3131 #endif 3132 3133 return LoopVectorPreHeader; 3134 } 3135 3136 BasicBlock *InnerLoopVectorizer::createVectorizedLoopSkeleton() { 3137 /* 3138 In this function we generate a new loop. The new loop will contain 3139 the vectorized instructions while the old loop will continue to run the 3140 scalar remainder. 3141 3142 [ ] <-- loop iteration number check. 3143 / | 3144 / v 3145 | [ ] <-- vector loop bypass (may consist of multiple blocks). 3146 | / | 3147 | / v 3148 || [ ] <-- vector pre header. 3149 |/ | 3150 | v 3151 | [ ] \ 3152 | [ ]_| <-- vector loop. 3153 | | 3154 | v 3155 | -[ ] <--- middle-block. 3156 | / | 3157 | / v 3158 -|- >[ ] <--- new preheader. 3159 | | 3160 | v 3161 | [ ] \ 3162 | [ ]_| <-- old scalar loop to handle remainder. 3163 \ | 3164 \ v 3165 >[ ] <-- exit block. 3166 ... 3167 */ 3168 3169 // Get the metadata of the original loop before it gets modified. 3170 MDNode *OrigLoopID = OrigLoop->getLoopID(); 3171 3172 // Create an empty vector loop, and prepare basic blocks for the runtime 3173 // checks. 3174 Loop *Lp = createVectorLoopSkeleton(""); 3175 3176 // Now, compare the new count to zero. If it is zero skip the vector loop and 3177 // jump to the scalar loop. This check also covers the case where the 3178 // backedge-taken count is uint##_max: adding one to it will overflow leading 3179 // to an incorrect trip count of zero. In this (rare) case we will also jump 3180 // to the scalar loop. 3181 emitMinimumIterationCountCheck(Lp, LoopScalarPreHeader); 3182 3183 // Generate the code to check any assumptions that we've made for SCEV 3184 // expressions. 3185 emitSCEVChecks(Lp, LoopScalarPreHeader); 3186 3187 // Generate the code that checks in runtime if arrays overlap. We put the 3188 // checks into a separate block to make the more common case of few elements 3189 // faster. 3190 emitMemRuntimeChecks(Lp, LoopScalarPreHeader); 3191 3192 // Some loops have a single integer induction variable, while other loops 3193 // don't. One example is c++ iterators that often have multiple pointer 3194 // induction variables. In the code below we also support a case where we 3195 // don't have a single induction variable. 3196 // 3197 // We try to obtain an induction variable from the original loop as hard 3198 // as possible. However if we don't find one that: 3199 // - is an integer 3200 // - counts from zero, stepping by one 3201 // - is the size of the widest induction variable type 3202 // then we create a new one. 3203 OldInduction = Legal->getPrimaryInduction(); 3204 Type *IdxTy = Legal->getWidestInductionType(); 3205 Value *StartIdx = ConstantInt::get(IdxTy, 0); 3206 // The loop step is equal to the vectorization factor (num of SIMD elements) 3207 // times the unroll factor (num of SIMD instructions). 3208 Constant *Step = ConstantInt::get(IdxTy, VF * UF); 3209 Value *CountRoundDown = getOrCreateVectorTripCount(Lp); 3210 Induction = 3211 createInductionVariable(Lp, StartIdx, CountRoundDown, Step, 3212 getDebugLocFromInstOrOperands(OldInduction)); 3213 3214 // Emit phis for the new starting index of the scalar loop. 3215 createInductionResumeValues(Lp, CountRoundDown); 3216 3217 return completeLoopSkeleton(Lp, OrigLoopID); 3218 } 3219 3220 // Fix up external users of the induction variable. At this point, we are 3221 // in LCSSA form, with all external PHIs that use the IV having one input value, 3222 // coming from the remainder loop. We need those PHIs to also have a correct 3223 // value for the IV when arriving directly from the middle block. 3224 void InnerLoopVectorizer::fixupIVUsers(PHINode *OrigPhi, 3225 const InductionDescriptor &II, 3226 Value *CountRoundDown, Value *EndValue, 3227 BasicBlock *MiddleBlock) { 3228 // There are two kinds of external IV usages - those that use the value 3229 // computed in the last iteration (the PHI) and those that use the penultimate 3230 // value (the value that feeds into the phi from the loop latch). 3231 // We allow both, but they, obviously, have different values. 3232 3233 assert(OrigLoop->getExitBlock() && "Expected a single exit block"); 3234 3235 DenseMap<Value *, Value *> MissingVals; 3236 3237 // An external user of the last iteration's value should see the value that 3238 // the remainder loop uses to initialize its own IV. 3239 Value *PostInc = OrigPhi->getIncomingValueForBlock(OrigLoop->getLoopLatch()); 3240 for (User *U : PostInc->users()) { 3241 Instruction *UI = cast<Instruction>(U); 3242 if (!OrigLoop->contains(UI)) { 3243 assert(isa<PHINode>(UI) && "Expected LCSSA form"); 3244 MissingVals[UI] = EndValue; 3245 } 3246 } 3247 3248 // An external user of the penultimate value need to see EndValue - Step. 3249 // The simplest way to get this is to recompute it from the constituent SCEVs, 3250 // that is Start + (Step * (CRD - 1)). 3251 for (User *U : OrigPhi->users()) { 3252 auto *UI = cast<Instruction>(U); 3253 if (!OrigLoop->contains(UI)) { 3254 const DataLayout &DL = 3255 OrigLoop->getHeader()->getModule()->getDataLayout(); 3256 assert(isa<PHINode>(UI) && "Expected LCSSA form"); 3257 3258 IRBuilder<> B(MiddleBlock->getTerminator()); 3259 Value *CountMinusOne = B.CreateSub( 3260 CountRoundDown, ConstantInt::get(CountRoundDown->getType(), 1)); 3261 Value *CMO = 3262 !II.getStep()->getType()->isIntegerTy() 3263 ? B.CreateCast(Instruction::SIToFP, CountMinusOne, 3264 II.getStep()->getType()) 3265 : B.CreateSExtOrTrunc(CountMinusOne, II.getStep()->getType()); 3266 CMO->setName("cast.cmo"); 3267 Value *Escape = emitTransformedIndex(B, CMO, PSE.getSE(), DL, II); 3268 Escape->setName("ind.escape"); 3269 MissingVals[UI] = Escape; 3270 } 3271 } 3272 3273 for (auto &I : MissingVals) { 3274 PHINode *PHI = cast<PHINode>(I.first); 3275 // One corner case we have to handle is two IVs "chasing" each-other, 3276 // that is %IV2 = phi [...], [ %IV1, %latch ] 3277 // In this case, if IV1 has an external use, we need to avoid adding both 3278 // "last value of IV1" and "penultimate value of IV2". So, verify that we 3279 // don't already have an incoming value for the middle block. 3280 if (PHI->getBasicBlockIndex(MiddleBlock) == -1) 3281 PHI->addIncoming(I.second, MiddleBlock); 3282 } 3283 } 3284 3285 namespace { 3286 3287 struct CSEDenseMapInfo { 3288 static bool canHandle(const Instruction *I) { 3289 return isa<InsertElementInst>(I) || isa<ExtractElementInst>(I) || 3290 isa<ShuffleVectorInst>(I) || isa<GetElementPtrInst>(I); 3291 } 3292 3293 static inline Instruction *getEmptyKey() { 3294 return DenseMapInfo<Instruction *>::getEmptyKey(); 3295 } 3296 3297 static inline Instruction *getTombstoneKey() { 3298 return DenseMapInfo<Instruction *>::getTombstoneKey(); 3299 } 3300 3301 static unsigned getHashValue(const Instruction *I) { 3302 assert(canHandle(I) && "Unknown instruction!"); 3303 return hash_combine(I->getOpcode(), hash_combine_range(I->value_op_begin(), 3304 I->value_op_end())); 3305 } 3306 3307 static bool isEqual(const Instruction *LHS, const Instruction *RHS) { 3308 if (LHS == getEmptyKey() || RHS == getEmptyKey() || 3309 LHS == getTombstoneKey() || RHS == getTombstoneKey()) 3310 return LHS == RHS; 3311 return LHS->isIdenticalTo(RHS); 3312 } 3313 }; 3314 3315 } // end anonymous namespace 3316 3317 ///Perform cse of induction variable instructions. 3318 static void cse(BasicBlock *BB) { 3319 // Perform simple cse. 3320 SmallDenseMap<Instruction *, Instruction *, 4, CSEDenseMapInfo> CSEMap; 3321 for (BasicBlock::iterator I = BB->begin(), E = BB->end(); I != E;) { 3322 Instruction *In = &*I++; 3323 3324 if (!CSEDenseMapInfo::canHandle(In)) 3325 continue; 3326 3327 // Check if we can replace this instruction with any of the 3328 // visited instructions. 3329 if (Instruction *V = CSEMap.lookup(In)) { 3330 In->replaceAllUsesWith(V); 3331 In->eraseFromParent(); 3332 continue; 3333 } 3334 3335 CSEMap[In] = In; 3336 } 3337 } 3338 3339 unsigned LoopVectorizationCostModel::getVectorCallCost(CallInst *CI, 3340 unsigned VF, 3341 bool &NeedToScalarize) { 3342 Function *F = CI->getCalledFunction(); 3343 Type *ScalarRetTy = CI->getType(); 3344 SmallVector<Type *, 4> Tys, ScalarTys; 3345 for (auto &ArgOp : CI->arg_operands()) 3346 ScalarTys.push_back(ArgOp->getType()); 3347 3348 // Estimate cost of scalarized vector call. The source operands are assumed 3349 // to be vectors, so we need to extract individual elements from there, 3350 // execute VF scalar calls, and then gather the result into the vector return 3351 // value. 3352 unsigned ScalarCallCost = TTI.getCallInstrCost(F, ScalarRetTy, ScalarTys, 3353 TTI::TCK_RecipThroughput); 3354 if (VF == 1) 3355 return ScalarCallCost; 3356 3357 // Compute corresponding vector type for return value and arguments. 3358 Type *RetTy = ToVectorTy(ScalarRetTy, VF); 3359 for (Type *ScalarTy : ScalarTys) 3360 Tys.push_back(ToVectorTy(ScalarTy, VF)); 3361 3362 // Compute costs of unpacking argument values for the scalar calls and 3363 // packing the return values to a vector. 3364 unsigned ScalarizationCost = getScalarizationOverhead(CI, VF); 3365 3366 unsigned Cost = ScalarCallCost * VF + ScalarizationCost; 3367 3368 // If we can't emit a vector call for this function, then the currently found 3369 // cost is the cost we need to return. 3370 NeedToScalarize = true; 3371 VFShape Shape = VFShape::get(*CI, {VF, false}, false /*HasGlobalPred*/); 3372 Function *VecFunc = VFDatabase(*CI).getVectorizedFunction(Shape); 3373 3374 if (!TLI || CI->isNoBuiltin() || !VecFunc) 3375 return Cost; 3376 3377 // If the corresponding vector cost is cheaper, return its cost. 3378 unsigned VectorCallCost = TTI.getCallInstrCost(nullptr, RetTy, Tys, 3379 TTI::TCK_RecipThroughput); 3380 if (VectorCallCost < Cost) { 3381 NeedToScalarize = false; 3382 return VectorCallCost; 3383 } 3384 return Cost; 3385 } 3386 3387 unsigned LoopVectorizationCostModel::getVectorIntrinsicCost(CallInst *CI, 3388 unsigned VF) { 3389 Intrinsic::ID ID = getVectorIntrinsicIDForCall(CI, TLI); 3390 assert(ID && "Expected intrinsic call!"); 3391 3392 IntrinsicCostAttributes CostAttrs(ID, *CI, VF); 3393 return TTI.getIntrinsicInstrCost(CostAttrs, 3394 TargetTransformInfo::TCK_RecipThroughput); 3395 } 3396 3397 static Type *smallestIntegerVectorType(Type *T1, Type *T2) { 3398 auto *I1 = cast<IntegerType>(cast<VectorType>(T1)->getElementType()); 3399 auto *I2 = cast<IntegerType>(cast<VectorType>(T2)->getElementType()); 3400 return I1->getBitWidth() < I2->getBitWidth() ? T1 : T2; 3401 } 3402 3403 static Type *largestIntegerVectorType(Type *T1, Type *T2) { 3404 auto *I1 = cast<IntegerType>(cast<VectorType>(T1)->getElementType()); 3405 auto *I2 = cast<IntegerType>(cast<VectorType>(T2)->getElementType()); 3406 return I1->getBitWidth() > I2->getBitWidth() ? T1 : T2; 3407 } 3408 3409 void InnerLoopVectorizer::truncateToMinimalBitwidths() { 3410 // For every instruction `I` in MinBWs, truncate the operands, create a 3411 // truncated version of `I` and reextend its result. InstCombine runs 3412 // later and will remove any ext/trunc pairs. 3413 SmallPtrSet<Value *, 4> Erased; 3414 for (const auto &KV : Cost->getMinimalBitwidths()) { 3415 // If the value wasn't vectorized, we must maintain the original scalar 3416 // type. The absence of the value from VectorLoopValueMap indicates that it 3417 // wasn't vectorized. 3418 if (!VectorLoopValueMap.hasAnyVectorValue(KV.first)) 3419 continue; 3420 for (unsigned Part = 0; Part < UF; ++Part) { 3421 Value *I = getOrCreateVectorValue(KV.first, Part); 3422 if (Erased.count(I) || I->use_empty() || !isa<Instruction>(I)) 3423 continue; 3424 Type *OriginalTy = I->getType(); 3425 Type *ScalarTruncatedTy = 3426 IntegerType::get(OriginalTy->getContext(), KV.second); 3427 auto *TruncatedTy = FixedVectorType::get( 3428 ScalarTruncatedTy, cast<VectorType>(OriginalTy)->getNumElements()); 3429 if (TruncatedTy == OriginalTy) 3430 continue; 3431 3432 IRBuilder<> B(cast<Instruction>(I)); 3433 auto ShrinkOperand = [&](Value *V) -> Value * { 3434 if (auto *ZI = dyn_cast<ZExtInst>(V)) 3435 if (ZI->getSrcTy() == TruncatedTy) 3436 return ZI->getOperand(0); 3437 return B.CreateZExtOrTrunc(V, TruncatedTy); 3438 }; 3439 3440 // The actual instruction modification depends on the instruction type, 3441 // unfortunately. 3442 Value *NewI = nullptr; 3443 if (auto *BO = dyn_cast<BinaryOperator>(I)) { 3444 NewI = B.CreateBinOp(BO->getOpcode(), ShrinkOperand(BO->getOperand(0)), 3445 ShrinkOperand(BO->getOperand(1))); 3446 3447 // Any wrapping introduced by shrinking this operation shouldn't be 3448 // considered undefined behavior. So, we can't unconditionally copy 3449 // arithmetic wrapping flags to NewI. 3450 cast<BinaryOperator>(NewI)->copyIRFlags(I, /*IncludeWrapFlags=*/false); 3451 } else if (auto *CI = dyn_cast<ICmpInst>(I)) { 3452 NewI = 3453 B.CreateICmp(CI->getPredicate(), ShrinkOperand(CI->getOperand(0)), 3454 ShrinkOperand(CI->getOperand(1))); 3455 } else if (auto *SI = dyn_cast<SelectInst>(I)) { 3456 NewI = B.CreateSelect(SI->getCondition(), 3457 ShrinkOperand(SI->getTrueValue()), 3458 ShrinkOperand(SI->getFalseValue())); 3459 } else if (auto *CI = dyn_cast<CastInst>(I)) { 3460 switch (CI->getOpcode()) { 3461 default: 3462 llvm_unreachable("Unhandled cast!"); 3463 case Instruction::Trunc: 3464 NewI = ShrinkOperand(CI->getOperand(0)); 3465 break; 3466 case Instruction::SExt: 3467 NewI = B.CreateSExtOrTrunc( 3468 CI->getOperand(0), 3469 smallestIntegerVectorType(OriginalTy, TruncatedTy)); 3470 break; 3471 case Instruction::ZExt: 3472 NewI = B.CreateZExtOrTrunc( 3473 CI->getOperand(0), 3474 smallestIntegerVectorType(OriginalTy, TruncatedTy)); 3475 break; 3476 } 3477 } else if (auto *SI = dyn_cast<ShuffleVectorInst>(I)) { 3478 auto Elements0 = 3479 cast<VectorType>(SI->getOperand(0)->getType())->getNumElements(); 3480 auto *O0 = B.CreateZExtOrTrunc( 3481 SI->getOperand(0), 3482 FixedVectorType::get(ScalarTruncatedTy, Elements0)); 3483 auto Elements1 = 3484 cast<VectorType>(SI->getOperand(1)->getType())->getNumElements(); 3485 auto *O1 = B.CreateZExtOrTrunc( 3486 SI->getOperand(1), 3487 FixedVectorType::get(ScalarTruncatedTy, Elements1)); 3488 3489 NewI = B.CreateShuffleVector(O0, O1, SI->getShuffleMask()); 3490 } else if (isa<LoadInst>(I) || isa<PHINode>(I)) { 3491 // Don't do anything with the operands, just extend the result. 3492 continue; 3493 } else if (auto *IE = dyn_cast<InsertElementInst>(I)) { 3494 auto Elements = 3495 cast<VectorType>(IE->getOperand(0)->getType())->getNumElements(); 3496 auto *O0 = B.CreateZExtOrTrunc( 3497 IE->getOperand(0), 3498 FixedVectorType::get(ScalarTruncatedTy, Elements)); 3499 auto *O1 = B.CreateZExtOrTrunc(IE->getOperand(1), ScalarTruncatedTy); 3500 NewI = B.CreateInsertElement(O0, O1, IE->getOperand(2)); 3501 } else if (auto *EE = dyn_cast<ExtractElementInst>(I)) { 3502 auto Elements = 3503 cast<VectorType>(EE->getOperand(0)->getType())->getNumElements(); 3504 auto *O0 = B.CreateZExtOrTrunc( 3505 EE->getOperand(0), 3506 FixedVectorType::get(ScalarTruncatedTy, Elements)); 3507 NewI = B.CreateExtractElement(O0, EE->getOperand(2)); 3508 } else { 3509 // If we don't know what to do, be conservative and don't do anything. 3510 continue; 3511 } 3512 3513 // Lastly, extend the result. 3514 NewI->takeName(cast<Instruction>(I)); 3515 Value *Res = B.CreateZExtOrTrunc(NewI, OriginalTy); 3516 I->replaceAllUsesWith(Res); 3517 cast<Instruction>(I)->eraseFromParent(); 3518 Erased.insert(I); 3519 VectorLoopValueMap.resetVectorValue(KV.first, Part, Res); 3520 } 3521 } 3522 3523 // We'll have created a bunch of ZExts that are now parentless. Clean up. 3524 for (const auto &KV : Cost->getMinimalBitwidths()) { 3525 // If the value wasn't vectorized, we must maintain the original scalar 3526 // type. The absence of the value from VectorLoopValueMap indicates that it 3527 // wasn't vectorized. 3528 if (!VectorLoopValueMap.hasAnyVectorValue(KV.first)) 3529 continue; 3530 for (unsigned Part = 0; Part < UF; ++Part) { 3531 Value *I = getOrCreateVectorValue(KV.first, Part); 3532 ZExtInst *Inst = dyn_cast<ZExtInst>(I); 3533 if (Inst && Inst->use_empty()) { 3534 Value *NewI = Inst->getOperand(0); 3535 Inst->eraseFromParent(); 3536 VectorLoopValueMap.resetVectorValue(KV.first, Part, NewI); 3537 } 3538 } 3539 } 3540 } 3541 3542 void InnerLoopVectorizer::fixVectorizedLoop() { 3543 // Insert truncates and extends for any truncated instructions as hints to 3544 // InstCombine. 3545 if (VF > 1) 3546 truncateToMinimalBitwidths(); 3547 3548 // Fix widened non-induction PHIs by setting up the PHI operands. 3549 if (OrigPHIsToFix.size()) { 3550 assert(EnableVPlanNativePath && 3551 "Unexpected non-induction PHIs for fixup in non VPlan-native path"); 3552 fixNonInductionPHIs(); 3553 } 3554 3555 // At this point every instruction in the original loop is widened to a 3556 // vector form. Now we need to fix the recurrences in the loop. These PHI 3557 // nodes are currently empty because we did not want to introduce cycles. 3558 // This is the second stage of vectorizing recurrences. 3559 fixCrossIterationPHIs(); 3560 3561 // Forget the original basic block. 3562 PSE.getSE()->forgetLoop(OrigLoop); 3563 3564 // Fix-up external users of the induction variables. 3565 for (auto &Entry : Legal->getInductionVars()) 3566 fixupIVUsers(Entry.first, Entry.second, 3567 getOrCreateVectorTripCount(LI->getLoopFor(LoopVectorBody)), 3568 IVEndValues[Entry.first], LoopMiddleBlock); 3569 3570 fixLCSSAPHIs(); 3571 for (Instruction *PI : PredicatedInstructions) 3572 sinkScalarOperands(&*PI); 3573 3574 // Remove redundant induction instructions. 3575 cse(LoopVectorBody); 3576 3577 // Set/update profile weights for the vector and remainder loops as original 3578 // loop iterations are now distributed among them. Note that original loop 3579 // represented by LoopScalarBody becomes remainder loop after vectorization. 3580 // 3581 // For cases like foldTailByMasking() and requiresScalarEpiloque() we may 3582 // end up getting slightly roughened result but that should be OK since 3583 // profile is not inherently precise anyway. Note also possible bypass of 3584 // vector code caused by legality checks is ignored, assigning all the weight 3585 // to the vector loop, optimistically. 3586 setProfileInfoAfterUnrolling(LI->getLoopFor(LoopScalarBody), 3587 LI->getLoopFor(LoopVectorBody), 3588 LI->getLoopFor(LoopScalarBody), VF * UF); 3589 } 3590 3591 void InnerLoopVectorizer::fixCrossIterationPHIs() { 3592 // In order to support recurrences we need to be able to vectorize Phi nodes. 3593 // Phi nodes have cycles, so we need to vectorize them in two stages. This is 3594 // stage #2: We now need to fix the recurrences by adding incoming edges to 3595 // the currently empty PHI nodes. At this point every instruction in the 3596 // original loop is widened to a vector form so we can use them to construct 3597 // the incoming edges. 3598 for (PHINode &Phi : OrigLoop->getHeader()->phis()) { 3599 // Handle first-order recurrences and reductions that need to be fixed. 3600 if (Legal->isFirstOrderRecurrence(&Phi)) 3601 fixFirstOrderRecurrence(&Phi); 3602 else if (Legal->isReductionVariable(&Phi)) 3603 fixReduction(&Phi); 3604 } 3605 } 3606 3607 void InnerLoopVectorizer::fixFirstOrderRecurrence(PHINode *Phi) { 3608 // This is the second phase of vectorizing first-order recurrences. An 3609 // overview of the transformation is described below. Suppose we have the 3610 // following loop. 3611 // 3612 // for (int i = 0; i < n; ++i) 3613 // b[i] = a[i] - a[i - 1]; 3614 // 3615 // There is a first-order recurrence on "a". For this loop, the shorthand 3616 // scalar IR looks like: 3617 // 3618 // scalar.ph: 3619 // s_init = a[-1] 3620 // br scalar.body 3621 // 3622 // scalar.body: 3623 // i = phi [0, scalar.ph], [i+1, scalar.body] 3624 // s1 = phi [s_init, scalar.ph], [s2, scalar.body] 3625 // s2 = a[i] 3626 // b[i] = s2 - s1 3627 // br cond, scalar.body, ... 3628 // 3629 // In this example, s1 is a recurrence because it's value depends on the 3630 // previous iteration. In the first phase of vectorization, we created a 3631 // temporary value for s1. We now complete the vectorization and produce the 3632 // shorthand vector IR shown below (for VF = 4, UF = 1). 3633 // 3634 // vector.ph: 3635 // v_init = vector(..., ..., ..., a[-1]) 3636 // br vector.body 3637 // 3638 // vector.body 3639 // i = phi [0, vector.ph], [i+4, vector.body] 3640 // v1 = phi [v_init, vector.ph], [v2, vector.body] 3641 // v2 = a[i, i+1, i+2, i+3]; 3642 // v3 = vector(v1(3), v2(0, 1, 2)) 3643 // b[i, i+1, i+2, i+3] = v2 - v3 3644 // br cond, vector.body, middle.block 3645 // 3646 // middle.block: 3647 // x = v2(3) 3648 // br scalar.ph 3649 // 3650 // scalar.ph: 3651 // s_init = phi [x, middle.block], [a[-1], otherwise] 3652 // br scalar.body 3653 // 3654 // After execution completes the vector loop, we extract the next value of 3655 // the recurrence (x) to use as the initial value in the scalar loop. 3656 3657 // Get the original loop preheader and single loop latch. 3658 auto *Preheader = OrigLoop->getLoopPreheader(); 3659 auto *Latch = OrigLoop->getLoopLatch(); 3660 3661 // Get the initial and previous values of the scalar recurrence. 3662 auto *ScalarInit = Phi->getIncomingValueForBlock(Preheader); 3663 auto *Previous = Phi->getIncomingValueForBlock(Latch); 3664 3665 // Create a vector from the initial value. 3666 auto *VectorInit = ScalarInit; 3667 if (VF > 1) { 3668 Builder.SetInsertPoint(LoopVectorPreHeader->getTerminator()); 3669 VectorInit = Builder.CreateInsertElement( 3670 UndefValue::get(FixedVectorType::get(VectorInit->getType(), VF)), 3671 VectorInit, Builder.getInt32(VF - 1), "vector.recur.init"); 3672 } 3673 3674 // We constructed a temporary phi node in the first phase of vectorization. 3675 // This phi node will eventually be deleted. 3676 Builder.SetInsertPoint( 3677 cast<Instruction>(VectorLoopValueMap.getVectorValue(Phi, 0))); 3678 3679 // Create a phi node for the new recurrence. The current value will either be 3680 // the initial value inserted into a vector or loop-varying vector value. 3681 auto *VecPhi = Builder.CreatePHI(VectorInit->getType(), 2, "vector.recur"); 3682 VecPhi->addIncoming(VectorInit, LoopVectorPreHeader); 3683 3684 // Get the vectorized previous value of the last part UF - 1. It appears last 3685 // among all unrolled iterations, due to the order of their construction. 3686 Value *PreviousLastPart = getOrCreateVectorValue(Previous, UF - 1); 3687 3688 // Find and set the insertion point after the previous value if it is an 3689 // instruction. 3690 BasicBlock::iterator InsertPt; 3691 // Note that the previous value may have been constant-folded so it is not 3692 // guaranteed to be an instruction in the vector loop. 3693 // FIXME: Loop invariant values do not form recurrences. We should deal with 3694 // them earlier. 3695 if (LI->getLoopFor(LoopVectorBody)->isLoopInvariant(PreviousLastPart)) 3696 InsertPt = LoopVectorBody->getFirstInsertionPt(); 3697 else { 3698 Instruction *PreviousInst = cast<Instruction>(PreviousLastPart); 3699 if (isa<PHINode>(PreviousLastPart)) 3700 // If the previous value is a phi node, we should insert after all the phi 3701 // nodes in the block containing the PHI to avoid breaking basic block 3702 // verification. Note that the basic block may be different to 3703 // LoopVectorBody, in case we predicate the loop. 3704 InsertPt = PreviousInst->getParent()->getFirstInsertionPt(); 3705 else 3706 InsertPt = ++PreviousInst->getIterator(); 3707 } 3708 Builder.SetInsertPoint(&*InsertPt); 3709 3710 // We will construct a vector for the recurrence by combining the values for 3711 // the current and previous iterations. This is the required shuffle mask. 3712 SmallVector<int, 8> ShuffleMask(VF); 3713 ShuffleMask[0] = VF - 1; 3714 for (unsigned I = 1; I < VF; ++I) 3715 ShuffleMask[I] = I + VF - 1; 3716 3717 // The vector from which to take the initial value for the current iteration 3718 // (actual or unrolled). Initially, this is the vector phi node. 3719 Value *Incoming = VecPhi; 3720 3721 // Shuffle the current and previous vector and update the vector parts. 3722 for (unsigned Part = 0; Part < UF; ++Part) { 3723 Value *PreviousPart = getOrCreateVectorValue(Previous, Part); 3724 Value *PhiPart = VectorLoopValueMap.getVectorValue(Phi, Part); 3725 auto *Shuffle = VF > 1 ? Builder.CreateShuffleVector(Incoming, PreviousPart, 3726 ShuffleMask) 3727 : Incoming; 3728 PhiPart->replaceAllUsesWith(Shuffle); 3729 cast<Instruction>(PhiPart)->eraseFromParent(); 3730 VectorLoopValueMap.resetVectorValue(Phi, Part, Shuffle); 3731 Incoming = PreviousPart; 3732 } 3733 3734 // Fix the latch value of the new recurrence in the vector loop. 3735 VecPhi->addIncoming(Incoming, LI->getLoopFor(LoopVectorBody)->getLoopLatch()); 3736 3737 // Extract the last vector element in the middle block. This will be the 3738 // initial value for the recurrence when jumping to the scalar loop. 3739 auto *ExtractForScalar = Incoming; 3740 if (VF > 1) { 3741 Builder.SetInsertPoint(LoopMiddleBlock->getTerminator()); 3742 ExtractForScalar = Builder.CreateExtractElement( 3743 ExtractForScalar, Builder.getInt32(VF - 1), "vector.recur.extract"); 3744 } 3745 // Extract the second last element in the middle block if the 3746 // Phi is used outside the loop. We need to extract the phi itself 3747 // and not the last element (the phi update in the current iteration). This 3748 // will be the value when jumping to the exit block from the LoopMiddleBlock, 3749 // when the scalar loop is not run at all. 3750 Value *ExtractForPhiUsedOutsideLoop = nullptr; 3751 if (VF > 1) 3752 ExtractForPhiUsedOutsideLoop = Builder.CreateExtractElement( 3753 Incoming, Builder.getInt32(VF - 2), "vector.recur.extract.for.phi"); 3754 // When loop is unrolled without vectorizing, initialize 3755 // ExtractForPhiUsedOutsideLoop with the value just prior to unrolled value of 3756 // `Incoming`. This is analogous to the vectorized case above: extracting the 3757 // second last element when VF > 1. 3758 else if (UF > 1) 3759 ExtractForPhiUsedOutsideLoop = getOrCreateVectorValue(Previous, UF - 2); 3760 3761 // Fix the initial value of the original recurrence in the scalar loop. 3762 Builder.SetInsertPoint(&*LoopScalarPreHeader->begin()); 3763 auto *Start = Builder.CreatePHI(Phi->getType(), 2, "scalar.recur.init"); 3764 for (auto *BB : predecessors(LoopScalarPreHeader)) { 3765 auto *Incoming = BB == LoopMiddleBlock ? ExtractForScalar : ScalarInit; 3766 Start->addIncoming(Incoming, BB); 3767 } 3768 3769 Phi->setIncomingValueForBlock(LoopScalarPreHeader, Start); 3770 Phi->setName("scalar.recur"); 3771 3772 // Finally, fix users of the recurrence outside the loop. The users will need 3773 // either the last value of the scalar recurrence or the last value of the 3774 // vector recurrence we extracted in the middle block. Since the loop is in 3775 // LCSSA form, we just need to find all the phi nodes for the original scalar 3776 // recurrence in the exit block, and then add an edge for the middle block. 3777 for (PHINode &LCSSAPhi : LoopExitBlock->phis()) { 3778 if (LCSSAPhi.getIncomingValue(0) == Phi) { 3779 LCSSAPhi.addIncoming(ExtractForPhiUsedOutsideLoop, LoopMiddleBlock); 3780 } 3781 } 3782 } 3783 3784 void InnerLoopVectorizer::fixReduction(PHINode *Phi) { 3785 Constant *Zero = Builder.getInt32(0); 3786 3787 // Get it's reduction variable descriptor. 3788 assert(Legal->isReductionVariable(Phi) && 3789 "Unable to find the reduction variable"); 3790 RecurrenceDescriptor RdxDesc = Legal->getReductionVars()[Phi]; 3791 3792 RecurrenceDescriptor::RecurrenceKind RK = RdxDesc.getRecurrenceKind(); 3793 TrackingVH<Value> ReductionStartValue = RdxDesc.getRecurrenceStartValue(); 3794 Instruction *LoopExitInst = RdxDesc.getLoopExitInstr(); 3795 RecurrenceDescriptor::MinMaxRecurrenceKind MinMaxKind = 3796 RdxDesc.getMinMaxRecurrenceKind(); 3797 setDebugLocFromInst(Builder, ReductionStartValue); 3798 3799 // We need to generate a reduction vector from the incoming scalar. 3800 // To do so, we need to generate the 'identity' vector and override 3801 // one of the elements with the incoming scalar reduction. We need 3802 // to do it in the vector-loop preheader. 3803 Builder.SetInsertPoint(LoopVectorPreHeader->getTerminator()); 3804 3805 // This is the vector-clone of the value that leaves the loop. 3806 Type *VecTy = getOrCreateVectorValue(LoopExitInst, 0)->getType(); 3807 3808 // Find the reduction identity variable. Zero for addition, or, xor, 3809 // one for multiplication, -1 for And. 3810 Value *Identity; 3811 Value *VectorStart; 3812 if (RK == RecurrenceDescriptor::RK_IntegerMinMax || 3813 RK == RecurrenceDescriptor::RK_FloatMinMax) { 3814 // MinMax reduction have the start value as their identify. 3815 if (VF == 1) { 3816 VectorStart = Identity = ReductionStartValue; 3817 } else { 3818 VectorStart = Identity = 3819 Builder.CreateVectorSplat(VF, ReductionStartValue, "minmax.ident"); 3820 } 3821 } else { 3822 // Handle other reduction kinds: 3823 Constant *Iden = RecurrenceDescriptor::getRecurrenceIdentity( 3824 RK, VecTy->getScalarType()); 3825 if (VF == 1) { 3826 Identity = Iden; 3827 // This vector is the Identity vector where the first element is the 3828 // incoming scalar reduction. 3829 VectorStart = ReductionStartValue; 3830 } else { 3831 Identity = ConstantVector::getSplat({VF, false}, Iden); 3832 3833 // This vector is the Identity vector where the first element is the 3834 // incoming scalar reduction. 3835 VectorStart = 3836 Builder.CreateInsertElement(Identity, ReductionStartValue, Zero); 3837 } 3838 } 3839 3840 // Wrap flags are in general invalid after vectorization, clear them. 3841 clearReductionWrapFlags(RdxDesc); 3842 3843 // Fix the vector-loop phi. 3844 3845 // Reductions do not have to start at zero. They can start with 3846 // any loop invariant values. 3847 BasicBlock *Latch = OrigLoop->getLoopLatch(); 3848 Value *LoopVal = Phi->getIncomingValueForBlock(Latch); 3849 3850 for (unsigned Part = 0; Part < UF; ++Part) { 3851 Value *VecRdxPhi = getOrCreateVectorValue(Phi, Part); 3852 Value *Val = getOrCreateVectorValue(LoopVal, Part); 3853 // Make sure to add the reduction start value only to the 3854 // first unroll part. 3855 Value *StartVal = (Part == 0) ? VectorStart : Identity; 3856 cast<PHINode>(VecRdxPhi)->addIncoming(StartVal, LoopVectorPreHeader); 3857 cast<PHINode>(VecRdxPhi) 3858 ->addIncoming(Val, LI->getLoopFor(LoopVectorBody)->getLoopLatch()); 3859 } 3860 3861 // Before each round, move the insertion point right between 3862 // the PHIs and the values we are going to write. 3863 // This allows us to write both PHINodes and the extractelement 3864 // instructions. 3865 Builder.SetInsertPoint(&*LoopMiddleBlock->getFirstInsertionPt()); 3866 3867 setDebugLocFromInst(Builder, LoopExitInst); 3868 3869 // If tail is folded by masking, the vector value to leave the loop should be 3870 // a Select choosing between the vectorized LoopExitInst and vectorized Phi, 3871 // instead of the former. 3872 if (Cost->foldTailByMasking()) { 3873 for (unsigned Part = 0; Part < UF; ++Part) { 3874 Value *VecLoopExitInst = 3875 VectorLoopValueMap.getVectorValue(LoopExitInst, Part); 3876 Value *Sel = nullptr; 3877 for (User *U : VecLoopExitInst->users()) { 3878 if (isa<SelectInst>(U)) { 3879 assert(!Sel && "Reduction exit feeding two selects"); 3880 Sel = U; 3881 } else 3882 assert(isa<PHINode>(U) && "Reduction exit must feed Phi's or select"); 3883 } 3884 assert(Sel && "Reduction exit feeds no select"); 3885 VectorLoopValueMap.resetVectorValue(LoopExitInst, Part, Sel); 3886 } 3887 } 3888 3889 // If the vector reduction can be performed in a smaller type, we truncate 3890 // then extend the loop exit value to enable InstCombine to evaluate the 3891 // entire expression in the smaller type. 3892 if (VF > 1 && Phi->getType() != RdxDesc.getRecurrenceType()) { 3893 Type *RdxVecTy = FixedVectorType::get(RdxDesc.getRecurrenceType(), VF); 3894 Builder.SetInsertPoint( 3895 LI->getLoopFor(LoopVectorBody)->getLoopLatch()->getTerminator()); 3896 VectorParts RdxParts(UF); 3897 for (unsigned Part = 0; Part < UF; ++Part) { 3898 RdxParts[Part] = VectorLoopValueMap.getVectorValue(LoopExitInst, Part); 3899 Value *Trunc = Builder.CreateTrunc(RdxParts[Part], RdxVecTy); 3900 Value *Extnd = RdxDesc.isSigned() ? Builder.CreateSExt(Trunc, VecTy) 3901 : Builder.CreateZExt(Trunc, VecTy); 3902 for (Value::user_iterator UI = RdxParts[Part]->user_begin(); 3903 UI != RdxParts[Part]->user_end();) 3904 if (*UI != Trunc) { 3905 (*UI++)->replaceUsesOfWith(RdxParts[Part], Extnd); 3906 RdxParts[Part] = Extnd; 3907 } else { 3908 ++UI; 3909 } 3910 } 3911 Builder.SetInsertPoint(&*LoopMiddleBlock->getFirstInsertionPt()); 3912 for (unsigned Part = 0; Part < UF; ++Part) { 3913 RdxParts[Part] = Builder.CreateTrunc(RdxParts[Part], RdxVecTy); 3914 VectorLoopValueMap.resetVectorValue(LoopExitInst, Part, RdxParts[Part]); 3915 } 3916 } 3917 3918 // Reduce all of the unrolled parts into a single vector. 3919 Value *ReducedPartRdx = VectorLoopValueMap.getVectorValue(LoopExitInst, 0); 3920 unsigned Op = RecurrenceDescriptor::getRecurrenceBinOp(RK); 3921 3922 // The middle block terminator has already been assigned a DebugLoc here (the 3923 // OrigLoop's single latch terminator). We want the whole middle block to 3924 // appear to execute on this line because: (a) it is all compiler generated, 3925 // (b) these instructions are always executed after evaluating the latch 3926 // conditional branch, and (c) other passes may add new predecessors which 3927 // terminate on this line. This is the easiest way to ensure we don't 3928 // accidentally cause an extra step back into the loop while debugging. 3929 setDebugLocFromInst(Builder, LoopMiddleBlock->getTerminator()); 3930 for (unsigned Part = 1; Part < UF; ++Part) { 3931 Value *RdxPart = VectorLoopValueMap.getVectorValue(LoopExitInst, Part); 3932 if (Op != Instruction::ICmp && Op != Instruction::FCmp) 3933 // Floating point operations had to be 'fast' to enable the reduction. 3934 ReducedPartRdx = addFastMathFlag( 3935 Builder.CreateBinOp((Instruction::BinaryOps)Op, RdxPart, 3936 ReducedPartRdx, "bin.rdx"), 3937 RdxDesc.getFastMathFlags()); 3938 else 3939 ReducedPartRdx = createMinMaxOp(Builder, MinMaxKind, ReducedPartRdx, 3940 RdxPart); 3941 } 3942 3943 if (VF > 1) { 3944 bool NoNaN = Legal->hasFunNoNaNAttr(); 3945 ReducedPartRdx = 3946 createTargetReduction(Builder, TTI, RdxDesc, ReducedPartRdx, NoNaN); 3947 // If the reduction can be performed in a smaller type, we need to extend 3948 // the reduction to the wider type before we branch to the original loop. 3949 if (Phi->getType() != RdxDesc.getRecurrenceType()) 3950 ReducedPartRdx = 3951 RdxDesc.isSigned() 3952 ? Builder.CreateSExt(ReducedPartRdx, Phi->getType()) 3953 : Builder.CreateZExt(ReducedPartRdx, Phi->getType()); 3954 } 3955 3956 // Create a phi node that merges control-flow from the backedge-taken check 3957 // block and the middle block. 3958 PHINode *BCBlockPhi = PHINode::Create(Phi->getType(), 2, "bc.merge.rdx", 3959 LoopScalarPreHeader->getTerminator()); 3960 for (unsigned I = 0, E = LoopBypassBlocks.size(); I != E; ++I) 3961 BCBlockPhi->addIncoming(ReductionStartValue, LoopBypassBlocks[I]); 3962 BCBlockPhi->addIncoming(ReducedPartRdx, LoopMiddleBlock); 3963 3964 // Now, we need to fix the users of the reduction variable 3965 // inside and outside of the scalar remainder loop. 3966 // We know that the loop is in LCSSA form. We need to update the 3967 // PHI nodes in the exit blocks. 3968 for (PHINode &LCSSAPhi : LoopExitBlock->phis()) { 3969 // All PHINodes need to have a single entry edge, or two if 3970 // we already fixed them. 3971 assert(LCSSAPhi.getNumIncomingValues() < 3 && "Invalid LCSSA PHI"); 3972 3973 // We found a reduction value exit-PHI. Update it with the 3974 // incoming bypass edge. 3975 if (LCSSAPhi.getIncomingValue(0) == LoopExitInst) 3976 LCSSAPhi.addIncoming(ReducedPartRdx, LoopMiddleBlock); 3977 } // end of the LCSSA phi scan. 3978 3979 // Fix the scalar loop reduction variable with the incoming reduction sum 3980 // from the vector body and from the backedge value. 3981 int IncomingEdgeBlockIdx = 3982 Phi->getBasicBlockIndex(OrigLoop->getLoopLatch()); 3983 assert(IncomingEdgeBlockIdx >= 0 && "Invalid block index"); 3984 // Pick the other block. 3985 int SelfEdgeBlockIdx = (IncomingEdgeBlockIdx ? 0 : 1); 3986 Phi->setIncomingValue(SelfEdgeBlockIdx, BCBlockPhi); 3987 Phi->setIncomingValue(IncomingEdgeBlockIdx, LoopExitInst); 3988 } 3989 3990 void InnerLoopVectorizer::clearReductionWrapFlags( 3991 RecurrenceDescriptor &RdxDesc) { 3992 RecurrenceDescriptor::RecurrenceKind RK = RdxDesc.getRecurrenceKind(); 3993 if (RK != RecurrenceDescriptor::RK_IntegerAdd && 3994 RK != RecurrenceDescriptor::RK_IntegerMult) 3995 return; 3996 3997 Instruction *LoopExitInstr = RdxDesc.getLoopExitInstr(); 3998 assert(LoopExitInstr && "null loop exit instruction"); 3999 SmallVector<Instruction *, 8> Worklist; 4000 SmallPtrSet<Instruction *, 8> Visited; 4001 Worklist.push_back(LoopExitInstr); 4002 Visited.insert(LoopExitInstr); 4003 4004 while (!Worklist.empty()) { 4005 Instruction *Cur = Worklist.pop_back_val(); 4006 if (isa<OverflowingBinaryOperator>(Cur)) 4007 for (unsigned Part = 0; Part < UF; ++Part) { 4008 Value *V = getOrCreateVectorValue(Cur, Part); 4009 cast<Instruction>(V)->dropPoisonGeneratingFlags(); 4010 } 4011 4012 for (User *U : Cur->users()) { 4013 Instruction *UI = cast<Instruction>(U); 4014 if ((Cur != LoopExitInstr || OrigLoop->contains(UI->getParent())) && 4015 Visited.insert(UI).second) 4016 Worklist.push_back(UI); 4017 } 4018 } 4019 } 4020 4021 void InnerLoopVectorizer::fixLCSSAPHIs() { 4022 for (PHINode &LCSSAPhi : LoopExitBlock->phis()) { 4023 if (LCSSAPhi.getNumIncomingValues() == 1) { 4024 auto *IncomingValue = LCSSAPhi.getIncomingValue(0); 4025 // Non-instruction incoming values will have only one value. 4026 unsigned LastLane = 0; 4027 if (isa<Instruction>(IncomingValue)) 4028 LastLane = Cost->isUniformAfterVectorization( 4029 cast<Instruction>(IncomingValue), VF) 4030 ? 0 4031 : VF - 1; 4032 // Can be a loop invariant incoming value or the last scalar value to be 4033 // extracted from the vectorized loop. 4034 Builder.SetInsertPoint(LoopMiddleBlock->getTerminator()); 4035 Value *lastIncomingValue = 4036 getOrCreateScalarValue(IncomingValue, { UF - 1, LastLane }); 4037 LCSSAPhi.addIncoming(lastIncomingValue, LoopMiddleBlock); 4038 } 4039 } 4040 } 4041 4042 void InnerLoopVectorizer::sinkScalarOperands(Instruction *PredInst) { 4043 // The basic block and loop containing the predicated instruction. 4044 auto *PredBB = PredInst->getParent(); 4045 auto *VectorLoop = LI->getLoopFor(PredBB); 4046 4047 // Initialize a worklist with the operands of the predicated instruction. 4048 SetVector<Value *> Worklist(PredInst->op_begin(), PredInst->op_end()); 4049 4050 // Holds instructions that we need to analyze again. An instruction may be 4051 // reanalyzed if we don't yet know if we can sink it or not. 4052 SmallVector<Instruction *, 8> InstsToReanalyze; 4053 4054 // Returns true if a given use occurs in the predicated block. Phi nodes use 4055 // their operands in their corresponding predecessor blocks. 4056 auto isBlockOfUsePredicated = [&](Use &U) -> bool { 4057 auto *I = cast<Instruction>(U.getUser()); 4058 BasicBlock *BB = I->getParent(); 4059 if (auto *Phi = dyn_cast<PHINode>(I)) 4060 BB = Phi->getIncomingBlock( 4061 PHINode::getIncomingValueNumForOperand(U.getOperandNo())); 4062 return BB == PredBB; 4063 }; 4064 4065 // Iteratively sink the scalarized operands of the predicated instruction 4066 // into the block we created for it. When an instruction is sunk, it's 4067 // operands are then added to the worklist. The algorithm ends after one pass 4068 // through the worklist doesn't sink a single instruction. 4069 bool Changed; 4070 do { 4071 // Add the instructions that need to be reanalyzed to the worklist, and 4072 // reset the changed indicator. 4073 Worklist.insert(InstsToReanalyze.begin(), InstsToReanalyze.end()); 4074 InstsToReanalyze.clear(); 4075 Changed = false; 4076 4077 while (!Worklist.empty()) { 4078 auto *I = dyn_cast<Instruction>(Worklist.pop_back_val()); 4079 4080 // We can't sink an instruction if it is a phi node, is already in the 4081 // predicated block, is not in the loop, or may have side effects. 4082 if (!I || isa<PHINode>(I) || I->getParent() == PredBB || 4083 !VectorLoop->contains(I) || I->mayHaveSideEffects()) 4084 continue; 4085 4086 // It's legal to sink the instruction if all its uses occur in the 4087 // predicated block. Otherwise, there's nothing to do yet, and we may 4088 // need to reanalyze the instruction. 4089 if (!llvm::all_of(I->uses(), isBlockOfUsePredicated)) { 4090 InstsToReanalyze.push_back(I); 4091 continue; 4092 } 4093 4094 // Move the instruction to the beginning of the predicated block, and add 4095 // it's operands to the worklist. 4096 I->moveBefore(&*PredBB->getFirstInsertionPt()); 4097 Worklist.insert(I->op_begin(), I->op_end()); 4098 4099 // The sinking may have enabled other instructions to be sunk, so we will 4100 // need to iterate. 4101 Changed = true; 4102 } 4103 } while (Changed); 4104 } 4105 4106 void InnerLoopVectorizer::fixNonInductionPHIs() { 4107 for (PHINode *OrigPhi : OrigPHIsToFix) { 4108 PHINode *NewPhi = 4109 cast<PHINode>(VectorLoopValueMap.getVectorValue(OrigPhi, 0)); 4110 unsigned NumIncomingValues = OrigPhi->getNumIncomingValues(); 4111 4112 SmallVector<BasicBlock *, 2> ScalarBBPredecessors( 4113 predecessors(OrigPhi->getParent())); 4114 SmallVector<BasicBlock *, 2> VectorBBPredecessors( 4115 predecessors(NewPhi->getParent())); 4116 assert(ScalarBBPredecessors.size() == VectorBBPredecessors.size() && 4117 "Scalar and Vector BB should have the same number of predecessors"); 4118 4119 // The insertion point in Builder may be invalidated by the time we get 4120 // here. Force the Builder insertion point to something valid so that we do 4121 // not run into issues during insertion point restore in 4122 // getOrCreateVectorValue calls below. 4123 Builder.SetInsertPoint(NewPhi); 4124 4125 // The predecessor order is preserved and we can rely on mapping between 4126 // scalar and vector block predecessors. 4127 for (unsigned i = 0; i < NumIncomingValues; ++i) { 4128 BasicBlock *NewPredBB = VectorBBPredecessors[i]; 4129 4130 // When looking up the new scalar/vector values to fix up, use incoming 4131 // values from original phi. 4132 Value *ScIncV = 4133 OrigPhi->getIncomingValueForBlock(ScalarBBPredecessors[i]); 4134 4135 // Scalar incoming value may need a broadcast 4136 Value *NewIncV = getOrCreateVectorValue(ScIncV, 0); 4137 NewPhi->addIncoming(NewIncV, NewPredBB); 4138 } 4139 } 4140 } 4141 4142 void InnerLoopVectorizer::widenGEP(GetElementPtrInst *GEP, VPUser &Operands, 4143 unsigned UF, unsigned VF, 4144 bool IsPtrLoopInvariant, 4145 SmallBitVector &IsIndexLoopInvariant, 4146 VPTransformState &State) { 4147 // Construct a vector GEP by widening the operands of the scalar GEP as 4148 // necessary. We mark the vector GEP 'inbounds' if appropriate. A GEP 4149 // results in a vector of pointers when at least one operand of the GEP 4150 // is vector-typed. Thus, to keep the representation compact, we only use 4151 // vector-typed operands for loop-varying values. 4152 4153 if (VF > 1 && IsPtrLoopInvariant && IsIndexLoopInvariant.all()) { 4154 // If we are vectorizing, but the GEP has only loop-invariant operands, 4155 // the GEP we build (by only using vector-typed operands for 4156 // loop-varying values) would be a scalar pointer. Thus, to ensure we 4157 // produce a vector of pointers, we need to either arbitrarily pick an 4158 // operand to broadcast, or broadcast a clone of the original GEP. 4159 // Here, we broadcast a clone of the original. 4160 // 4161 // TODO: If at some point we decide to scalarize instructions having 4162 // loop-invariant operands, this special case will no longer be 4163 // required. We would add the scalarization decision to 4164 // collectLoopScalars() and teach getVectorValue() to broadcast 4165 // the lane-zero scalar value. 4166 auto *Clone = Builder.Insert(GEP->clone()); 4167 for (unsigned Part = 0; Part < UF; ++Part) { 4168 Value *EntryPart = Builder.CreateVectorSplat(VF, Clone); 4169 VectorLoopValueMap.setVectorValue(GEP, Part, EntryPart); 4170 addMetadata(EntryPart, GEP); 4171 } 4172 } else { 4173 // If the GEP has at least one loop-varying operand, we are sure to 4174 // produce a vector of pointers. But if we are only unrolling, we want 4175 // to produce a scalar GEP for each unroll part. Thus, the GEP we 4176 // produce with the code below will be scalar (if VF == 1) or vector 4177 // (otherwise). Note that for the unroll-only case, we still maintain 4178 // values in the vector mapping with initVector, as we do for other 4179 // instructions. 4180 for (unsigned Part = 0; Part < UF; ++Part) { 4181 // The pointer operand of the new GEP. If it's loop-invariant, we 4182 // won't broadcast it. 4183 auto *Ptr = IsPtrLoopInvariant ? State.get(Operands.getOperand(0), {0, 0}) 4184 : State.get(Operands.getOperand(0), Part); 4185 4186 // Collect all the indices for the new GEP. If any index is 4187 // loop-invariant, we won't broadcast it. 4188 SmallVector<Value *, 4> Indices; 4189 for (unsigned I = 1, E = Operands.getNumOperands(); I < E; I++) { 4190 VPValue *Operand = Operands.getOperand(I); 4191 if (IsIndexLoopInvariant[I - 1]) 4192 Indices.push_back(State.get(Operand, {0, 0})); 4193 else 4194 Indices.push_back(State.get(Operand, Part)); 4195 } 4196 4197 // Create the new GEP. Note that this GEP may be a scalar if VF == 1, 4198 // but it should be a vector, otherwise. 4199 auto *NewGEP = 4200 GEP->isInBounds() 4201 ? Builder.CreateInBoundsGEP(GEP->getSourceElementType(), Ptr, 4202 Indices) 4203 : Builder.CreateGEP(GEP->getSourceElementType(), Ptr, Indices); 4204 assert((VF == 1 || NewGEP->getType()->isVectorTy()) && 4205 "NewGEP is not a pointer vector"); 4206 VectorLoopValueMap.setVectorValue(GEP, Part, NewGEP); 4207 addMetadata(NewGEP, GEP); 4208 } 4209 } 4210 } 4211 4212 void InnerLoopVectorizer::widenPHIInstruction(Instruction *PN, unsigned UF, 4213 unsigned VF) { 4214 PHINode *P = cast<PHINode>(PN); 4215 if (EnableVPlanNativePath) { 4216 // Currently we enter here in the VPlan-native path for non-induction 4217 // PHIs where all control flow is uniform. We simply widen these PHIs. 4218 // Create a vector phi with no operands - the vector phi operands will be 4219 // set at the end of vector code generation. 4220 Type *VecTy = 4221 (VF == 1) ? PN->getType() : FixedVectorType::get(PN->getType(), VF); 4222 Value *VecPhi = Builder.CreatePHI(VecTy, PN->getNumOperands(), "vec.phi"); 4223 VectorLoopValueMap.setVectorValue(P, 0, VecPhi); 4224 OrigPHIsToFix.push_back(P); 4225 4226 return; 4227 } 4228 4229 assert(PN->getParent() == OrigLoop->getHeader() && 4230 "Non-header phis should have been handled elsewhere"); 4231 4232 // In order to support recurrences we need to be able to vectorize Phi nodes. 4233 // Phi nodes have cycles, so we need to vectorize them in two stages. This is 4234 // stage #1: We create a new vector PHI node with no incoming edges. We'll use 4235 // this value when we vectorize all of the instructions that use the PHI. 4236 if (Legal->isReductionVariable(P) || Legal->isFirstOrderRecurrence(P)) { 4237 for (unsigned Part = 0; Part < UF; ++Part) { 4238 // This is phase one of vectorizing PHIs. 4239 Type *VecTy = 4240 (VF == 1) ? PN->getType() : FixedVectorType::get(PN->getType(), VF); 4241 Value *EntryPart = PHINode::Create( 4242 VecTy, 2, "vec.phi", &*LoopVectorBody->getFirstInsertionPt()); 4243 VectorLoopValueMap.setVectorValue(P, Part, EntryPart); 4244 } 4245 return; 4246 } 4247 4248 setDebugLocFromInst(Builder, P); 4249 4250 // This PHINode must be an induction variable. 4251 // Make sure that we know about it. 4252 assert(Legal->getInductionVars().count(P) && "Not an induction variable"); 4253 4254 InductionDescriptor II = Legal->getInductionVars().lookup(P); 4255 const DataLayout &DL = OrigLoop->getHeader()->getModule()->getDataLayout(); 4256 4257 // FIXME: The newly created binary instructions should contain nsw/nuw flags, 4258 // which can be found from the original scalar operations. 4259 switch (II.getKind()) { 4260 case InductionDescriptor::IK_NoInduction: 4261 llvm_unreachable("Unknown induction"); 4262 case InductionDescriptor::IK_IntInduction: 4263 case InductionDescriptor::IK_FpInduction: 4264 llvm_unreachable("Integer/fp induction is handled elsewhere."); 4265 case InductionDescriptor::IK_PtrInduction: { 4266 // Handle the pointer induction variable case. 4267 assert(P->getType()->isPointerTy() && "Unexpected type."); 4268 4269 if (Cost->isScalarAfterVectorization(P, VF)) { 4270 // This is the normalized GEP that starts counting at zero. 4271 Value *PtrInd = 4272 Builder.CreateSExtOrTrunc(Induction, II.getStep()->getType()); 4273 // Determine the number of scalars we need to generate for each unroll 4274 // iteration. If the instruction is uniform, we only need to generate the 4275 // first lane. Otherwise, we generate all VF values. 4276 unsigned Lanes = Cost->isUniformAfterVectorization(P, VF) ? 1 : VF; 4277 for (unsigned Part = 0; Part < UF; ++Part) { 4278 for (unsigned Lane = 0; Lane < Lanes; ++Lane) { 4279 Constant *Idx = ConstantInt::get(PtrInd->getType(), Lane + Part * VF); 4280 Value *GlobalIdx = Builder.CreateAdd(PtrInd, Idx); 4281 Value *SclrGep = 4282 emitTransformedIndex(Builder, GlobalIdx, PSE.getSE(), DL, II); 4283 SclrGep->setName("next.gep"); 4284 VectorLoopValueMap.setScalarValue(P, {Part, Lane}, SclrGep); 4285 } 4286 } 4287 return; 4288 } 4289 assert(isa<SCEVConstant>(II.getStep()) && 4290 "Induction step not a SCEV constant!"); 4291 Type *PhiType = II.getStep()->getType(); 4292 4293 // Build a pointer phi 4294 Value *ScalarStartValue = II.getStartValue(); 4295 Type *ScStValueType = ScalarStartValue->getType(); 4296 PHINode *NewPointerPhi = 4297 PHINode::Create(ScStValueType, 2, "pointer.phi", Induction); 4298 NewPointerPhi->addIncoming(ScalarStartValue, LoopVectorPreHeader); 4299 4300 // A pointer induction, performed by using a gep 4301 BasicBlock *LoopLatch = LI->getLoopFor(LoopVectorBody)->getLoopLatch(); 4302 Instruction *InductionLoc = LoopLatch->getTerminator(); 4303 const SCEV *ScalarStep = II.getStep(); 4304 SCEVExpander Exp(*PSE.getSE(), DL, "induction"); 4305 Value *ScalarStepValue = 4306 Exp.expandCodeFor(ScalarStep, PhiType, InductionLoc); 4307 Value *InductionGEP = GetElementPtrInst::Create( 4308 ScStValueType->getPointerElementType(), NewPointerPhi, 4309 Builder.CreateMul(ScalarStepValue, ConstantInt::get(PhiType, VF * UF)), 4310 "ptr.ind", InductionLoc); 4311 NewPointerPhi->addIncoming(InductionGEP, LoopLatch); 4312 4313 // Create UF many actual address geps that use the pointer 4314 // phi as base and a vectorized version of the step value 4315 // (<step*0, ..., step*N>) as offset. 4316 for (unsigned Part = 0; Part < UF; ++Part) { 4317 SmallVector<Constant *, 8> Indices; 4318 // Create a vector of consecutive numbers from zero to VF. 4319 for (unsigned i = 0; i < VF; ++i) 4320 Indices.push_back(ConstantInt::get(PhiType, i + Part * VF)); 4321 Constant *StartOffset = ConstantVector::get(Indices); 4322 4323 Value *GEP = Builder.CreateGEP( 4324 ScStValueType->getPointerElementType(), NewPointerPhi, 4325 Builder.CreateMul(StartOffset, 4326 Builder.CreateVectorSplat(VF, ScalarStepValue), 4327 "vector.gep")); 4328 VectorLoopValueMap.setVectorValue(P, Part, GEP); 4329 } 4330 } 4331 } 4332 } 4333 4334 /// A helper function for checking whether an integer division-related 4335 /// instruction may divide by zero (in which case it must be predicated if 4336 /// executed conditionally in the scalar code). 4337 /// TODO: It may be worthwhile to generalize and check isKnownNonZero(). 4338 /// Non-zero divisors that are non compile-time constants will not be 4339 /// converted into multiplication, so we will still end up scalarizing 4340 /// the division, but can do so w/o predication. 4341 static bool mayDivideByZero(Instruction &I) { 4342 assert((I.getOpcode() == Instruction::UDiv || 4343 I.getOpcode() == Instruction::SDiv || 4344 I.getOpcode() == Instruction::URem || 4345 I.getOpcode() == Instruction::SRem) && 4346 "Unexpected instruction"); 4347 Value *Divisor = I.getOperand(1); 4348 auto *CInt = dyn_cast<ConstantInt>(Divisor); 4349 return !CInt || CInt->isZero(); 4350 } 4351 4352 void InnerLoopVectorizer::widenInstruction(Instruction &I, VPUser &User, 4353 VPTransformState &State) { 4354 switch (I.getOpcode()) { 4355 case Instruction::Call: 4356 case Instruction::Br: 4357 case Instruction::PHI: 4358 case Instruction::GetElementPtr: 4359 case Instruction::Select: 4360 llvm_unreachable("This instruction is handled by a different recipe."); 4361 case Instruction::UDiv: 4362 case Instruction::SDiv: 4363 case Instruction::SRem: 4364 case Instruction::URem: 4365 case Instruction::Add: 4366 case Instruction::FAdd: 4367 case Instruction::Sub: 4368 case Instruction::FSub: 4369 case Instruction::FNeg: 4370 case Instruction::Mul: 4371 case Instruction::FMul: 4372 case Instruction::FDiv: 4373 case Instruction::FRem: 4374 case Instruction::Shl: 4375 case Instruction::LShr: 4376 case Instruction::AShr: 4377 case Instruction::And: 4378 case Instruction::Or: 4379 case Instruction::Xor: { 4380 // Just widen unops and binops. 4381 setDebugLocFromInst(Builder, &I); 4382 4383 for (unsigned Part = 0; Part < UF; ++Part) { 4384 SmallVector<Value *, 2> Ops; 4385 for (VPValue *VPOp : User.operands()) 4386 Ops.push_back(State.get(VPOp, Part)); 4387 4388 Value *V = Builder.CreateNAryOp(I.getOpcode(), Ops); 4389 4390 if (auto *VecOp = dyn_cast<Instruction>(V)) 4391 VecOp->copyIRFlags(&I); 4392 4393 // Use this vector value for all users of the original instruction. 4394 VectorLoopValueMap.setVectorValue(&I, Part, V); 4395 addMetadata(V, &I); 4396 } 4397 4398 break; 4399 } 4400 case Instruction::ICmp: 4401 case Instruction::FCmp: { 4402 // Widen compares. Generate vector compares. 4403 bool FCmp = (I.getOpcode() == Instruction::FCmp); 4404 auto *Cmp = cast<CmpInst>(&I); 4405 setDebugLocFromInst(Builder, Cmp); 4406 for (unsigned Part = 0; Part < UF; ++Part) { 4407 Value *A = State.get(User.getOperand(0), Part); 4408 Value *B = State.get(User.getOperand(1), Part); 4409 Value *C = nullptr; 4410 if (FCmp) { 4411 // Propagate fast math flags. 4412 IRBuilder<>::FastMathFlagGuard FMFG(Builder); 4413 Builder.setFastMathFlags(Cmp->getFastMathFlags()); 4414 C = Builder.CreateFCmp(Cmp->getPredicate(), A, B); 4415 } else { 4416 C = Builder.CreateICmp(Cmp->getPredicate(), A, B); 4417 } 4418 VectorLoopValueMap.setVectorValue(&I, Part, C); 4419 addMetadata(C, &I); 4420 } 4421 4422 break; 4423 } 4424 4425 case Instruction::ZExt: 4426 case Instruction::SExt: 4427 case Instruction::FPToUI: 4428 case Instruction::FPToSI: 4429 case Instruction::FPExt: 4430 case Instruction::PtrToInt: 4431 case Instruction::IntToPtr: 4432 case Instruction::SIToFP: 4433 case Instruction::UIToFP: 4434 case Instruction::Trunc: 4435 case Instruction::FPTrunc: 4436 case Instruction::BitCast: { 4437 auto *CI = cast<CastInst>(&I); 4438 setDebugLocFromInst(Builder, CI); 4439 4440 /// Vectorize casts. 4441 Type *DestTy = 4442 (VF == 1) ? CI->getType() : FixedVectorType::get(CI->getType(), VF); 4443 4444 for (unsigned Part = 0; Part < UF; ++Part) { 4445 Value *A = State.get(User.getOperand(0), Part); 4446 Value *Cast = Builder.CreateCast(CI->getOpcode(), A, DestTy); 4447 VectorLoopValueMap.setVectorValue(&I, Part, Cast); 4448 addMetadata(Cast, &I); 4449 } 4450 break; 4451 } 4452 default: 4453 // This instruction is not vectorized by simple widening. 4454 LLVM_DEBUG(dbgs() << "LV: Found an unhandled instruction: " << I); 4455 llvm_unreachable("Unhandled instruction!"); 4456 } // end of switch. 4457 } 4458 4459 void InnerLoopVectorizer::widenCallInstruction(CallInst &I, VPUser &ArgOperands, 4460 VPTransformState &State) { 4461 assert(!isa<DbgInfoIntrinsic>(I) && 4462 "DbgInfoIntrinsic should have been dropped during VPlan construction"); 4463 setDebugLocFromInst(Builder, &I); 4464 4465 Module *M = I.getParent()->getParent()->getParent(); 4466 auto *CI = cast<CallInst>(&I); 4467 4468 SmallVector<Type *, 4> Tys; 4469 for (Value *ArgOperand : CI->arg_operands()) 4470 Tys.push_back(ToVectorTy(ArgOperand->getType(), VF)); 4471 4472 Intrinsic::ID ID = getVectorIntrinsicIDForCall(CI, TLI); 4473 4474 // The flag shows whether we use Intrinsic or a usual Call for vectorized 4475 // version of the instruction. 4476 // Is it beneficial to perform intrinsic call compared to lib call? 4477 bool NeedToScalarize = false; 4478 unsigned CallCost = Cost->getVectorCallCost(CI, VF, NeedToScalarize); 4479 bool UseVectorIntrinsic = 4480 ID && Cost->getVectorIntrinsicCost(CI, VF) <= CallCost; 4481 assert((UseVectorIntrinsic || !NeedToScalarize) && 4482 "Instruction should be scalarized elsewhere."); 4483 4484 for (unsigned Part = 0; Part < UF; ++Part) { 4485 SmallVector<Value *, 4> Args; 4486 for (auto &I : enumerate(ArgOperands.operands())) { 4487 // Some intrinsics have a scalar argument - don't replace it with a 4488 // vector. 4489 Value *Arg; 4490 if (!UseVectorIntrinsic || !hasVectorInstrinsicScalarOpd(ID, I.index())) 4491 Arg = State.get(I.value(), Part); 4492 else 4493 Arg = State.get(I.value(), {0, 0}); 4494 Args.push_back(Arg); 4495 } 4496 4497 Function *VectorF; 4498 if (UseVectorIntrinsic) { 4499 // Use vector version of the intrinsic. 4500 Type *TysForDecl[] = {CI->getType()}; 4501 if (VF > 1) 4502 TysForDecl[0] = 4503 FixedVectorType::get(CI->getType()->getScalarType(), VF); 4504 VectorF = Intrinsic::getDeclaration(M, ID, TysForDecl); 4505 assert(VectorF && "Can't retrieve vector intrinsic."); 4506 } else { 4507 // Use vector version of the function call. 4508 const VFShape Shape = 4509 VFShape::get(*CI, {VF, false} /*EC*/, false /*HasGlobalPred*/); 4510 #ifndef NDEBUG 4511 assert(VFDatabase(*CI).getVectorizedFunction(Shape) != nullptr && 4512 "Can't create vector function."); 4513 #endif 4514 VectorF = VFDatabase(*CI).getVectorizedFunction(Shape); 4515 } 4516 SmallVector<OperandBundleDef, 1> OpBundles; 4517 CI->getOperandBundlesAsDefs(OpBundles); 4518 CallInst *V = Builder.CreateCall(VectorF, Args, OpBundles); 4519 4520 if (isa<FPMathOperator>(V)) 4521 V->copyFastMathFlags(CI); 4522 4523 VectorLoopValueMap.setVectorValue(&I, Part, V); 4524 addMetadata(V, &I); 4525 } 4526 } 4527 4528 void InnerLoopVectorizer::widenSelectInstruction(SelectInst &I, 4529 VPUser &Operands, 4530 bool InvariantCond, 4531 VPTransformState &State) { 4532 setDebugLocFromInst(Builder, &I); 4533 4534 // The condition can be loop invariant but still defined inside the 4535 // loop. This means that we can't just use the original 'cond' value. 4536 // We have to take the 'vectorized' value and pick the first lane. 4537 // Instcombine will make this a no-op. 4538 auto *InvarCond = 4539 InvariantCond ? State.get(Operands.getOperand(0), {0, 0}) : nullptr; 4540 4541 for (unsigned Part = 0; Part < UF; ++Part) { 4542 Value *Cond = 4543 InvarCond ? InvarCond : State.get(Operands.getOperand(0), Part); 4544 Value *Op0 = State.get(Operands.getOperand(1), Part); 4545 Value *Op1 = State.get(Operands.getOperand(2), Part); 4546 Value *Sel = Builder.CreateSelect(Cond, Op0, Op1); 4547 VectorLoopValueMap.setVectorValue(&I, Part, Sel); 4548 addMetadata(Sel, &I); 4549 } 4550 } 4551 4552 void LoopVectorizationCostModel::collectLoopScalars(unsigned VF) { 4553 // We should not collect Scalars more than once per VF. Right now, this 4554 // function is called from collectUniformsAndScalars(), which already does 4555 // this check. Collecting Scalars for VF=1 does not make any sense. 4556 assert(VF >= 2 && Scalars.find(VF) == Scalars.end() && 4557 "This function should not be visited twice for the same VF"); 4558 4559 SmallSetVector<Instruction *, 8> Worklist; 4560 4561 // These sets are used to seed the analysis with pointers used by memory 4562 // accesses that will remain scalar. 4563 SmallSetVector<Instruction *, 8> ScalarPtrs; 4564 SmallPtrSet<Instruction *, 8> PossibleNonScalarPtrs; 4565 auto *Latch = TheLoop->getLoopLatch(); 4566 4567 // A helper that returns true if the use of Ptr by MemAccess will be scalar. 4568 // The pointer operands of loads and stores will be scalar as long as the 4569 // memory access is not a gather or scatter operation. The value operand of a 4570 // store will remain scalar if the store is scalarized. 4571 auto isScalarUse = [&](Instruction *MemAccess, Value *Ptr) { 4572 InstWidening WideningDecision = getWideningDecision(MemAccess, VF); 4573 assert(WideningDecision != CM_Unknown && 4574 "Widening decision should be ready at this moment"); 4575 if (auto *Store = dyn_cast<StoreInst>(MemAccess)) 4576 if (Ptr == Store->getValueOperand()) 4577 return WideningDecision == CM_Scalarize; 4578 assert(Ptr == getLoadStorePointerOperand(MemAccess) && 4579 "Ptr is neither a value or pointer operand"); 4580 return WideningDecision != CM_GatherScatter; 4581 }; 4582 4583 // A helper that returns true if the given value is a bitcast or 4584 // getelementptr instruction contained in the loop. 4585 auto isLoopVaryingBitCastOrGEP = [&](Value *V) { 4586 return ((isa<BitCastInst>(V) && V->getType()->isPointerTy()) || 4587 isa<GetElementPtrInst>(V)) && 4588 !TheLoop->isLoopInvariant(V); 4589 }; 4590 4591 auto isScalarPtrInduction = [&](Instruction *MemAccess, Value *Ptr) { 4592 if (!isa<PHINode>(Ptr) || 4593 !Legal->getInductionVars().count(cast<PHINode>(Ptr))) 4594 return false; 4595 auto &Induction = Legal->getInductionVars()[cast<PHINode>(Ptr)]; 4596 if (Induction.getKind() != InductionDescriptor::IK_PtrInduction) 4597 return false; 4598 return isScalarUse(MemAccess, Ptr); 4599 }; 4600 4601 // A helper that evaluates a memory access's use of a pointer. If the 4602 // pointer is actually the pointer induction of a loop, it is being 4603 // inserted into Worklist. If the use will be a scalar use, and the 4604 // pointer is only used by memory accesses, we place the pointer in 4605 // ScalarPtrs. Otherwise, the pointer is placed in PossibleNonScalarPtrs. 4606 auto evaluatePtrUse = [&](Instruction *MemAccess, Value *Ptr) { 4607 if (isScalarPtrInduction(MemAccess, Ptr)) { 4608 Worklist.insert(cast<Instruction>(Ptr)); 4609 Instruction *Update = cast<Instruction>( 4610 cast<PHINode>(Ptr)->getIncomingValueForBlock(Latch)); 4611 Worklist.insert(Update); 4612 LLVM_DEBUG(dbgs() << "LV: Found new scalar instruction: " << *Ptr 4613 << "\n"); 4614 LLVM_DEBUG(dbgs() << "LV: Found new scalar instruction: " << *Update 4615 << "\n"); 4616 return; 4617 } 4618 // We only care about bitcast and getelementptr instructions contained in 4619 // the loop. 4620 if (!isLoopVaryingBitCastOrGEP(Ptr)) 4621 return; 4622 4623 // If the pointer has already been identified as scalar (e.g., if it was 4624 // also identified as uniform), there's nothing to do. 4625 auto *I = cast<Instruction>(Ptr); 4626 if (Worklist.count(I)) 4627 return; 4628 4629 // If the use of the pointer will be a scalar use, and all users of the 4630 // pointer are memory accesses, place the pointer in ScalarPtrs. Otherwise, 4631 // place the pointer in PossibleNonScalarPtrs. 4632 if (isScalarUse(MemAccess, Ptr) && llvm::all_of(I->users(), [&](User *U) { 4633 return isa<LoadInst>(U) || isa<StoreInst>(U); 4634 })) 4635 ScalarPtrs.insert(I); 4636 else 4637 PossibleNonScalarPtrs.insert(I); 4638 }; 4639 4640 // We seed the scalars analysis with three classes of instructions: (1) 4641 // instructions marked uniform-after-vectorization and (2) bitcast, 4642 // getelementptr and (pointer) phi instructions used by memory accesses 4643 // requiring a scalar use. 4644 // 4645 // (1) Add to the worklist all instructions that have been identified as 4646 // uniform-after-vectorization. 4647 Worklist.insert(Uniforms[VF].begin(), Uniforms[VF].end()); 4648 4649 // (2) Add to the worklist all bitcast and getelementptr instructions used by 4650 // memory accesses requiring a scalar use. The pointer operands of loads and 4651 // stores will be scalar as long as the memory accesses is not a gather or 4652 // scatter operation. The value operand of a store will remain scalar if the 4653 // store is scalarized. 4654 for (auto *BB : TheLoop->blocks()) 4655 for (auto &I : *BB) { 4656 if (auto *Load = dyn_cast<LoadInst>(&I)) { 4657 evaluatePtrUse(Load, Load->getPointerOperand()); 4658 } else if (auto *Store = dyn_cast<StoreInst>(&I)) { 4659 evaluatePtrUse(Store, Store->getPointerOperand()); 4660 evaluatePtrUse(Store, Store->getValueOperand()); 4661 } 4662 } 4663 for (auto *I : ScalarPtrs) 4664 if (!PossibleNonScalarPtrs.count(I)) { 4665 LLVM_DEBUG(dbgs() << "LV: Found scalar instruction: " << *I << "\n"); 4666 Worklist.insert(I); 4667 } 4668 4669 // Insert the forced scalars. 4670 // FIXME: Currently widenPHIInstruction() often creates a dead vector 4671 // induction variable when the PHI user is scalarized. 4672 auto ForcedScalar = ForcedScalars.find(VF); 4673 if (ForcedScalar != ForcedScalars.end()) 4674 for (auto *I : ForcedScalar->second) 4675 Worklist.insert(I); 4676 4677 // Expand the worklist by looking through any bitcasts and getelementptr 4678 // instructions we've already identified as scalar. This is similar to the 4679 // expansion step in collectLoopUniforms(); however, here we're only 4680 // expanding to include additional bitcasts and getelementptr instructions. 4681 unsigned Idx = 0; 4682 while (Idx != Worklist.size()) { 4683 Instruction *Dst = Worklist[Idx++]; 4684 if (!isLoopVaryingBitCastOrGEP(Dst->getOperand(0))) 4685 continue; 4686 auto *Src = cast<Instruction>(Dst->getOperand(0)); 4687 if (llvm::all_of(Src->users(), [&](User *U) -> bool { 4688 auto *J = cast<Instruction>(U); 4689 return !TheLoop->contains(J) || Worklist.count(J) || 4690 ((isa<LoadInst>(J) || isa<StoreInst>(J)) && 4691 isScalarUse(J, Src)); 4692 })) { 4693 Worklist.insert(Src); 4694 LLVM_DEBUG(dbgs() << "LV: Found scalar instruction: " << *Src << "\n"); 4695 } 4696 } 4697 4698 // An induction variable will remain scalar if all users of the induction 4699 // variable and induction variable update remain scalar. 4700 for (auto &Induction : Legal->getInductionVars()) { 4701 auto *Ind = Induction.first; 4702 auto *IndUpdate = cast<Instruction>(Ind->getIncomingValueForBlock(Latch)); 4703 4704 // If tail-folding is applied, the primary induction variable will be used 4705 // to feed a vector compare. 4706 if (Ind == Legal->getPrimaryInduction() && foldTailByMasking()) 4707 continue; 4708 4709 // Determine if all users of the induction variable are scalar after 4710 // vectorization. 4711 auto ScalarInd = llvm::all_of(Ind->users(), [&](User *U) -> bool { 4712 auto *I = cast<Instruction>(U); 4713 return I == IndUpdate || !TheLoop->contains(I) || Worklist.count(I); 4714 }); 4715 if (!ScalarInd) 4716 continue; 4717 4718 // Determine if all users of the induction variable update instruction are 4719 // scalar after vectorization. 4720 auto ScalarIndUpdate = 4721 llvm::all_of(IndUpdate->users(), [&](User *U) -> bool { 4722 auto *I = cast<Instruction>(U); 4723 return I == Ind || !TheLoop->contains(I) || Worklist.count(I); 4724 }); 4725 if (!ScalarIndUpdate) 4726 continue; 4727 4728 // The induction variable and its update instruction will remain scalar. 4729 Worklist.insert(Ind); 4730 Worklist.insert(IndUpdate); 4731 LLVM_DEBUG(dbgs() << "LV: Found scalar instruction: " << *Ind << "\n"); 4732 LLVM_DEBUG(dbgs() << "LV: Found scalar instruction: " << *IndUpdate 4733 << "\n"); 4734 } 4735 4736 Scalars[VF].insert(Worklist.begin(), Worklist.end()); 4737 } 4738 4739 bool LoopVectorizationCostModel::isScalarWithPredication(Instruction *I, unsigned VF) { 4740 if (!blockNeedsPredication(I->getParent())) 4741 return false; 4742 switch(I->getOpcode()) { 4743 default: 4744 break; 4745 case Instruction::Load: 4746 case Instruction::Store: { 4747 if (!Legal->isMaskRequired(I)) 4748 return false; 4749 auto *Ptr = getLoadStorePointerOperand(I); 4750 auto *Ty = getMemInstValueType(I); 4751 // We have already decided how to vectorize this instruction, get that 4752 // result. 4753 if (VF > 1) { 4754 InstWidening WideningDecision = getWideningDecision(I, VF); 4755 assert(WideningDecision != CM_Unknown && 4756 "Widening decision should be ready at this moment"); 4757 return WideningDecision == CM_Scalarize; 4758 } 4759 const Align Alignment = getLoadStoreAlignment(I); 4760 return isa<LoadInst>(I) ? !(isLegalMaskedLoad(Ty, Ptr, Alignment) || 4761 isLegalMaskedGather(Ty, Alignment)) 4762 : !(isLegalMaskedStore(Ty, Ptr, Alignment) || 4763 isLegalMaskedScatter(Ty, Alignment)); 4764 } 4765 case Instruction::UDiv: 4766 case Instruction::SDiv: 4767 case Instruction::SRem: 4768 case Instruction::URem: 4769 return mayDivideByZero(*I); 4770 } 4771 return false; 4772 } 4773 4774 bool LoopVectorizationCostModel::interleavedAccessCanBeWidened(Instruction *I, 4775 unsigned VF) { 4776 assert(isAccessInterleaved(I) && "Expecting interleaved access."); 4777 assert(getWideningDecision(I, VF) == CM_Unknown && 4778 "Decision should not be set yet."); 4779 auto *Group = getInterleavedAccessGroup(I); 4780 assert(Group && "Must have a group."); 4781 4782 // If the instruction's allocated size doesn't equal it's type size, it 4783 // requires padding and will be scalarized. 4784 auto &DL = I->getModule()->getDataLayout(); 4785 auto *ScalarTy = getMemInstValueType(I); 4786 if (hasIrregularType(ScalarTy, DL, VF)) 4787 return false; 4788 4789 // Check if masking is required. 4790 // A Group may need masking for one of two reasons: it resides in a block that 4791 // needs predication, or it was decided to use masking to deal with gaps. 4792 bool PredicatedAccessRequiresMasking = 4793 Legal->blockNeedsPredication(I->getParent()) && Legal->isMaskRequired(I); 4794 bool AccessWithGapsRequiresMasking = 4795 Group->requiresScalarEpilogue() && !isScalarEpilogueAllowed(); 4796 if (!PredicatedAccessRequiresMasking && !AccessWithGapsRequiresMasking) 4797 return true; 4798 4799 // If masked interleaving is required, we expect that the user/target had 4800 // enabled it, because otherwise it either wouldn't have been created or 4801 // it should have been invalidated by the CostModel. 4802 assert(useMaskedInterleavedAccesses(TTI) && 4803 "Masked interleave-groups for predicated accesses are not enabled."); 4804 4805 auto *Ty = getMemInstValueType(I); 4806 const Align Alignment = getLoadStoreAlignment(I); 4807 return isa<LoadInst>(I) ? TTI.isLegalMaskedLoad(Ty, Alignment) 4808 : TTI.isLegalMaskedStore(Ty, Alignment); 4809 } 4810 4811 bool LoopVectorizationCostModel::memoryInstructionCanBeWidened(Instruction *I, 4812 unsigned VF) { 4813 // Get and ensure we have a valid memory instruction. 4814 LoadInst *LI = dyn_cast<LoadInst>(I); 4815 StoreInst *SI = dyn_cast<StoreInst>(I); 4816 assert((LI || SI) && "Invalid memory instruction"); 4817 4818 auto *Ptr = getLoadStorePointerOperand(I); 4819 4820 // In order to be widened, the pointer should be consecutive, first of all. 4821 if (!Legal->isConsecutivePtr(Ptr)) 4822 return false; 4823 4824 // If the instruction is a store located in a predicated block, it will be 4825 // scalarized. 4826 if (isScalarWithPredication(I)) 4827 return false; 4828 4829 // If the instruction's allocated size doesn't equal it's type size, it 4830 // requires padding and will be scalarized. 4831 auto &DL = I->getModule()->getDataLayout(); 4832 auto *ScalarTy = LI ? LI->getType() : SI->getValueOperand()->getType(); 4833 if (hasIrregularType(ScalarTy, DL, VF)) 4834 return false; 4835 4836 return true; 4837 } 4838 4839 void LoopVectorizationCostModel::collectLoopUniforms(unsigned VF) { 4840 // We should not collect Uniforms more than once per VF. Right now, 4841 // this function is called from collectUniformsAndScalars(), which 4842 // already does this check. Collecting Uniforms for VF=1 does not make any 4843 // sense. 4844 4845 assert(VF >= 2 && Uniforms.find(VF) == Uniforms.end() && 4846 "This function should not be visited twice for the same VF"); 4847 4848 // Visit the list of Uniforms. If we'll not find any uniform value, we'll 4849 // not analyze again. Uniforms.count(VF) will return 1. 4850 Uniforms[VF].clear(); 4851 4852 // We now know that the loop is vectorizable! 4853 // Collect instructions inside the loop that will remain uniform after 4854 // vectorization. 4855 4856 // Global values, params and instructions outside of current loop are out of 4857 // scope. 4858 auto isOutOfScope = [&](Value *V) -> bool { 4859 Instruction *I = dyn_cast<Instruction>(V); 4860 return (!I || !TheLoop->contains(I)); 4861 }; 4862 4863 SetVector<Instruction *> Worklist; 4864 BasicBlock *Latch = TheLoop->getLoopLatch(); 4865 4866 // Instructions that are scalar with predication must not be considered 4867 // uniform after vectorization, because that would create an erroneous 4868 // replicating region where only a single instance out of VF should be formed. 4869 // TODO: optimize such seldom cases if found important, see PR40816. 4870 auto addToWorklistIfAllowed = [&](Instruction *I) -> void { 4871 if (isScalarWithPredication(I, VF)) { 4872 LLVM_DEBUG(dbgs() << "LV: Found not uniform being ScalarWithPredication: " 4873 << *I << "\n"); 4874 return; 4875 } 4876 LLVM_DEBUG(dbgs() << "LV: Found uniform instruction: " << *I << "\n"); 4877 Worklist.insert(I); 4878 }; 4879 4880 // Start with the conditional branch. If the branch condition is an 4881 // instruction contained in the loop that is only used by the branch, it is 4882 // uniform. 4883 auto *Cmp = dyn_cast<Instruction>(Latch->getTerminator()->getOperand(0)); 4884 if (Cmp && TheLoop->contains(Cmp) && Cmp->hasOneUse()) 4885 addToWorklistIfAllowed(Cmp); 4886 4887 // Holds consecutive and consecutive-like pointers. Consecutive-like pointers 4888 // are pointers that are treated like consecutive pointers during 4889 // vectorization. The pointer operands of interleaved accesses are an 4890 // example. 4891 SmallSetVector<Instruction *, 8> ConsecutiveLikePtrs; 4892 4893 // Holds pointer operands of instructions that are possibly non-uniform. 4894 SmallPtrSet<Instruction *, 8> PossibleNonUniformPtrs; 4895 4896 auto isUniformDecision = [&](Instruction *I, unsigned VF) { 4897 InstWidening WideningDecision = getWideningDecision(I, VF); 4898 assert(WideningDecision != CM_Unknown && 4899 "Widening decision should be ready at this moment"); 4900 4901 return (WideningDecision == CM_Widen || 4902 WideningDecision == CM_Widen_Reverse || 4903 WideningDecision == CM_Interleave); 4904 }; 4905 // Iterate over the instructions in the loop, and collect all 4906 // consecutive-like pointer operands in ConsecutiveLikePtrs. If it's possible 4907 // that a consecutive-like pointer operand will be scalarized, we collect it 4908 // in PossibleNonUniformPtrs instead. We use two sets here because a single 4909 // getelementptr instruction can be used by both vectorized and scalarized 4910 // memory instructions. For example, if a loop loads and stores from the same 4911 // location, but the store is conditional, the store will be scalarized, and 4912 // the getelementptr won't remain uniform. 4913 for (auto *BB : TheLoop->blocks()) 4914 for (auto &I : *BB) { 4915 // If there's no pointer operand, there's nothing to do. 4916 auto *Ptr = dyn_cast_or_null<Instruction>(getLoadStorePointerOperand(&I)); 4917 if (!Ptr) 4918 continue; 4919 4920 // True if all users of Ptr are memory accesses that have Ptr as their 4921 // pointer operand. 4922 auto UsersAreMemAccesses = 4923 llvm::all_of(Ptr->users(), [&](User *U) -> bool { 4924 return getLoadStorePointerOperand(U) == Ptr; 4925 }); 4926 4927 // Ensure the memory instruction will not be scalarized or used by 4928 // gather/scatter, making its pointer operand non-uniform. If the pointer 4929 // operand is used by any instruction other than a memory access, we 4930 // conservatively assume the pointer operand may be non-uniform. 4931 if (!UsersAreMemAccesses || !isUniformDecision(&I, VF)) 4932 PossibleNonUniformPtrs.insert(Ptr); 4933 4934 // If the memory instruction will be vectorized and its pointer operand 4935 // is consecutive-like, or interleaving - the pointer operand should 4936 // remain uniform. 4937 else 4938 ConsecutiveLikePtrs.insert(Ptr); 4939 } 4940 4941 // Add to the Worklist all consecutive and consecutive-like pointers that 4942 // aren't also identified as possibly non-uniform. 4943 for (auto *V : ConsecutiveLikePtrs) 4944 if (!PossibleNonUniformPtrs.count(V)) 4945 addToWorklistIfAllowed(V); 4946 4947 // Expand Worklist in topological order: whenever a new instruction 4948 // is added , its users should be already inside Worklist. It ensures 4949 // a uniform instruction will only be used by uniform instructions. 4950 unsigned idx = 0; 4951 while (idx != Worklist.size()) { 4952 Instruction *I = Worklist[idx++]; 4953 4954 for (auto OV : I->operand_values()) { 4955 // isOutOfScope operands cannot be uniform instructions. 4956 if (isOutOfScope(OV)) 4957 continue; 4958 // First order recurrence Phi's should typically be considered 4959 // non-uniform. 4960 auto *OP = dyn_cast<PHINode>(OV); 4961 if (OP && Legal->isFirstOrderRecurrence(OP)) 4962 continue; 4963 // If all the users of the operand are uniform, then add the 4964 // operand into the uniform worklist. 4965 auto *OI = cast<Instruction>(OV); 4966 if (llvm::all_of(OI->users(), [&](User *U) -> bool { 4967 auto *J = cast<Instruction>(U); 4968 return Worklist.count(J) || 4969 (OI == getLoadStorePointerOperand(J) && 4970 isUniformDecision(J, VF)); 4971 })) 4972 addToWorklistIfAllowed(OI); 4973 } 4974 } 4975 4976 // Returns true if Ptr is the pointer operand of a memory access instruction 4977 // I, and I is known to not require scalarization. 4978 auto isVectorizedMemAccessUse = [&](Instruction *I, Value *Ptr) -> bool { 4979 return getLoadStorePointerOperand(I) == Ptr && isUniformDecision(I, VF); 4980 }; 4981 4982 // For an instruction to be added into Worklist above, all its users inside 4983 // the loop should also be in Worklist. However, this condition cannot be 4984 // true for phi nodes that form a cyclic dependence. We must process phi 4985 // nodes separately. An induction variable will remain uniform if all users 4986 // of the induction variable and induction variable update remain uniform. 4987 // The code below handles both pointer and non-pointer induction variables. 4988 for (auto &Induction : Legal->getInductionVars()) { 4989 auto *Ind = Induction.first; 4990 auto *IndUpdate = cast<Instruction>(Ind->getIncomingValueForBlock(Latch)); 4991 4992 // Determine if all users of the induction variable are uniform after 4993 // vectorization. 4994 auto UniformInd = llvm::all_of(Ind->users(), [&](User *U) -> bool { 4995 auto *I = cast<Instruction>(U); 4996 return I == IndUpdate || !TheLoop->contains(I) || Worklist.count(I) || 4997 isVectorizedMemAccessUse(I, Ind); 4998 }); 4999 if (!UniformInd) 5000 continue; 5001 5002 // Determine if all users of the induction variable update instruction are 5003 // uniform after vectorization. 5004 auto UniformIndUpdate = 5005 llvm::all_of(IndUpdate->users(), [&](User *U) -> bool { 5006 auto *I = cast<Instruction>(U); 5007 return I == Ind || !TheLoop->contains(I) || Worklist.count(I) || 5008 isVectorizedMemAccessUse(I, IndUpdate); 5009 }); 5010 if (!UniformIndUpdate) 5011 continue; 5012 5013 // The induction variable and its update instruction will remain uniform. 5014 addToWorklistIfAllowed(Ind); 5015 addToWorklistIfAllowed(IndUpdate); 5016 } 5017 5018 Uniforms[VF].insert(Worklist.begin(), Worklist.end()); 5019 } 5020 5021 bool LoopVectorizationCostModel::runtimeChecksRequired() { 5022 LLVM_DEBUG(dbgs() << "LV: Performing code size checks.\n"); 5023 5024 if (Legal->getRuntimePointerChecking()->Need) { 5025 reportVectorizationFailure("Runtime ptr check is required with -Os/-Oz", 5026 "runtime pointer checks needed. Enable vectorization of this " 5027 "loop with '#pragma clang loop vectorize(enable)' when " 5028 "compiling with -Os/-Oz", 5029 "CantVersionLoopWithOptForSize", ORE, TheLoop); 5030 return true; 5031 } 5032 5033 if (!PSE.getUnionPredicate().getPredicates().empty()) { 5034 reportVectorizationFailure("Runtime SCEV check is required with -Os/-Oz", 5035 "runtime SCEV checks needed. Enable vectorization of this " 5036 "loop with '#pragma clang loop vectorize(enable)' when " 5037 "compiling with -Os/-Oz", 5038 "CantVersionLoopWithOptForSize", ORE, TheLoop); 5039 return true; 5040 } 5041 5042 // FIXME: Avoid specializing for stride==1 instead of bailing out. 5043 if (!Legal->getLAI()->getSymbolicStrides().empty()) { 5044 reportVectorizationFailure("Runtime stride check for small trip count", 5045 "runtime stride == 1 checks needed. Enable vectorization of " 5046 "this loop without such check by compiling with -Os/-Oz", 5047 "CantVersionLoopWithOptForSize", ORE, TheLoop); 5048 return true; 5049 } 5050 5051 return false; 5052 } 5053 5054 Optional<unsigned> LoopVectorizationCostModel::computeMaxVF(unsigned UserVF, 5055 unsigned UserIC) { 5056 if (Legal->getRuntimePointerChecking()->Need && TTI.hasBranchDivergence()) { 5057 // TODO: It may by useful to do since it's still likely to be dynamically 5058 // uniform if the target can skip. 5059 reportVectorizationFailure( 5060 "Not inserting runtime ptr check for divergent target", 5061 "runtime pointer checks needed. Not enabled for divergent target", 5062 "CantVersionLoopWithDivergentTarget", ORE, TheLoop); 5063 return None; 5064 } 5065 5066 unsigned TC = PSE.getSE()->getSmallConstantTripCount(TheLoop); 5067 LLVM_DEBUG(dbgs() << "LV: Found trip count: " << TC << '\n'); 5068 if (TC == 1) { 5069 reportVectorizationFailure("Single iteration (non) loop", 5070 "loop trip count is one, irrelevant for vectorization", 5071 "SingleIterationLoop", ORE, TheLoop); 5072 return None; 5073 } 5074 5075 switch (ScalarEpilogueStatus) { 5076 case CM_ScalarEpilogueAllowed: 5077 return UserVF ? UserVF : computeFeasibleMaxVF(TC); 5078 case CM_ScalarEpilogueNotNeededUsePredicate: 5079 LLVM_DEBUG( 5080 dbgs() << "LV: vector predicate hint/switch found.\n" 5081 << "LV: Not allowing scalar epilogue, creating predicated " 5082 << "vector loop.\n"); 5083 break; 5084 case CM_ScalarEpilogueNotAllowedLowTripLoop: 5085 // fallthrough as a special case of OptForSize 5086 case CM_ScalarEpilogueNotAllowedOptSize: 5087 if (ScalarEpilogueStatus == CM_ScalarEpilogueNotAllowedOptSize) 5088 LLVM_DEBUG( 5089 dbgs() << "LV: Not allowing scalar epilogue due to -Os/-Oz.\n"); 5090 else 5091 LLVM_DEBUG(dbgs() << "LV: Not allowing scalar epilogue due to low trip " 5092 << "count.\n"); 5093 5094 // Bail if runtime checks are required, which are not good when optimising 5095 // for size. 5096 if (runtimeChecksRequired()) 5097 return None; 5098 break; 5099 } 5100 5101 // Now try the tail folding 5102 5103 // Invalidate interleave groups that require an epilogue if we can't mask 5104 // the interleave-group. 5105 if (!useMaskedInterleavedAccesses(TTI)) { 5106 assert(WideningDecisions.empty() && Uniforms.empty() && Scalars.empty() && 5107 "No decisions should have been taken at this point"); 5108 // Note: There is no need to invalidate any cost modeling decisions here, as 5109 // non where taken so far. 5110 InterleaveInfo.invalidateGroupsRequiringScalarEpilogue(); 5111 } 5112 5113 unsigned MaxVF = UserVF ? UserVF : computeFeasibleMaxVF(TC); 5114 assert((UserVF || isPowerOf2_32(MaxVF)) && "MaxVF must be a power of 2"); 5115 unsigned MaxVFtimesIC = UserIC ? MaxVF * UserIC : MaxVF; 5116 if (TC > 0 && TC % MaxVFtimesIC == 0) { 5117 // Accept MaxVF if we do not have a tail. 5118 LLVM_DEBUG(dbgs() << "LV: No tail will remain for any chosen VF.\n"); 5119 return MaxVF; 5120 } 5121 5122 // If we don't know the precise trip count, or if the trip count that we 5123 // found modulo the vectorization factor is not zero, try to fold the tail 5124 // by masking. 5125 // FIXME: look for a smaller MaxVF that does divide TC rather than masking. 5126 if (Legal->prepareToFoldTailByMasking()) { 5127 FoldTailByMasking = true; 5128 return MaxVF; 5129 } 5130 5131 if (TC == 0) { 5132 reportVectorizationFailure( 5133 "Unable to calculate the loop count due to complex control flow", 5134 "unable to calculate the loop count due to complex control flow", 5135 "UnknownLoopCountComplexCFG", ORE, TheLoop); 5136 return None; 5137 } 5138 5139 reportVectorizationFailure( 5140 "Cannot optimize for size and vectorize at the same time.", 5141 "cannot optimize for size and vectorize at the same time. " 5142 "Enable vectorization of this loop with '#pragma clang loop " 5143 "vectorize(enable)' when compiling with -Os/-Oz", 5144 "NoTailLoopWithOptForSize", ORE, TheLoop); 5145 return None; 5146 } 5147 5148 unsigned 5149 LoopVectorizationCostModel::computeFeasibleMaxVF(unsigned ConstTripCount) { 5150 MinBWs = computeMinimumValueSizes(TheLoop->getBlocks(), *DB, &TTI); 5151 unsigned SmallestType, WidestType; 5152 std::tie(SmallestType, WidestType) = getSmallestAndWidestTypes(); 5153 unsigned WidestRegister = TTI.getRegisterBitWidth(true); 5154 5155 // Get the maximum safe dependence distance in bits computed by LAA. 5156 // It is computed by MaxVF * sizeOf(type) * 8, where type is taken from 5157 // the memory accesses that is most restrictive (involved in the smallest 5158 // dependence distance). 5159 unsigned MaxSafeRegisterWidth = Legal->getMaxSafeRegisterWidth(); 5160 5161 WidestRegister = std::min(WidestRegister, MaxSafeRegisterWidth); 5162 5163 // Ensure MaxVF is a power of 2; the dependence distance bound may not be. 5164 // Note that both WidestRegister and WidestType may not be a powers of 2. 5165 unsigned MaxVectorSize = PowerOf2Floor(WidestRegister / WidestType); 5166 5167 LLVM_DEBUG(dbgs() << "LV: The Smallest and Widest types: " << SmallestType 5168 << " / " << WidestType << " bits.\n"); 5169 LLVM_DEBUG(dbgs() << "LV: The Widest register safe to use is: " 5170 << WidestRegister << " bits.\n"); 5171 5172 assert(MaxVectorSize <= 256 && "Did not expect to pack so many elements" 5173 " into one vector!"); 5174 if (MaxVectorSize == 0) { 5175 LLVM_DEBUG(dbgs() << "LV: The target has no vector registers.\n"); 5176 MaxVectorSize = 1; 5177 return MaxVectorSize; 5178 } else if (ConstTripCount && ConstTripCount < MaxVectorSize && 5179 isPowerOf2_32(ConstTripCount)) { 5180 // We need to clamp the VF to be the ConstTripCount. There is no point in 5181 // choosing a higher viable VF as done in the loop below. 5182 LLVM_DEBUG(dbgs() << "LV: Clamping the MaxVF to the constant trip count: " 5183 << ConstTripCount << "\n"); 5184 MaxVectorSize = ConstTripCount; 5185 return MaxVectorSize; 5186 } 5187 5188 unsigned MaxVF = MaxVectorSize; 5189 if (TTI.shouldMaximizeVectorBandwidth(!isScalarEpilogueAllowed()) || 5190 (MaximizeBandwidth && isScalarEpilogueAllowed())) { 5191 // Collect all viable vectorization factors larger than the default MaxVF 5192 // (i.e. MaxVectorSize). 5193 SmallVector<unsigned, 8> VFs; 5194 unsigned NewMaxVectorSize = WidestRegister / SmallestType; 5195 for (unsigned VS = MaxVectorSize * 2; VS <= NewMaxVectorSize; VS *= 2) 5196 VFs.push_back(VS); 5197 5198 // For each VF calculate its register usage. 5199 auto RUs = calculateRegisterUsage(VFs); 5200 5201 // Select the largest VF which doesn't require more registers than existing 5202 // ones. 5203 for (int i = RUs.size() - 1; i >= 0; --i) { 5204 bool Selected = true; 5205 for (auto& pair : RUs[i].MaxLocalUsers) { 5206 unsigned TargetNumRegisters = TTI.getNumberOfRegisters(pair.first); 5207 if (pair.second > TargetNumRegisters) 5208 Selected = false; 5209 } 5210 if (Selected) { 5211 MaxVF = VFs[i]; 5212 break; 5213 } 5214 } 5215 if (unsigned MinVF = TTI.getMinimumVF(SmallestType)) { 5216 if (MaxVF < MinVF) { 5217 LLVM_DEBUG(dbgs() << "LV: Overriding calculated MaxVF(" << MaxVF 5218 << ") with target's minimum: " << MinVF << '\n'); 5219 MaxVF = MinVF; 5220 } 5221 } 5222 } 5223 return MaxVF; 5224 } 5225 5226 VectorizationFactor 5227 LoopVectorizationCostModel::selectVectorizationFactor(unsigned MaxVF) { 5228 float Cost = expectedCost(1).first; 5229 const float ScalarCost = Cost; 5230 unsigned Width = 1; 5231 LLVM_DEBUG(dbgs() << "LV: Scalar loop costs: " << (int)ScalarCost << ".\n"); 5232 5233 bool ForceVectorization = Hints->getForce() == LoopVectorizeHints::FK_Enabled; 5234 if (ForceVectorization && MaxVF > 1) { 5235 // Ignore scalar width, because the user explicitly wants vectorization. 5236 // Initialize cost to max so that VF = 2 is, at least, chosen during cost 5237 // evaluation. 5238 Cost = std::numeric_limits<float>::max(); 5239 } 5240 5241 for (unsigned i = 2; i <= MaxVF; i *= 2) { 5242 // Notice that the vector loop needs to be executed less times, so 5243 // we need to divide the cost of the vector loops by the width of 5244 // the vector elements. 5245 VectorizationCostTy C = expectedCost(i); 5246 float VectorCost = C.first / (float)i; 5247 LLVM_DEBUG(dbgs() << "LV: Vector loop of width " << i 5248 << " costs: " << (int)VectorCost << ".\n"); 5249 if (!C.second && !ForceVectorization) { 5250 LLVM_DEBUG( 5251 dbgs() << "LV: Not considering vector loop of width " << i 5252 << " because it will not generate any vector instructions.\n"); 5253 continue; 5254 } 5255 if (VectorCost < Cost) { 5256 Cost = VectorCost; 5257 Width = i; 5258 } 5259 } 5260 5261 if (!EnableCondStoresVectorization && NumPredStores) { 5262 reportVectorizationFailure("There are conditional stores.", 5263 "store that is conditionally executed prevents vectorization", 5264 "ConditionalStore", ORE, TheLoop); 5265 Width = 1; 5266 Cost = ScalarCost; 5267 } 5268 5269 LLVM_DEBUG(if (ForceVectorization && Width > 1 && Cost >= ScalarCost) dbgs() 5270 << "LV: Vectorization seems to be not beneficial, " 5271 << "but was forced by a user.\n"); 5272 LLVM_DEBUG(dbgs() << "LV: Selecting VF: " << Width << ".\n"); 5273 VectorizationFactor Factor = {Width, (unsigned)(Width * Cost)}; 5274 return Factor; 5275 } 5276 5277 std::pair<unsigned, unsigned> 5278 LoopVectorizationCostModel::getSmallestAndWidestTypes() { 5279 unsigned MinWidth = -1U; 5280 unsigned MaxWidth = 8; 5281 const DataLayout &DL = TheFunction->getParent()->getDataLayout(); 5282 5283 // For each block. 5284 for (BasicBlock *BB : TheLoop->blocks()) { 5285 // For each instruction in the loop. 5286 for (Instruction &I : BB->instructionsWithoutDebug()) { 5287 Type *T = I.getType(); 5288 5289 // Skip ignored values. 5290 if (ValuesToIgnore.count(&I)) 5291 continue; 5292 5293 // Only examine Loads, Stores and PHINodes. 5294 if (!isa<LoadInst>(I) && !isa<StoreInst>(I) && !isa<PHINode>(I)) 5295 continue; 5296 5297 // Examine PHI nodes that are reduction variables. Update the type to 5298 // account for the recurrence type. 5299 if (auto *PN = dyn_cast<PHINode>(&I)) { 5300 if (!Legal->isReductionVariable(PN)) 5301 continue; 5302 RecurrenceDescriptor RdxDesc = Legal->getReductionVars()[PN]; 5303 T = RdxDesc.getRecurrenceType(); 5304 } 5305 5306 // Examine the stored values. 5307 if (auto *ST = dyn_cast<StoreInst>(&I)) 5308 T = ST->getValueOperand()->getType(); 5309 5310 // Ignore loaded pointer types and stored pointer types that are not 5311 // vectorizable. 5312 // 5313 // FIXME: The check here attempts to predict whether a load or store will 5314 // be vectorized. We only know this for certain after a VF has 5315 // been selected. Here, we assume that if an access can be 5316 // vectorized, it will be. We should also look at extending this 5317 // optimization to non-pointer types. 5318 // 5319 if (T->isPointerTy() && !isConsecutiveLoadOrStore(&I) && 5320 !isAccessInterleaved(&I) && !isLegalGatherOrScatter(&I)) 5321 continue; 5322 5323 MinWidth = std::min(MinWidth, 5324 (unsigned)DL.getTypeSizeInBits(T->getScalarType())); 5325 MaxWidth = std::max(MaxWidth, 5326 (unsigned)DL.getTypeSizeInBits(T->getScalarType())); 5327 } 5328 } 5329 5330 return {MinWidth, MaxWidth}; 5331 } 5332 5333 unsigned LoopVectorizationCostModel::selectInterleaveCount(unsigned VF, 5334 unsigned LoopCost) { 5335 // -- The interleave heuristics -- 5336 // We interleave the loop in order to expose ILP and reduce the loop overhead. 5337 // There are many micro-architectural considerations that we can't predict 5338 // at this level. For example, frontend pressure (on decode or fetch) due to 5339 // code size, or the number and capabilities of the execution ports. 5340 // 5341 // We use the following heuristics to select the interleave count: 5342 // 1. If the code has reductions, then we interleave to break the cross 5343 // iteration dependency. 5344 // 2. If the loop is really small, then we interleave to reduce the loop 5345 // overhead. 5346 // 3. We don't interleave if we think that we will spill registers to memory 5347 // due to the increased register pressure. 5348 5349 if (!isScalarEpilogueAllowed()) 5350 return 1; 5351 5352 // We used the distance for the interleave count. 5353 if (Legal->getMaxSafeDepDistBytes() != -1U) 5354 return 1; 5355 5356 // Do not interleave loops with a relatively small known or estimated trip 5357 // count. 5358 auto BestKnownTC = getSmallBestKnownTC(*PSE.getSE(), TheLoop); 5359 if (BestKnownTC && *BestKnownTC < TinyTripCountInterleaveThreshold) 5360 return 1; 5361 5362 RegisterUsage R = calculateRegisterUsage({VF})[0]; 5363 // We divide by these constants so assume that we have at least one 5364 // instruction that uses at least one register. 5365 for (auto& pair : R.MaxLocalUsers) { 5366 pair.second = std::max(pair.second, 1U); 5367 } 5368 5369 // We calculate the interleave count using the following formula. 5370 // Subtract the number of loop invariants from the number of available 5371 // registers. These registers are used by all of the interleaved instances. 5372 // Next, divide the remaining registers by the number of registers that is 5373 // required by the loop, in order to estimate how many parallel instances 5374 // fit without causing spills. All of this is rounded down if necessary to be 5375 // a power of two. We want power of two interleave count to simplify any 5376 // addressing operations or alignment considerations. 5377 // We also want power of two interleave counts to ensure that the induction 5378 // variable of the vector loop wraps to zero, when tail is folded by masking; 5379 // this currently happens when OptForSize, in which case IC is set to 1 above. 5380 unsigned IC = UINT_MAX; 5381 5382 for (auto& pair : R.MaxLocalUsers) { 5383 unsigned TargetNumRegisters = TTI.getNumberOfRegisters(pair.first); 5384 LLVM_DEBUG(dbgs() << "LV: The target has " << TargetNumRegisters 5385 << " registers of " 5386 << TTI.getRegisterClassName(pair.first) << " register class\n"); 5387 if (VF == 1) { 5388 if (ForceTargetNumScalarRegs.getNumOccurrences() > 0) 5389 TargetNumRegisters = ForceTargetNumScalarRegs; 5390 } else { 5391 if (ForceTargetNumVectorRegs.getNumOccurrences() > 0) 5392 TargetNumRegisters = ForceTargetNumVectorRegs; 5393 } 5394 unsigned MaxLocalUsers = pair.second; 5395 unsigned LoopInvariantRegs = 0; 5396 if (R.LoopInvariantRegs.find(pair.first) != R.LoopInvariantRegs.end()) 5397 LoopInvariantRegs = R.LoopInvariantRegs[pair.first]; 5398 5399 unsigned TmpIC = PowerOf2Floor((TargetNumRegisters - LoopInvariantRegs) / MaxLocalUsers); 5400 // Don't count the induction variable as interleaved. 5401 if (EnableIndVarRegisterHeur) { 5402 TmpIC = 5403 PowerOf2Floor((TargetNumRegisters - LoopInvariantRegs - 1) / 5404 std::max(1U, (MaxLocalUsers - 1))); 5405 } 5406 5407 IC = std::min(IC, TmpIC); 5408 } 5409 5410 // Clamp the interleave ranges to reasonable counts. 5411 unsigned MaxInterleaveCount = TTI.getMaxInterleaveFactor(VF); 5412 5413 // Check if the user has overridden the max. 5414 if (VF == 1) { 5415 if (ForceTargetMaxScalarInterleaveFactor.getNumOccurrences() > 0) 5416 MaxInterleaveCount = ForceTargetMaxScalarInterleaveFactor; 5417 } else { 5418 if (ForceTargetMaxVectorInterleaveFactor.getNumOccurrences() > 0) 5419 MaxInterleaveCount = ForceTargetMaxVectorInterleaveFactor; 5420 } 5421 5422 // If trip count is known or estimated compile time constant, limit the 5423 // interleave count to be less than the trip count divided by VF. 5424 if (BestKnownTC) { 5425 MaxInterleaveCount = std::min(*BestKnownTC / VF, MaxInterleaveCount); 5426 } 5427 5428 // If we did not calculate the cost for VF (because the user selected the VF) 5429 // then we calculate the cost of VF here. 5430 if (LoopCost == 0) 5431 LoopCost = expectedCost(VF).first; 5432 5433 assert(LoopCost && "Non-zero loop cost expected"); 5434 5435 // Clamp the calculated IC to be between the 1 and the max interleave count 5436 // that the target and trip count allows. 5437 if (IC > MaxInterleaveCount) 5438 IC = MaxInterleaveCount; 5439 else if (IC < 1) 5440 IC = 1; 5441 5442 // Interleave if we vectorized this loop and there is a reduction that could 5443 // benefit from interleaving. 5444 if (VF > 1 && !Legal->getReductionVars().empty()) { 5445 LLVM_DEBUG(dbgs() << "LV: Interleaving because of reductions.\n"); 5446 return IC; 5447 } 5448 5449 // Note that if we've already vectorized the loop we will have done the 5450 // runtime check and so interleaving won't require further checks. 5451 bool InterleavingRequiresRuntimePointerCheck = 5452 (VF == 1 && Legal->getRuntimePointerChecking()->Need); 5453 5454 // We want to interleave small loops in order to reduce the loop overhead and 5455 // potentially expose ILP opportunities. 5456 LLVM_DEBUG(dbgs() << "LV: Loop cost is " << LoopCost << '\n'); 5457 if (!InterleavingRequiresRuntimePointerCheck && LoopCost < SmallLoopCost) { 5458 // We assume that the cost overhead is 1 and we use the cost model 5459 // to estimate the cost of the loop and interleave until the cost of the 5460 // loop overhead is about 5% of the cost of the loop. 5461 unsigned SmallIC = 5462 std::min(IC, (unsigned)PowerOf2Floor(SmallLoopCost / LoopCost)); 5463 5464 // Interleave until store/load ports (estimated by max interleave count) are 5465 // saturated. 5466 unsigned NumStores = Legal->getNumStores(); 5467 unsigned NumLoads = Legal->getNumLoads(); 5468 unsigned StoresIC = IC / (NumStores ? NumStores : 1); 5469 unsigned LoadsIC = IC / (NumLoads ? NumLoads : 1); 5470 5471 // If we have a scalar reduction (vector reductions are already dealt with 5472 // by this point), we can increase the critical path length if the loop 5473 // we're interleaving is inside another loop. Limit, by default to 2, so the 5474 // critical path only gets increased by one reduction operation. 5475 if (!Legal->getReductionVars().empty() && TheLoop->getLoopDepth() > 1) { 5476 unsigned F = static_cast<unsigned>(MaxNestedScalarReductionIC); 5477 SmallIC = std::min(SmallIC, F); 5478 StoresIC = std::min(StoresIC, F); 5479 LoadsIC = std::min(LoadsIC, F); 5480 } 5481 5482 if (EnableLoadStoreRuntimeInterleave && 5483 std::max(StoresIC, LoadsIC) > SmallIC) { 5484 LLVM_DEBUG( 5485 dbgs() << "LV: Interleaving to saturate store or load ports.\n"); 5486 return std::max(StoresIC, LoadsIC); 5487 } 5488 5489 LLVM_DEBUG(dbgs() << "LV: Interleaving to reduce branch cost.\n"); 5490 return SmallIC; 5491 } 5492 5493 // Interleave if this is a large loop (small loops are already dealt with by 5494 // this point) that could benefit from interleaving. 5495 bool HasReductions = !Legal->getReductionVars().empty(); 5496 if (TTI.enableAggressiveInterleaving(HasReductions)) { 5497 LLVM_DEBUG(dbgs() << "LV: Interleaving to expose ILP.\n"); 5498 return IC; 5499 } 5500 5501 LLVM_DEBUG(dbgs() << "LV: Not Interleaving.\n"); 5502 return 1; 5503 } 5504 5505 SmallVector<LoopVectorizationCostModel::RegisterUsage, 8> 5506 LoopVectorizationCostModel::calculateRegisterUsage(ArrayRef<unsigned> VFs) { 5507 // This function calculates the register usage by measuring the highest number 5508 // of values that are alive at a single location. Obviously, this is a very 5509 // rough estimation. We scan the loop in a topological order in order and 5510 // assign a number to each instruction. We use RPO to ensure that defs are 5511 // met before their users. We assume that each instruction that has in-loop 5512 // users starts an interval. We record every time that an in-loop value is 5513 // used, so we have a list of the first and last occurrences of each 5514 // instruction. Next, we transpose this data structure into a multi map that 5515 // holds the list of intervals that *end* at a specific location. This multi 5516 // map allows us to perform a linear search. We scan the instructions linearly 5517 // and record each time that a new interval starts, by placing it in a set. 5518 // If we find this value in the multi-map then we remove it from the set. 5519 // The max register usage is the maximum size of the set. 5520 // We also search for instructions that are defined outside the loop, but are 5521 // used inside the loop. We need this number separately from the max-interval 5522 // usage number because when we unroll, loop-invariant values do not take 5523 // more register. 5524 LoopBlocksDFS DFS(TheLoop); 5525 DFS.perform(LI); 5526 5527 RegisterUsage RU; 5528 5529 // Each 'key' in the map opens a new interval. The values 5530 // of the map are the index of the 'last seen' usage of the 5531 // instruction that is the key. 5532 using IntervalMap = DenseMap<Instruction *, unsigned>; 5533 5534 // Maps instruction to its index. 5535 SmallVector<Instruction *, 64> IdxToInstr; 5536 // Marks the end of each interval. 5537 IntervalMap EndPoint; 5538 // Saves the list of instruction indices that are used in the loop. 5539 SmallPtrSet<Instruction *, 8> Ends; 5540 // Saves the list of values that are used in the loop but are 5541 // defined outside the loop, such as arguments and constants. 5542 SmallPtrSet<Value *, 8> LoopInvariants; 5543 5544 for (BasicBlock *BB : make_range(DFS.beginRPO(), DFS.endRPO())) { 5545 for (Instruction &I : BB->instructionsWithoutDebug()) { 5546 IdxToInstr.push_back(&I); 5547 5548 // Save the end location of each USE. 5549 for (Value *U : I.operands()) { 5550 auto *Instr = dyn_cast<Instruction>(U); 5551 5552 // Ignore non-instruction values such as arguments, constants, etc. 5553 if (!Instr) 5554 continue; 5555 5556 // If this instruction is outside the loop then record it and continue. 5557 if (!TheLoop->contains(Instr)) { 5558 LoopInvariants.insert(Instr); 5559 continue; 5560 } 5561 5562 // Overwrite previous end points. 5563 EndPoint[Instr] = IdxToInstr.size(); 5564 Ends.insert(Instr); 5565 } 5566 } 5567 } 5568 5569 // Saves the list of intervals that end with the index in 'key'. 5570 using InstrList = SmallVector<Instruction *, 2>; 5571 DenseMap<unsigned, InstrList> TransposeEnds; 5572 5573 // Transpose the EndPoints to a list of values that end at each index. 5574 for (auto &Interval : EndPoint) 5575 TransposeEnds[Interval.second].push_back(Interval.first); 5576 5577 SmallPtrSet<Instruction *, 8> OpenIntervals; 5578 5579 // Get the size of the widest register. 5580 unsigned MaxSafeDepDist = -1U; 5581 if (Legal->getMaxSafeDepDistBytes() != -1U) 5582 MaxSafeDepDist = Legal->getMaxSafeDepDistBytes() * 8; 5583 unsigned WidestRegister = 5584 std::min(TTI.getRegisterBitWidth(true), MaxSafeDepDist); 5585 const DataLayout &DL = TheFunction->getParent()->getDataLayout(); 5586 5587 SmallVector<RegisterUsage, 8> RUs(VFs.size()); 5588 SmallVector<SmallMapVector<unsigned, unsigned, 4>, 8> MaxUsages(VFs.size()); 5589 5590 LLVM_DEBUG(dbgs() << "LV(REG): Calculating max register usage:\n"); 5591 5592 // A lambda that gets the register usage for the given type and VF. 5593 auto GetRegUsage = [&DL, WidestRegister](Type *Ty, unsigned VF) { 5594 if (Ty->isTokenTy()) 5595 return 0U; 5596 unsigned TypeSize = DL.getTypeSizeInBits(Ty->getScalarType()); 5597 return std::max<unsigned>(1, VF * TypeSize / WidestRegister); 5598 }; 5599 5600 for (unsigned int i = 0, s = IdxToInstr.size(); i < s; ++i) { 5601 Instruction *I = IdxToInstr[i]; 5602 5603 // Remove all of the instructions that end at this location. 5604 InstrList &List = TransposeEnds[i]; 5605 for (Instruction *ToRemove : List) 5606 OpenIntervals.erase(ToRemove); 5607 5608 // Ignore instructions that are never used within the loop. 5609 if (!Ends.count(I)) 5610 continue; 5611 5612 // Skip ignored values. 5613 if (ValuesToIgnore.count(I)) 5614 continue; 5615 5616 // For each VF find the maximum usage of registers. 5617 for (unsigned j = 0, e = VFs.size(); j < e; ++j) { 5618 // Count the number of live intervals. 5619 SmallMapVector<unsigned, unsigned, 4> RegUsage; 5620 5621 if (VFs[j] == 1) { 5622 for (auto Inst : OpenIntervals) { 5623 unsigned ClassID = TTI.getRegisterClassForType(false, Inst->getType()); 5624 if (RegUsage.find(ClassID) == RegUsage.end()) 5625 RegUsage[ClassID] = 1; 5626 else 5627 RegUsage[ClassID] += 1; 5628 } 5629 } else { 5630 collectUniformsAndScalars(VFs[j]); 5631 for (auto Inst : OpenIntervals) { 5632 // Skip ignored values for VF > 1. 5633 if (VecValuesToIgnore.count(Inst)) 5634 continue; 5635 if (isScalarAfterVectorization(Inst, VFs[j])) { 5636 unsigned ClassID = TTI.getRegisterClassForType(false, Inst->getType()); 5637 if (RegUsage.find(ClassID) == RegUsage.end()) 5638 RegUsage[ClassID] = 1; 5639 else 5640 RegUsage[ClassID] += 1; 5641 } else { 5642 unsigned ClassID = TTI.getRegisterClassForType(true, Inst->getType()); 5643 if (RegUsage.find(ClassID) == RegUsage.end()) 5644 RegUsage[ClassID] = GetRegUsage(Inst->getType(), VFs[j]); 5645 else 5646 RegUsage[ClassID] += GetRegUsage(Inst->getType(), VFs[j]); 5647 } 5648 } 5649 } 5650 5651 for (auto& pair : RegUsage) { 5652 if (MaxUsages[j].find(pair.first) != MaxUsages[j].end()) 5653 MaxUsages[j][pair.first] = std::max(MaxUsages[j][pair.first], pair.second); 5654 else 5655 MaxUsages[j][pair.first] = pair.second; 5656 } 5657 } 5658 5659 LLVM_DEBUG(dbgs() << "LV(REG): At #" << i << " Interval # " 5660 << OpenIntervals.size() << '\n'); 5661 5662 // Add the current instruction to the list of open intervals. 5663 OpenIntervals.insert(I); 5664 } 5665 5666 for (unsigned i = 0, e = VFs.size(); i < e; ++i) { 5667 SmallMapVector<unsigned, unsigned, 4> Invariant; 5668 5669 for (auto Inst : LoopInvariants) { 5670 unsigned Usage = VFs[i] == 1 ? 1 : GetRegUsage(Inst->getType(), VFs[i]); 5671 unsigned ClassID = TTI.getRegisterClassForType(VFs[i] > 1, Inst->getType()); 5672 if (Invariant.find(ClassID) == Invariant.end()) 5673 Invariant[ClassID] = Usage; 5674 else 5675 Invariant[ClassID] += Usage; 5676 } 5677 5678 LLVM_DEBUG({ 5679 dbgs() << "LV(REG): VF = " << VFs[i] << '\n'; 5680 dbgs() << "LV(REG): Found max usage: " << MaxUsages[i].size() 5681 << " item\n"; 5682 for (const auto &pair : MaxUsages[i]) { 5683 dbgs() << "LV(REG): RegisterClass: " 5684 << TTI.getRegisterClassName(pair.first) << ", " << pair.second 5685 << " registers\n"; 5686 } 5687 dbgs() << "LV(REG): Found invariant usage: " << Invariant.size() 5688 << " item\n"; 5689 for (const auto &pair : Invariant) { 5690 dbgs() << "LV(REG): RegisterClass: " 5691 << TTI.getRegisterClassName(pair.first) << ", " << pair.second 5692 << " registers\n"; 5693 } 5694 }); 5695 5696 RU.LoopInvariantRegs = Invariant; 5697 RU.MaxLocalUsers = MaxUsages[i]; 5698 RUs[i] = RU; 5699 } 5700 5701 return RUs; 5702 } 5703 5704 bool LoopVectorizationCostModel::useEmulatedMaskMemRefHack(Instruction *I){ 5705 // TODO: Cost model for emulated masked load/store is completely 5706 // broken. This hack guides the cost model to use an artificially 5707 // high enough value to practically disable vectorization with such 5708 // operations, except where previously deployed legality hack allowed 5709 // using very low cost values. This is to avoid regressions coming simply 5710 // from moving "masked load/store" check from legality to cost model. 5711 // Masked Load/Gather emulation was previously never allowed. 5712 // Limited number of Masked Store/Scatter emulation was allowed. 5713 assert(isPredicatedInst(I) && "Expecting a scalar emulated instruction"); 5714 return isa<LoadInst>(I) || 5715 (isa<StoreInst>(I) && 5716 NumPredStores > NumberOfStoresToPredicate); 5717 } 5718 5719 void LoopVectorizationCostModel::collectInstsToScalarize(unsigned VF) { 5720 // If we aren't vectorizing the loop, or if we've already collected the 5721 // instructions to scalarize, there's nothing to do. Collection may already 5722 // have occurred if we have a user-selected VF and are now computing the 5723 // expected cost for interleaving. 5724 if (VF < 2 || InstsToScalarize.find(VF) != InstsToScalarize.end()) 5725 return; 5726 5727 // Initialize a mapping for VF in InstsToScalalarize. If we find that it's 5728 // not profitable to scalarize any instructions, the presence of VF in the 5729 // map will indicate that we've analyzed it already. 5730 ScalarCostsTy &ScalarCostsVF = InstsToScalarize[VF]; 5731 5732 // Find all the instructions that are scalar with predication in the loop and 5733 // determine if it would be better to not if-convert the blocks they are in. 5734 // If so, we also record the instructions to scalarize. 5735 for (BasicBlock *BB : TheLoop->blocks()) { 5736 if (!blockNeedsPredication(BB)) 5737 continue; 5738 for (Instruction &I : *BB) 5739 if (isScalarWithPredication(&I)) { 5740 ScalarCostsTy ScalarCosts; 5741 // Do not apply discount logic if hacked cost is needed 5742 // for emulated masked memrefs. 5743 if (!useEmulatedMaskMemRefHack(&I) && 5744 computePredInstDiscount(&I, ScalarCosts, VF) >= 0) 5745 ScalarCostsVF.insert(ScalarCosts.begin(), ScalarCosts.end()); 5746 // Remember that BB will remain after vectorization. 5747 PredicatedBBsAfterVectorization.insert(BB); 5748 } 5749 } 5750 } 5751 5752 int LoopVectorizationCostModel::computePredInstDiscount( 5753 Instruction *PredInst, DenseMap<Instruction *, unsigned> &ScalarCosts, 5754 unsigned VF) { 5755 assert(!isUniformAfterVectorization(PredInst, VF) && 5756 "Instruction marked uniform-after-vectorization will be predicated"); 5757 5758 // Initialize the discount to zero, meaning that the scalar version and the 5759 // vector version cost the same. 5760 int Discount = 0; 5761 5762 // Holds instructions to analyze. The instructions we visit are mapped in 5763 // ScalarCosts. Those instructions are the ones that would be scalarized if 5764 // we find that the scalar version costs less. 5765 SmallVector<Instruction *, 8> Worklist; 5766 5767 // Returns true if the given instruction can be scalarized. 5768 auto canBeScalarized = [&](Instruction *I) -> bool { 5769 // We only attempt to scalarize instructions forming a single-use chain 5770 // from the original predicated block that would otherwise be vectorized. 5771 // Although not strictly necessary, we give up on instructions we know will 5772 // already be scalar to avoid traversing chains that are unlikely to be 5773 // beneficial. 5774 if (!I->hasOneUse() || PredInst->getParent() != I->getParent() || 5775 isScalarAfterVectorization(I, VF)) 5776 return false; 5777 5778 // If the instruction is scalar with predication, it will be analyzed 5779 // separately. We ignore it within the context of PredInst. 5780 if (isScalarWithPredication(I)) 5781 return false; 5782 5783 // If any of the instruction's operands are uniform after vectorization, 5784 // the instruction cannot be scalarized. This prevents, for example, a 5785 // masked load from being scalarized. 5786 // 5787 // We assume we will only emit a value for lane zero of an instruction 5788 // marked uniform after vectorization, rather than VF identical values. 5789 // Thus, if we scalarize an instruction that uses a uniform, we would 5790 // create uses of values corresponding to the lanes we aren't emitting code 5791 // for. This behavior can be changed by allowing getScalarValue to clone 5792 // the lane zero values for uniforms rather than asserting. 5793 for (Use &U : I->operands()) 5794 if (auto *J = dyn_cast<Instruction>(U.get())) 5795 if (isUniformAfterVectorization(J, VF)) 5796 return false; 5797 5798 // Otherwise, we can scalarize the instruction. 5799 return true; 5800 }; 5801 5802 // Compute the expected cost discount from scalarizing the entire expression 5803 // feeding the predicated instruction. We currently only consider expressions 5804 // that are single-use instruction chains. 5805 Worklist.push_back(PredInst); 5806 while (!Worklist.empty()) { 5807 Instruction *I = Worklist.pop_back_val(); 5808 5809 // If we've already analyzed the instruction, there's nothing to do. 5810 if (ScalarCosts.find(I) != ScalarCosts.end()) 5811 continue; 5812 5813 // Compute the cost of the vector instruction. Note that this cost already 5814 // includes the scalarization overhead of the predicated instruction. 5815 unsigned VectorCost = getInstructionCost(I, VF).first; 5816 5817 // Compute the cost of the scalarized instruction. This cost is the cost of 5818 // the instruction as if it wasn't if-converted and instead remained in the 5819 // predicated block. We will scale this cost by block probability after 5820 // computing the scalarization overhead. 5821 unsigned ScalarCost = VF * getInstructionCost(I, 1).first; 5822 5823 // Compute the scalarization overhead of needed insertelement instructions 5824 // and phi nodes. 5825 if (isScalarWithPredication(I) && !I->getType()->isVoidTy()) { 5826 ScalarCost += TTI.getScalarizationOverhead( 5827 cast<VectorType>(ToVectorTy(I->getType(), VF)), 5828 APInt::getAllOnesValue(VF), true, false); 5829 ScalarCost += VF * TTI.getCFInstrCost(Instruction::PHI, 5830 TTI::TCK_RecipThroughput); 5831 } 5832 5833 // Compute the scalarization overhead of needed extractelement 5834 // instructions. For each of the instruction's operands, if the operand can 5835 // be scalarized, add it to the worklist; otherwise, account for the 5836 // overhead. 5837 for (Use &U : I->operands()) 5838 if (auto *J = dyn_cast<Instruction>(U.get())) { 5839 assert(VectorType::isValidElementType(J->getType()) && 5840 "Instruction has non-scalar type"); 5841 if (canBeScalarized(J)) 5842 Worklist.push_back(J); 5843 else if (needsExtract(J, VF)) 5844 ScalarCost += TTI.getScalarizationOverhead( 5845 cast<VectorType>(ToVectorTy(J->getType(), VF)), 5846 APInt::getAllOnesValue(VF), false, true); 5847 } 5848 5849 // Scale the total scalar cost by block probability. 5850 ScalarCost /= getReciprocalPredBlockProb(); 5851 5852 // Compute the discount. A non-negative discount means the vector version 5853 // of the instruction costs more, and scalarizing would be beneficial. 5854 Discount += VectorCost - ScalarCost; 5855 ScalarCosts[I] = ScalarCost; 5856 } 5857 5858 return Discount; 5859 } 5860 5861 LoopVectorizationCostModel::VectorizationCostTy 5862 LoopVectorizationCostModel::expectedCost(unsigned VF) { 5863 VectorizationCostTy Cost; 5864 5865 // For each block. 5866 for (BasicBlock *BB : TheLoop->blocks()) { 5867 VectorizationCostTy BlockCost; 5868 5869 // For each instruction in the old loop. 5870 for (Instruction &I : BB->instructionsWithoutDebug()) { 5871 // Skip ignored values. 5872 if (ValuesToIgnore.count(&I) || (VF > 1 && VecValuesToIgnore.count(&I))) 5873 continue; 5874 5875 VectorizationCostTy C = getInstructionCost(&I, VF); 5876 5877 // Check if we should override the cost. 5878 if (ForceTargetInstructionCost.getNumOccurrences() > 0) 5879 C.first = ForceTargetInstructionCost; 5880 5881 BlockCost.first += C.first; 5882 BlockCost.second |= C.second; 5883 LLVM_DEBUG(dbgs() << "LV: Found an estimated cost of " << C.first 5884 << " for VF " << VF << " For instruction: " << I 5885 << '\n'); 5886 } 5887 5888 // If we are vectorizing a predicated block, it will have been 5889 // if-converted. This means that the block's instructions (aside from 5890 // stores and instructions that may divide by zero) will now be 5891 // unconditionally executed. For the scalar case, we may not always execute 5892 // the predicated block. Thus, scale the block's cost by the probability of 5893 // executing it. 5894 if (VF == 1 && blockNeedsPredication(BB)) 5895 BlockCost.first /= getReciprocalPredBlockProb(); 5896 5897 Cost.first += BlockCost.first; 5898 Cost.second |= BlockCost.second; 5899 } 5900 5901 return Cost; 5902 } 5903 5904 /// Gets Address Access SCEV after verifying that the access pattern 5905 /// is loop invariant except the induction variable dependence. 5906 /// 5907 /// This SCEV can be sent to the Target in order to estimate the address 5908 /// calculation cost. 5909 static const SCEV *getAddressAccessSCEV( 5910 Value *Ptr, 5911 LoopVectorizationLegality *Legal, 5912 PredicatedScalarEvolution &PSE, 5913 const Loop *TheLoop) { 5914 5915 auto *Gep = dyn_cast<GetElementPtrInst>(Ptr); 5916 if (!Gep) 5917 return nullptr; 5918 5919 // We are looking for a gep with all loop invariant indices except for one 5920 // which should be an induction variable. 5921 auto SE = PSE.getSE(); 5922 unsigned NumOperands = Gep->getNumOperands(); 5923 for (unsigned i = 1; i < NumOperands; ++i) { 5924 Value *Opd = Gep->getOperand(i); 5925 if (!SE->isLoopInvariant(SE->getSCEV(Opd), TheLoop) && 5926 !Legal->isInductionVariable(Opd)) 5927 return nullptr; 5928 } 5929 5930 // Now we know we have a GEP ptr, %inv, %ind, %inv. return the Ptr SCEV. 5931 return PSE.getSCEV(Ptr); 5932 } 5933 5934 static bool isStrideMul(Instruction *I, LoopVectorizationLegality *Legal) { 5935 return Legal->hasStride(I->getOperand(0)) || 5936 Legal->hasStride(I->getOperand(1)); 5937 } 5938 5939 unsigned LoopVectorizationCostModel::getMemInstScalarizationCost(Instruction *I, 5940 unsigned VF) { 5941 assert(VF > 1 && "Scalarization cost of instruction implies vectorization."); 5942 Type *ValTy = getMemInstValueType(I); 5943 auto SE = PSE.getSE(); 5944 5945 unsigned AS = getLoadStoreAddressSpace(I); 5946 Value *Ptr = getLoadStorePointerOperand(I); 5947 Type *PtrTy = ToVectorTy(Ptr->getType(), VF); 5948 5949 // Figure out whether the access is strided and get the stride value 5950 // if it's known in compile time 5951 const SCEV *PtrSCEV = getAddressAccessSCEV(Ptr, Legal, PSE, TheLoop); 5952 5953 // Get the cost of the scalar memory instruction and address computation. 5954 unsigned Cost = VF * TTI.getAddressComputationCost(PtrTy, SE, PtrSCEV); 5955 5956 // Don't pass *I here, since it is scalar but will actually be part of a 5957 // vectorized loop where the user of it is a vectorized instruction. 5958 const Align Alignment = getLoadStoreAlignment(I); 5959 Cost += VF * TTI.getMemoryOpCost(I->getOpcode(), ValTy->getScalarType(), 5960 Alignment, AS, 5961 TTI::TCK_RecipThroughput); 5962 5963 // Get the overhead of the extractelement and insertelement instructions 5964 // we might create due to scalarization. 5965 Cost += getScalarizationOverhead(I, VF); 5966 5967 // If we have a predicated store, it may not be executed for each vector 5968 // lane. Scale the cost by the probability of executing the predicated 5969 // block. 5970 if (isPredicatedInst(I)) { 5971 Cost /= getReciprocalPredBlockProb(); 5972 5973 if (useEmulatedMaskMemRefHack(I)) 5974 // Artificially setting to a high enough value to practically disable 5975 // vectorization with such operations. 5976 Cost = 3000000; 5977 } 5978 5979 return Cost; 5980 } 5981 5982 unsigned LoopVectorizationCostModel::getConsecutiveMemOpCost(Instruction *I, 5983 unsigned VF) { 5984 Type *ValTy = getMemInstValueType(I); 5985 auto *VectorTy = cast<VectorType>(ToVectorTy(ValTy, VF)); 5986 Value *Ptr = getLoadStorePointerOperand(I); 5987 unsigned AS = getLoadStoreAddressSpace(I); 5988 int ConsecutiveStride = Legal->isConsecutivePtr(Ptr); 5989 enum TTI::TargetCostKind CostKind = TTI::TCK_RecipThroughput; 5990 5991 assert((ConsecutiveStride == 1 || ConsecutiveStride == -1) && 5992 "Stride should be 1 or -1 for consecutive memory access"); 5993 const Align Alignment = getLoadStoreAlignment(I); 5994 unsigned Cost = 0; 5995 if (Legal->isMaskRequired(I)) 5996 Cost += TTI.getMaskedMemoryOpCost(I->getOpcode(), VectorTy, Alignment, AS, 5997 CostKind); 5998 else 5999 Cost += TTI.getMemoryOpCost(I->getOpcode(), VectorTy, Alignment, AS, 6000 CostKind, I); 6001 6002 bool Reverse = ConsecutiveStride < 0; 6003 if (Reverse) 6004 Cost += TTI.getShuffleCost(TargetTransformInfo::SK_Reverse, VectorTy, 0); 6005 return Cost; 6006 } 6007 6008 unsigned LoopVectorizationCostModel::getUniformMemOpCost(Instruction *I, 6009 unsigned VF) { 6010 Type *ValTy = getMemInstValueType(I); 6011 auto *VectorTy = cast<VectorType>(ToVectorTy(ValTy, VF)); 6012 const Align Alignment = getLoadStoreAlignment(I); 6013 unsigned AS = getLoadStoreAddressSpace(I); 6014 enum TTI::TargetCostKind CostKind = TTI::TCK_RecipThroughput; 6015 if (isa<LoadInst>(I)) { 6016 return TTI.getAddressComputationCost(ValTy) + 6017 TTI.getMemoryOpCost(Instruction::Load, ValTy, Alignment, AS, 6018 CostKind) + 6019 TTI.getShuffleCost(TargetTransformInfo::SK_Broadcast, VectorTy); 6020 } 6021 StoreInst *SI = cast<StoreInst>(I); 6022 6023 bool isLoopInvariantStoreValue = Legal->isUniform(SI->getValueOperand()); 6024 return TTI.getAddressComputationCost(ValTy) + 6025 TTI.getMemoryOpCost(Instruction::Store, ValTy, Alignment, AS, 6026 CostKind) + 6027 (isLoopInvariantStoreValue 6028 ? 0 6029 : TTI.getVectorInstrCost(Instruction::ExtractElement, VectorTy, 6030 VF - 1)); 6031 } 6032 6033 unsigned LoopVectorizationCostModel::getGatherScatterCost(Instruction *I, 6034 unsigned VF) { 6035 Type *ValTy = getMemInstValueType(I); 6036 auto *VectorTy = cast<VectorType>(ToVectorTy(ValTy, VF)); 6037 const Align Alignment = getLoadStoreAlignment(I); 6038 const Value *Ptr = getLoadStorePointerOperand(I); 6039 6040 return TTI.getAddressComputationCost(VectorTy) + 6041 TTI.getGatherScatterOpCost( 6042 I->getOpcode(), VectorTy, Ptr, Legal->isMaskRequired(I), Alignment, 6043 TargetTransformInfo::TCK_RecipThroughput, I); 6044 } 6045 6046 unsigned LoopVectorizationCostModel::getInterleaveGroupCost(Instruction *I, 6047 unsigned VF) { 6048 Type *ValTy = getMemInstValueType(I); 6049 auto *VectorTy = cast<VectorType>(ToVectorTy(ValTy, VF)); 6050 unsigned AS = getLoadStoreAddressSpace(I); 6051 6052 auto Group = getInterleavedAccessGroup(I); 6053 assert(Group && "Fail to get an interleaved access group."); 6054 6055 unsigned InterleaveFactor = Group->getFactor(); 6056 auto *WideVecTy = FixedVectorType::get(ValTy, VF * InterleaveFactor); 6057 6058 // Holds the indices of existing members in an interleaved load group. 6059 // An interleaved store group doesn't need this as it doesn't allow gaps. 6060 SmallVector<unsigned, 4> Indices; 6061 if (isa<LoadInst>(I)) { 6062 for (unsigned i = 0; i < InterleaveFactor; i++) 6063 if (Group->getMember(i)) 6064 Indices.push_back(i); 6065 } 6066 6067 // Calculate the cost of the whole interleaved group. 6068 bool UseMaskForGaps = 6069 Group->requiresScalarEpilogue() && !isScalarEpilogueAllowed(); 6070 unsigned Cost = TTI.getInterleavedMemoryOpCost( 6071 I->getOpcode(), WideVecTy, Group->getFactor(), Indices, Group->getAlign(), 6072 AS, TTI::TCK_RecipThroughput, Legal->isMaskRequired(I), UseMaskForGaps); 6073 6074 if (Group->isReverse()) { 6075 // TODO: Add support for reversed masked interleaved access. 6076 assert(!Legal->isMaskRequired(I) && 6077 "Reverse masked interleaved access not supported."); 6078 Cost += Group->getNumMembers() * 6079 TTI.getShuffleCost(TargetTransformInfo::SK_Reverse, VectorTy, 0); 6080 } 6081 return Cost; 6082 } 6083 6084 unsigned LoopVectorizationCostModel::getMemoryInstructionCost(Instruction *I, 6085 unsigned VF) { 6086 // Calculate scalar cost only. Vectorization cost should be ready at this 6087 // moment. 6088 if (VF == 1) { 6089 Type *ValTy = getMemInstValueType(I); 6090 const Align Alignment = getLoadStoreAlignment(I); 6091 unsigned AS = getLoadStoreAddressSpace(I); 6092 6093 return TTI.getAddressComputationCost(ValTy) + 6094 TTI.getMemoryOpCost(I->getOpcode(), ValTy, Alignment, AS, 6095 TTI::TCK_RecipThroughput, I); 6096 } 6097 return getWideningCost(I, VF); 6098 } 6099 6100 LoopVectorizationCostModel::VectorizationCostTy 6101 LoopVectorizationCostModel::getInstructionCost(Instruction *I, unsigned VF) { 6102 // If we know that this instruction will remain uniform, check the cost of 6103 // the scalar version. 6104 if (isUniformAfterVectorization(I, VF)) 6105 VF = 1; 6106 6107 if (VF > 1 && isProfitableToScalarize(I, VF)) 6108 return VectorizationCostTy(InstsToScalarize[VF][I], false); 6109 6110 // Forced scalars do not have any scalarization overhead. 6111 auto ForcedScalar = ForcedScalars.find(VF); 6112 if (VF > 1 && ForcedScalar != ForcedScalars.end()) { 6113 auto InstSet = ForcedScalar->second; 6114 if (InstSet.count(I)) 6115 return VectorizationCostTy((getInstructionCost(I, 1).first * VF), false); 6116 } 6117 6118 Type *VectorTy; 6119 unsigned C = getInstructionCost(I, VF, VectorTy); 6120 6121 bool TypeNotScalarized = 6122 VF > 1 && VectorTy->isVectorTy() && TTI.getNumberOfParts(VectorTy) < VF; 6123 return VectorizationCostTy(C, TypeNotScalarized); 6124 } 6125 6126 unsigned LoopVectorizationCostModel::getScalarizationOverhead(Instruction *I, 6127 unsigned VF) { 6128 6129 if (VF == 1) 6130 return 0; 6131 6132 unsigned Cost = 0; 6133 Type *RetTy = ToVectorTy(I->getType(), VF); 6134 if (!RetTy->isVoidTy() && 6135 (!isa<LoadInst>(I) || !TTI.supportsEfficientVectorElementLoadStore())) 6136 Cost += TTI.getScalarizationOverhead( 6137 cast<VectorType>(RetTy), APInt::getAllOnesValue(VF), true, false); 6138 6139 // Some targets keep addresses scalar. 6140 if (isa<LoadInst>(I) && !TTI.prefersVectorizedAddressing()) 6141 return Cost; 6142 6143 // Some targets support efficient element stores. 6144 if (isa<StoreInst>(I) && TTI.supportsEfficientVectorElementLoadStore()) 6145 return Cost; 6146 6147 // Collect operands to consider. 6148 CallInst *CI = dyn_cast<CallInst>(I); 6149 Instruction::op_range Ops = CI ? CI->arg_operands() : I->operands(); 6150 6151 // Skip operands that do not require extraction/scalarization and do not incur 6152 // any overhead. 6153 return Cost + TTI.getOperandsScalarizationOverhead( 6154 filterExtractingOperands(Ops, VF), VF); 6155 } 6156 6157 void LoopVectorizationCostModel::setCostBasedWideningDecision(unsigned VF) { 6158 if (VF == 1) 6159 return; 6160 NumPredStores = 0; 6161 for (BasicBlock *BB : TheLoop->blocks()) { 6162 // For each instruction in the old loop. 6163 for (Instruction &I : *BB) { 6164 Value *Ptr = getLoadStorePointerOperand(&I); 6165 if (!Ptr) 6166 continue; 6167 6168 // TODO: We should generate better code and update the cost model for 6169 // predicated uniform stores. Today they are treated as any other 6170 // predicated store (see added test cases in 6171 // invariant-store-vectorization.ll). 6172 if (isa<StoreInst>(&I) && isScalarWithPredication(&I)) 6173 NumPredStores++; 6174 6175 if (Legal->isUniform(Ptr) && 6176 // Conditional loads and stores should be scalarized and predicated. 6177 // isScalarWithPredication cannot be used here since masked 6178 // gather/scatters are not considered scalar with predication. 6179 !Legal->blockNeedsPredication(I.getParent())) { 6180 // TODO: Avoid replicating loads and stores instead of 6181 // relying on instcombine to remove them. 6182 // Load: Scalar load + broadcast 6183 // Store: Scalar store + isLoopInvariantStoreValue ? 0 : extract 6184 unsigned Cost = getUniformMemOpCost(&I, VF); 6185 setWideningDecision(&I, VF, CM_Scalarize, Cost); 6186 continue; 6187 } 6188 6189 // We assume that widening is the best solution when possible. 6190 if (memoryInstructionCanBeWidened(&I, VF)) { 6191 unsigned Cost = getConsecutiveMemOpCost(&I, VF); 6192 int ConsecutiveStride = 6193 Legal->isConsecutivePtr(getLoadStorePointerOperand(&I)); 6194 assert((ConsecutiveStride == 1 || ConsecutiveStride == -1) && 6195 "Expected consecutive stride."); 6196 InstWidening Decision = 6197 ConsecutiveStride == 1 ? CM_Widen : CM_Widen_Reverse; 6198 setWideningDecision(&I, VF, Decision, Cost); 6199 continue; 6200 } 6201 6202 // Choose between Interleaving, Gather/Scatter or Scalarization. 6203 unsigned InterleaveCost = std::numeric_limits<unsigned>::max(); 6204 unsigned NumAccesses = 1; 6205 if (isAccessInterleaved(&I)) { 6206 auto Group = getInterleavedAccessGroup(&I); 6207 assert(Group && "Fail to get an interleaved access group."); 6208 6209 // Make one decision for the whole group. 6210 if (getWideningDecision(&I, VF) != CM_Unknown) 6211 continue; 6212 6213 NumAccesses = Group->getNumMembers(); 6214 if (interleavedAccessCanBeWidened(&I, VF)) 6215 InterleaveCost = getInterleaveGroupCost(&I, VF); 6216 } 6217 6218 unsigned GatherScatterCost = 6219 isLegalGatherOrScatter(&I) 6220 ? getGatherScatterCost(&I, VF) * NumAccesses 6221 : std::numeric_limits<unsigned>::max(); 6222 6223 unsigned ScalarizationCost = 6224 getMemInstScalarizationCost(&I, VF) * NumAccesses; 6225 6226 // Choose better solution for the current VF, 6227 // write down this decision and use it during vectorization. 6228 unsigned Cost; 6229 InstWidening Decision; 6230 if (InterleaveCost <= GatherScatterCost && 6231 InterleaveCost < ScalarizationCost) { 6232 Decision = CM_Interleave; 6233 Cost = InterleaveCost; 6234 } else if (GatherScatterCost < ScalarizationCost) { 6235 Decision = CM_GatherScatter; 6236 Cost = GatherScatterCost; 6237 } else { 6238 Decision = CM_Scalarize; 6239 Cost = ScalarizationCost; 6240 } 6241 // If the instructions belongs to an interleave group, the whole group 6242 // receives the same decision. The whole group receives the cost, but 6243 // the cost will actually be assigned to one instruction. 6244 if (auto Group = getInterleavedAccessGroup(&I)) 6245 setWideningDecision(Group, VF, Decision, Cost); 6246 else 6247 setWideningDecision(&I, VF, Decision, Cost); 6248 } 6249 } 6250 6251 // Make sure that any load of address and any other address computation 6252 // remains scalar unless there is gather/scatter support. This avoids 6253 // inevitable extracts into address registers, and also has the benefit of 6254 // activating LSR more, since that pass can't optimize vectorized 6255 // addresses. 6256 if (TTI.prefersVectorizedAddressing()) 6257 return; 6258 6259 // Start with all scalar pointer uses. 6260 SmallPtrSet<Instruction *, 8> AddrDefs; 6261 for (BasicBlock *BB : TheLoop->blocks()) 6262 for (Instruction &I : *BB) { 6263 Instruction *PtrDef = 6264 dyn_cast_or_null<Instruction>(getLoadStorePointerOperand(&I)); 6265 if (PtrDef && TheLoop->contains(PtrDef) && 6266 getWideningDecision(&I, VF) != CM_GatherScatter) 6267 AddrDefs.insert(PtrDef); 6268 } 6269 6270 // Add all instructions used to generate the addresses. 6271 SmallVector<Instruction *, 4> Worklist; 6272 for (auto *I : AddrDefs) 6273 Worklist.push_back(I); 6274 while (!Worklist.empty()) { 6275 Instruction *I = Worklist.pop_back_val(); 6276 for (auto &Op : I->operands()) 6277 if (auto *InstOp = dyn_cast<Instruction>(Op)) 6278 if ((InstOp->getParent() == I->getParent()) && !isa<PHINode>(InstOp) && 6279 AddrDefs.insert(InstOp).second) 6280 Worklist.push_back(InstOp); 6281 } 6282 6283 for (auto *I : AddrDefs) { 6284 if (isa<LoadInst>(I)) { 6285 // Setting the desired widening decision should ideally be handled in 6286 // by cost functions, but since this involves the task of finding out 6287 // if the loaded register is involved in an address computation, it is 6288 // instead changed here when we know this is the case. 6289 InstWidening Decision = getWideningDecision(I, VF); 6290 if (Decision == CM_Widen || Decision == CM_Widen_Reverse) 6291 // Scalarize a widened load of address. 6292 setWideningDecision(I, VF, CM_Scalarize, 6293 (VF * getMemoryInstructionCost(I, 1))); 6294 else if (auto Group = getInterleavedAccessGroup(I)) { 6295 // Scalarize an interleave group of address loads. 6296 for (unsigned I = 0; I < Group->getFactor(); ++I) { 6297 if (Instruction *Member = Group->getMember(I)) 6298 setWideningDecision(Member, VF, CM_Scalarize, 6299 (VF * getMemoryInstructionCost(Member, 1))); 6300 } 6301 } 6302 } else 6303 // Make sure I gets scalarized and a cost estimate without 6304 // scalarization overhead. 6305 ForcedScalars[VF].insert(I); 6306 } 6307 } 6308 6309 unsigned LoopVectorizationCostModel::getInstructionCost(Instruction *I, 6310 unsigned VF, 6311 Type *&VectorTy) { 6312 Type *RetTy = I->getType(); 6313 if (canTruncateToMinimalBitwidth(I, VF)) 6314 RetTy = IntegerType::get(RetTy->getContext(), MinBWs[I]); 6315 VectorTy = isScalarAfterVectorization(I, VF) ? RetTy : ToVectorTy(RetTy, VF); 6316 auto SE = PSE.getSE(); 6317 TTI::TargetCostKind CostKind = TTI::TCK_RecipThroughput; 6318 6319 // TODO: We need to estimate the cost of intrinsic calls. 6320 switch (I->getOpcode()) { 6321 case Instruction::GetElementPtr: 6322 // We mark this instruction as zero-cost because the cost of GEPs in 6323 // vectorized code depends on whether the corresponding memory instruction 6324 // is scalarized or not. Therefore, we handle GEPs with the memory 6325 // instruction cost. 6326 return 0; 6327 case Instruction::Br: { 6328 // In cases of scalarized and predicated instructions, there will be VF 6329 // predicated blocks in the vectorized loop. Each branch around these 6330 // blocks requires also an extract of its vector compare i1 element. 6331 bool ScalarPredicatedBB = false; 6332 BranchInst *BI = cast<BranchInst>(I); 6333 if (VF > 1 && BI->isConditional() && 6334 (PredicatedBBsAfterVectorization.count(BI->getSuccessor(0)) || 6335 PredicatedBBsAfterVectorization.count(BI->getSuccessor(1)))) 6336 ScalarPredicatedBB = true; 6337 6338 if (ScalarPredicatedBB) { 6339 // Return cost for branches around scalarized and predicated blocks. 6340 auto *Vec_i1Ty = 6341 FixedVectorType::get(IntegerType::getInt1Ty(RetTy->getContext()), VF); 6342 return (TTI.getScalarizationOverhead(Vec_i1Ty, APInt::getAllOnesValue(VF), 6343 false, true) + 6344 (TTI.getCFInstrCost(Instruction::Br, CostKind) * VF)); 6345 } else if (I->getParent() == TheLoop->getLoopLatch() || VF == 1) 6346 // The back-edge branch will remain, as will all scalar branches. 6347 return TTI.getCFInstrCost(Instruction::Br, CostKind); 6348 else 6349 // This branch will be eliminated by if-conversion. 6350 return 0; 6351 // Note: We currently assume zero cost for an unconditional branch inside 6352 // a predicated block since it will become a fall-through, although we 6353 // may decide in the future to call TTI for all branches. 6354 } 6355 case Instruction::PHI: { 6356 auto *Phi = cast<PHINode>(I); 6357 6358 // First-order recurrences are replaced by vector shuffles inside the loop. 6359 // NOTE: Don't use ToVectorTy as SK_ExtractSubvector expects a vector type. 6360 if (VF > 1 && Legal->isFirstOrderRecurrence(Phi)) 6361 return TTI.getShuffleCost(TargetTransformInfo::SK_ExtractSubvector, 6362 cast<VectorType>(VectorTy), VF - 1, 6363 FixedVectorType::get(RetTy, 1)); 6364 6365 // Phi nodes in non-header blocks (not inductions, reductions, etc.) are 6366 // converted into select instructions. We require N - 1 selects per phi 6367 // node, where N is the number of incoming values. 6368 if (VF > 1 && Phi->getParent() != TheLoop->getHeader()) 6369 return (Phi->getNumIncomingValues() - 1) * 6370 TTI.getCmpSelInstrCost( 6371 Instruction::Select, ToVectorTy(Phi->getType(), VF), 6372 ToVectorTy(Type::getInt1Ty(Phi->getContext()), VF), 6373 CostKind); 6374 6375 return TTI.getCFInstrCost(Instruction::PHI, CostKind); 6376 } 6377 case Instruction::UDiv: 6378 case Instruction::SDiv: 6379 case Instruction::URem: 6380 case Instruction::SRem: 6381 // If we have a predicated instruction, it may not be executed for each 6382 // vector lane. Get the scalarization cost and scale this amount by the 6383 // probability of executing the predicated block. If the instruction is not 6384 // predicated, we fall through to the next case. 6385 if (VF > 1 && isScalarWithPredication(I)) { 6386 unsigned Cost = 0; 6387 6388 // These instructions have a non-void type, so account for the phi nodes 6389 // that we will create. This cost is likely to be zero. The phi node 6390 // cost, if any, should be scaled by the block probability because it 6391 // models a copy at the end of each predicated block. 6392 Cost += VF * TTI.getCFInstrCost(Instruction::PHI, CostKind); 6393 6394 // The cost of the non-predicated instruction. 6395 Cost += VF * TTI.getArithmeticInstrCost(I->getOpcode(), RetTy, CostKind); 6396 6397 // The cost of insertelement and extractelement instructions needed for 6398 // scalarization. 6399 Cost += getScalarizationOverhead(I, VF); 6400 6401 // Scale the cost by the probability of executing the predicated blocks. 6402 // This assumes the predicated block for each vector lane is equally 6403 // likely. 6404 return Cost / getReciprocalPredBlockProb(); 6405 } 6406 LLVM_FALLTHROUGH; 6407 case Instruction::Add: 6408 case Instruction::FAdd: 6409 case Instruction::Sub: 6410 case Instruction::FSub: 6411 case Instruction::Mul: 6412 case Instruction::FMul: 6413 case Instruction::FDiv: 6414 case Instruction::FRem: 6415 case Instruction::Shl: 6416 case Instruction::LShr: 6417 case Instruction::AShr: 6418 case Instruction::And: 6419 case Instruction::Or: 6420 case Instruction::Xor: { 6421 // Since we will replace the stride by 1 the multiplication should go away. 6422 if (I->getOpcode() == Instruction::Mul && isStrideMul(I, Legal)) 6423 return 0; 6424 // Certain instructions can be cheaper to vectorize if they have a constant 6425 // second vector operand. One example of this are shifts on x86. 6426 Value *Op2 = I->getOperand(1); 6427 TargetTransformInfo::OperandValueProperties Op2VP; 6428 TargetTransformInfo::OperandValueKind Op2VK = 6429 TTI.getOperandInfo(Op2, Op2VP); 6430 if (Op2VK == TargetTransformInfo::OK_AnyValue && Legal->isUniform(Op2)) 6431 Op2VK = TargetTransformInfo::OK_UniformValue; 6432 6433 SmallVector<const Value *, 4> Operands(I->operand_values()); 6434 unsigned N = isScalarAfterVectorization(I, VF) ? VF : 1; 6435 return N * TTI.getArithmeticInstrCost( 6436 I->getOpcode(), VectorTy, CostKind, 6437 TargetTransformInfo::OK_AnyValue, 6438 Op2VK, TargetTransformInfo::OP_None, Op2VP, Operands, I); 6439 } 6440 case Instruction::FNeg: { 6441 unsigned N = isScalarAfterVectorization(I, VF) ? VF : 1; 6442 return N * TTI.getArithmeticInstrCost( 6443 I->getOpcode(), VectorTy, CostKind, 6444 TargetTransformInfo::OK_AnyValue, 6445 TargetTransformInfo::OK_AnyValue, 6446 TargetTransformInfo::OP_None, TargetTransformInfo::OP_None, 6447 I->getOperand(0), I); 6448 } 6449 case Instruction::Select: { 6450 SelectInst *SI = cast<SelectInst>(I); 6451 const SCEV *CondSCEV = SE->getSCEV(SI->getCondition()); 6452 bool ScalarCond = (SE->isLoopInvariant(CondSCEV, TheLoop)); 6453 Type *CondTy = SI->getCondition()->getType(); 6454 if (!ScalarCond) 6455 CondTy = FixedVectorType::get(CondTy, VF); 6456 6457 return TTI.getCmpSelInstrCost(I->getOpcode(), VectorTy, CondTy, 6458 CostKind, I); 6459 } 6460 case Instruction::ICmp: 6461 case Instruction::FCmp: { 6462 Type *ValTy = I->getOperand(0)->getType(); 6463 Instruction *Op0AsInstruction = dyn_cast<Instruction>(I->getOperand(0)); 6464 if (canTruncateToMinimalBitwidth(Op0AsInstruction, VF)) 6465 ValTy = IntegerType::get(ValTy->getContext(), MinBWs[Op0AsInstruction]); 6466 VectorTy = ToVectorTy(ValTy, VF); 6467 return TTI.getCmpSelInstrCost(I->getOpcode(), VectorTy, nullptr, CostKind, 6468 I); 6469 } 6470 case Instruction::Store: 6471 case Instruction::Load: { 6472 unsigned Width = VF; 6473 if (Width > 1) { 6474 InstWidening Decision = getWideningDecision(I, Width); 6475 assert(Decision != CM_Unknown && 6476 "CM decision should be taken at this point"); 6477 if (Decision == CM_Scalarize) 6478 Width = 1; 6479 } 6480 VectorTy = ToVectorTy(getMemInstValueType(I), Width); 6481 return getMemoryInstructionCost(I, VF); 6482 } 6483 case Instruction::ZExt: 6484 case Instruction::SExt: 6485 case Instruction::FPToUI: 6486 case Instruction::FPToSI: 6487 case Instruction::FPExt: 6488 case Instruction::PtrToInt: 6489 case Instruction::IntToPtr: 6490 case Instruction::SIToFP: 6491 case Instruction::UIToFP: 6492 case Instruction::Trunc: 6493 case Instruction::FPTrunc: 6494 case Instruction::BitCast: { 6495 // Computes the CastContextHint from a Load/Store instruction. 6496 auto ComputeCCH = [&](Instruction *I) -> TTI::CastContextHint { 6497 assert((isa<LoadInst>(I) || isa<StoreInst>(I)) && 6498 "Expected a load or a store!"); 6499 6500 if (VF == 1 || !TheLoop->contains(I)) 6501 return TTI::CastContextHint::Normal; 6502 6503 switch (getWideningDecision(I, VF)) { 6504 case LoopVectorizationCostModel::CM_GatherScatter: 6505 return TTI::CastContextHint::GatherScatter; 6506 case LoopVectorizationCostModel::CM_Interleave: 6507 return TTI::CastContextHint::Interleave; 6508 case LoopVectorizationCostModel::CM_Scalarize: 6509 case LoopVectorizationCostModel::CM_Widen: 6510 return Legal->isMaskRequired(I) ? TTI::CastContextHint::Masked 6511 : TTI::CastContextHint::Normal; 6512 case LoopVectorizationCostModel::CM_Widen_Reverse: 6513 return TTI::CastContextHint::Reversed; 6514 case LoopVectorizationCostModel::CM_Unknown: 6515 llvm_unreachable("Instr did not go through cost modelling?"); 6516 } 6517 6518 llvm_unreachable("Unhandled case!"); 6519 }; 6520 6521 unsigned Opcode = I->getOpcode(); 6522 TTI::CastContextHint CCH = TTI::CastContextHint::None; 6523 // For Trunc, the context is the only user, which must be a StoreInst. 6524 if (Opcode == Instruction::Trunc || Opcode == Instruction::FPTrunc) { 6525 if (I->hasOneUse()) 6526 if (StoreInst *Store = dyn_cast<StoreInst>(*I->user_begin())) 6527 CCH = ComputeCCH(Store); 6528 } 6529 // For Z/Sext, the context is the operand, which must be a LoadInst. 6530 else if (Opcode == Instruction::ZExt || Opcode == Instruction::SExt || 6531 Opcode == Instruction::FPExt) { 6532 if (LoadInst *Load = dyn_cast<LoadInst>(I->getOperand(0))) 6533 CCH = ComputeCCH(Load); 6534 } 6535 6536 // We optimize the truncation of induction variables having constant 6537 // integer steps. The cost of these truncations is the same as the scalar 6538 // operation. 6539 if (isOptimizableIVTruncate(I, VF)) { 6540 auto *Trunc = cast<TruncInst>(I); 6541 return TTI.getCastInstrCost(Instruction::Trunc, Trunc->getDestTy(), 6542 Trunc->getSrcTy(), CCH, CostKind, Trunc); 6543 } 6544 6545 Type *SrcScalarTy = I->getOperand(0)->getType(); 6546 Type *SrcVecTy = 6547 VectorTy->isVectorTy() ? ToVectorTy(SrcScalarTy, VF) : SrcScalarTy; 6548 if (canTruncateToMinimalBitwidth(I, VF)) { 6549 // This cast is going to be shrunk. This may remove the cast or it might 6550 // turn it into slightly different cast. For example, if MinBW == 16, 6551 // "zext i8 %1 to i32" becomes "zext i8 %1 to i16". 6552 // 6553 // Calculate the modified src and dest types. 6554 Type *MinVecTy = VectorTy; 6555 if (Opcode == Instruction::Trunc) { 6556 SrcVecTy = smallestIntegerVectorType(SrcVecTy, MinVecTy); 6557 VectorTy = 6558 largestIntegerVectorType(ToVectorTy(I->getType(), VF), MinVecTy); 6559 } else if (Opcode == Instruction::ZExt || Opcode == Instruction::SExt) { 6560 SrcVecTy = largestIntegerVectorType(SrcVecTy, MinVecTy); 6561 VectorTy = 6562 smallestIntegerVectorType(ToVectorTy(I->getType(), VF), MinVecTy); 6563 } 6564 } 6565 6566 unsigned N = isScalarAfterVectorization(I, VF) ? VF : 1; 6567 return N * 6568 TTI.getCastInstrCost(Opcode, VectorTy, SrcVecTy, CCH, CostKind, I); 6569 } 6570 case Instruction::Call: { 6571 bool NeedToScalarize; 6572 CallInst *CI = cast<CallInst>(I); 6573 unsigned CallCost = getVectorCallCost(CI, VF, NeedToScalarize); 6574 if (getVectorIntrinsicIDForCall(CI, TLI)) 6575 return std::min(CallCost, getVectorIntrinsicCost(CI, VF)); 6576 return CallCost; 6577 } 6578 default: 6579 // The cost of executing VF copies of the scalar instruction. This opcode 6580 // is unknown. Assume that it is the same as 'mul'. 6581 return VF * TTI.getArithmeticInstrCost(Instruction::Mul, VectorTy, 6582 CostKind) + 6583 getScalarizationOverhead(I, VF); 6584 } // end of switch. 6585 } 6586 6587 char LoopVectorize::ID = 0; 6588 6589 static const char lv_name[] = "Loop Vectorization"; 6590 6591 INITIALIZE_PASS_BEGIN(LoopVectorize, LV_NAME, lv_name, false, false) 6592 INITIALIZE_PASS_DEPENDENCY(TargetTransformInfoWrapperPass) 6593 INITIALIZE_PASS_DEPENDENCY(BasicAAWrapperPass) 6594 INITIALIZE_PASS_DEPENDENCY(AAResultsWrapperPass) 6595 INITIALIZE_PASS_DEPENDENCY(GlobalsAAWrapperPass) 6596 INITIALIZE_PASS_DEPENDENCY(AssumptionCacheTracker) 6597 INITIALIZE_PASS_DEPENDENCY(BlockFrequencyInfoWrapperPass) 6598 INITIALIZE_PASS_DEPENDENCY(DominatorTreeWrapperPass) 6599 INITIALIZE_PASS_DEPENDENCY(ScalarEvolutionWrapperPass) 6600 INITIALIZE_PASS_DEPENDENCY(LoopInfoWrapperPass) 6601 INITIALIZE_PASS_DEPENDENCY(LoopAccessLegacyAnalysis) 6602 INITIALIZE_PASS_DEPENDENCY(DemandedBitsWrapperPass) 6603 INITIALIZE_PASS_DEPENDENCY(OptimizationRemarkEmitterWrapperPass) 6604 INITIALIZE_PASS_DEPENDENCY(ProfileSummaryInfoWrapperPass) 6605 INITIALIZE_PASS_DEPENDENCY(InjectTLIMappingsLegacy) 6606 INITIALIZE_PASS_END(LoopVectorize, LV_NAME, lv_name, false, false) 6607 6608 namespace llvm { 6609 6610 Pass *createLoopVectorizePass() { return new LoopVectorize(); } 6611 6612 Pass *createLoopVectorizePass(bool InterleaveOnlyWhenForced, 6613 bool VectorizeOnlyWhenForced) { 6614 return new LoopVectorize(InterleaveOnlyWhenForced, VectorizeOnlyWhenForced); 6615 } 6616 6617 } // end namespace llvm 6618 6619 bool LoopVectorizationCostModel::isConsecutiveLoadOrStore(Instruction *Inst) { 6620 // Check if the pointer operand of a load or store instruction is 6621 // consecutive. 6622 if (auto *Ptr = getLoadStorePointerOperand(Inst)) 6623 return Legal->isConsecutivePtr(Ptr); 6624 return false; 6625 } 6626 6627 void LoopVectorizationCostModel::collectValuesToIgnore() { 6628 // Ignore ephemeral values. 6629 CodeMetrics::collectEphemeralValues(TheLoop, AC, ValuesToIgnore); 6630 6631 // Ignore type-promoting instructions we identified during reduction 6632 // detection. 6633 for (auto &Reduction : Legal->getReductionVars()) { 6634 RecurrenceDescriptor &RedDes = Reduction.second; 6635 SmallPtrSetImpl<Instruction *> &Casts = RedDes.getCastInsts(); 6636 VecValuesToIgnore.insert(Casts.begin(), Casts.end()); 6637 } 6638 // Ignore type-casting instructions we identified during induction 6639 // detection. 6640 for (auto &Induction : Legal->getInductionVars()) { 6641 InductionDescriptor &IndDes = Induction.second; 6642 const SmallVectorImpl<Instruction *> &Casts = IndDes.getCastInsts(); 6643 VecValuesToIgnore.insert(Casts.begin(), Casts.end()); 6644 } 6645 } 6646 6647 // TODO: we could return a pair of values that specify the max VF and 6648 // min VF, to be used in `buildVPlans(MinVF, MaxVF)` instead of 6649 // `buildVPlans(VF, VF)`. We cannot do it because VPLAN at the moment 6650 // doesn't have a cost model that can choose which plan to execute if 6651 // more than one is generated. 6652 static unsigned determineVPlanVF(const unsigned WidestVectorRegBits, 6653 LoopVectorizationCostModel &CM) { 6654 unsigned WidestType; 6655 std::tie(std::ignore, WidestType) = CM.getSmallestAndWidestTypes(); 6656 return WidestVectorRegBits / WidestType; 6657 } 6658 6659 VectorizationFactor 6660 LoopVectorizationPlanner::planInVPlanNativePath(unsigned UserVF) { 6661 unsigned VF = UserVF; 6662 // Outer loop handling: They may require CFG and instruction level 6663 // transformations before even evaluating whether vectorization is profitable. 6664 // Since we cannot modify the incoming IR, we need to build VPlan upfront in 6665 // the vectorization pipeline. 6666 if (!OrigLoop->empty()) { 6667 // If the user doesn't provide a vectorization factor, determine a 6668 // reasonable one. 6669 if (!UserVF) { 6670 VF = determineVPlanVF(TTI->getRegisterBitWidth(true /* Vector*/), CM); 6671 LLVM_DEBUG(dbgs() << "LV: VPlan computed VF " << VF << ".\n"); 6672 6673 // Make sure we have a VF > 1 for stress testing. 6674 if (VPlanBuildStressTest && VF < 2) { 6675 LLVM_DEBUG(dbgs() << "LV: VPlan stress testing: " 6676 << "overriding computed VF.\n"); 6677 VF = 4; 6678 } 6679 } 6680 assert(EnableVPlanNativePath && "VPlan-native path is not enabled."); 6681 assert(isPowerOf2_32(VF) && "VF needs to be a power of two"); 6682 LLVM_DEBUG(dbgs() << "LV: Using " << (UserVF ? "user " : "") << "VF " << VF 6683 << " to build VPlans.\n"); 6684 buildVPlans(VF, VF); 6685 6686 // For VPlan build stress testing, we bail out after VPlan construction. 6687 if (VPlanBuildStressTest) 6688 return VectorizationFactor::Disabled(); 6689 6690 return {VF, 0}; 6691 } 6692 6693 LLVM_DEBUG( 6694 dbgs() << "LV: Not vectorizing. Inner loops aren't supported in the " 6695 "VPlan-native path.\n"); 6696 return VectorizationFactor::Disabled(); 6697 } 6698 6699 Optional<VectorizationFactor> LoopVectorizationPlanner::plan(unsigned UserVF, 6700 unsigned UserIC) { 6701 assert(OrigLoop->empty() && "Inner loop expected."); 6702 Optional<unsigned> MaybeMaxVF = CM.computeMaxVF(UserVF, UserIC); 6703 if (!MaybeMaxVF) // Cases that should not to be vectorized nor interleaved. 6704 return None; 6705 6706 // Invalidate interleave groups if all blocks of loop will be predicated. 6707 if (CM.blockNeedsPredication(OrigLoop->getHeader()) && 6708 !useMaskedInterleavedAccesses(*TTI)) { 6709 LLVM_DEBUG( 6710 dbgs() 6711 << "LV: Invalidate all interleaved groups due to fold-tail by masking " 6712 "which requires masked-interleaved support.\n"); 6713 if (CM.InterleaveInfo.invalidateGroups()) 6714 // Invalidating interleave groups also requires invalidating all decisions 6715 // based on them, which includes widening decisions and uniform and scalar 6716 // values. 6717 CM.invalidateCostModelingDecisions(); 6718 } 6719 6720 if (UserVF) { 6721 LLVM_DEBUG(dbgs() << "LV: Using user VF " << UserVF << ".\n"); 6722 assert(isPowerOf2_32(UserVF) && "VF needs to be a power of two"); 6723 // Collect the instructions (and their associated costs) that will be more 6724 // profitable to scalarize. 6725 CM.selectUserVectorizationFactor(UserVF); 6726 buildVPlansWithVPRecipes(UserVF, UserVF); 6727 LLVM_DEBUG(printPlans(dbgs())); 6728 return {{UserVF, 0}}; 6729 } 6730 6731 unsigned MaxVF = MaybeMaxVF.getValue(); 6732 assert(MaxVF != 0 && "MaxVF is zero."); 6733 6734 for (unsigned VF = 1; VF <= MaxVF; VF *= 2) { 6735 // Collect Uniform and Scalar instructions after vectorization with VF. 6736 CM.collectUniformsAndScalars(VF); 6737 6738 // Collect the instructions (and their associated costs) that will be more 6739 // profitable to scalarize. 6740 if (VF > 1) 6741 CM.collectInstsToScalarize(VF); 6742 } 6743 6744 buildVPlansWithVPRecipes(1, MaxVF); 6745 LLVM_DEBUG(printPlans(dbgs())); 6746 if (MaxVF == 1) 6747 return VectorizationFactor::Disabled(); 6748 6749 // Select the optimal vectorization factor. 6750 return CM.selectVectorizationFactor(MaxVF); 6751 } 6752 6753 void LoopVectorizationPlanner::setBestPlan(unsigned VF, unsigned UF) { 6754 LLVM_DEBUG(dbgs() << "Setting best plan to VF=" << VF << ", UF=" << UF 6755 << '\n'); 6756 BestVF = VF; 6757 BestUF = UF; 6758 6759 erase_if(VPlans, [VF](const VPlanPtr &Plan) { 6760 return !Plan->hasVF(VF); 6761 }); 6762 assert(VPlans.size() == 1 && "Best VF has not a single VPlan."); 6763 } 6764 6765 void LoopVectorizationPlanner::executePlan(InnerLoopVectorizer &ILV, 6766 DominatorTree *DT) { 6767 // Perform the actual loop transformation. 6768 6769 // 1. Create a new empty loop. Unlink the old loop and connect the new one. 6770 VPCallbackILV CallbackILV(ILV); 6771 6772 VPTransformState State{BestVF, BestUF, LI, 6773 DT, ILV.Builder, ILV.VectorLoopValueMap, 6774 &ILV, CallbackILV}; 6775 State.CFG.PrevBB = ILV.createVectorizedLoopSkeleton(); 6776 State.TripCount = ILV.getOrCreateTripCount(nullptr); 6777 State.CanonicalIV = ILV.Induction; 6778 6779 //===------------------------------------------------===// 6780 // 6781 // Notice: any optimization or new instruction that go 6782 // into the code below should also be implemented in 6783 // the cost-model. 6784 // 6785 //===------------------------------------------------===// 6786 6787 // 2. Copy and widen instructions from the old loop into the new loop. 6788 assert(VPlans.size() == 1 && "Not a single VPlan to execute."); 6789 VPlans.front()->execute(&State); 6790 6791 // 3. Fix the vectorized code: take care of header phi's, live-outs, 6792 // predication, updating analyses. 6793 ILV.fixVectorizedLoop(); 6794 } 6795 6796 void LoopVectorizationPlanner::collectTriviallyDeadInstructions( 6797 SmallPtrSetImpl<Instruction *> &DeadInstructions) { 6798 BasicBlock *Latch = OrigLoop->getLoopLatch(); 6799 6800 // We create new control-flow for the vectorized loop, so the original 6801 // condition will be dead after vectorization if it's only used by the 6802 // branch. 6803 auto *Cmp = dyn_cast<Instruction>(Latch->getTerminator()->getOperand(0)); 6804 if (Cmp && Cmp->hasOneUse()) 6805 DeadInstructions.insert(Cmp); 6806 6807 // We create new "steps" for induction variable updates to which the original 6808 // induction variables map. An original update instruction will be dead if 6809 // all its users except the induction variable are dead. 6810 for (auto &Induction : Legal->getInductionVars()) { 6811 PHINode *Ind = Induction.first; 6812 auto *IndUpdate = cast<Instruction>(Ind->getIncomingValueForBlock(Latch)); 6813 if (llvm::all_of(IndUpdate->users(), [&](User *U) -> bool { 6814 return U == Ind || DeadInstructions.count(cast<Instruction>(U)); 6815 })) 6816 DeadInstructions.insert(IndUpdate); 6817 6818 // We record as "Dead" also the type-casting instructions we had identified 6819 // during induction analysis. We don't need any handling for them in the 6820 // vectorized loop because we have proven that, under a proper runtime 6821 // test guarding the vectorized loop, the value of the phi, and the casted 6822 // value of the phi, are the same. The last instruction in this casting chain 6823 // will get its scalar/vector/widened def from the scalar/vector/widened def 6824 // of the respective phi node. Any other casts in the induction def-use chain 6825 // have no other uses outside the phi update chain, and will be ignored. 6826 InductionDescriptor &IndDes = Induction.second; 6827 const SmallVectorImpl<Instruction *> &Casts = IndDes.getCastInsts(); 6828 DeadInstructions.insert(Casts.begin(), Casts.end()); 6829 } 6830 } 6831 6832 Value *InnerLoopUnroller::reverseVector(Value *Vec) { return Vec; } 6833 6834 Value *InnerLoopUnroller::getBroadcastInstrs(Value *V) { return V; } 6835 6836 Value *InnerLoopUnroller::getStepVector(Value *Val, int StartIdx, Value *Step, 6837 Instruction::BinaryOps BinOp) { 6838 // When unrolling and the VF is 1, we only need to add a simple scalar. 6839 Type *Ty = Val->getType(); 6840 assert(!Ty->isVectorTy() && "Val must be a scalar"); 6841 6842 if (Ty->isFloatingPointTy()) { 6843 Constant *C = ConstantFP::get(Ty, (double)StartIdx); 6844 6845 // Floating point operations had to be 'fast' to enable the unrolling. 6846 Value *MulOp = addFastMathFlag(Builder.CreateFMul(C, Step)); 6847 return addFastMathFlag(Builder.CreateBinOp(BinOp, Val, MulOp)); 6848 } 6849 Constant *C = ConstantInt::get(Ty, StartIdx); 6850 return Builder.CreateAdd(Val, Builder.CreateMul(C, Step), "induction"); 6851 } 6852 6853 static void AddRuntimeUnrollDisableMetaData(Loop *L) { 6854 SmallVector<Metadata *, 4> MDs; 6855 // Reserve first location for self reference to the LoopID metadata node. 6856 MDs.push_back(nullptr); 6857 bool IsUnrollMetadata = false; 6858 MDNode *LoopID = L->getLoopID(); 6859 if (LoopID) { 6860 // First find existing loop unrolling disable metadata. 6861 for (unsigned i = 1, ie = LoopID->getNumOperands(); i < ie; ++i) { 6862 auto *MD = dyn_cast<MDNode>(LoopID->getOperand(i)); 6863 if (MD) { 6864 const auto *S = dyn_cast<MDString>(MD->getOperand(0)); 6865 IsUnrollMetadata = 6866 S && S->getString().startswith("llvm.loop.unroll.disable"); 6867 } 6868 MDs.push_back(LoopID->getOperand(i)); 6869 } 6870 } 6871 6872 if (!IsUnrollMetadata) { 6873 // Add runtime unroll disable metadata. 6874 LLVMContext &Context = L->getHeader()->getContext(); 6875 SmallVector<Metadata *, 1> DisableOperands; 6876 DisableOperands.push_back( 6877 MDString::get(Context, "llvm.loop.unroll.runtime.disable")); 6878 MDNode *DisableNode = MDNode::get(Context, DisableOperands); 6879 MDs.push_back(DisableNode); 6880 MDNode *NewLoopID = MDNode::get(Context, MDs); 6881 // Set operand 0 to refer to the loop id itself. 6882 NewLoopID->replaceOperandWith(0, NewLoopID); 6883 L->setLoopID(NewLoopID); 6884 } 6885 } 6886 6887 bool LoopVectorizationPlanner::getDecisionAndClampRange( 6888 const std::function<bool(unsigned)> &Predicate, VFRange &Range) { 6889 assert(Range.End > Range.Start && "Trying to test an empty VF range."); 6890 bool PredicateAtRangeStart = Predicate(Range.Start); 6891 6892 for (unsigned TmpVF = Range.Start * 2; TmpVF < Range.End; TmpVF *= 2) 6893 if (Predicate(TmpVF) != PredicateAtRangeStart) { 6894 Range.End = TmpVF; 6895 break; 6896 } 6897 6898 return PredicateAtRangeStart; 6899 } 6900 6901 /// Build VPlans for the full range of feasible VF's = {\p MinVF, 2 * \p MinVF, 6902 /// 4 * \p MinVF, ..., \p MaxVF} by repeatedly building a VPlan for a sub-range 6903 /// of VF's starting at a given VF and extending it as much as possible. Each 6904 /// vectorization decision can potentially shorten this sub-range during 6905 /// buildVPlan(). 6906 void LoopVectorizationPlanner::buildVPlans(unsigned MinVF, unsigned MaxVF) { 6907 for (unsigned VF = MinVF; VF < MaxVF + 1;) { 6908 VFRange SubRange = {VF, MaxVF + 1}; 6909 VPlans.push_back(buildVPlan(SubRange)); 6910 VF = SubRange.End; 6911 } 6912 } 6913 6914 VPValue *VPRecipeBuilder::createEdgeMask(BasicBlock *Src, BasicBlock *Dst, 6915 VPlanPtr &Plan) { 6916 assert(is_contained(predecessors(Dst), Src) && "Invalid edge"); 6917 6918 // Look for cached value. 6919 std::pair<BasicBlock *, BasicBlock *> Edge(Src, Dst); 6920 EdgeMaskCacheTy::iterator ECEntryIt = EdgeMaskCache.find(Edge); 6921 if (ECEntryIt != EdgeMaskCache.end()) 6922 return ECEntryIt->second; 6923 6924 VPValue *SrcMask = createBlockInMask(Src, Plan); 6925 6926 // The terminator has to be a branch inst! 6927 BranchInst *BI = dyn_cast<BranchInst>(Src->getTerminator()); 6928 assert(BI && "Unexpected terminator found"); 6929 6930 if (!BI->isConditional() || BI->getSuccessor(0) == BI->getSuccessor(1)) 6931 return EdgeMaskCache[Edge] = SrcMask; 6932 6933 VPValue *EdgeMask = Plan->getVPValue(BI->getCondition()); 6934 assert(EdgeMask && "No Edge Mask found for condition"); 6935 6936 if (BI->getSuccessor(0) != Dst) 6937 EdgeMask = Builder.createNot(EdgeMask); 6938 6939 if (SrcMask) // Otherwise block in-mask is all-one, no need to AND. 6940 EdgeMask = Builder.createAnd(EdgeMask, SrcMask); 6941 6942 return EdgeMaskCache[Edge] = EdgeMask; 6943 } 6944 6945 VPValue *VPRecipeBuilder::createBlockInMask(BasicBlock *BB, VPlanPtr &Plan) { 6946 assert(OrigLoop->contains(BB) && "Block is not a part of a loop"); 6947 6948 // Look for cached value. 6949 BlockMaskCacheTy::iterator BCEntryIt = BlockMaskCache.find(BB); 6950 if (BCEntryIt != BlockMaskCache.end()) 6951 return BCEntryIt->second; 6952 6953 // All-one mask is modelled as no-mask following the convention for masked 6954 // load/store/gather/scatter. Initialize BlockMask to no-mask. 6955 VPValue *BlockMask = nullptr; 6956 6957 if (OrigLoop->getHeader() == BB) { 6958 if (!CM.blockNeedsPredication(BB)) 6959 return BlockMaskCache[BB] = BlockMask; // Loop incoming mask is all-one. 6960 6961 // Introduce the early-exit compare IV <= BTC to form header block mask. 6962 // This is used instead of IV < TC because TC may wrap, unlike BTC. 6963 // Start by constructing the desired canonical IV. 6964 VPValue *IV = nullptr; 6965 if (Legal->getPrimaryInduction()) 6966 IV = Plan->getVPValue(Legal->getPrimaryInduction()); 6967 else { 6968 auto IVRecipe = new VPWidenCanonicalIVRecipe(); 6969 Builder.getInsertBlock()->appendRecipe(IVRecipe); 6970 IV = IVRecipe->getVPValue(); 6971 } 6972 VPValue *BTC = Plan->getOrCreateBackedgeTakenCount(); 6973 bool TailFolded = !CM.isScalarEpilogueAllowed(); 6974 if (TailFolded && CM.TTI.emitGetActiveLaneMask()) 6975 BlockMask = Builder.createNaryOp(VPInstruction::ActiveLaneMask, {IV, BTC}); 6976 else 6977 BlockMask = Builder.createNaryOp(VPInstruction::ICmpULE, {IV, BTC}); 6978 return BlockMaskCache[BB] = BlockMask; 6979 } 6980 6981 // This is the block mask. We OR all incoming edges. 6982 for (auto *Predecessor : predecessors(BB)) { 6983 VPValue *EdgeMask = createEdgeMask(Predecessor, BB, Plan); 6984 if (!EdgeMask) // Mask of predecessor is all-one so mask of block is too. 6985 return BlockMaskCache[BB] = EdgeMask; 6986 6987 if (!BlockMask) { // BlockMask has its initialized nullptr value. 6988 BlockMask = EdgeMask; 6989 continue; 6990 } 6991 6992 BlockMask = Builder.createOr(BlockMask, EdgeMask); 6993 } 6994 6995 return BlockMaskCache[BB] = BlockMask; 6996 } 6997 6998 VPWidenMemoryInstructionRecipe * 6999 VPRecipeBuilder::tryToWidenMemory(Instruction *I, VFRange &Range, 7000 VPlanPtr &Plan) { 7001 assert((isa<LoadInst>(I) || isa<StoreInst>(I)) && 7002 "Must be called with either a load or store"); 7003 7004 auto willWiden = [&](unsigned VF) -> bool { 7005 if (VF == 1) 7006 return false; 7007 LoopVectorizationCostModel::InstWidening Decision = 7008 CM.getWideningDecision(I, VF); 7009 assert(Decision != LoopVectorizationCostModel::CM_Unknown && 7010 "CM decision should be taken at this point."); 7011 if (Decision == LoopVectorizationCostModel::CM_Interleave) 7012 return true; 7013 if (CM.isScalarAfterVectorization(I, VF) || 7014 CM.isProfitableToScalarize(I, VF)) 7015 return false; 7016 return Decision != LoopVectorizationCostModel::CM_Scalarize; 7017 }; 7018 7019 if (!LoopVectorizationPlanner::getDecisionAndClampRange(willWiden, Range)) 7020 return nullptr; 7021 7022 VPValue *Mask = nullptr; 7023 if (Legal->isMaskRequired(I)) 7024 Mask = createBlockInMask(I->getParent(), Plan); 7025 7026 VPValue *Addr = Plan->getOrAddVPValue(getLoadStorePointerOperand(I)); 7027 if (LoadInst *Load = dyn_cast<LoadInst>(I)) 7028 return new VPWidenMemoryInstructionRecipe(*Load, Addr, Mask); 7029 7030 StoreInst *Store = cast<StoreInst>(I); 7031 VPValue *StoredValue = Plan->getOrAddVPValue(Store->getValueOperand()); 7032 return new VPWidenMemoryInstructionRecipe(*Store, Addr, StoredValue, Mask); 7033 } 7034 7035 VPWidenIntOrFpInductionRecipe * 7036 VPRecipeBuilder::tryToOptimizeInductionPHI(PHINode *Phi) const { 7037 // Check if this is an integer or fp induction. If so, build the recipe that 7038 // produces its scalar and vector values. 7039 InductionDescriptor II = Legal->getInductionVars().lookup(Phi); 7040 if (II.getKind() == InductionDescriptor::IK_IntInduction || 7041 II.getKind() == InductionDescriptor::IK_FpInduction) 7042 return new VPWidenIntOrFpInductionRecipe(Phi); 7043 7044 return nullptr; 7045 } 7046 7047 VPWidenIntOrFpInductionRecipe * 7048 VPRecipeBuilder::tryToOptimizeInductionTruncate(TruncInst *I, 7049 VFRange &Range) const { 7050 // Optimize the special case where the source is a constant integer 7051 // induction variable. Notice that we can only optimize the 'trunc' case 7052 // because (a) FP conversions lose precision, (b) sext/zext may wrap, and 7053 // (c) other casts depend on pointer size. 7054 7055 // Determine whether \p K is a truncation based on an induction variable that 7056 // can be optimized. 7057 auto isOptimizableIVTruncate = 7058 [&](Instruction *K) -> std::function<bool(unsigned)> { 7059 return 7060 [=](unsigned VF) -> bool { return CM.isOptimizableIVTruncate(K, VF); }; 7061 }; 7062 7063 if (LoopVectorizationPlanner::getDecisionAndClampRange( 7064 isOptimizableIVTruncate(I), Range)) 7065 return new VPWidenIntOrFpInductionRecipe(cast<PHINode>(I->getOperand(0)), 7066 I); 7067 return nullptr; 7068 } 7069 7070 VPBlendRecipe *VPRecipeBuilder::tryToBlend(PHINode *Phi, VPlanPtr &Plan) { 7071 // We know that all PHIs in non-header blocks are converted into selects, so 7072 // we don't have to worry about the insertion order and we can just use the 7073 // builder. At this point we generate the predication tree. There may be 7074 // duplications since this is a simple recursive scan, but future 7075 // optimizations will clean it up. 7076 7077 SmallVector<VPValue *, 2> Operands; 7078 unsigned NumIncoming = Phi->getNumIncomingValues(); 7079 for (unsigned In = 0; In < NumIncoming; In++) { 7080 VPValue *EdgeMask = 7081 createEdgeMask(Phi->getIncomingBlock(In), Phi->getParent(), Plan); 7082 assert((EdgeMask || NumIncoming == 1) && 7083 "Multiple predecessors with one having a full mask"); 7084 Operands.push_back(Plan->getOrAddVPValue(Phi->getIncomingValue(In))); 7085 if (EdgeMask) 7086 Operands.push_back(EdgeMask); 7087 } 7088 return new VPBlendRecipe(Phi, Operands); 7089 } 7090 7091 VPWidenCallRecipe *VPRecipeBuilder::tryToWidenCall(CallInst *CI, VFRange &Range, 7092 VPlan &Plan) const { 7093 7094 bool IsPredicated = LoopVectorizationPlanner::getDecisionAndClampRange( 7095 [this, CI](unsigned VF) { return CM.isScalarWithPredication(CI, VF); }, 7096 Range); 7097 7098 if (IsPredicated) 7099 return nullptr; 7100 7101 Intrinsic::ID ID = getVectorIntrinsicIDForCall(CI, TLI); 7102 if (ID && (ID == Intrinsic::assume || ID == Intrinsic::lifetime_end || 7103 ID == Intrinsic::lifetime_start || ID == Intrinsic::sideeffect)) 7104 return nullptr; 7105 7106 auto willWiden = [&](unsigned VF) -> bool { 7107 Intrinsic::ID ID = getVectorIntrinsicIDForCall(CI, TLI); 7108 // The following case may be scalarized depending on the VF. 7109 // The flag shows whether we use Intrinsic or a usual Call for vectorized 7110 // version of the instruction. 7111 // Is it beneficial to perform intrinsic call compared to lib call? 7112 bool NeedToScalarize = false; 7113 unsigned CallCost = CM.getVectorCallCost(CI, VF, NeedToScalarize); 7114 bool UseVectorIntrinsic = 7115 ID && CM.getVectorIntrinsicCost(CI, VF) <= CallCost; 7116 return UseVectorIntrinsic || !NeedToScalarize; 7117 }; 7118 7119 if (!LoopVectorizationPlanner::getDecisionAndClampRange(willWiden, Range)) 7120 return nullptr; 7121 7122 return new VPWidenCallRecipe(*CI, Plan.mapToVPValues(CI->arg_operands())); 7123 } 7124 7125 bool VPRecipeBuilder::shouldWiden(Instruction *I, VFRange &Range) const { 7126 assert(!isa<BranchInst>(I) && !isa<PHINode>(I) && !isa<LoadInst>(I) && 7127 !isa<StoreInst>(I) && "Instruction should have been handled earlier"); 7128 // Instruction should be widened, unless it is scalar after vectorization, 7129 // scalarization is profitable or it is predicated. 7130 auto WillScalarize = [this, I](unsigned VF) -> bool { 7131 return CM.isScalarAfterVectorization(I, VF) || 7132 CM.isProfitableToScalarize(I, VF) || 7133 CM.isScalarWithPredication(I, VF); 7134 }; 7135 return !LoopVectorizationPlanner::getDecisionAndClampRange(WillScalarize, 7136 Range); 7137 } 7138 7139 VPWidenRecipe *VPRecipeBuilder::tryToWiden(Instruction *I, VPlan &Plan) const { 7140 auto IsVectorizableOpcode = [](unsigned Opcode) { 7141 switch (Opcode) { 7142 case Instruction::Add: 7143 case Instruction::And: 7144 case Instruction::AShr: 7145 case Instruction::BitCast: 7146 case Instruction::FAdd: 7147 case Instruction::FCmp: 7148 case Instruction::FDiv: 7149 case Instruction::FMul: 7150 case Instruction::FNeg: 7151 case Instruction::FPExt: 7152 case Instruction::FPToSI: 7153 case Instruction::FPToUI: 7154 case Instruction::FPTrunc: 7155 case Instruction::FRem: 7156 case Instruction::FSub: 7157 case Instruction::ICmp: 7158 case Instruction::IntToPtr: 7159 case Instruction::LShr: 7160 case Instruction::Mul: 7161 case Instruction::Or: 7162 case Instruction::PtrToInt: 7163 case Instruction::SDiv: 7164 case Instruction::Select: 7165 case Instruction::SExt: 7166 case Instruction::Shl: 7167 case Instruction::SIToFP: 7168 case Instruction::SRem: 7169 case Instruction::Sub: 7170 case Instruction::Trunc: 7171 case Instruction::UDiv: 7172 case Instruction::UIToFP: 7173 case Instruction::URem: 7174 case Instruction::Xor: 7175 case Instruction::ZExt: 7176 return true; 7177 } 7178 return false; 7179 }; 7180 7181 if (!IsVectorizableOpcode(I->getOpcode())) 7182 return nullptr; 7183 7184 // Success: widen this instruction. 7185 return new VPWidenRecipe(*I, Plan.mapToVPValues(I->operands())); 7186 } 7187 7188 VPBasicBlock *VPRecipeBuilder::handleReplication( 7189 Instruction *I, VFRange &Range, VPBasicBlock *VPBB, 7190 DenseMap<Instruction *, VPReplicateRecipe *> &PredInst2Recipe, 7191 VPlanPtr &Plan) { 7192 bool IsUniform = LoopVectorizationPlanner::getDecisionAndClampRange( 7193 [&](unsigned VF) { return CM.isUniformAfterVectorization(I, VF); }, 7194 Range); 7195 7196 bool IsPredicated = LoopVectorizationPlanner::getDecisionAndClampRange( 7197 [&](unsigned VF) { return CM.isScalarWithPredication(I, VF); }, Range); 7198 7199 auto *Recipe = new VPReplicateRecipe(I, Plan->mapToVPValues(I->operands()), 7200 IsUniform, IsPredicated); 7201 setRecipe(I, Recipe); 7202 7203 // Find if I uses a predicated instruction. If so, it will use its scalar 7204 // value. Avoid hoisting the insert-element which packs the scalar value into 7205 // a vector value, as that happens iff all users use the vector value. 7206 for (auto &Op : I->operands()) 7207 if (auto *PredInst = dyn_cast<Instruction>(Op)) 7208 if (PredInst2Recipe.find(PredInst) != PredInst2Recipe.end()) 7209 PredInst2Recipe[PredInst]->setAlsoPack(false); 7210 7211 // Finalize the recipe for Instr, first if it is not predicated. 7212 if (!IsPredicated) { 7213 LLVM_DEBUG(dbgs() << "LV: Scalarizing:" << *I << "\n"); 7214 VPBB->appendRecipe(Recipe); 7215 return VPBB; 7216 } 7217 LLVM_DEBUG(dbgs() << "LV: Scalarizing and predicating:" << *I << "\n"); 7218 assert(VPBB->getSuccessors().empty() && 7219 "VPBB has successors when handling predicated replication."); 7220 // Record predicated instructions for above packing optimizations. 7221 PredInst2Recipe[I] = Recipe; 7222 VPBlockBase *Region = createReplicateRegion(I, Recipe, Plan); 7223 VPBlockUtils::insertBlockAfter(Region, VPBB); 7224 auto *RegSucc = new VPBasicBlock(); 7225 VPBlockUtils::insertBlockAfter(RegSucc, Region); 7226 return RegSucc; 7227 } 7228 7229 VPRegionBlock *VPRecipeBuilder::createReplicateRegion(Instruction *Instr, 7230 VPRecipeBase *PredRecipe, 7231 VPlanPtr &Plan) { 7232 // Instructions marked for predication are replicated and placed under an 7233 // if-then construct to prevent side-effects. 7234 7235 // Generate recipes to compute the block mask for this region. 7236 VPValue *BlockInMask = createBlockInMask(Instr->getParent(), Plan); 7237 7238 // Build the triangular if-then region. 7239 std::string RegionName = (Twine("pred.") + Instr->getOpcodeName()).str(); 7240 assert(Instr->getParent() && "Predicated instruction not in any basic block"); 7241 auto *BOMRecipe = new VPBranchOnMaskRecipe(BlockInMask); 7242 auto *Entry = new VPBasicBlock(Twine(RegionName) + ".entry", BOMRecipe); 7243 auto *PHIRecipe = 7244 Instr->getType()->isVoidTy() ? nullptr : new VPPredInstPHIRecipe(Instr); 7245 auto *Exit = new VPBasicBlock(Twine(RegionName) + ".continue", PHIRecipe); 7246 auto *Pred = new VPBasicBlock(Twine(RegionName) + ".if", PredRecipe); 7247 VPRegionBlock *Region = new VPRegionBlock(Entry, Exit, RegionName, true); 7248 7249 // Note: first set Entry as region entry and then connect successors starting 7250 // from it in order, to propagate the "parent" of each VPBasicBlock. 7251 VPBlockUtils::insertTwoBlocksAfter(Pred, Exit, BlockInMask, Entry); 7252 VPBlockUtils::connectBlocks(Pred, Exit); 7253 7254 return Region; 7255 } 7256 7257 VPRecipeBase *VPRecipeBuilder::tryToCreateWidenRecipe(Instruction *Instr, 7258 VFRange &Range, 7259 VPlanPtr &Plan) { 7260 // First, check for specific widening recipes that deal with calls, memory 7261 // operations, inductions and Phi nodes. 7262 if (auto *CI = dyn_cast<CallInst>(Instr)) 7263 return tryToWidenCall(CI, Range, *Plan); 7264 7265 if (isa<LoadInst>(Instr) || isa<StoreInst>(Instr)) 7266 return tryToWidenMemory(Instr, Range, Plan); 7267 7268 VPRecipeBase *Recipe; 7269 if (auto Phi = dyn_cast<PHINode>(Instr)) { 7270 if (Phi->getParent() != OrigLoop->getHeader()) 7271 return tryToBlend(Phi, Plan); 7272 if ((Recipe = tryToOptimizeInductionPHI(Phi))) 7273 return Recipe; 7274 return new VPWidenPHIRecipe(Phi); 7275 } 7276 7277 if (isa<TruncInst>(Instr) && 7278 (Recipe = tryToOptimizeInductionTruncate(cast<TruncInst>(Instr), Range))) 7279 return Recipe; 7280 7281 if (!shouldWiden(Instr, Range)) 7282 return nullptr; 7283 7284 if (auto GEP = dyn_cast<GetElementPtrInst>(Instr)) 7285 return new VPWidenGEPRecipe(GEP, Plan->mapToVPValues(GEP->operands()), 7286 OrigLoop); 7287 7288 if (auto *SI = dyn_cast<SelectInst>(Instr)) { 7289 bool InvariantCond = 7290 PSE.getSE()->isLoopInvariant(PSE.getSCEV(SI->getOperand(0)), OrigLoop); 7291 return new VPWidenSelectRecipe(*SI, Plan->mapToVPValues(SI->operands()), 7292 InvariantCond); 7293 } 7294 7295 return tryToWiden(Instr, *Plan); 7296 } 7297 7298 void LoopVectorizationPlanner::buildVPlansWithVPRecipes(unsigned MinVF, 7299 unsigned MaxVF) { 7300 assert(OrigLoop->empty() && "Inner loop expected."); 7301 7302 // Collect conditions feeding internal conditional branches; they need to be 7303 // represented in VPlan for it to model masking. 7304 SmallPtrSet<Value *, 1> NeedDef; 7305 7306 auto *Latch = OrigLoop->getLoopLatch(); 7307 for (BasicBlock *BB : OrigLoop->blocks()) { 7308 if (BB == Latch) 7309 continue; 7310 BranchInst *Branch = dyn_cast<BranchInst>(BB->getTerminator()); 7311 if (Branch && Branch->isConditional()) 7312 NeedDef.insert(Branch->getCondition()); 7313 } 7314 7315 // If the tail is to be folded by masking, the primary induction variable, if 7316 // exists needs to be represented in VPlan for it to model early-exit masking. 7317 // Also, both the Phi and the live-out instruction of each reduction are 7318 // required in order to introduce a select between them in VPlan. 7319 if (CM.foldTailByMasking()) { 7320 if (Legal->getPrimaryInduction()) 7321 NeedDef.insert(Legal->getPrimaryInduction()); 7322 for (auto &Reduction : Legal->getReductionVars()) { 7323 NeedDef.insert(Reduction.first); 7324 NeedDef.insert(Reduction.second.getLoopExitInstr()); 7325 } 7326 } 7327 7328 // Collect instructions from the original loop that will become trivially dead 7329 // in the vectorized loop. We don't need to vectorize these instructions. For 7330 // example, original induction update instructions can become dead because we 7331 // separately emit induction "steps" when generating code for the new loop. 7332 // Similarly, we create a new latch condition when setting up the structure 7333 // of the new loop, so the old one can become dead. 7334 SmallPtrSet<Instruction *, 4> DeadInstructions; 7335 collectTriviallyDeadInstructions(DeadInstructions); 7336 7337 // Add assume instructions we need to drop to DeadInstructions, to prevent 7338 // them from being added to the VPlan. 7339 // TODO: We only need to drop assumes in blocks that get flattend. If the 7340 // control flow is preserved, we should keep them. 7341 auto &ConditionalAssumes = Legal->getConditionalAssumes(); 7342 DeadInstructions.insert(ConditionalAssumes.begin(), ConditionalAssumes.end()); 7343 7344 DenseMap<Instruction *, Instruction *> &SinkAfter = Legal->getSinkAfter(); 7345 // Dead instructions do not need sinking. Remove them from SinkAfter. 7346 for (Instruction *I : DeadInstructions) 7347 SinkAfter.erase(I); 7348 7349 for (unsigned VF = MinVF; VF < MaxVF + 1;) { 7350 VFRange SubRange = {VF, MaxVF + 1}; 7351 VPlans.push_back(buildVPlanWithVPRecipes(SubRange, NeedDef, 7352 DeadInstructions, SinkAfter)); 7353 VF = SubRange.End; 7354 } 7355 } 7356 7357 VPlanPtr LoopVectorizationPlanner::buildVPlanWithVPRecipes( 7358 VFRange &Range, SmallPtrSetImpl<Value *> &NeedDef, 7359 SmallPtrSetImpl<Instruction *> &DeadInstructions, 7360 const DenseMap<Instruction *, Instruction *> &SinkAfter) { 7361 7362 // Hold a mapping from predicated instructions to their recipes, in order to 7363 // fix their AlsoPack behavior if a user is determined to replicate and use a 7364 // scalar instead of vector value. 7365 DenseMap<Instruction *, VPReplicateRecipe *> PredInst2Recipe; 7366 7367 SmallPtrSet<const InterleaveGroup<Instruction> *, 1> InterleaveGroups; 7368 7369 VPRecipeBuilder RecipeBuilder(OrigLoop, TLI, Legal, CM, PSE, Builder); 7370 7371 // --------------------------------------------------------------------------- 7372 // Pre-construction: record ingredients whose recipes we'll need to further 7373 // process after constructing the initial VPlan. 7374 // --------------------------------------------------------------------------- 7375 7376 // Mark instructions we'll need to sink later and their targets as 7377 // ingredients whose recipe we'll need to record. 7378 for (auto &Entry : SinkAfter) { 7379 RecipeBuilder.recordRecipeOf(Entry.first); 7380 RecipeBuilder.recordRecipeOf(Entry.second); 7381 } 7382 7383 // For each interleave group which is relevant for this (possibly trimmed) 7384 // Range, add it to the set of groups to be later applied to the VPlan and add 7385 // placeholders for its members' Recipes which we'll be replacing with a 7386 // single VPInterleaveRecipe. 7387 for (InterleaveGroup<Instruction> *IG : IAI.getInterleaveGroups()) { 7388 auto applyIG = [IG, this](unsigned VF) -> bool { 7389 return (VF >= 2 && // Query is illegal for VF == 1 7390 CM.getWideningDecision(IG->getInsertPos(), VF) == 7391 LoopVectorizationCostModel::CM_Interleave); 7392 }; 7393 if (!getDecisionAndClampRange(applyIG, Range)) 7394 continue; 7395 InterleaveGroups.insert(IG); 7396 for (unsigned i = 0; i < IG->getFactor(); i++) 7397 if (Instruction *Member = IG->getMember(i)) 7398 RecipeBuilder.recordRecipeOf(Member); 7399 }; 7400 7401 // --------------------------------------------------------------------------- 7402 // Build initial VPlan: Scan the body of the loop in a topological order to 7403 // visit each basic block after having visited its predecessor basic blocks. 7404 // --------------------------------------------------------------------------- 7405 7406 // Create a dummy pre-entry VPBasicBlock to start building the VPlan. 7407 auto Plan = std::make_unique<VPlan>(); 7408 VPBasicBlock *VPBB = new VPBasicBlock("Pre-Entry"); 7409 Plan->setEntry(VPBB); 7410 7411 // Represent values that will have defs inside VPlan. 7412 for (Value *V : NeedDef) 7413 Plan->addVPValue(V); 7414 7415 // Scan the body of the loop in a topological order to visit each basic block 7416 // after having visited its predecessor basic blocks. 7417 LoopBlocksDFS DFS(OrigLoop); 7418 DFS.perform(LI); 7419 7420 for (BasicBlock *BB : make_range(DFS.beginRPO(), DFS.endRPO())) { 7421 // Relevant instructions from basic block BB will be grouped into VPRecipe 7422 // ingredients and fill a new VPBasicBlock. 7423 unsigned VPBBsForBB = 0; 7424 auto *FirstVPBBForBB = new VPBasicBlock(BB->getName()); 7425 VPBlockUtils::insertBlockAfter(FirstVPBBForBB, VPBB); 7426 VPBB = FirstVPBBForBB; 7427 Builder.setInsertPoint(VPBB); 7428 7429 // Introduce each ingredient into VPlan. 7430 // TODO: Model and preserve debug instrinsics in VPlan. 7431 for (Instruction &I : BB->instructionsWithoutDebug()) { 7432 Instruction *Instr = &I; 7433 7434 // First filter out irrelevant instructions, to ensure no recipes are 7435 // built for them. 7436 if (isa<BranchInst>(Instr) || DeadInstructions.count(Instr)) 7437 continue; 7438 7439 if (auto Recipe = 7440 RecipeBuilder.tryToCreateWidenRecipe(Instr, Range, Plan)) { 7441 RecipeBuilder.setRecipe(Instr, Recipe); 7442 VPBB->appendRecipe(Recipe); 7443 continue; 7444 } 7445 7446 // Otherwise, if all widening options failed, Instruction is to be 7447 // replicated. This may create a successor for VPBB. 7448 VPBasicBlock *NextVPBB = RecipeBuilder.handleReplication( 7449 Instr, Range, VPBB, PredInst2Recipe, Plan); 7450 if (NextVPBB != VPBB) { 7451 VPBB = NextVPBB; 7452 VPBB->setName(BB->hasName() ? BB->getName() + "." + Twine(VPBBsForBB++) 7453 : ""); 7454 } 7455 } 7456 } 7457 7458 // Discard empty dummy pre-entry VPBasicBlock. Note that other VPBasicBlocks 7459 // may also be empty, such as the last one VPBB, reflecting original 7460 // basic-blocks with no recipes. 7461 VPBasicBlock *PreEntry = cast<VPBasicBlock>(Plan->getEntry()); 7462 assert(PreEntry->empty() && "Expecting empty pre-entry block."); 7463 VPBlockBase *Entry = Plan->setEntry(PreEntry->getSingleSuccessor()); 7464 VPBlockUtils::disconnectBlocks(PreEntry, Entry); 7465 delete PreEntry; 7466 7467 // --------------------------------------------------------------------------- 7468 // Transform initial VPlan: Apply previously taken decisions, in order, to 7469 // bring the VPlan to its final state. 7470 // --------------------------------------------------------------------------- 7471 7472 // Apply Sink-After legal constraints. 7473 for (auto &Entry : SinkAfter) { 7474 VPRecipeBase *Sink = RecipeBuilder.getRecipe(Entry.first); 7475 VPRecipeBase *Target = RecipeBuilder.getRecipe(Entry.second); 7476 Sink->moveAfter(Target); 7477 } 7478 7479 // Interleave memory: for each Interleave Group we marked earlier as relevant 7480 // for this VPlan, replace the Recipes widening its memory instructions with a 7481 // single VPInterleaveRecipe at its insertion point. 7482 for (auto IG : InterleaveGroups) { 7483 auto *Recipe = cast<VPWidenMemoryInstructionRecipe>( 7484 RecipeBuilder.getRecipe(IG->getInsertPos())); 7485 (new VPInterleaveRecipe(IG, Recipe->getAddr(), Recipe->getMask())) 7486 ->insertBefore(Recipe); 7487 7488 for (unsigned i = 0; i < IG->getFactor(); ++i) 7489 if (Instruction *Member = IG->getMember(i)) { 7490 RecipeBuilder.getRecipe(Member)->eraseFromParent(); 7491 } 7492 } 7493 7494 // Finally, if tail is folded by masking, introduce selects between the phi 7495 // and the live-out instruction of each reduction, at the end of the latch. 7496 if (CM.foldTailByMasking() && !Legal->getReductionVars().empty()) { 7497 Builder.setInsertPoint(VPBB); 7498 auto *Cond = RecipeBuilder.createBlockInMask(OrigLoop->getHeader(), Plan); 7499 for (auto &Reduction : Legal->getReductionVars()) { 7500 VPValue *Phi = Plan->getVPValue(Reduction.first); 7501 VPValue *Red = Plan->getVPValue(Reduction.second.getLoopExitInstr()); 7502 Builder.createNaryOp(Instruction::Select, {Cond, Red, Phi}); 7503 } 7504 } 7505 7506 std::string PlanName; 7507 raw_string_ostream RSO(PlanName); 7508 unsigned VF = Range.Start; 7509 Plan->addVF(VF); 7510 RSO << "Initial VPlan for VF={" << VF; 7511 for (VF *= 2; VF < Range.End; VF *= 2) { 7512 Plan->addVF(VF); 7513 RSO << "," << VF; 7514 } 7515 RSO << "},UF>=1"; 7516 RSO.flush(); 7517 Plan->setName(PlanName); 7518 7519 return Plan; 7520 } 7521 7522 VPlanPtr LoopVectorizationPlanner::buildVPlan(VFRange &Range) { 7523 // Outer loop handling: They may require CFG and instruction level 7524 // transformations before even evaluating whether vectorization is profitable. 7525 // Since we cannot modify the incoming IR, we need to build VPlan upfront in 7526 // the vectorization pipeline. 7527 assert(!OrigLoop->empty()); 7528 assert(EnableVPlanNativePath && "VPlan-native path is not enabled."); 7529 7530 // Create new empty VPlan 7531 auto Plan = std::make_unique<VPlan>(); 7532 7533 // Build hierarchical CFG 7534 VPlanHCFGBuilder HCFGBuilder(OrigLoop, LI, *Plan); 7535 HCFGBuilder.buildHierarchicalCFG(); 7536 7537 for (unsigned VF = Range.Start; VF < Range.End; VF *= 2) 7538 Plan->addVF(VF); 7539 7540 if (EnableVPlanPredication) { 7541 VPlanPredicator VPP(*Plan); 7542 VPP.predicate(); 7543 7544 // Avoid running transformation to recipes until masked code generation in 7545 // VPlan-native path is in place. 7546 return Plan; 7547 } 7548 7549 SmallPtrSet<Instruction *, 1> DeadInstructions; 7550 VPlanTransforms::VPInstructionsToVPRecipes( 7551 OrigLoop, Plan, Legal->getInductionVars(), DeadInstructions); 7552 return Plan; 7553 } 7554 7555 Value* LoopVectorizationPlanner::VPCallbackILV:: 7556 getOrCreateVectorValues(Value *V, unsigned Part) { 7557 return ILV.getOrCreateVectorValue(V, Part); 7558 } 7559 7560 Value *LoopVectorizationPlanner::VPCallbackILV::getOrCreateScalarValue( 7561 Value *V, const VPIteration &Instance) { 7562 return ILV.getOrCreateScalarValue(V, Instance); 7563 } 7564 7565 void VPInterleaveRecipe::print(raw_ostream &O, const Twine &Indent, 7566 VPSlotTracker &SlotTracker) const { 7567 O << "\"INTERLEAVE-GROUP with factor " << IG->getFactor() << " at "; 7568 IG->getInsertPos()->printAsOperand(O, false); 7569 O << ", "; 7570 getAddr()->printAsOperand(O, SlotTracker); 7571 VPValue *Mask = getMask(); 7572 if (Mask) { 7573 O << ", "; 7574 Mask->printAsOperand(O, SlotTracker); 7575 } 7576 for (unsigned i = 0; i < IG->getFactor(); ++i) 7577 if (Instruction *I = IG->getMember(i)) 7578 O << "\\l\" +\n" << Indent << "\" " << VPlanIngredient(I) << " " << i; 7579 } 7580 7581 void VPWidenCallRecipe::execute(VPTransformState &State) { 7582 State.ILV->widenCallInstruction(Ingredient, User, State); 7583 } 7584 7585 void VPWidenSelectRecipe::execute(VPTransformState &State) { 7586 State.ILV->widenSelectInstruction(Ingredient, User, InvariantCond, State); 7587 } 7588 7589 void VPWidenRecipe::execute(VPTransformState &State) { 7590 State.ILV->widenInstruction(Ingredient, User, State); 7591 } 7592 7593 void VPWidenGEPRecipe::execute(VPTransformState &State) { 7594 State.ILV->widenGEP(GEP, User, State.UF, State.VF, IsPtrLoopInvariant, 7595 IsIndexLoopInvariant, State); 7596 } 7597 7598 void VPWidenIntOrFpInductionRecipe::execute(VPTransformState &State) { 7599 assert(!State.Instance && "Int or FP induction being replicated."); 7600 State.ILV->widenIntOrFpInduction(IV, Trunc); 7601 } 7602 7603 void VPWidenPHIRecipe::execute(VPTransformState &State) { 7604 State.ILV->widenPHIInstruction(Phi, State.UF, State.VF); 7605 } 7606 7607 void VPBlendRecipe::execute(VPTransformState &State) { 7608 State.ILV->setDebugLocFromInst(State.Builder, Phi); 7609 // We know that all PHIs in non-header blocks are converted into 7610 // selects, so we don't have to worry about the insertion order and we 7611 // can just use the builder. 7612 // At this point we generate the predication tree. There may be 7613 // duplications since this is a simple recursive scan, but future 7614 // optimizations will clean it up. 7615 7616 unsigned NumIncoming = getNumIncomingValues(); 7617 7618 // Generate a sequence of selects of the form: 7619 // SELECT(Mask3, In3, 7620 // SELECT(Mask2, In2, 7621 // SELECT(Mask1, In1, 7622 // In0))) 7623 // Note that Mask0 is never used: lanes for which no path reaches this phi and 7624 // are essentially undef are taken from In0. 7625 InnerLoopVectorizer::VectorParts Entry(State.UF); 7626 for (unsigned In = 0; In < NumIncoming; ++In) { 7627 for (unsigned Part = 0; Part < State.UF; ++Part) { 7628 // We might have single edge PHIs (blocks) - use an identity 7629 // 'select' for the first PHI operand. 7630 Value *In0 = State.get(getIncomingValue(In), Part); 7631 if (In == 0) 7632 Entry[Part] = In0; // Initialize with the first incoming value. 7633 else { 7634 // Select between the current value and the previous incoming edge 7635 // based on the incoming mask. 7636 Value *Cond = State.get(getMask(In), Part); 7637 Entry[Part] = 7638 State.Builder.CreateSelect(Cond, In0, Entry[Part], "predphi"); 7639 } 7640 } 7641 } 7642 for (unsigned Part = 0; Part < State.UF; ++Part) 7643 State.ValueMap.setVectorValue(Phi, Part, Entry[Part]); 7644 } 7645 7646 void VPInterleaveRecipe::execute(VPTransformState &State) { 7647 assert(!State.Instance && "Interleave group being replicated."); 7648 State.ILV->vectorizeInterleaveGroup(IG, State, getAddr(), getMask()); 7649 } 7650 7651 void VPReplicateRecipe::execute(VPTransformState &State) { 7652 if (State.Instance) { // Generate a single instance. 7653 State.ILV->scalarizeInstruction(Ingredient, User, *State.Instance, 7654 IsPredicated, State); 7655 // Insert scalar instance packing it into a vector. 7656 if (AlsoPack && State.VF > 1) { 7657 // If we're constructing lane 0, initialize to start from undef. 7658 if (State.Instance->Lane == 0) { 7659 Value *Undef = UndefValue::get( 7660 FixedVectorType::get(Ingredient->getType(), State.VF)); 7661 State.ValueMap.setVectorValue(Ingredient, State.Instance->Part, Undef); 7662 } 7663 State.ILV->packScalarIntoVectorValue(Ingredient, *State.Instance); 7664 } 7665 return; 7666 } 7667 7668 // Generate scalar instances for all VF lanes of all UF parts, unless the 7669 // instruction is uniform inwhich case generate only the first lane for each 7670 // of the UF parts. 7671 unsigned EndLane = IsUniform ? 1 : State.VF; 7672 for (unsigned Part = 0; Part < State.UF; ++Part) 7673 for (unsigned Lane = 0; Lane < EndLane; ++Lane) 7674 State.ILV->scalarizeInstruction(Ingredient, User, {Part, Lane}, 7675 IsPredicated, State); 7676 } 7677 7678 void VPBranchOnMaskRecipe::execute(VPTransformState &State) { 7679 assert(State.Instance && "Branch on Mask works only on single instance."); 7680 7681 unsigned Part = State.Instance->Part; 7682 unsigned Lane = State.Instance->Lane; 7683 7684 Value *ConditionBit = nullptr; 7685 VPValue *BlockInMask = getMask(); 7686 if (BlockInMask) { 7687 ConditionBit = State.get(BlockInMask, Part); 7688 if (ConditionBit->getType()->isVectorTy()) 7689 ConditionBit = State.Builder.CreateExtractElement( 7690 ConditionBit, State.Builder.getInt32(Lane)); 7691 } else // Block in mask is all-one. 7692 ConditionBit = State.Builder.getTrue(); 7693 7694 // Replace the temporary unreachable terminator with a new conditional branch, 7695 // whose two destinations will be set later when they are created. 7696 auto *CurrentTerminator = State.CFG.PrevBB->getTerminator(); 7697 assert(isa<UnreachableInst>(CurrentTerminator) && 7698 "Expected to replace unreachable terminator with conditional branch."); 7699 auto *CondBr = BranchInst::Create(State.CFG.PrevBB, nullptr, ConditionBit); 7700 CondBr->setSuccessor(0, nullptr); 7701 ReplaceInstWithInst(CurrentTerminator, CondBr); 7702 } 7703 7704 void VPPredInstPHIRecipe::execute(VPTransformState &State) { 7705 assert(State.Instance && "Predicated instruction PHI works per instance."); 7706 Instruction *ScalarPredInst = cast<Instruction>( 7707 State.ValueMap.getScalarValue(PredInst, *State.Instance)); 7708 BasicBlock *PredicatedBB = ScalarPredInst->getParent(); 7709 BasicBlock *PredicatingBB = PredicatedBB->getSinglePredecessor(); 7710 assert(PredicatingBB && "Predicated block has no single predecessor."); 7711 7712 // By current pack/unpack logic we need to generate only a single phi node: if 7713 // a vector value for the predicated instruction exists at this point it means 7714 // the instruction has vector users only, and a phi for the vector value is 7715 // needed. In this case the recipe of the predicated instruction is marked to 7716 // also do that packing, thereby "hoisting" the insert-element sequence. 7717 // Otherwise, a phi node for the scalar value is needed. 7718 unsigned Part = State.Instance->Part; 7719 if (State.ValueMap.hasVectorValue(PredInst, Part)) { 7720 Value *VectorValue = State.ValueMap.getVectorValue(PredInst, Part); 7721 InsertElementInst *IEI = cast<InsertElementInst>(VectorValue); 7722 PHINode *VPhi = State.Builder.CreatePHI(IEI->getType(), 2); 7723 VPhi->addIncoming(IEI->getOperand(0), PredicatingBB); // Unmodified vector. 7724 VPhi->addIncoming(IEI, PredicatedBB); // New vector with inserted element. 7725 State.ValueMap.resetVectorValue(PredInst, Part, VPhi); // Update cache. 7726 } else { 7727 Type *PredInstType = PredInst->getType(); 7728 PHINode *Phi = State.Builder.CreatePHI(PredInstType, 2); 7729 Phi->addIncoming(UndefValue::get(ScalarPredInst->getType()), PredicatingBB); 7730 Phi->addIncoming(ScalarPredInst, PredicatedBB); 7731 State.ValueMap.resetScalarValue(PredInst, *State.Instance, Phi); 7732 } 7733 } 7734 7735 void VPWidenMemoryInstructionRecipe::execute(VPTransformState &State) { 7736 VPValue *StoredValue = isa<StoreInst>(Instr) ? getStoredValue() : nullptr; 7737 State.ILV->vectorizeMemoryInstruction(&Instr, State, getAddr(), StoredValue, 7738 getMask()); 7739 } 7740 7741 // Determine how to lower the scalar epilogue, which depends on 1) optimising 7742 // for minimum code-size, 2) predicate compiler options, 3) loop hints forcing 7743 // predication, and 4) a TTI hook that analyses whether the loop is suitable 7744 // for predication. 7745 static ScalarEpilogueLowering getScalarEpilogueLowering( 7746 Function *F, Loop *L, LoopVectorizeHints &Hints, ProfileSummaryInfo *PSI, 7747 BlockFrequencyInfo *BFI, TargetTransformInfo *TTI, TargetLibraryInfo *TLI, 7748 AssumptionCache *AC, LoopInfo *LI, ScalarEvolution *SE, DominatorTree *DT, 7749 LoopVectorizationLegality &LVL) { 7750 bool OptSize = 7751 F->hasOptSize() || llvm::shouldOptimizeForSize(L->getHeader(), PSI, BFI, 7752 PGSOQueryType::IRPass); 7753 // 1) OptSize takes precedence over all other options, i.e. if this is set, 7754 // don't look at hints or options, and don't request a scalar epilogue. 7755 if (OptSize) 7756 return CM_ScalarEpilogueNotAllowedOptSize; 7757 7758 bool PredicateOptDisabled = PreferPredicateOverEpilog.getNumOccurrences() && 7759 !PreferPredicateOverEpilog; 7760 7761 // 2) Next, if disabling predication is requested on the command line, honour 7762 // this and request a scalar epilogue. 7763 if (PredicateOptDisabled) 7764 return CM_ScalarEpilogueAllowed; 7765 7766 // 3) and 4) look if enabling predication is requested on the command line, 7767 // with a loop hint, or if the TTI hook indicates this is profitable, request 7768 // predication . 7769 if (PreferPredicateOverEpilog || 7770 Hints.getPredicate() == LoopVectorizeHints::FK_Enabled || 7771 (TTI->preferPredicateOverEpilogue(L, LI, *SE, *AC, TLI, DT, 7772 LVL.getLAI()) && 7773 Hints.getPredicate() != LoopVectorizeHints::FK_Disabled)) 7774 return CM_ScalarEpilogueNotNeededUsePredicate; 7775 7776 return CM_ScalarEpilogueAllowed; 7777 } 7778 7779 // Process the loop in the VPlan-native vectorization path. This path builds 7780 // VPlan upfront in the vectorization pipeline, which allows to apply 7781 // VPlan-to-VPlan transformations from the very beginning without modifying the 7782 // input LLVM IR. 7783 static bool processLoopInVPlanNativePath( 7784 Loop *L, PredicatedScalarEvolution &PSE, LoopInfo *LI, DominatorTree *DT, 7785 LoopVectorizationLegality *LVL, TargetTransformInfo *TTI, 7786 TargetLibraryInfo *TLI, DemandedBits *DB, AssumptionCache *AC, 7787 OptimizationRemarkEmitter *ORE, BlockFrequencyInfo *BFI, 7788 ProfileSummaryInfo *PSI, LoopVectorizeHints &Hints) { 7789 7790 if (PSE.getBackedgeTakenCount() == PSE.getSE()->getCouldNotCompute()) { 7791 LLVM_DEBUG(dbgs() << "LV: cannot compute the outer-loop trip count\n"); 7792 return false; 7793 } 7794 assert(EnableVPlanNativePath && "VPlan-native path is disabled."); 7795 Function *F = L->getHeader()->getParent(); 7796 InterleavedAccessInfo IAI(PSE, L, DT, LI, LVL->getLAI()); 7797 7798 ScalarEpilogueLowering SEL = getScalarEpilogueLowering( 7799 F, L, Hints, PSI, BFI, TTI, TLI, AC, LI, PSE.getSE(), DT, *LVL); 7800 7801 LoopVectorizationCostModel CM(SEL, L, PSE, LI, LVL, *TTI, TLI, DB, AC, ORE, F, 7802 &Hints, IAI); 7803 // Use the planner for outer loop vectorization. 7804 // TODO: CM is not used at this point inside the planner. Turn CM into an 7805 // optional argument if we don't need it in the future. 7806 LoopVectorizationPlanner LVP(L, LI, TLI, TTI, LVL, CM, IAI, PSE); 7807 7808 // Get user vectorization factor. 7809 const unsigned UserVF = Hints.getWidth(); 7810 7811 // Plan how to best vectorize, return the best VF and its cost. 7812 const VectorizationFactor VF = LVP.planInVPlanNativePath(UserVF); 7813 7814 // If we are stress testing VPlan builds, do not attempt to generate vector 7815 // code. Masked vector code generation support will follow soon. 7816 // Also, do not attempt to vectorize if no vector code will be produced. 7817 if (VPlanBuildStressTest || EnableVPlanPredication || 7818 VectorizationFactor::Disabled() == VF) 7819 return false; 7820 7821 LVP.setBestPlan(VF.Width, 1); 7822 7823 InnerLoopVectorizer LB(L, PSE, LI, DT, TLI, TTI, AC, ORE, VF.Width, 1, LVL, 7824 &CM, BFI, PSI); 7825 LLVM_DEBUG(dbgs() << "Vectorizing outer loop in \"" 7826 << L->getHeader()->getParent()->getName() << "\"\n"); 7827 LVP.executePlan(LB, DT); 7828 7829 // Mark the loop as already vectorized to avoid vectorizing again. 7830 Hints.setAlreadyVectorized(); 7831 7832 assert(!verifyFunction(*L->getHeader()->getParent(), &dbgs())); 7833 return true; 7834 } 7835 7836 LoopVectorizePass::LoopVectorizePass(LoopVectorizeOptions Opts) 7837 : InterleaveOnlyWhenForced(Opts.InterleaveOnlyWhenForced || 7838 !EnableLoopInterleaving), 7839 VectorizeOnlyWhenForced(Opts.VectorizeOnlyWhenForced || 7840 !EnableLoopVectorization) {} 7841 7842 bool LoopVectorizePass::processLoop(Loop *L) { 7843 assert((EnableVPlanNativePath || L->empty()) && 7844 "VPlan-native path is not enabled. Only process inner loops."); 7845 7846 #ifndef NDEBUG 7847 const std::string DebugLocStr = getDebugLocString(L); 7848 #endif /* NDEBUG */ 7849 7850 LLVM_DEBUG(dbgs() << "\nLV: Checking a loop in \"" 7851 << L->getHeader()->getParent()->getName() << "\" from " 7852 << DebugLocStr << "\n"); 7853 7854 LoopVectorizeHints Hints(L, InterleaveOnlyWhenForced, *ORE); 7855 7856 LLVM_DEBUG( 7857 dbgs() << "LV: Loop hints:" 7858 << " force=" 7859 << (Hints.getForce() == LoopVectorizeHints::FK_Disabled 7860 ? "disabled" 7861 : (Hints.getForce() == LoopVectorizeHints::FK_Enabled 7862 ? "enabled" 7863 : "?")) 7864 << " width=" << Hints.getWidth() 7865 << " unroll=" << Hints.getInterleave() << "\n"); 7866 7867 // Function containing loop 7868 Function *F = L->getHeader()->getParent(); 7869 7870 // Looking at the diagnostic output is the only way to determine if a loop 7871 // was vectorized (other than looking at the IR or machine code), so it 7872 // is important to generate an optimization remark for each loop. Most of 7873 // these messages are generated as OptimizationRemarkAnalysis. Remarks 7874 // generated as OptimizationRemark and OptimizationRemarkMissed are 7875 // less verbose reporting vectorized loops and unvectorized loops that may 7876 // benefit from vectorization, respectively. 7877 7878 if (!Hints.allowVectorization(F, L, VectorizeOnlyWhenForced)) { 7879 LLVM_DEBUG(dbgs() << "LV: Loop hints prevent vectorization.\n"); 7880 return false; 7881 } 7882 7883 PredicatedScalarEvolution PSE(*SE, *L); 7884 7885 // Check if it is legal to vectorize the loop. 7886 LoopVectorizationRequirements Requirements(*ORE); 7887 LoopVectorizationLegality LVL(L, PSE, DT, TTI, TLI, AA, F, GetLAA, LI, ORE, 7888 &Requirements, &Hints, DB, AC, BFI, PSI); 7889 if (!LVL.canVectorize(EnableVPlanNativePath)) { 7890 LLVM_DEBUG(dbgs() << "LV: Not vectorizing: Cannot prove legality.\n"); 7891 Hints.emitRemarkWithHints(); 7892 return false; 7893 } 7894 7895 // Check the function attributes and profiles to find out if this function 7896 // should be optimized for size. 7897 ScalarEpilogueLowering SEL = getScalarEpilogueLowering( 7898 F, L, Hints, PSI, BFI, TTI, TLI, AC, LI, PSE.getSE(), DT, LVL); 7899 7900 // Entrance to the VPlan-native vectorization path. Outer loops are processed 7901 // here. They may require CFG and instruction level transformations before 7902 // even evaluating whether vectorization is profitable. Since we cannot modify 7903 // the incoming IR, we need to build VPlan upfront in the vectorization 7904 // pipeline. 7905 if (!L->empty()) 7906 return processLoopInVPlanNativePath(L, PSE, LI, DT, &LVL, TTI, TLI, DB, AC, 7907 ORE, BFI, PSI, Hints); 7908 7909 assert(L->empty() && "Inner loop expected."); 7910 7911 // Check the loop for a trip count threshold: vectorize loops with a tiny trip 7912 // count by optimizing for size, to minimize overheads. 7913 auto ExpectedTC = getSmallBestKnownTC(*SE, L); 7914 if (ExpectedTC && *ExpectedTC < TinyTripCountVectorThreshold) { 7915 LLVM_DEBUG(dbgs() << "LV: Found a loop with a very small trip count. " 7916 << "This loop is worth vectorizing only if no scalar " 7917 << "iteration overheads are incurred."); 7918 if (Hints.getForce() == LoopVectorizeHints::FK_Enabled) 7919 LLVM_DEBUG(dbgs() << " But vectorizing was explicitly forced.\n"); 7920 else { 7921 LLVM_DEBUG(dbgs() << "\n"); 7922 SEL = CM_ScalarEpilogueNotAllowedLowTripLoop; 7923 } 7924 } 7925 7926 // Check the function attributes to see if implicit floats are allowed. 7927 // FIXME: This check doesn't seem possibly correct -- what if the loop is 7928 // an integer loop and the vector instructions selected are purely integer 7929 // vector instructions? 7930 if (F->hasFnAttribute(Attribute::NoImplicitFloat)) { 7931 reportVectorizationFailure( 7932 "Can't vectorize when the NoImplicitFloat attribute is used", 7933 "loop not vectorized due to NoImplicitFloat attribute", 7934 "NoImplicitFloat", ORE, L); 7935 Hints.emitRemarkWithHints(); 7936 return false; 7937 } 7938 7939 // Check if the target supports potentially unsafe FP vectorization. 7940 // FIXME: Add a check for the type of safety issue (denormal, signaling) 7941 // for the target we're vectorizing for, to make sure none of the 7942 // additional fp-math flags can help. 7943 if (Hints.isPotentiallyUnsafe() && 7944 TTI->isFPVectorizationPotentiallyUnsafe()) { 7945 reportVectorizationFailure( 7946 "Potentially unsafe FP op prevents vectorization", 7947 "loop not vectorized due to unsafe FP support.", 7948 "UnsafeFP", ORE, L); 7949 Hints.emitRemarkWithHints(); 7950 return false; 7951 } 7952 7953 bool UseInterleaved = TTI->enableInterleavedAccessVectorization(); 7954 InterleavedAccessInfo IAI(PSE, L, DT, LI, LVL.getLAI()); 7955 7956 // If an override option has been passed in for interleaved accesses, use it. 7957 if (EnableInterleavedMemAccesses.getNumOccurrences() > 0) 7958 UseInterleaved = EnableInterleavedMemAccesses; 7959 7960 // Analyze interleaved memory accesses. 7961 if (UseInterleaved) { 7962 IAI.analyzeInterleaving(useMaskedInterleavedAccesses(*TTI)); 7963 } 7964 7965 // Use the cost model. 7966 LoopVectorizationCostModel CM(SEL, L, PSE, LI, &LVL, *TTI, TLI, DB, AC, ORE, 7967 F, &Hints, IAI); 7968 CM.collectValuesToIgnore(); 7969 7970 // Use the planner for vectorization. 7971 LoopVectorizationPlanner LVP(L, LI, TLI, TTI, &LVL, CM, IAI, PSE); 7972 7973 // Get user vectorization factor and interleave count. 7974 unsigned UserVF = Hints.getWidth(); 7975 unsigned UserIC = Hints.getInterleave(); 7976 7977 // Plan how to best vectorize, return the best VF and its cost. 7978 Optional<VectorizationFactor> MaybeVF = LVP.plan(UserVF, UserIC); 7979 7980 VectorizationFactor VF = VectorizationFactor::Disabled(); 7981 unsigned IC = 1; 7982 7983 if (MaybeVF) { 7984 VF = *MaybeVF; 7985 // Select the interleave count. 7986 IC = CM.selectInterleaveCount(VF.Width, VF.Cost); 7987 } 7988 7989 // Identify the diagnostic messages that should be produced. 7990 std::pair<StringRef, std::string> VecDiagMsg, IntDiagMsg; 7991 bool VectorizeLoop = true, InterleaveLoop = true; 7992 if (Requirements.doesNotMeet(F, L, Hints)) { 7993 LLVM_DEBUG(dbgs() << "LV: Not vectorizing: loop did not meet vectorization " 7994 "requirements.\n"); 7995 Hints.emitRemarkWithHints(); 7996 return false; 7997 } 7998 7999 if (VF.Width == 1) { 8000 LLVM_DEBUG(dbgs() << "LV: Vectorization is possible but not beneficial.\n"); 8001 VecDiagMsg = std::make_pair( 8002 "VectorizationNotBeneficial", 8003 "the cost-model indicates that vectorization is not beneficial"); 8004 VectorizeLoop = false; 8005 } 8006 8007 if (!MaybeVF && UserIC > 1) { 8008 // Tell the user interleaving was avoided up-front, despite being explicitly 8009 // requested. 8010 LLVM_DEBUG(dbgs() << "LV: Ignoring UserIC, because vectorization and " 8011 "interleaving should be avoided up front\n"); 8012 IntDiagMsg = std::make_pair( 8013 "InterleavingAvoided", 8014 "Ignoring UserIC, because interleaving was avoided up front"); 8015 InterleaveLoop = false; 8016 } else if (IC == 1 && UserIC <= 1) { 8017 // Tell the user interleaving is not beneficial. 8018 LLVM_DEBUG(dbgs() << "LV: Interleaving is not beneficial.\n"); 8019 IntDiagMsg = std::make_pair( 8020 "InterleavingNotBeneficial", 8021 "the cost-model indicates that interleaving is not beneficial"); 8022 InterleaveLoop = false; 8023 if (UserIC == 1) { 8024 IntDiagMsg.first = "InterleavingNotBeneficialAndDisabled"; 8025 IntDiagMsg.second += 8026 " and is explicitly disabled or interleave count is set to 1"; 8027 } 8028 } else if (IC > 1 && UserIC == 1) { 8029 // Tell the user interleaving is beneficial, but it explicitly disabled. 8030 LLVM_DEBUG( 8031 dbgs() << "LV: Interleaving is beneficial but is explicitly disabled."); 8032 IntDiagMsg = std::make_pair( 8033 "InterleavingBeneficialButDisabled", 8034 "the cost-model indicates that interleaving is beneficial " 8035 "but is explicitly disabled or interleave count is set to 1"); 8036 InterleaveLoop = false; 8037 } 8038 8039 // Override IC if user provided an interleave count. 8040 IC = UserIC > 0 ? UserIC : IC; 8041 8042 // Emit diagnostic messages, if any. 8043 const char *VAPassName = Hints.vectorizeAnalysisPassName(); 8044 if (!VectorizeLoop && !InterleaveLoop) { 8045 // Do not vectorize or interleaving the loop. 8046 ORE->emit([&]() { 8047 return OptimizationRemarkMissed(VAPassName, VecDiagMsg.first, 8048 L->getStartLoc(), L->getHeader()) 8049 << VecDiagMsg.second; 8050 }); 8051 ORE->emit([&]() { 8052 return OptimizationRemarkMissed(LV_NAME, IntDiagMsg.first, 8053 L->getStartLoc(), L->getHeader()) 8054 << IntDiagMsg.second; 8055 }); 8056 return false; 8057 } else if (!VectorizeLoop && InterleaveLoop) { 8058 LLVM_DEBUG(dbgs() << "LV: Interleave Count is " << IC << '\n'); 8059 ORE->emit([&]() { 8060 return OptimizationRemarkAnalysis(VAPassName, VecDiagMsg.first, 8061 L->getStartLoc(), L->getHeader()) 8062 << VecDiagMsg.second; 8063 }); 8064 } else if (VectorizeLoop && !InterleaveLoop) { 8065 LLVM_DEBUG(dbgs() << "LV: Found a vectorizable loop (" << VF.Width 8066 << ") in " << DebugLocStr << '\n'); 8067 ORE->emit([&]() { 8068 return OptimizationRemarkAnalysis(LV_NAME, IntDiagMsg.first, 8069 L->getStartLoc(), L->getHeader()) 8070 << IntDiagMsg.second; 8071 }); 8072 } else if (VectorizeLoop && InterleaveLoop) { 8073 LLVM_DEBUG(dbgs() << "LV: Found a vectorizable loop (" << VF.Width 8074 << ") in " << DebugLocStr << '\n'); 8075 LLVM_DEBUG(dbgs() << "LV: Interleave Count is " << IC << '\n'); 8076 } 8077 8078 LVP.setBestPlan(VF.Width, IC); 8079 8080 using namespace ore; 8081 bool DisableRuntimeUnroll = false; 8082 MDNode *OrigLoopID = L->getLoopID(); 8083 8084 if (!VectorizeLoop) { 8085 assert(IC > 1 && "interleave count should not be 1 or 0"); 8086 // If we decided that it is not legal to vectorize the loop, then 8087 // interleave it. 8088 InnerLoopUnroller Unroller(L, PSE, LI, DT, TLI, TTI, AC, ORE, IC, &LVL, &CM, 8089 BFI, PSI); 8090 LVP.executePlan(Unroller, DT); 8091 8092 ORE->emit([&]() { 8093 return OptimizationRemark(LV_NAME, "Interleaved", L->getStartLoc(), 8094 L->getHeader()) 8095 << "interleaved loop (interleaved count: " 8096 << NV("InterleaveCount", IC) << ")"; 8097 }); 8098 } else { 8099 // If we decided that it is *legal* to vectorize the loop, then do it. 8100 InnerLoopVectorizer LB(L, PSE, LI, DT, TLI, TTI, AC, ORE, VF.Width, IC, 8101 &LVL, &CM, BFI, PSI); 8102 LVP.executePlan(LB, DT); 8103 ++LoopsVectorized; 8104 8105 // Add metadata to disable runtime unrolling a scalar loop when there are 8106 // no runtime checks about strides and memory. A scalar loop that is 8107 // rarely used is not worth unrolling. 8108 if (!LB.areSafetyChecksAdded()) 8109 DisableRuntimeUnroll = true; 8110 8111 // Report the vectorization decision. 8112 ORE->emit([&]() { 8113 return OptimizationRemark(LV_NAME, "Vectorized", L->getStartLoc(), 8114 L->getHeader()) 8115 << "vectorized loop (vectorization width: " 8116 << NV("VectorizationFactor", VF.Width) 8117 << ", interleaved count: " << NV("InterleaveCount", IC) << ")"; 8118 }); 8119 } 8120 8121 Optional<MDNode *> RemainderLoopID = 8122 makeFollowupLoopID(OrigLoopID, {LLVMLoopVectorizeFollowupAll, 8123 LLVMLoopVectorizeFollowupEpilogue}); 8124 if (RemainderLoopID.hasValue()) { 8125 L->setLoopID(RemainderLoopID.getValue()); 8126 } else { 8127 if (DisableRuntimeUnroll) 8128 AddRuntimeUnrollDisableMetaData(L); 8129 8130 // Mark the loop as already vectorized to avoid vectorizing again. 8131 Hints.setAlreadyVectorized(); 8132 } 8133 8134 assert(!verifyFunction(*L->getHeader()->getParent(), &dbgs())); 8135 return true; 8136 } 8137 8138 LoopVectorizeResult LoopVectorizePass::runImpl( 8139 Function &F, ScalarEvolution &SE_, LoopInfo &LI_, TargetTransformInfo &TTI_, 8140 DominatorTree &DT_, BlockFrequencyInfo &BFI_, TargetLibraryInfo *TLI_, 8141 DemandedBits &DB_, AAResults &AA_, AssumptionCache &AC_, 8142 std::function<const LoopAccessInfo &(Loop &)> &GetLAA_, 8143 OptimizationRemarkEmitter &ORE_, ProfileSummaryInfo *PSI_) { 8144 SE = &SE_; 8145 LI = &LI_; 8146 TTI = &TTI_; 8147 DT = &DT_; 8148 BFI = &BFI_; 8149 TLI = TLI_; 8150 AA = &AA_; 8151 AC = &AC_; 8152 GetLAA = &GetLAA_; 8153 DB = &DB_; 8154 ORE = &ORE_; 8155 PSI = PSI_; 8156 8157 // Don't attempt if 8158 // 1. the target claims to have no vector registers, and 8159 // 2. interleaving won't help ILP. 8160 // 8161 // The second condition is necessary because, even if the target has no 8162 // vector registers, loop vectorization may still enable scalar 8163 // interleaving. 8164 if (!TTI->getNumberOfRegisters(TTI->getRegisterClassForType(true)) && 8165 TTI->getMaxInterleaveFactor(1) < 2) 8166 return LoopVectorizeResult(false, false); 8167 8168 bool Changed = false, CFGChanged = false; 8169 8170 // The vectorizer requires loops to be in simplified form. 8171 // Since simplification may add new inner loops, it has to run before the 8172 // legality and profitability checks. This means running the loop vectorizer 8173 // will simplify all loops, regardless of whether anything end up being 8174 // vectorized. 8175 for (auto &L : *LI) 8176 Changed |= CFGChanged |= 8177 simplifyLoop(L, DT, LI, SE, AC, nullptr, false /* PreserveLCSSA */); 8178 8179 // Build up a worklist of inner-loops to vectorize. This is necessary as 8180 // the act of vectorizing or partially unrolling a loop creates new loops 8181 // and can invalidate iterators across the loops. 8182 SmallVector<Loop *, 8> Worklist; 8183 8184 for (Loop *L : *LI) 8185 collectSupportedLoops(*L, LI, ORE, Worklist); 8186 8187 LoopsAnalyzed += Worklist.size(); 8188 8189 // Now walk the identified inner loops. 8190 while (!Worklist.empty()) { 8191 Loop *L = Worklist.pop_back_val(); 8192 8193 // For the inner loops we actually process, form LCSSA to simplify the 8194 // transform. 8195 Changed |= formLCSSARecursively(*L, *DT, LI, SE); 8196 8197 Changed |= CFGChanged |= processLoop(L); 8198 } 8199 8200 // Process each loop nest in the function. 8201 return LoopVectorizeResult(Changed, CFGChanged); 8202 } 8203 8204 PreservedAnalyses LoopVectorizePass::run(Function &F, 8205 FunctionAnalysisManager &AM) { 8206 auto &SE = AM.getResult<ScalarEvolutionAnalysis>(F); 8207 auto &LI = AM.getResult<LoopAnalysis>(F); 8208 auto &TTI = AM.getResult<TargetIRAnalysis>(F); 8209 auto &DT = AM.getResult<DominatorTreeAnalysis>(F); 8210 auto &BFI = AM.getResult<BlockFrequencyAnalysis>(F); 8211 auto &TLI = AM.getResult<TargetLibraryAnalysis>(F); 8212 auto &AA = AM.getResult<AAManager>(F); 8213 auto &AC = AM.getResult<AssumptionAnalysis>(F); 8214 auto &DB = AM.getResult<DemandedBitsAnalysis>(F); 8215 auto &ORE = AM.getResult<OptimizationRemarkEmitterAnalysis>(F); 8216 MemorySSA *MSSA = EnableMSSALoopDependency 8217 ? &AM.getResult<MemorySSAAnalysis>(F).getMSSA() 8218 : nullptr; 8219 8220 auto &LAM = AM.getResult<LoopAnalysisManagerFunctionProxy>(F).getManager(); 8221 std::function<const LoopAccessInfo &(Loop &)> GetLAA = 8222 [&](Loop &L) -> const LoopAccessInfo & { 8223 LoopStandardAnalysisResults AR = {AA, AC, DT, LI, SE, TLI, TTI, MSSA}; 8224 return LAM.getResult<LoopAccessAnalysis>(L, AR); 8225 }; 8226 auto &MAMProxy = AM.getResult<ModuleAnalysisManagerFunctionProxy>(F); 8227 ProfileSummaryInfo *PSI = 8228 MAMProxy.getCachedResult<ProfileSummaryAnalysis>(*F.getParent()); 8229 LoopVectorizeResult Result = 8230 runImpl(F, SE, LI, TTI, DT, BFI, &TLI, DB, AA, AC, GetLAA, ORE, PSI); 8231 if (!Result.MadeAnyChange) 8232 return PreservedAnalyses::all(); 8233 PreservedAnalyses PA; 8234 8235 // We currently do not preserve loopinfo/dominator analyses with outer loop 8236 // vectorization. Until this is addressed, mark these analyses as preserved 8237 // only for non-VPlan-native path. 8238 // TODO: Preserve Loop and Dominator analyses for VPlan-native path. 8239 if (!EnableVPlanNativePath) { 8240 PA.preserve<LoopAnalysis>(); 8241 PA.preserve<DominatorTreeAnalysis>(); 8242 } 8243 PA.preserve<BasicAA>(); 8244 PA.preserve<GlobalsAA>(); 8245 if (!Result.MadeCFGChange) 8246 PA.preserveSet<CFGAnalyses>(); 8247 return PA; 8248 } 8249