1 //===- LoopVectorize.cpp - A Loop Vectorizer ------------------------------===// 2 // 3 // Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. 4 // See https://llvm.org/LICENSE.txt for license information. 5 // SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception 6 // 7 //===----------------------------------------------------------------------===// 8 // 9 // This is the LLVM loop vectorizer. This pass modifies 'vectorizable' loops 10 // and generates target-independent LLVM-IR. 11 // The vectorizer uses the TargetTransformInfo analysis to estimate the costs 12 // of instructions in order to estimate the profitability of vectorization. 13 // 14 // The loop vectorizer combines consecutive loop iterations into a single 15 // 'wide' iteration. After this transformation the index is incremented 16 // by the SIMD vector width, and not by one. 17 // 18 // This pass has three parts: 19 // 1. The main loop pass that drives the different parts. 20 // 2. LoopVectorizationLegality - A unit that checks for the legality 21 // of the vectorization. 22 // 3. InnerLoopVectorizer - A unit that performs the actual 23 // widening of instructions. 24 // 4. LoopVectorizationCostModel - A unit that checks for the profitability 25 // of vectorization. It decides on the optimal vector width, which 26 // can be one, if vectorization is not profitable. 27 // 28 // There is a development effort going on to migrate loop vectorizer to the 29 // VPlan infrastructure and to introduce outer loop vectorization support (see 30 // docs/Proposal/VectorizationPlan.rst and 31 // http://lists.llvm.org/pipermail/llvm-dev/2017-December/119523.html). For this 32 // purpose, we temporarily introduced the VPlan-native vectorization path: an 33 // alternative vectorization path that is natively implemented on top of the 34 // VPlan infrastructure. See EnableVPlanNativePath for enabling. 35 // 36 //===----------------------------------------------------------------------===// 37 // 38 // The reduction-variable vectorization is based on the paper: 39 // D. Nuzman and R. Henderson. Multi-platform Auto-vectorization. 40 // 41 // Variable uniformity checks are inspired by: 42 // Karrenberg, R. and Hack, S. Whole Function Vectorization. 43 // 44 // The interleaved access vectorization is based on the paper: 45 // Dorit Nuzman, Ira Rosen and Ayal Zaks. Auto-Vectorization of Interleaved 46 // Data for SIMD 47 // 48 // Other ideas/concepts are from: 49 // A. Zaks and D. Nuzman. Autovectorization in GCC-two years later. 50 // 51 // S. Maleki, Y. Gao, M. Garzaran, T. Wong and D. Padua. An Evaluation of 52 // Vectorizing Compilers. 53 // 54 //===----------------------------------------------------------------------===// 55 56 #include "llvm/Transforms/Vectorize/LoopVectorize.h" 57 #include "LoopVectorizationPlanner.h" 58 #include "VPRecipeBuilder.h" 59 #include "VPlan.h" 60 #include "VPlanHCFGBuilder.h" 61 #include "VPlanPredicator.h" 62 #include "VPlanTransforms.h" 63 #include "llvm/ADT/APInt.h" 64 #include "llvm/ADT/ArrayRef.h" 65 #include "llvm/ADT/DenseMap.h" 66 #include "llvm/ADT/DenseMapInfo.h" 67 #include "llvm/ADT/Hashing.h" 68 #include "llvm/ADT/MapVector.h" 69 #include "llvm/ADT/None.h" 70 #include "llvm/ADT/Optional.h" 71 #include "llvm/ADT/STLExtras.h" 72 #include "llvm/ADT/SetVector.h" 73 #include "llvm/ADT/SmallPtrSet.h" 74 #include "llvm/ADT/SmallVector.h" 75 #include "llvm/ADT/Statistic.h" 76 #include "llvm/ADT/StringRef.h" 77 #include "llvm/ADT/Twine.h" 78 #include "llvm/ADT/iterator_range.h" 79 #include "llvm/Analysis/AssumptionCache.h" 80 #include "llvm/Analysis/BasicAliasAnalysis.h" 81 #include "llvm/Analysis/BlockFrequencyInfo.h" 82 #include "llvm/Analysis/CFG.h" 83 #include "llvm/Analysis/CodeMetrics.h" 84 #include "llvm/Analysis/DemandedBits.h" 85 #include "llvm/Analysis/GlobalsModRef.h" 86 #include "llvm/Analysis/LoopAccessAnalysis.h" 87 #include "llvm/Analysis/LoopAnalysisManager.h" 88 #include "llvm/Analysis/LoopInfo.h" 89 #include "llvm/Analysis/LoopIterator.h" 90 #include "llvm/Analysis/MemorySSA.h" 91 #include "llvm/Analysis/OptimizationRemarkEmitter.h" 92 #include "llvm/Analysis/ProfileSummaryInfo.h" 93 #include "llvm/Analysis/ScalarEvolution.h" 94 #include "llvm/Analysis/ScalarEvolutionExpander.h" 95 #include "llvm/Analysis/ScalarEvolutionExpressions.h" 96 #include "llvm/Analysis/TargetLibraryInfo.h" 97 #include "llvm/Analysis/TargetTransformInfo.h" 98 #include "llvm/Analysis/VectorUtils.h" 99 #include "llvm/IR/Attributes.h" 100 #include "llvm/IR/BasicBlock.h" 101 #include "llvm/IR/CFG.h" 102 #include "llvm/IR/Constant.h" 103 #include "llvm/IR/Constants.h" 104 #include "llvm/IR/DataLayout.h" 105 #include "llvm/IR/DebugInfoMetadata.h" 106 #include "llvm/IR/DebugLoc.h" 107 #include "llvm/IR/DerivedTypes.h" 108 #include "llvm/IR/DiagnosticInfo.h" 109 #include "llvm/IR/Dominators.h" 110 #include "llvm/IR/Function.h" 111 #include "llvm/IR/IRBuilder.h" 112 #include "llvm/IR/InstrTypes.h" 113 #include "llvm/IR/Instruction.h" 114 #include "llvm/IR/Instructions.h" 115 #include "llvm/IR/IntrinsicInst.h" 116 #include "llvm/IR/Intrinsics.h" 117 #include "llvm/IR/LLVMContext.h" 118 #include "llvm/IR/Metadata.h" 119 #include "llvm/IR/Module.h" 120 #include "llvm/IR/Operator.h" 121 #include "llvm/IR/Type.h" 122 #include "llvm/IR/Use.h" 123 #include "llvm/IR/User.h" 124 #include "llvm/IR/Value.h" 125 #include "llvm/IR/ValueHandle.h" 126 #include "llvm/IR/Verifier.h" 127 #include "llvm/InitializePasses.h" 128 #include "llvm/Pass.h" 129 #include "llvm/Support/Casting.h" 130 #include "llvm/Support/CommandLine.h" 131 #include "llvm/Support/Compiler.h" 132 #include "llvm/Support/Debug.h" 133 #include "llvm/Support/ErrorHandling.h" 134 #include "llvm/Support/MathExtras.h" 135 #include "llvm/Support/raw_ostream.h" 136 #include "llvm/Transforms/Utils/BasicBlockUtils.h" 137 #include "llvm/Transforms/Utils/InjectTLIMappings.h" 138 #include "llvm/Transforms/Utils/LoopSimplify.h" 139 #include "llvm/Transforms/Utils/LoopUtils.h" 140 #include "llvm/Transforms/Utils/LoopVersioning.h" 141 #include "llvm/Transforms/Utils/SizeOpts.h" 142 #include "llvm/Transforms/Vectorize/LoopVectorizationLegality.h" 143 #include <algorithm> 144 #include <cassert> 145 #include <cstdint> 146 #include <cstdlib> 147 #include <functional> 148 #include <iterator> 149 #include <limits> 150 #include <memory> 151 #include <string> 152 #include <tuple> 153 #include <utility> 154 155 using namespace llvm; 156 157 #define LV_NAME "loop-vectorize" 158 #define DEBUG_TYPE LV_NAME 159 160 /// @{ 161 /// Metadata attribute names 162 static const char *const LLVMLoopVectorizeFollowupAll = 163 "llvm.loop.vectorize.followup_all"; 164 static const char *const LLVMLoopVectorizeFollowupVectorized = 165 "llvm.loop.vectorize.followup_vectorized"; 166 static const char *const LLVMLoopVectorizeFollowupEpilogue = 167 "llvm.loop.vectorize.followup_epilogue"; 168 /// @} 169 170 STATISTIC(LoopsVectorized, "Number of loops vectorized"); 171 STATISTIC(LoopsAnalyzed, "Number of loops analyzed for vectorization"); 172 173 /// Loops with a known constant trip count below this number are vectorized only 174 /// if no scalar iteration overheads are incurred. 175 static cl::opt<unsigned> TinyTripCountVectorThreshold( 176 "vectorizer-min-trip-count", cl::init(16), cl::Hidden, 177 cl::desc("Loops with a constant trip count that is smaller than this " 178 "value are vectorized only if no scalar iteration overheads " 179 "are incurred.")); 180 181 // Indicates that an epilogue is undesired, predication is preferred. 182 // This means that the vectorizer will try to fold the loop-tail (epilogue) 183 // into the loop and predicate the loop body accordingly. 184 static cl::opt<bool> PreferPredicateOverEpilog( 185 "prefer-predicate-over-epilog", cl::init(false), cl::Hidden, 186 cl::desc("Indicate that an epilogue is undesired, predication should be " 187 "used instead.")); 188 189 static cl::opt<bool> MaximizeBandwidth( 190 "vectorizer-maximize-bandwidth", cl::init(false), cl::Hidden, 191 cl::desc("Maximize bandwidth when selecting vectorization factor which " 192 "will be determined by the smallest type in loop.")); 193 194 static cl::opt<bool> EnableInterleavedMemAccesses( 195 "enable-interleaved-mem-accesses", cl::init(false), cl::Hidden, 196 cl::desc("Enable vectorization on interleaved memory accesses in a loop")); 197 198 /// An interleave-group may need masking if it resides in a block that needs 199 /// predication, or in order to mask away gaps. 200 static cl::opt<bool> EnableMaskedInterleavedMemAccesses( 201 "enable-masked-interleaved-mem-accesses", cl::init(false), cl::Hidden, 202 cl::desc("Enable vectorization on masked interleaved memory accesses in a loop")); 203 204 static cl::opt<unsigned> TinyTripCountInterleaveThreshold( 205 "tiny-trip-count-interleave-threshold", cl::init(128), cl::Hidden, 206 cl::desc("We don't interleave loops with a estimated constant trip count " 207 "below this number")); 208 209 static cl::opt<unsigned> ForceTargetNumScalarRegs( 210 "force-target-num-scalar-regs", cl::init(0), cl::Hidden, 211 cl::desc("A flag that overrides the target's number of scalar registers.")); 212 213 static cl::opt<unsigned> ForceTargetNumVectorRegs( 214 "force-target-num-vector-regs", cl::init(0), cl::Hidden, 215 cl::desc("A flag that overrides the target's number of vector registers.")); 216 217 static cl::opt<unsigned> ForceTargetMaxScalarInterleaveFactor( 218 "force-target-max-scalar-interleave", cl::init(0), cl::Hidden, 219 cl::desc("A flag that overrides the target's max interleave factor for " 220 "scalar loops.")); 221 222 static cl::opt<unsigned> ForceTargetMaxVectorInterleaveFactor( 223 "force-target-max-vector-interleave", cl::init(0), cl::Hidden, 224 cl::desc("A flag that overrides the target's max interleave factor for " 225 "vectorized loops.")); 226 227 static cl::opt<unsigned> ForceTargetInstructionCost( 228 "force-target-instruction-cost", cl::init(0), cl::Hidden, 229 cl::desc("A flag that overrides the target's expected cost for " 230 "an instruction to a single constant value. Mostly " 231 "useful for getting consistent testing.")); 232 233 static cl::opt<unsigned> SmallLoopCost( 234 "small-loop-cost", cl::init(20), cl::Hidden, 235 cl::desc( 236 "The cost of a loop that is considered 'small' by the interleaver.")); 237 238 static cl::opt<bool> LoopVectorizeWithBlockFrequency( 239 "loop-vectorize-with-block-frequency", cl::init(true), cl::Hidden, 240 cl::desc("Enable the use of the block frequency analysis to access PGO " 241 "heuristics minimizing code growth in cold regions and being more " 242 "aggressive in hot regions.")); 243 244 // Runtime interleave loops for load/store throughput. 245 static cl::opt<bool> EnableLoadStoreRuntimeInterleave( 246 "enable-loadstore-runtime-interleave", cl::init(true), cl::Hidden, 247 cl::desc( 248 "Enable runtime interleaving until load/store ports are saturated")); 249 250 /// The number of stores in a loop that are allowed to need predication. 251 static cl::opt<unsigned> NumberOfStoresToPredicate( 252 "vectorize-num-stores-pred", cl::init(1), cl::Hidden, 253 cl::desc("Max number of stores to be predicated behind an if.")); 254 255 static cl::opt<bool> EnableIndVarRegisterHeur( 256 "enable-ind-var-reg-heur", cl::init(true), cl::Hidden, 257 cl::desc("Count the induction variable only once when interleaving")); 258 259 static cl::opt<bool> EnableCondStoresVectorization( 260 "enable-cond-stores-vec", cl::init(true), cl::Hidden, 261 cl::desc("Enable if predication of stores during vectorization.")); 262 263 static cl::opt<unsigned> MaxNestedScalarReductionIC( 264 "max-nested-scalar-reduction-interleave", cl::init(2), cl::Hidden, 265 cl::desc("The maximum interleave count to use when interleaving a scalar " 266 "reduction in a nested loop.")); 267 268 cl::opt<bool> EnableVPlanNativePath( 269 "enable-vplan-native-path", cl::init(false), cl::Hidden, 270 cl::desc("Enable VPlan-native vectorization path with " 271 "support for outer loop vectorization.")); 272 273 // FIXME: Remove this switch once we have divergence analysis. Currently we 274 // assume divergent non-backedge branches when this switch is true. 275 cl::opt<bool> EnableVPlanPredication( 276 "enable-vplan-predication", cl::init(false), cl::Hidden, 277 cl::desc("Enable VPlan-native vectorization path predicator with " 278 "support for outer loop vectorization.")); 279 280 // This flag enables the stress testing of the VPlan H-CFG construction in the 281 // VPlan-native vectorization path. It must be used in conjuction with 282 // -enable-vplan-native-path. -vplan-verify-hcfg can also be used to enable the 283 // verification of the H-CFGs built. 284 static cl::opt<bool> VPlanBuildStressTest( 285 "vplan-build-stress-test", cl::init(false), cl::Hidden, 286 cl::desc( 287 "Build VPlan for every supported loop nest in the function and bail " 288 "out right after the build (stress test the VPlan H-CFG construction " 289 "in the VPlan-native vectorization path).")); 290 291 cl::opt<bool> llvm::EnableLoopInterleaving( 292 "interleave-loops", cl::init(true), cl::Hidden, 293 cl::desc("Enable loop interleaving in Loop vectorization passes")); 294 cl::opt<bool> llvm::EnableLoopVectorization( 295 "vectorize-loops", cl::init(true), cl::Hidden, 296 cl::desc("Run the Loop vectorization passes")); 297 298 /// A helper function that returns the type of loaded or stored value. 299 static Type *getMemInstValueType(Value *I) { 300 assert((isa<LoadInst>(I) || isa<StoreInst>(I)) && 301 "Expected Load or Store instruction"); 302 if (auto *LI = dyn_cast<LoadInst>(I)) 303 return LI->getType(); 304 return cast<StoreInst>(I)->getValueOperand()->getType(); 305 } 306 307 /// A helper function that returns true if the given type is irregular. The 308 /// type is irregular if its allocated size doesn't equal the store size of an 309 /// element of the corresponding vector type at the given vectorization factor. 310 static bool hasIrregularType(Type *Ty, const DataLayout &DL, unsigned VF) { 311 // Determine if an array of VF elements of type Ty is "bitcast compatible" 312 // with a <VF x Ty> vector. 313 if (VF > 1) { 314 auto *VectorTy = VectorType::get(Ty, VF); 315 return VF * DL.getTypeAllocSize(Ty) != DL.getTypeStoreSize(VectorTy); 316 } 317 318 // If the vectorization factor is one, we just check if an array of type Ty 319 // requires padding between elements. 320 return DL.getTypeAllocSizeInBits(Ty) != DL.getTypeSizeInBits(Ty); 321 } 322 323 /// A helper function that returns the reciprocal of the block probability of 324 /// predicated blocks. If we return X, we are assuming the predicated block 325 /// will execute once for every X iterations of the loop header. 326 /// 327 /// TODO: We should use actual block probability here, if available. Currently, 328 /// we always assume predicated blocks have a 50% chance of executing. 329 static unsigned getReciprocalPredBlockProb() { return 2; } 330 331 /// A helper function that adds a 'fast' flag to floating-point operations. 332 static Value *addFastMathFlag(Value *V) { 333 if (isa<FPMathOperator>(V)) 334 cast<Instruction>(V)->setFastMathFlags(FastMathFlags::getFast()); 335 return V; 336 } 337 338 static Value *addFastMathFlag(Value *V, FastMathFlags FMF) { 339 if (isa<FPMathOperator>(V)) 340 cast<Instruction>(V)->setFastMathFlags(FMF); 341 return V; 342 } 343 344 /// A helper function that returns an integer or floating-point constant with 345 /// value C. 346 static Constant *getSignedIntOrFpConstant(Type *Ty, int64_t C) { 347 return Ty->isIntegerTy() ? ConstantInt::getSigned(Ty, C) 348 : ConstantFP::get(Ty, C); 349 } 350 351 /// Returns "best known" trip count for the specified loop \p L as defined by 352 /// the following procedure: 353 /// 1) Returns exact trip count if it is known. 354 /// 2) Returns expected trip count according to profile data if any. 355 /// 3) Returns upper bound estimate if it is known. 356 /// 4) Returns None if all of the above failed. 357 static Optional<unsigned> getSmallBestKnownTC(ScalarEvolution &SE, Loop *L) { 358 // Check if exact trip count is known. 359 if (unsigned ExpectedTC = SE.getSmallConstantTripCount(L)) 360 return ExpectedTC; 361 362 // Check if there is an expected trip count available from profile data. 363 if (LoopVectorizeWithBlockFrequency) 364 if (auto EstimatedTC = getLoopEstimatedTripCount(L)) 365 return EstimatedTC; 366 367 // Check if upper bound estimate is known. 368 if (unsigned ExpectedTC = SE.getSmallConstantMaxTripCount(L)) 369 return ExpectedTC; 370 371 return None; 372 } 373 374 namespace llvm { 375 376 /// InnerLoopVectorizer vectorizes loops which contain only one basic 377 /// block to a specified vectorization factor (VF). 378 /// This class performs the widening of scalars into vectors, or multiple 379 /// scalars. This class also implements the following features: 380 /// * It inserts an epilogue loop for handling loops that don't have iteration 381 /// counts that are known to be a multiple of the vectorization factor. 382 /// * It handles the code generation for reduction variables. 383 /// * Scalarization (implementation using scalars) of un-vectorizable 384 /// instructions. 385 /// InnerLoopVectorizer does not perform any vectorization-legality 386 /// checks, and relies on the caller to check for the different legality 387 /// aspects. The InnerLoopVectorizer relies on the 388 /// LoopVectorizationLegality class to provide information about the induction 389 /// and reduction variables that were found to a given vectorization factor. 390 class InnerLoopVectorizer { 391 public: 392 InnerLoopVectorizer(Loop *OrigLoop, PredicatedScalarEvolution &PSE, 393 LoopInfo *LI, DominatorTree *DT, 394 const TargetLibraryInfo *TLI, 395 const TargetTransformInfo *TTI, AssumptionCache *AC, 396 OptimizationRemarkEmitter *ORE, unsigned VecWidth, 397 unsigned UnrollFactor, LoopVectorizationLegality *LVL, 398 LoopVectorizationCostModel *CM) 399 : OrigLoop(OrigLoop), PSE(PSE), LI(LI), DT(DT), TLI(TLI), TTI(TTI), 400 AC(AC), ORE(ORE), VF(VecWidth), UF(UnrollFactor), 401 Builder(PSE.getSE()->getContext()), 402 VectorLoopValueMap(UnrollFactor, VecWidth), Legal(LVL), Cost(CM) {} 403 virtual ~InnerLoopVectorizer() = default; 404 405 /// Create a new empty loop. Unlink the old loop and connect the new one. 406 /// Return the pre-header block of the new loop. 407 BasicBlock *createVectorizedLoopSkeleton(); 408 409 /// Widen a single instruction within the innermost loop. 410 void widenInstruction(Instruction &I); 411 412 /// Widen a single call instruction within the innermost loop. 413 void widenCallInstruction(CallInst &I); 414 415 /// Fix the vectorized code, taking care of header phi's, live-outs, and more. 416 void fixVectorizedLoop(); 417 418 // Return true if any runtime check is added. 419 bool areSafetyChecksAdded() { return AddedSafetyChecks; } 420 421 /// A type for vectorized values in the new loop. Each value from the 422 /// original loop, when vectorized, is represented by UF vector values in the 423 /// new unrolled loop, where UF is the unroll factor. 424 using VectorParts = SmallVector<Value *, 2>; 425 426 /// Vectorize a single GetElementPtrInst based on information gathered and 427 /// decisions taken during planning. 428 void widenGEP(GetElementPtrInst *GEP, unsigned UF, unsigned VF, 429 bool IsPtrLoopInvariant, SmallBitVector &IsIndexLoopInvariant); 430 431 /// Vectorize a single PHINode in a block. This method handles the induction 432 /// variable canonicalization. It supports both VF = 1 for unrolled loops and 433 /// arbitrary length vectors. 434 void widenPHIInstruction(Instruction *PN, unsigned UF, unsigned VF); 435 436 /// A helper function to scalarize a single Instruction in the innermost loop. 437 /// Generates a sequence of scalar instances for each lane between \p MinLane 438 /// and \p MaxLane, times each part between \p MinPart and \p MaxPart, 439 /// inclusive.. 440 void scalarizeInstruction(Instruction *Instr, const VPIteration &Instance, 441 bool IfPredicateInstr); 442 443 /// Widen an integer or floating-point induction variable \p IV. If \p Trunc 444 /// is provided, the integer induction variable will first be truncated to 445 /// the corresponding type. 446 void widenIntOrFpInduction(PHINode *IV, TruncInst *Trunc = nullptr); 447 448 /// getOrCreateVectorValue and getOrCreateScalarValue coordinate to generate a 449 /// vector or scalar value on-demand if one is not yet available. When 450 /// vectorizing a loop, we visit the definition of an instruction before its 451 /// uses. When visiting the definition, we either vectorize or scalarize the 452 /// instruction, creating an entry for it in the corresponding map. (In some 453 /// cases, such as induction variables, we will create both vector and scalar 454 /// entries.) Then, as we encounter uses of the definition, we derive values 455 /// for each scalar or vector use unless such a value is already available. 456 /// For example, if we scalarize a definition and one of its uses is vector, 457 /// we build the required vector on-demand with an insertelement sequence 458 /// when visiting the use. Otherwise, if the use is scalar, we can use the 459 /// existing scalar definition. 460 /// 461 /// Return a value in the new loop corresponding to \p V from the original 462 /// loop at unroll index \p Part. If the value has already been vectorized, 463 /// the corresponding vector entry in VectorLoopValueMap is returned. If, 464 /// however, the value has a scalar entry in VectorLoopValueMap, we construct 465 /// a new vector value on-demand by inserting the scalar values into a vector 466 /// with an insertelement sequence. If the value has been neither vectorized 467 /// nor scalarized, it must be loop invariant, so we simply broadcast the 468 /// value into a vector. 469 Value *getOrCreateVectorValue(Value *V, unsigned Part); 470 471 /// Return a value in the new loop corresponding to \p V from the original 472 /// loop at unroll and vector indices \p Instance. If the value has been 473 /// vectorized but not scalarized, the necessary extractelement instruction 474 /// will be generated. 475 Value *getOrCreateScalarValue(Value *V, const VPIteration &Instance); 476 477 /// Construct the vector value of a scalarized value \p V one lane at a time. 478 void packScalarIntoVectorValue(Value *V, const VPIteration &Instance); 479 480 /// Try to vectorize the interleaved access group that \p Instr belongs to 481 /// with the base address given in \p Addr, optionally masking the vector 482 /// operations if \p BlockInMask is non-null. Use \p State to translate given 483 /// VPValues to IR values in the vectorized loop. 484 void vectorizeInterleaveGroup(Instruction *Instr, VPTransformState &State, 485 VPValue *Addr, VPValue *BlockInMask = nullptr); 486 487 /// Vectorize Load and Store instructions with the base address given in \p 488 /// Addr, optionally masking the vector operations if \p BlockInMask is 489 /// non-null. Use \p State to translate given VPValues to IR values in the 490 /// vectorized loop. 491 void vectorizeMemoryInstruction(Instruction *Instr, VPTransformState &State, 492 VPValue *Addr, VPValue *StoredValue, 493 VPValue *BlockInMask); 494 495 /// Set the debug location in the builder using the debug location in 496 /// the instruction. 497 void setDebugLocFromInst(IRBuilder<> &B, const Value *Ptr); 498 499 /// Fix the non-induction PHIs in the OrigPHIsToFix vector. 500 void fixNonInductionPHIs(void); 501 502 protected: 503 friend class LoopVectorizationPlanner; 504 505 /// A small list of PHINodes. 506 using PhiVector = SmallVector<PHINode *, 4>; 507 508 /// A type for scalarized values in the new loop. Each value from the 509 /// original loop, when scalarized, is represented by UF x VF scalar values 510 /// in the new unrolled loop, where UF is the unroll factor and VF is the 511 /// vectorization factor. 512 using ScalarParts = SmallVector<SmallVector<Value *, 4>, 2>; 513 514 /// Set up the values of the IVs correctly when exiting the vector loop. 515 void fixupIVUsers(PHINode *OrigPhi, const InductionDescriptor &II, 516 Value *CountRoundDown, Value *EndValue, 517 BasicBlock *MiddleBlock); 518 519 /// Create a new induction variable inside L. 520 PHINode *createInductionVariable(Loop *L, Value *Start, Value *End, 521 Value *Step, Instruction *DL); 522 523 /// Handle all cross-iteration phis in the header. 524 void fixCrossIterationPHIs(); 525 526 /// Fix a first-order recurrence. This is the second phase of vectorizing 527 /// this phi node. 528 void fixFirstOrderRecurrence(PHINode *Phi); 529 530 /// Fix a reduction cross-iteration phi. This is the second phase of 531 /// vectorizing this phi node. 532 void fixReduction(PHINode *Phi); 533 534 /// Clear NSW/NUW flags from reduction instructions if necessary. 535 void clearReductionWrapFlags(RecurrenceDescriptor &RdxDesc); 536 537 /// The Loop exit block may have single value PHI nodes with some 538 /// incoming value. While vectorizing we only handled real values 539 /// that were defined inside the loop and we should have one value for 540 /// each predecessor of its parent basic block. See PR14725. 541 void fixLCSSAPHIs(); 542 543 /// Iteratively sink the scalarized operands of a predicated instruction into 544 /// the block that was created for it. 545 void sinkScalarOperands(Instruction *PredInst); 546 547 /// Shrinks vector element sizes to the smallest bitwidth they can be legally 548 /// represented as. 549 void truncateToMinimalBitwidths(); 550 551 /// Create a broadcast instruction. This method generates a broadcast 552 /// instruction (shuffle) for loop invariant values and for the induction 553 /// value. If this is the induction variable then we extend it to N, N+1, ... 554 /// this is needed because each iteration in the loop corresponds to a SIMD 555 /// element. 556 virtual Value *getBroadcastInstrs(Value *V); 557 558 /// This function adds (StartIdx, StartIdx + Step, StartIdx + 2*Step, ...) 559 /// to each vector element of Val. The sequence starts at StartIndex. 560 /// \p Opcode is relevant for FP induction variable. 561 virtual Value *getStepVector(Value *Val, int StartIdx, Value *Step, 562 Instruction::BinaryOps Opcode = 563 Instruction::BinaryOpsEnd); 564 565 /// Compute scalar induction steps. \p ScalarIV is the scalar induction 566 /// variable on which to base the steps, \p Step is the size of the step, and 567 /// \p EntryVal is the value from the original loop that maps to the steps. 568 /// Note that \p EntryVal doesn't have to be an induction variable - it 569 /// can also be a truncate instruction. 570 void buildScalarSteps(Value *ScalarIV, Value *Step, Instruction *EntryVal, 571 const InductionDescriptor &ID); 572 573 /// Create a vector induction phi node based on an existing scalar one. \p 574 /// EntryVal is the value from the original loop that maps to the vector phi 575 /// node, and \p Step is the loop-invariant step. If \p EntryVal is a 576 /// truncate instruction, instead of widening the original IV, we widen a 577 /// version of the IV truncated to \p EntryVal's type. 578 void createVectorIntOrFpInductionPHI(const InductionDescriptor &II, 579 Value *Step, Instruction *EntryVal); 580 581 /// Returns true if an instruction \p I should be scalarized instead of 582 /// vectorized for the chosen vectorization factor. 583 bool shouldScalarizeInstruction(Instruction *I) const; 584 585 /// Returns true if we should generate a scalar version of \p IV. 586 bool needsScalarInduction(Instruction *IV) const; 587 588 /// If there is a cast involved in the induction variable \p ID, which should 589 /// be ignored in the vectorized loop body, this function records the 590 /// VectorLoopValue of the respective Phi also as the VectorLoopValue of the 591 /// cast. We had already proved that the casted Phi is equal to the uncasted 592 /// Phi in the vectorized loop (under a runtime guard), and therefore 593 /// there is no need to vectorize the cast - the same value can be used in the 594 /// vector loop for both the Phi and the cast. 595 /// If \p VectorLoopValue is a scalarized value, \p Lane is also specified, 596 /// Otherwise, \p VectorLoopValue is a widened/vectorized value. 597 /// 598 /// \p EntryVal is the value from the original loop that maps to the vector 599 /// phi node and is used to distinguish what is the IV currently being 600 /// processed - original one (if \p EntryVal is a phi corresponding to the 601 /// original IV) or the "newly-created" one based on the proof mentioned above 602 /// (see also buildScalarSteps() and createVectorIntOrFPInductionPHI()). In the 603 /// latter case \p EntryVal is a TruncInst and we must not record anything for 604 /// that IV, but it's error-prone to expect callers of this routine to care 605 /// about that, hence this explicit parameter. 606 void recordVectorLoopValueForInductionCast(const InductionDescriptor &ID, 607 const Instruction *EntryVal, 608 Value *VectorLoopValue, 609 unsigned Part, 610 unsigned Lane = UINT_MAX); 611 612 /// Generate a shuffle sequence that will reverse the vector Vec. 613 virtual Value *reverseVector(Value *Vec); 614 615 /// Returns (and creates if needed) the original loop trip count. 616 Value *getOrCreateTripCount(Loop *NewLoop); 617 618 /// Returns (and creates if needed) the trip count of the widened loop. 619 Value *getOrCreateVectorTripCount(Loop *NewLoop); 620 621 /// Returns a bitcasted value to the requested vector type. 622 /// Also handles bitcasts of vector<float> <-> vector<pointer> types. 623 Value *createBitOrPointerCast(Value *V, VectorType *DstVTy, 624 const DataLayout &DL); 625 626 /// Emit a bypass check to see if the vector trip count is zero, including if 627 /// it overflows. 628 void emitMinimumIterationCountCheck(Loop *L, BasicBlock *Bypass); 629 630 /// Emit a bypass check to see if all of the SCEV assumptions we've 631 /// had to make are correct. 632 void emitSCEVChecks(Loop *L, BasicBlock *Bypass); 633 634 /// Emit bypass checks to check any memory assumptions we may have made. 635 void emitMemRuntimeChecks(Loop *L, BasicBlock *Bypass); 636 637 /// Compute the transformed value of Index at offset StartValue using step 638 /// StepValue. 639 /// For integer induction, returns StartValue + Index * StepValue. 640 /// For pointer induction, returns StartValue[Index * StepValue]. 641 /// FIXME: The newly created binary instructions should contain nsw/nuw 642 /// flags, which can be found from the original scalar operations. 643 Value *emitTransformedIndex(IRBuilder<> &B, Value *Index, ScalarEvolution *SE, 644 const DataLayout &DL, 645 const InductionDescriptor &ID) const; 646 647 /// Add additional metadata to \p To that was not present on \p Orig. 648 /// 649 /// Currently this is used to add the noalias annotations based on the 650 /// inserted memchecks. Use this for instructions that are *cloned* into the 651 /// vector loop. 652 void addNewMetadata(Instruction *To, const Instruction *Orig); 653 654 /// Add metadata from one instruction to another. 655 /// 656 /// This includes both the original MDs from \p From and additional ones (\see 657 /// addNewMetadata). Use this for *newly created* instructions in the vector 658 /// loop. 659 void addMetadata(Instruction *To, Instruction *From); 660 661 /// Similar to the previous function but it adds the metadata to a 662 /// vector of instructions. 663 void addMetadata(ArrayRef<Value *> To, Instruction *From); 664 665 /// The original loop. 666 Loop *OrigLoop; 667 668 /// A wrapper around ScalarEvolution used to add runtime SCEV checks. Applies 669 /// dynamic knowledge to simplify SCEV expressions and converts them to a 670 /// more usable form. 671 PredicatedScalarEvolution &PSE; 672 673 /// Loop Info. 674 LoopInfo *LI; 675 676 /// Dominator Tree. 677 DominatorTree *DT; 678 679 /// Alias Analysis. 680 AliasAnalysis *AA; 681 682 /// Target Library Info. 683 const TargetLibraryInfo *TLI; 684 685 /// Target Transform Info. 686 const TargetTransformInfo *TTI; 687 688 /// Assumption Cache. 689 AssumptionCache *AC; 690 691 /// Interface to emit optimization remarks. 692 OptimizationRemarkEmitter *ORE; 693 694 /// LoopVersioning. It's only set up (non-null) if memchecks were 695 /// used. 696 /// 697 /// This is currently only used to add no-alias metadata based on the 698 /// memchecks. The actually versioning is performed manually. 699 std::unique_ptr<LoopVersioning> LVer; 700 701 /// The vectorization SIMD factor to use. Each vector will have this many 702 /// vector elements. 703 unsigned VF; 704 705 /// The vectorization unroll factor to use. Each scalar is vectorized to this 706 /// many different vector instructions. 707 unsigned UF; 708 709 /// The builder that we use 710 IRBuilder<> Builder; 711 712 // --- Vectorization state --- 713 714 /// The vector-loop preheader. 715 BasicBlock *LoopVectorPreHeader; 716 717 /// The scalar-loop preheader. 718 BasicBlock *LoopScalarPreHeader; 719 720 /// Middle Block between the vector and the scalar. 721 BasicBlock *LoopMiddleBlock; 722 723 /// The ExitBlock of the scalar loop. 724 BasicBlock *LoopExitBlock; 725 726 /// The vector loop body. 727 BasicBlock *LoopVectorBody; 728 729 /// The scalar loop body. 730 BasicBlock *LoopScalarBody; 731 732 /// A list of all bypass blocks. The first block is the entry of the loop. 733 SmallVector<BasicBlock *, 4> LoopBypassBlocks; 734 735 /// The new Induction variable which was added to the new block. 736 PHINode *Induction = nullptr; 737 738 /// The induction variable of the old basic block. 739 PHINode *OldInduction = nullptr; 740 741 /// Maps values from the original loop to their corresponding values in the 742 /// vectorized loop. A key value can map to either vector values, scalar 743 /// values or both kinds of values, depending on whether the key was 744 /// vectorized and scalarized. 745 VectorizerValueMap VectorLoopValueMap; 746 747 /// Store instructions that were predicated. 748 SmallVector<Instruction *, 4> PredicatedInstructions; 749 750 /// Trip count of the original loop. 751 Value *TripCount = nullptr; 752 753 /// Trip count of the widened loop (TripCount - TripCount % (VF*UF)) 754 Value *VectorTripCount = nullptr; 755 756 /// The legality analysis. 757 LoopVectorizationLegality *Legal; 758 759 /// The profitablity analysis. 760 LoopVectorizationCostModel *Cost; 761 762 // Record whether runtime checks are added. 763 bool AddedSafetyChecks = false; 764 765 // Holds the end values for each induction variable. We save the end values 766 // so we can later fix-up the external users of the induction variables. 767 DenseMap<PHINode *, Value *> IVEndValues; 768 769 // Vector of original scalar PHIs whose corresponding widened PHIs need to be 770 // fixed up at the end of vector code generation. 771 SmallVector<PHINode *, 8> OrigPHIsToFix; 772 }; 773 774 class InnerLoopUnroller : public InnerLoopVectorizer { 775 public: 776 InnerLoopUnroller(Loop *OrigLoop, PredicatedScalarEvolution &PSE, 777 LoopInfo *LI, DominatorTree *DT, 778 const TargetLibraryInfo *TLI, 779 const TargetTransformInfo *TTI, AssumptionCache *AC, 780 OptimizationRemarkEmitter *ORE, unsigned UnrollFactor, 781 LoopVectorizationLegality *LVL, 782 LoopVectorizationCostModel *CM) 783 : InnerLoopVectorizer(OrigLoop, PSE, LI, DT, TLI, TTI, AC, ORE, 1, 784 UnrollFactor, LVL, CM) {} 785 786 private: 787 Value *getBroadcastInstrs(Value *V) override; 788 Value *getStepVector(Value *Val, int StartIdx, Value *Step, 789 Instruction::BinaryOps Opcode = 790 Instruction::BinaryOpsEnd) override; 791 Value *reverseVector(Value *Vec) override; 792 }; 793 794 } // end namespace llvm 795 796 /// Look for a meaningful debug location on the instruction or it's 797 /// operands. 798 static Instruction *getDebugLocFromInstOrOperands(Instruction *I) { 799 if (!I) 800 return I; 801 802 DebugLoc Empty; 803 if (I->getDebugLoc() != Empty) 804 return I; 805 806 for (User::op_iterator OI = I->op_begin(), OE = I->op_end(); OI != OE; ++OI) { 807 if (Instruction *OpInst = dyn_cast<Instruction>(*OI)) 808 if (OpInst->getDebugLoc() != Empty) 809 return OpInst; 810 } 811 812 return I; 813 } 814 815 void InnerLoopVectorizer::setDebugLocFromInst(IRBuilder<> &B, const Value *Ptr) { 816 if (const Instruction *Inst = dyn_cast_or_null<Instruction>(Ptr)) { 817 const DILocation *DIL = Inst->getDebugLoc(); 818 if (DIL && Inst->getFunction()->isDebugInfoForProfiling() && 819 !isa<DbgInfoIntrinsic>(Inst)) { 820 auto NewDIL = DIL->cloneByMultiplyingDuplicationFactor(UF * VF); 821 if (NewDIL) 822 B.SetCurrentDebugLocation(NewDIL.getValue()); 823 else 824 LLVM_DEBUG(dbgs() 825 << "Failed to create new discriminator: " 826 << DIL->getFilename() << " Line: " << DIL->getLine()); 827 } 828 else 829 B.SetCurrentDebugLocation(DIL); 830 } else 831 B.SetCurrentDebugLocation(DebugLoc()); 832 } 833 834 /// Write a record \p DebugMsg about vectorization failure to the debug 835 /// output stream. If \p I is passed, it is an instruction that prevents 836 /// vectorization. 837 #ifndef NDEBUG 838 static void debugVectorizationFailure(const StringRef DebugMsg, 839 Instruction *I) { 840 dbgs() << "LV: Not vectorizing: " << DebugMsg; 841 if (I != nullptr) 842 dbgs() << " " << *I; 843 else 844 dbgs() << '.'; 845 dbgs() << '\n'; 846 } 847 #endif 848 849 /// Create an analysis remark that explains why vectorization failed 850 /// 851 /// \p PassName is the name of the pass (e.g. can be AlwaysPrint). \p 852 /// RemarkName is the identifier for the remark. If \p I is passed it is an 853 /// instruction that prevents vectorization. Otherwise \p TheLoop is used for 854 /// the location of the remark. \return the remark object that can be 855 /// streamed to. 856 static OptimizationRemarkAnalysis createLVAnalysis(const char *PassName, 857 StringRef RemarkName, Loop *TheLoop, Instruction *I) { 858 Value *CodeRegion = TheLoop->getHeader(); 859 DebugLoc DL = TheLoop->getStartLoc(); 860 861 if (I) { 862 CodeRegion = I->getParent(); 863 // If there is no debug location attached to the instruction, revert back to 864 // using the loop's. 865 if (I->getDebugLoc()) 866 DL = I->getDebugLoc(); 867 } 868 869 OptimizationRemarkAnalysis R(PassName, RemarkName, DL, CodeRegion); 870 R << "loop not vectorized: "; 871 return R; 872 } 873 874 namespace llvm { 875 876 void reportVectorizationFailure(const StringRef DebugMsg, 877 const StringRef OREMsg, const StringRef ORETag, 878 OptimizationRemarkEmitter *ORE, Loop *TheLoop, Instruction *I) { 879 LLVM_DEBUG(debugVectorizationFailure(DebugMsg, I)); 880 LoopVectorizeHints Hints(TheLoop, true /* doesn't matter */, *ORE); 881 ORE->emit(createLVAnalysis(Hints.vectorizeAnalysisPassName(), 882 ORETag, TheLoop, I) << OREMsg); 883 } 884 885 } // end namespace llvm 886 887 #ifndef NDEBUG 888 /// \return string containing a file name and a line # for the given loop. 889 static std::string getDebugLocString(const Loop *L) { 890 std::string Result; 891 if (L) { 892 raw_string_ostream OS(Result); 893 if (const DebugLoc LoopDbgLoc = L->getStartLoc()) 894 LoopDbgLoc.print(OS); 895 else 896 // Just print the module name. 897 OS << L->getHeader()->getParent()->getParent()->getModuleIdentifier(); 898 OS.flush(); 899 } 900 return Result; 901 } 902 #endif 903 904 void InnerLoopVectorizer::addNewMetadata(Instruction *To, 905 const Instruction *Orig) { 906 // If the loop was versioned with memchecks, add the corresponding no-alias 907 // metadata. 908 if (LVer && (isa<LoadInst>(Orig) || isa<StoreInst>(Orig))) 909 LVer->annotateInstWithNoAlias(To, Orig); 910 } 911 912 void InnerLoopVectorizer::addMetadata(Instruction *To, 913 Instruction *From) { 914 propagateMetadata(To, From); 915 addNewMetadata(To, From); 916 } 917 918 void InnerLoopVectorizer::addMetadata(ArrayRef<Value *> To, 919 Instruction *From) { 920 for (Value *V : To) { 921 if (Instruction *I = dyn_cast<Instruction>(V)) 922 addMetadata(I, From); 923 } 924 } 925 926 namespace llvm { 927 928 // Loop vectorization cost-model hints how the scalar epilogue loop should be 929 // lowered. 930 enum ScalarEpilogueLowering { 931 932 // The default: allowing scalar epilogues. 933 CM_ScalarEpilogueAllowed, 934 935 // Vectorization with OptForSize: don't allow epilogues. 936 CM_ScalarEpilogueNotAllowedOptSize, 937 938 // A special case of vectorisation with OptForSize: loops with a very small 939 // trip count are considered for vectorization under OptForSize, thereby 940 // making sure the cost of their loop body is dominant, free of runtime 941 // guards and scalar iteration overheads. 942 CM_ScalarEpilogueNotAllowedLowTripLoop, 943 944 // Loop hint predicate indicating an epilogue is undesired. 945 CM_ScalarEpilogueNotNeededUsePredicate 946 }; 947 948 /// LoopVectorizationCostModel - estimates the expected speedups due to 949 /// vectorization. 950 /// In many cases vectorization is not profitable. This can happen because of 951 /// a number of reasons. In this class we mainly attempt to predict the 952 /// expected speedup/slowdowns due to the supported instruction set. We use the 953 /// TargetTransformInfo to query the different backends for the cost of 954 /// different operations. 955 class LoopVectorizationCostModel { 956 public: 957 LoopVectorizationCostModel(ScalarEpilogueLowering SEL, Loop *L, 958 PredicatedScalarEvolution &PSE, LoopInfo *LI, 959 LoopVectorizationLegality *Legal, 960 const TargetTransformInfo &TTI, 961 const TargetLibraryInfo *TLI, DemandedBits *DB, 962 AssumptionCache *AC, 963 OptimizationRemarkEmitter *ORE, const Function *F, 964 const LoopVectorizeHints *Hints, 965 InterleavedAccessInfo &IAI) 966 : ScalarEpilogueStatus(SEL), TheLoop(L), PSE(PSE), LI(LI), Legal(Legal), 967 TTI(TTI), TLI(TLI), DB(DB), AC(AC), ORE(ORE), TheFunction(F), 968 Hints(Hints), InterleaveInfo(IAI) {} 969 970 /// \return An upper bound for the vectorization factor, or None if 971 /// vectorization and interleaving should be avoided up front. 972 Optional<unsigned> computeMaxVF(); 973 974 /// \return True if runtime checks are required for vectorization, and false 975 /// otherwise. 976 bool runtimeChecksRequired(); 977 978 /// \return The most profitable vectorization factor and the cost of that VF. 979 /// This method checks every power of two up to MaxVF. If UserVF is not ZERO 980 /// then this vectorization factor will be selected if vectorization is 981 /// possible. 982 VectorizationFactor selectVectorizationFactor(unsigned MaxVF); 983 984 /// Setup cost-based decisions for user vectorization factor. 985 void selectUserVectorizationFactor(unsigned UserVF) { 986 collectUniformsAndScalars(UserVF); 987 collectInstsToScalarize(UserVF); 988 } 989 990 /// \return The size (in bits) of the smallest and widest types in the code 991 /// that needs to be vectorized. We ignore values that remain scalar such as 992 /// 64 bit loop indices. 993 std::pair<unsigned, unsigned> getSmallestAndWidestTypes(); 994 995 /// \return The desired interleave count. 996 /// If interleave count has been specified by metadata it will be returned. 997 /// Otherwise, the interleave count is computed and returned. VF and LoopCost 998 /// are the selected vectorization factor and the cost of the selected VF. 999 unsigned selectInterleaveCount(unsigned VF, unsigned LoopCost); 1000 1001 /// Memory access instruction may be vectorized in more than one way. 1002 /// Form of instruction after vectorization depends on cost. 1003 /// This function takes cost-based decisions for Load/Store instructions 1004 /// and collects them in a map. This decisions map is used for building 1005 /// the lists of loop-uniform and loop-scalar instructions. 1006 /// The calculated cost is saved with widening decision in order to 1007 /// avoid redundant calculations. 1008 void setCostBasedWideningDecision(unsigned VF); 1009 1010 /// A struct that represents some properties of the register usage 1011 /// of a loop. 1012 struct RegisterUsage { 1013 /// Holds the number of loop invariant values that are used in the loop. 1014 /// The key is ClassID of target-provided register class. 1015 SmallMapVector<unsigned, unsigned, 4> LoopInvariantRegs; 1016 /// Holds the maximum number of concurrent live intervals in the loop. 1017 /// The key is ClassID of target-provided register class. 1018 SmallMapVector<unsigned, unsigned, 4> MaxLocalUsers; 1019 }; 1020 1021 /// \return Returns information about the register usages of the loop for the 1022 /// given vectorization factors. 1023 SmallVector<RegisterUsage, 8> calculateRegisterUsage(ArrayRef<unsigned> VFs); 1024 1025 /// Collect values we want to ignore in the cost model. 1026 void collectValuesToIgnore(); 1027 1028 /// \returns The smallest bitwidth each instruction can be represented with. 1029 /// The vector equivalents of these instructions should be truncated to this 1030 /// type. 1031 const MapVector<Instruction *, uint64_t> &getMinimalBitwidths() const { 1032 return MinBWs; 1033 } 1034 1035 /// \returns True if it is more profitable to scalarize instruction \p I for 1036 /// vectorization factor \p VF. 1037 bool isProfitableToScalarize(Instruction *I, unsigned VF) const { 1038 assert(VF > 1 && "Profitable to scalarize relevant only for VF > 1."); 1039 1040 // Cost model is not run in the VPlan-native path - return conservative 1041 // result until this changes. 1042 if (EnableVPlanNativePath) 1043 return false; 1044 1045 auto Scalars = InstsToScalarize.find(VF); 1046 assert(Scalars != InstsToScalarize.end() && 1047 "VF not yet analyzed for scalarization profitability"); 1048 return Scalars->second.find(I) != Scalars->second.end(); 1049 } 1050 1051 /// Returns true if \p I is known to be uniform after vectorization. 1052 bool isUniformAfterVectorization(Instruction *I, unsigned VF) const { 1053 if (VF == 1) 1054 return true; 1055 1056 // Cost model is not run in the VPlan-native path - return conservative 1057 // result until this changes. 1058 if (EnableVPlanNativePath) 1059 return false; 1060 1061 auto UniformsPerVF = Uniforms.find(VF); 1062 assert(UniformsPerVF != Uniforms.end() && 1063 "VF not yet analyzed for uniformity"); 1064 return UniformsPerVF->second.find(I) != UniformsPerVF->second.end(); 1065 } 1066 1067 /// Returns true if \p I is known to be scalar after vectorization. 1068 bool isScalarAfterVectorization(Instruction *I, unsigned VF) const { 1069 if (VF == 1) 1070 return true; 1071 1072 // Cost model is not run in the VPlan-native path - return conservative 1073 // result until this changes. 1074 if (EnableVPlanNativePath) 1075 return false; 1076 1077 auto ScalarsPerVF = Scalars.find(VF); 1078 assert(ScalarsPerVF != Scalars.end() && 1079 "Scalar values are not calculated for VF"); 1080 return ScalarsPerVF->second.find(I) != ScalarsPerVF->second.end(); 1081 } 1082 1083 /// \returns True if instruction \p I can be truncated to a smaller bitwidth 1084 /// for vectorization factor \p VF. 1085 bool canTruncateToMinimalBitwidth(Instruction *I, unsigned VF) const { 1086 return VF > 1 && MinBWs.find(I) != MinBWs.end() && 1087 !isProfitableToScalarize(I, VF) && 1088 !isScalarAfterVectorization(I, VF); 1089 } 1090 1091 /// Decision that was taken during cost calculation for memory instruction. 1092 enum InstWidening { 1093 CM_Unknown, 1094 CM_Widen, // For consecutive accesses with stride +1. 1095 CM_Widen_Reverse, // For consecutive accesses with stride -1. 1096 CM_Interleave, 1097 CM_GatherScatter, 1098 CM_Scalarize 1099 }; 1100 1101 /// Save vectorization decision \p W and \p Cost taken by the cost model for 1102 /// instruction \p I and vector width \p VF. 1103 void setWideningDecision(Instruction *I, unsigned VF, InstWidening W, 1104 unsigned Cost) { 1105 assert(VF >= 2 && "Expected VF >=2"); 1106 WideningDecisions[std::make_pair(I, VF)] = std::make_pair(W, Cost); 1107 } 1108 1109 /// Save vectorization decision \p W and \p Cost taken by the cost model for 1110 /// interleaving group \p Grp and vector width \p VF. 1111 void setWideningDecision(const InterleaveGroup<Instruction> *Grp, unsigned VF, 1112 InstWidening W, unsigned Cost) { 1113 assert(VF >= 2 && "Expected VF >=2"); 1114 /// Broadcast this decicion to all instructions inside the group. 1115 /// But the cost will be assigned to one instruction only. 1116 for (unsigned i = 0; i < Grp->getFactor(); ++i) { 1117 if (auto *I = Grp->getMember(i)) { 1118 if (Grp->getInsertPos() == I) 1119 WideningDecisions[std::make_pair(I, VF)] = std::make_pair(W, Cost); 1120 else 1121 WideningDecisions[std::make_pair(I, VF)] = std::make_pair(W, 0); 1122 } 1123 } 1124 } 1125 1126 /// Return the cost model decision for the given instruction \p I and vector 1127 /// width \p VF. Return CM_Unknown if this instruction did not pass 1128 /// through the cost modeling. 1129 InstWidening getWideningDecision(Instruction *I, unsigned VF) { 1130 assert(VF >= 2 && "Expected VF >=2"); 1131 1132 // Cost model is not run in the VPlan-native path - return conservative 1133 // result until this changes. 1134 if (EnableVPlanNativePath) 1135 return CM_GatherScatter; 1136 1137 std::pair<Instruction *, unsigned> InstOnVF = std::make_pair(I, VF); 1138 auto Itr = WideningDecisions.find(InstOnVF); 1139 if (Itr == WideningDecisions.end()) 1140 return CM_Unknown; 1141 return Itr->second.first; 1142 } 1143 1144 /// Return the vectorization cost for the given instruction \p I and vector 1145 /// width \p VF. 1146 unsigned getWideningCost(Instruction *I, unsigned VF) { 1147 assert(VF >= 2 && "Expected VF >=2"); 1148 std::pair<Instruction *, unsigned> InstOnVF = std::make_pair(I, VF); 1149 assert(WideningDecisions.find(InstOnVF) != WideningDecisions.end() && 1150 "The cost is not calculated"); 1151 return WideningDecisions[InstOnVF].second; 1152 } 1153 1154 /// Return True if instruction \p I is an optimizable truncate whose operand 1155 /// is an induction variable. Such a truncate will be removed by adding a new 1156 /// induction variable with the destination type. 1157 bool isOptimizableIVTruncate(Instruction *I, unsigned VF) { 1158 // If the instruction is not a truncate, return false. 1159 auto *Trunc = dyn_cast<TruncInst>(I); 1160 if (!Trunc) 1161 return false; 1162 1163 // Get the source and destination types of the truncate. 1164 Type *SrcTy = ToVectorTy(cast<CastInst>(I)->getSrcTy(), VF); 1165 Type *DestTy = ToVectorTy(cast<CastInst>(I)->getDestTy(), VF); 1166 1167 // If the truncate is free for the given types, return false. Replacing a 1168 // free truncate with an induction variable would add an induction variable 1169 // update instruction to each iteration of the loop. We exclude from this 1170 // check the primary induction variable since it will need an update 1171 // instruction regardless. 1172 Value *Op = Trunc->getOperand(0); 1173 if (Op != Legal->getPrimaryInduction() && TTI.isTruncateFree(SrcTy, DestTy)) 1174 return false; 1175 1176 // If the truncated value is not an induction variable, return false. 1177 return Legal->isInductionPhi(Op); 1178 } 1179 1180 /// Collects the instructions to scalarize for each predicated instruction in 1181 /// the loop. 1182 void collectInstsToScalarize(unsigned VF); 1183 1184 /// Collect Uniform and Scalar values for the given \p VF. 1185 /// The sets depend on CM decision for Load/Store instructions 1186 /// that may be vectorized as interleave, gather-scatter or scalarized. 1187 void collectUniformsAndScalars(unsigned VF) { 1188 // Do the analysis once. 1189 if (VF == 1 || Uniforms.find(VF) != Uniforms.end()) 1190 return; 1191 setCostBasedWideningDecision(VF); 1192 collectLoopUniforms(VF); 1193 collectLoopScalars(VF); 1194 } 1195 1196 /// Returns true if the target machine supports masked store operation 1197 /// for the given \p DataType and kind of access to \p Ptr. 1198 bool isLegalMaskedStore(Type *DataType, Value *Ptr, MaybeAlign Alignment) { 1199 return Legal->isConsecutivePtr(Ptr) && 1200 TTI.isLegalMaskedStore(DataType, Alignment); 1201 } 1202 1203 /// Returns true if the target machine supports masked load operation 1204 /// for the given \p DataType and kind of access to \p Ptr. 1205 bool isLegalMaskedLoad(Type *DataType, Value *Ptr, MaybeAlign Alignment) { 1206 return Legal->isConsecutivePtr(Ptr) && 1207 TTI.isLegalMaskedLoad(DataType, Alignment); 1208 } 1209 1210 /// Returns true if the target machine supports masked scatter operation 1211 /// for the given \p DataType. 1212 bool isLegalMaskedScatter(Type *DataType, MaybeAlign Alignment) { 1213 return TTI.isLegalMaskedScatter(DataType, Alignment); 1214 } 1215 1216 /// Returns true if the target machine supports masked gather operation 1217 /// for the given \p DataType. 1218 bool isLegalMaskedGather(Type *DataType, MaybeAlign Alignment) { 1219 return TTI.isLegalMaskedGather(DataType, Alignment); 1220 } 1221 1222 /// Returns true if the target machine can represent \p V as a masked gather 1223 /// or scatter operation. 1224 bool isLegalGatherOrScatter(Value *V) { 1225 bool LI = isa<LoadInst>(V); 1226 bool SI = isa<StoreInst>(V); 1227 if (!LI && !SI) 1228 return false; 1229 auto *Ty = getMemInstValueType(V); 1230 MaybeAlign Align = getLoadStoreAlignment(V); 1231 return (LI && isLegalMaskedGather(Ty, Align)) || 1232 (SI && isLegalMaskedScatter(Ty, Align)); 1233 } 1234 1235 /// Returns true if \p I is an instruction that will be scalarized with 1236 /// predication. Such instructions include conditional stores and 1237 /// instructions that may divide by zero. 1238 /// If a non-zero VF has been calculated, we check if I will be scalarized 1239 /// predication for that VF. 1240 bool isScalarWithPredication(Instruction *I, unsigned VF = 1); 1241 1242 // Returns true if \p I is an instruction that will be predicated either 1243 // through scalar predication or masked load/store or masked gather/scatter. 1244 // Superset of instructions that return true for isScalarWithPredication. 1245 bool isPredicatedInst(Instruction *I) { 1246 if (!blockNeedsPredication(I->getParent())) 1247 return false; 1248 // Loads and stores that need some form of masked operation are predicated 1249 // instructions. 1250 if (isa<LoadInst>(I) || isa<StoreInst>(I)) 1251 return Legal->isMaskRequired(I); 1252 return isScalarWithPredication(I); 1253 } 1254 1255 /// Returns true if \p I is a memory instruction with consecutive memory 1256 /// access that can be widened. 1257 bool memoryInstructionCanBeWidened(Instruction *I, unsigned VF = 1); 1258 1259 /// Returns true if \p I is a memory instruction in an interleaved-group 1260 /// of memory accesses that can be vectorized with wide vector loads/stores 1261 /// and shuffles. 1262 bool interleavedAccessCanBeWidened(Instruction *I, unsigned VF = 1); 1263 1264 /// Check if \p Instr belongs to any interleaved access group. 1265 bool isAccessInterleaved(Instruction *Instr) { 1266 return InterleaveInfo.isInterleaved(Instr); 1267 } 1268 1269 /// Get the interleaved access group that \p Instr belongs to. 1270 const InterleaveGroup<Instruction> * 1271 getInterleavedAccessGroup(Instruction *Instr) { 1272 return InterleaveInfo.getInterleaveGroup(Instr); 1273 } 1274 1275 /// Returns true if an interleaved group requires a scalar iteration 1276 /// to handle accesses with gaps, and there is nothing preventing us from 1277 /// creating a scalar epilogue. 1278 bool requiresScalarEpilogue() const { 1279 return isScalarEpilogueAllowed() && InterleaveInfo.requiresScalarEpilogue(); 1280 } 1281 1282 /// Returns true if a scalar epilogue is not allowed due to optsize or a 1283 /// loop hint annotation. 1284 bool isScalarEpilogueAllowed() const { 1285 return ScalarEpilogueStatus == CM_ScalarEpilogueAllowed; 1286 } 1287 1288 /// Returns true if all loop blocks should be masked to fold tail loop. 1289 bool foldTailByMasking() const { return FoldTailByMasking; } 1290 1291 bool blockNeedsPredication(BasicBlock *BB) { 1292 return foldTailByMasking() || Legal->blockNeedsPredication(BB); 1293 } 1294 1295 /// Estimate cost of an intrinsic call instruction CI if it were vectorized 1296 /// with factor VF. Return the cost of the instruction, including 1297 /// scalarization overhead if it's needed. 1298 unsigned getVectorIntrinsicCost(CallInst *CI, unsigned VF); 1299 1300 /// Estimate cost of a call instruction CI if it were vectorized with factor 1301 /// VF. Return the cost of the instruction, including scalarization overhead 1302 /// if it's needed. The flag NeedToScalarize shows if the call needs to be 1303 /// scalarized - 1304 /// i.e. either vector version isn't available, or is too expensive. 1305 unsigned getVectorCallCost(CallInst *CI, unsigned VF, bool &NeedToScalarize); 1306 1307 private: 1308 unsigned NumPredStores = 0; 1309 1310 /// \return An upper bound for the vectorization factor, larger than zero. 1311 /// One is returned if vectorization should best be avoided due to cost. 1312 unsigned computeFeasibleMaxVF(unsigned ConstTripCount); 1313 1314 /// The vectorization cost is a combination of the cost itself and a boolean 1315 /// indicating whether any of the contributing operations will actually 1316 /// operate on 1317 /// vector values after type legalization in the backend. If this latter value 1318 /// is 1319 /// false, then all operations will be scalarized (i.e. no vectorization has 1320 /// actually taken place). 1321 using VectorizationCostTy = std::pair<unsigned, bool>; 1322 1323 /// Returns the expected execution cost. The unit of the cost does 1324 /// not matter because we use the 'cost' units to compare different 1325 /// vector widths. The cost that is returned is *not* normalized by 1326 /// the factor width. 1327 VectorizationCostTy expectedCost(unsigned VF); 1328 1329 /// Returns the execution time cost of an instruction for a given vector 1330 /// width. Vector width of one means scalar. 1331 VectorizationCostTy getInstructionCost(Instruction *I, unsigned VF); 1332 1333 /// The cost-computation logic from getInstructionCost which provides 1334 /// the vector type as an output parameter. 1335 unsigned getInstructionCost(Instruction *I, unsigned VF, Type *&VectorTy); 1336 1337 /// Calculate vectorization cost of memory instruction \p I. 1338 unsigned getMemoryInstructionCost(Instruction *I, unsigned VF); 1339 1340 /// The cost computation for scalarized memory instruction. 1341 unsigned getMemInstScalarizationCost(Instruction *I, unsigned VF); 1342 1343 /// The cost computation for interleaving group of memory instructions. 1344 unsigned getInterleaveGroupCost(Instruction *I, unsigned VF); 1345 1346 /// The cost computation for Gather/Scatter instruction. 1347 unsigned getGatherScatterCost(Instruction *I, unsigned VF); 1348 1349 /// The cost computation for widening instruction \p I with consecutive 1350 /// memory access. 1351 unsigned getConsecutiveMemOpCost(Instruction *I, unsigned VF); 1352 1353 /// The cost calculation for Load/Store instruction \p I with uniform pointer - 1354 /// Load: scalar load + broadcast. 1355 /// Store: scalar store + (loop invariant value stored? 0 : extract of last 1356 /// element) 1357 unsigned getUniformMemOpCost(Instruction *I, unsigned VF); 1358 1359 /// Estimate the overhead of scalarizing an instruction. This is a 1360 /// convenience wrapper for the type-based getScalarizationOverhead API. 1361 unsigned getScalarizationOverhead(Instruction *I, unsigned VF); 1362 1363 /// Returns whether the instruction is a load or store and will be a emitted 1364 /// as a vector operation. 1365 bool isConsecutiveLoadOrStore(Instruction *I); 1366 1367 /// Returns true if an artificially high cost for emulated masked memrefs 1368 /// should be used. 1369 bool useEmulatedMaskMemRefHack(Instruction *I); 1370 1371 /// Map of scalar integer values to the smallest bitwidth they can be legally 1372 /// represented as. The vector equivalents of these values should be truncated 1373 /// to this type. 1374 MapVector<Instruction *, uint64_t> MinBWs; 1375 1376 /// A type representing the costs for instructions if they were to be 1377 /// scalarized rather than vectorized. The entries are Instruction-Cost 1378 /// pairs. 1379 using ScalarCostsTy = DenseMap<Instruction *, unsigned>; 1380 1381 /// A set containing all BasicBlocks that are known to present after 1382 /// vectorization as a predicated block. 1383 SmallPtrSet<BasicBlock *, 4> PredicatedBBsAfterVectorization; 1384 1385 /// Records whether it is allowed to have the original scalar loop execute at 1386 /// least once. This may be needed as a fallback loop in case runtime 1387 /// aliasing/dependence checks fail, or to handle the tail/remainder 1388 /// iterations when the trip count is unknown or doesn't divide by the VF, 1389 /// or as a peel-loop to handle gaps in interleave-groups. 1390 /// Under optsize and when the trip count is very small we don't allow any 1391 /// iterations to execute in the scalar loop. 1392 ScalarEpilogueLowering ScalarEpilogueStatus = CM_ScalarEpilogueAllowed; 1393 1394 /// All blocks of loop are to be masked to fold tail of scalar iterations. 1395 bool FoldTailByMasking = false; 1396 1397 /// A map holding scalar costs for different vectorization factors. The 1398 /// presence of a cost for an instruction in the mapping indicates that the 1399 /// instruction will be scalarized when vectorizing with the associated 1400 /// vectorization factor. The entries are VF-ScalarCostTy pairs. 1401 DenseMap<unsigned, ScalarCostsTy> InstsToScalarize; 1402 1403 /// Holds the instructions known to be uniform after vectorization. 1404 /// The data is collected per VF. 1405 DenseMap<unsigned, SmallPtrSet<Instruction *, 4>> Uniforms; 1406 1407 /// Holds the instructions known to be scalar after vectorization. 1408 /// The data is collected per VF. 1409 DenseMap<unsigned, SmallPtrSet<Instruction *, 4>> Scalars; 1410 1411 /// Holds the instructions (address computations) that are forced to be 1412 /// scalarized. 1413 DenseMap<unsigned, SmallPtrSet<Instruction *, 4>> ForcedScalars; 1414 1415 /// Returns the expected difference in cost from scalarizing the expression 1416 /// feeding a predicated instruction \p PredInst. The instructions to 1417 /// scalarize and their scalar costs are collected in \p ScalarCosts. A 1418 /// non-negative return value implies the expression will be scalarized. 1419 /// Currently, only single-use chains are considered for scalarization. 1420 int computePredInstDiscount(Instruction *PredInst, ScalarCostsTy &ScalarCosts, 1421 unsigned VF); 1422 1423 /// Collect the instructions that are uniform after vectorization. An 1424 /// instruction is uniform if we represent it with a single scalar value in 1425 /// the vectorized loop corresponding to each vector iteration. Examples of 1426 /// uniform instructions include pointer operands of consecutive or 1427 /// interleaved memory accesses. Note that although uniformity implies an 1428 /// instruction will be scalar, the reverse is not true. In general, a 1429 /// scalarized instruction will be represented by VF scalar values in the 1430 /// vectorized loop, each corresponding to an iteration of the original 1431 /// scalar loop. 1432 void collectLoopUniforms(unsigned VF); 1433 1434 /// Collect the instructions that are scalar after vectorization. An 1435 /// instruction is scalar if it is known to be uniform or will be scalarized 1436 /// during vectorization. Non-uniform scalarized instructions will be 1437 /// represented by VF values in the vectorized loop, each corresponding to an 1438 /// iteration of the original scalar loop. 1439 void collectLoopScalars(unsigned VF); 1440 1441 /// Keeps cost model vectorization decision and cost for instructions. 1442 /// Right now it is used for memory instructions only. 1443 using DecisionList = DenseMap<std::pair<Instruction *, unsigned>, 1444 std::pair<InstWidening, unsigned>>; 1445 1446 DecisionList WideningDecisions; 1447 1448 /// Returns true if \p V is expected to be vectorized and it needs to be 1449 /// extracted. 1450 bool needsExtract(Value *V, unsigned VF) const { 1451 Instruction *I = dyn_cast<Instruction>(V); 1452 if (VF == 1 || !I || !TheLoop->contains(I) || TheLoop->isLoopInvariant(I)) 1453 return false; 1454 1455 // Assume we can vectorize V (and hence we need extraction) if the 1456 // scalars are not computed yet. This can happen, because it is called 1457 // via getScalarizationOverhead from setCostBasedWideningDecision, before 1458 // the scalars are collected. That should be a safe assumption in most 1459 // cases, because we check if the operands have vectorizable types 1460 // beforehand in LoopVectorizationLegality. 1461 return Scalars.find(VF) == Scalars.end() || 1462 !isScalarAfterVectorization(I, VF); 1463 }; 1464 1465 /// Returns a range containing only operands needing to be extracted. 1466 SmallVector<Value *, 4> filterExtractingOperands(Instruction::op_range Ops, 1467 unsigned VF) { 1468 return SmallVector<Value *, 4>(make_filter_range( 1469 Ops, [this, VF](Value *V) { return this->needsExtract(V, VF); })); 1470 } 1471 1472 public: 1473 /// The loop that we evaluate. 1474 Loop *TheLoop; 1475 1476 /// Predicated scalar evolution analysis. 1477 PredicatedScalarEvolution &PSE; 1478 1479 /// Loop Info analysis. 1480 LoopInfo *LI; 1481 1482 /// Vectorization legality. 1483 LoopVectorizationLegality *Legal; 1484 1485 /// Vector target information. 1486 const TargetTransformInfo &TTI; 1487 1488 /// Target Library Info. 1489 const TargetLibraryInfo *TLI; 1490 1491 /// Demanded bits analysis. 1492 DemandedBits *DB; 1493 1494 /// Assumption cache. 1495 AssumptionCache *AC; 1496 1497 /// Interface to emit optimization remarks. 1498 OptimizationRemarkEmitter *ORE; 1499 1500 const Function *TheFunction; 1501 1502 /// Loop Vectorize Hint. 1503 const LoopVectorizeHints *Hints; 1504 1505 /// The interleave access information contains groups of interleaved accesses 1506 /// with the same stride and close to each other. 1507 InterleavedAccessInfo &InterleaveInfo; 1508 1509 /// Values to ignore in the cost model. 1510 SmallPtrSet<const Value *, 16> ValuesToIgnore; 1511 1512 /// Values to ignore in the cost model when VF > 1. 1513 SmallPtrSet<const Value *, 16> VecValuesToIgnore; 1514 }; 1515 1516 } // end namespace llvm 1517 1518 // Return true if \p OuterLp is an outer loop annotated with hints for explicit 1519 // vectorization. The loop needs to be annotated with #pragma omp simd 1520 // simdlen(#) or #pragma clang vectorize(enable) vectorize_width(#). If the 1521 // vector length information is not provided, vectorization is not considered 1522 // explicit. Interleave hints are not allowed either. These limitations will be 1523 // relaxed in the future. 1524 // Please, note that we are currently forced to abuse the pragma 'clang 1525 // vectorize' semantics. This pragma provides *auto-vectorization hints* 1526 // (i.e., LV must check that vectorization is legal) whereas pragma 'omp simd' 1527 // provides *explicit vectorization hints* (LV can bypass legal checks and 1528 // assume that vectorization is legal). However, both hints are implemented 1529 // using the same metadata (llvm.loop.vectorize, processed by 1530 // LoopVectorizeHints). This will be fixed in the future when the native IR 1531 // representation for pragma 'omp simd' is introduced. 1532 static bool isExplicitVecOuterLoop(Loop *OuterLp, 1533 OptimizationRemarkEmitter *ORE) { 1534 assert(!OuterLp->empty() && "This is not an outer loop"); 1535 LoopVectorizeHints Hints(OuterLp, true /*DisableInterleaving*/, *ORE); 1536 1537 // Only outer loops with an explicit vectorization hint are supported. 1538 // Unannotated outer loops are ignored. 1539 if (Hints.getForce() == LoopVectorizeHints::FK_Undefined) 1540 return false; 1541 1542 Function *Fn = OuterLp->getHeader()->getParent(); 1543 if (!Hints.allowVectorization(Fn, OuterLp, 1544 true /*VectorizeOnlyWhenForced*/)) { 1545 LLVM_DEBUG(dbgs() << "LV: Loop hints prevent outer loop vectorization.\n"); 1546 return false; 1547 } 1548 1549 if (Hints.getInterleave() > 1) { 1550 // TODO: Interleave support is future work. 1551 LLVM_DEBUG(dbgs() << "LV: Not vectorizing: Interleave is not supported for " 1552 "outer loops.\n"); 1553 Hints.emitRemarkWithHints(); 1554 return false; 1555 } 1556 1557 return true; 1558 } 1559 1560 static void collectSupportedLoops(Loop &L, LoopInfo *LI, 1561 OptimizationRemarkEmitter *ORE, 1562 SmallVectorImpl<Loop *> &V) { 1563 // Collect inner loops and outer loops without irreducible control flow. For 1564 // now, only collect outer loops that have explicit vectorization hints. If we 1565 // are stress testing the VPlan H-CFG construction, we collect the outermost 1566 // loop of every loop nest. 1567 if (L.empty() || VPlanBuildStressTest || 1568 (EnableVPlanNativePath && isExplicitVecOuterLoop(&L, ORE))) { 1569 LoopBlocksRPO RPOT(&L); 1570 RPOT.perform(LI); 1571 if (!containsIrreducibleCFG<const BasicBlock *>(RPOT, *LI)) { 1572 V.push_back(&L); 1573 // TODO: Collect inner loops inside marked outer loops in case 1574 // vectorization fails for the outer loop. Do not invoke 1575 // 'containsIrreducibleCFG' again for inner loops when the outer loop is 1576 // already known to be reducible. We can use an inherited attribute for 1577 // that. 1578 return; 1579 } 1580 } 1581 for (Loop *InnerL : L) 1582 collectSupportedLoops(*InnerL, LI, ORE, V); 1583 } 1584 1585 namespace { 1586 1587 /// The LoopVectorize Pass. 1588 struct LoopVectorize : public FunctionPass { 1589 /// Pass identification, replacement for typeid 1590 static char ID; 1591 1592 LoopVectorizePass Impl; 1593 1594 explicit LoopVectorize(bool InterleaveOnlyWhenForced = false, 1595 bool VectorizeOnlyWhenForced = false) 1596 : FunctionPass(ID) { 1597 Impl.InterleaveOnlyWhenForced = InterleaveOnlyWhenForced; 1598 Impl.VectorizeOnlyWhenForced = VectorizeOnlyWhenForced; 1599 initializeLoopVectorizePass(*PassRegistry::getPassRegistry()); 1600 } 1601 1602 bool runOnFunction(Function &F) override { 1603 if (skipFunction(F)) 1604 return false; 1605 1606 auto *SE = &getAnalysis<ScalarEvolutionWrapperPass>().getSE(); 1607 auto *LI = &getAnalysis<LoopInfoWrapperPass>().getLoopInfo(); 1608 auto *TTI = &getAnalysis<TargetTransformInfoWrapperPass>().getTTI(F); 1609 auto *DT = &getAnalysis<DominatorTreeWrapperPass>().getDomTree(); 1610 auto *BFI = &getAnalysis<BlockFrequencyInfoWrapperPass>().getBFI(); 1611 auto *TLIP = getAnalysisIfAvailable<TargetLibraryInfoWrapperPass>(); 1612 auto *TLI = TLIP ? &TLIP->getTLI(F) : nullptr; 1613 auto *AA = &getAnalysis<AAResultsWrapperPass>().getAAResults(); 1614 auto *AC = &getAnalysis<AssumptionCacheTracker>().getAssumptionCache(F); 1615 auto *LAA = &getAnalysis<LoopAccessLegacyAnalysis>(); 1616 auto *DB = &getAnalysis<DemandedBitsWrapperPass>().getDemandedBits(); 1617 auto *ORE = &getAnalysis<OptimizationRemarkEmitterWrapperPass>().getORE(); 1618 auto *PSI = &getAnalysis<ProfileSummaryInfoWrapperPass>().getPSI(); 1619 1620 std::function<const LoopAccessInfo &(Loop &)> GetLAA = 1621 [&](Loop &L) -> const LoopAccessInfo & { return LAA->getInfo(&L); }; 1622 1623 return Impl.runImpl(F, *SE, *LI, *TTI, *DT, *BFI, TLI, *DB, *AA, *AC, 1624 GetLAA, *ORE, PSI); 1625 } 1626 1627 void getAnalysisUsage(AnalysisUsage &AU) const override { 1628 AU.addRequired<AssumptionCacheTracker>(); 1629 AU.addRequired<BlockFrequencyInfoWrapperPass>(); 1630 AU.addRequired<DominatorTreeWrapperPass>(); 1631 AU.addRequired<LoopInfoWrapperPass>(); 1632 AU.addRequired<ScalarEvolutionWrapperPass>(); 1633 AU.addRequired<TargetTransformInfoWrapperPass>(); 1634 AU.addRequired<AAResultsWrapperPass>(); 1635 AU.addRequired<LoopAccessLegacyAnalysis>(); 1636 AU.addRequired<DemandedBitsWrapperPass>(); 1637 AU.addRequired<OptimizationRemarkEmitterWrapperPass>(); 1638 AU.addRequired<InjectTLIMappingsLegacy>(); 1639 1640 // We currently do not preserve loopinfo/dominator analyses with outer loop 1641 // vectorization. Until this is addressed, mark these analyses as preserved 1642 // only for non-VPlan-native path. 1643 // TODO: Preserve Loop and Dominator analyses for VPlan-native path. 1644 if (!EnableVPlanNativePath) { 1645 AU.addPreserved<LoopInfoWrapperPass>(); 1646 AU.addPreserved<DominatorTreeWrapperPass>(); 1647 } 1648 1649 AU.addPreserved<BasicAAWrapperPass>(); 1650 AU.addPreserved<GlobalsAAWrapperPass>(); 1651 AU.addRequired<ProfileSummaryInfoWrapperPass>(); 1652 } 1653 }; 1654 1655 } // end anonymous namespace 1656 1657 //===----------------------------------------------------------------------===// 1658 // Implementation of LoopVectorizationLegality, InnerLoopVectorizer and 1659 // LoopVectorizationCostModel and LoopVectorizationPlanner. 1660 //===----------------------------------------------------------------------===// 1661 1662 Value *InnerLoopVectorizer::getBroadcastInstrs(Value *V) { 1663 // We need to place the broadcast of invariant variables outside the loop, 1664 // but only if it's proven safe to do so. Else, broadcast will be inside 1665 // vector loop body. 1666 Instruction *Instr = dyn_cast<Instruction>(V); 1667 bool SafeToHoist = OrigLoop->isLoopInvariant(V) && 1668 (!Instr || 1669 DT->dominates(Instr->getParent(), LoopVectorPreHeader)); 1670 // Place the code for broadcasting invariant variables in the new preheader. 1671 IRBuilder<>::InsertPointGuard Guard(Builder); 1672 if (SafeToHoist) 1673 Builder.SetInsertPoint(LoopVectorPreHeader->getTerminator()); 1674 1675 // Broadcast the scalar into all locations in the vector. 1676 Value *Shuf = Builder.CreateVectorSplat(VF, V, "broadcast"); 1677 1678 return Shuf; 1679 } 1680 1681 void InnerLoopVectorizer::createVectorIntOrFpInductionPHI( 1682 const InductionDescriptor &II, Value *Step, Instruction *EntryVal) { 1683 assert((isa<PHINode>(EntryVal) || isa<TruncInst>(EntryVal)) && 1684 "Expected either an induction phi-node or a truncate of it!"); 1685 Value *Start = II.getStartValue(); 1686 1687 // Construct the initial value of the vector IV in the vector loop preheader 1688 auto CurrIP = Builder.saveIP(); 1689 Builder.SetInsertPoint(LoopVectorPreHeader->getTerminator()); 1690 if (isa<TruncInst>(EntryVal)) { 1691 assert(Start->getType()->isIntegerTy() && 1692 "Truncation requires an integer type"); 1693 auto *TruncType = cast<IntegerType>(EntryVal->getType()); 1694 Step = Builder.CreateTrunc(Step, TruncType); 1695 Start = Builder.CreateCast(Instruction::Trunc, Start, TruncType); 1696 } 1697 Value *SplatStart = Builder.CreateVectorSplat(VF, Start); 1698 Value *SteppedStart = 1699 getStepVector(SplatStart, 0, Step, II.getInductionOpcode()); 1700 1701 // We create vector phi nodes for both integer and floating-point induction 1702 // variables. Here, we determine the kind of arithmetic we will perform. 1703 Instruction::BinaryOps AddOp; 1704 Instruction::BinaryOps MulOp; 1705 if (Step->getType()->isIntegerTy()) { 1706 AddOp = Instruction::Add; 1707 MulOp = Instruction::Mul; 1708 } else { 1709 AddOp = II.getInductionOpcode(); 1710 MulOp = Instruction::FMul; 1711 } 1712 1713 // Multiply the vectorization factor by the step using integer or 1714 // floating-point arithmetic as appropriate. 1715 Value *ConstVF = getSignedIntOrFpConstant(Step->getType(), VF); 1716 Value *Mul = addFastMathFlag(Builder.CreateBinOp(MulOp, Step, ConstVF)); 1717 1718 // Create a vector splat to use in the induction update. 1719 // 1720 // FIXME: If the step is non-constant, we create the vector splat with 1721 // IRBuilder. IRBuilder can constant-fold the multiply, but it doesn't 1722 // handle a constant vector splat. 1723 Value *SplatVF = 1724 isa<Constant>(Mul) 1725 ? ConstantVector::getSplat({VF, false}, cast<Constant>(Mul)) 1726 : Builder.CreateVectorSplat(VF, Mul); 1727 Builder.restoreIP(CurrIP); 1728 1729 // We may need to add the step a number of times, depending on the unroll 1730 // factor. The last of those goes into the PHI. 1731 PHINode *VecInd = PHINode::Create(SteppedStart->getType(), 2, "vec.ind", 1732 &*LoopVectorBody->getFirstInsertionPt()); 1733 VecInd->setDebugLoc(EntryVal->getDebugLoc()); 1734 Instruction *LastInduction = VecInd; 1735 for (unsigned Part = 0; Part < UF; ++Part) { 1736 VectorLoopValueMap.setVectorValue(EntryVal, Part, LastInduction); 1737 1738 if (isa<TruncInst>(EntryVal)) 1739 addMetadata(LastInduction, EntryVal); 1740 recordVectorLoopValueForInductionCast(II, EntryVal, LastInduction, Part); 1741 1742 LastInduction = cast<Instruction>(addFastMathFlag( 1743 Builder.CreateBinOp(AddOp, LastInduction, SplatVF, "step.add"))); 1744 LastInduction->setDebugLoc(EntryVal->getDebugLoc()); 1745 } 1746 1747 // Move the last step to the end of the latch block. This ensures consistent 1748 // placement of all induction updates. 1749 auto *LoopVectorLatch = LI->getLoopFor(LoopVectorBody)->getLoopLatch(); 1750 auto *Br = cast<BranchInst>(LoopVectorLatch->getTerminator()); 1751 auto *ICmp = cast<Instruction>(Br->getCondition()); 1752 LastInduction->moveBefore(ICmp); 1753 LastInduction->setName("vec.ind.next"); 1754 1755 VecInd->addIncoming(SteppedStart, LoopVectorPreHeader); 1756 VecInd->addIncoming(LastInduction, LoopVectorLatch); 1757 } 1758 1759 bool InnerLoopVectorizer::shouldScalarizeInstruction(Instruction *I) const { 1760 return Cost->isScalarAfterVectorization(I, VF) || 1761 Cost->isProfitableToScalarize(I, VF); 1762 } 1763 1764 bool InnerLoopVectorizer::needsScalarInduction(Instruction *IV) const { 1765 if (shouldScalarizeInstruction(IV)) 1766 return true; 1767 auto isScalarInst = [&](User *U) -> bool { 1768 auto *I = cast<Instruction>(U); 1769 return (OrigLoop->contains(I) && shouldScalarizeInstruction(I)); 1770 }; 1771 return llvm::any_of(IV->users(), isScalarInst); 1772 } 1773 1774 void InnerLoopVectorizer::recordVectorLoopValueForInductionCast( 1775 const InductionDescriptor &ID, const Instruction *EntryVal, 1776 Value *VectorLoopVal, unsigned Part, unsigned Lane) { 1777 assert((isa<PHINode>(EntryVal) || isa<TruncInst>(EntryVal)) && 1778 "Expected either an induction phi-node or a truncate of it!"); 1779 1780 // This induction variable is not the phi from the original loop but the 1781 // newly-created IV based on the proof that casted Phi is equal to the 1782 // uncasted Phi in the vectorized loop (under a runtime guard possibly). It 1783 // re-uses the same InductionDescriptor that original IV uses but we don't 1784 // have to do any recording in this case - that is done when original IV is 1785 // processed. 1786 if (isa<TruncInst>(EntryVal)) 1787 return; 1788 1789 const SmallVectorImpl<Instruction *> &Casts = ID.getCastInsts(); 1790 if (Casts.empty()) 1791 return; 1792 // Only the first Cast instruction in the Casts vector is of interest. 1793 // The rest of the Casts (if exist) have no uses outside the 1794 // induction update chain itself. 1795 Instruction *CastInst = *Casts.begin(); 1796 if (Lane < UINT_MAX) 1797 VectorLoopValueMap.setScalarValue(CastInst, {Part, Lane}, VectorLoopVal); 1798 else 1799 VectorLoopValueMap.setVectorValue(CastInst, Part, VectorLoopVal); 1800 } 1801 1802 void InnerLoopVectorizer::widenIntOrFpInduction(PHINode *IV, TruncInst *Trunc) { 1803 assert((IV->getType()->isIntegerTy() || IV != OldInduction) && 1804 "Primary induction variable must have an integer type"); 1805 1806 auto II = Legal->getInductionVars().find(IV); 1807 assert(II != Legal->getInductionVars().end() && "IV is not an induction"); 1808 1809 auto ID = II->second; 1810 assert(IV->getType() == ID.getStartValue()->getType() && "Types must match"); 1811 1812 // The value from the original loop to which we are mapping the new induction 1813 // variable. 1814 Instruction *EntryVal = Trunc ? cast<Instruction>(Trunc) : IV; 1815 1816 auto &DL = OrigLoop->getHeader()->getModule()->getDataLayout(); 1817 1818 // Generate code for the induction step. Note that induction steps are 1819 // required to be loop-invariant 1820 auto CreateStepValue = [&](const SCEV *Step) -> Value * { 1821 assert(PSE.getSE()->isLoopInvariant(Step, OrigLoop) && 1822 "Induction step should be loop invariant"); 1823 if (PSE.getSE()->isSCEVable(IV->getType())) { 1824 SCEVExpander Exp(*PSE.getSE(), DL, "induction"); 1825 return Exp.expandCodeFor(Step, Step->getType(), 1826 LoopVectorPreHeader->getTerminator()); 1827 } 1828 return cast<SCEVUnknown>(Step)->getValue(); 1829 }; 1830 1831 // The scalar value to broadcast. This is derived from the canonical 1832 // induction variable. If a truncation type is given, truncate the canonical 1833 // induction variable and step. Otherwise, derive these values from the 1834 // induction descriptor. 1835 auto CreateScalarIV = [&](Value *&Step) -> Value * { 1836 Value *ScalarIV = Induction; 1837 if (IV != OldInduction) { 1838 ScalarIV = IV->getType()->isIntegerTy() 1839 ? Builder.CreateSExtOrTrunc(Induction, IV->getType()) 1840 : Builder.CreateCast(Instruction::SIToFP, Induction, 1841 IV->getType()); 1842 ScalarIV = emitTransformedIndex(Builder, ScalarIV, PSE.getSE(), DL, ID); 1843 ScalarIV->setName("offset.idx"); 1844 } 1845 if (Trunc) { 1846 auto *TruncType = cast<IntegerType>(Trunc->getType()); 1847 assert(Step->getType()->isIntegerTy() && 1848 "Truncation requires an integer step"); 1849 ScalarIV = Builder.CreateTrunc(ScalarIV, TruncType); 1850 Step = Builder.CreateTrunc(Step, TruncType); 1851 } 1852 return ScalarIV; 1853 }; 1854 1855 // Create the vector values from the scalar IV, in the absence of creating a 1856 // vector IV. 1857 auto CreateSplatIV = [&](Value *ScalarIV, Value *Step) { 1858 Value *Broadcasted = getBroadcastInstrs(ScalarIV); 1859 for (unsigned Part = 0; Part < UF; ++Part) { 1860 Value *EntryPart = 1861 getStepVector(Broadcasted, VF * Part, Step, ID.getInductionOpcode()); 1862 VectorLoopValueMap.setVectorValue(EntryVal, Part, EntryPart); 1863 if (Trunc) 1864 addMetadata(EntryPart, Trunc); 1865 recordVectorLoopValueForInductionCast(ID, EntryVal, EntryPart, Part); 1866 } 1867 }; 1868 1869 // Now do the actual transformations, and start with creating the step value. 1870 Value *Step = CreateStepValue(ID.getStep()); 1871 if (VF <= 1) { 1872 Value *ScalarIV = CreateScalarIV(Step); 1873 CreateSplatIV(ScalarIV, Step); 1874 return; 1875 } 1876 1877 // Determine if we want a scalar version of the induction variable. This is 1878 // true if the induction variable itself is not widened, or if it has at 1879 // least one user in the loop that is not widened. 1880 auto NeedsScalarIV = needsScalarInduction(EntryVal); 1881 if (!NeedsScalarIV) { 1882 createVectorIntOrFpInductionPHI(ID, Step, EntryVal); 1883 return; 1884 } 1885 1886 // Try to create a new independent vector induction variable. If we can't 1887 // create the phi node, we will splat the scalar induction variable in each 1888 // loop iteration. 1889 if (!shouldScalarizeInstruction(EntryVal)) { 1890 createVectorIntOrFpInductionPHI(ID, Step, EntryVal); 1891 Value *ScalarIV = CreateScalarIV(Step); 1892 // Create scalar steps that can be used by instructions we will later 1893 // scalarize. Note that the addition of the scalar steps will not increase 1894 // the number of instructions in the loop in the common case prior to 1895 // InstCombine. We will be trading one vector extract for each scalar step. 1896 buildScalarSteps(ScalarIV, Step, EntryVal, ID); 1897 return; 1898 } 1899 1900 // If we haven't yet vectorized the induction variable, splat the scalar 1901 // induction variable, and build the necessary step vectors. 1902 // TODO: Don't do it unless the vectorized IV is really required. 1903 Value *ScalarIV = CreateScalarIV(Step); 1904 CreateSplatIV(ScalarIV, Step); 1905 buildScalarSteps(ScalarIV, Step, EntryVal, ID); 1906 } 1907 1908 Value *InnerLoopVectorizer::getStepVector(Value *Val, int StartIdx, Value *Step, 1909 Instruction::BinaryOps BinOp) { 1910 // Create and check the types. 1911 assert(Val->getType()->isVectorTy() && "Must be a vector"); 1912 int VLen = Val->getType()->getVectorNumElements(); 1913 1914 Type *STy = Val->getType()->getScalarType(); 1915 assert((STy->isIntegerTy() || STy->isFloatingPointTy()) && 1916 "Induction Step must be an integer or FP"); 1917 assert(Step->getType() == STy && "Step has wrong type"); 1918 1919 SmallVector<Constant *, 8> Indices; 1920 1921 if (STy->isIntegerTy()) { 1922 // Create a vector of consecutive numbers from zero to VF. 1923 for (int i = 0; i < VLen; ++i) 1924 Indices.push_back(ConstantInt::get(STy, StartIdx + i)); 1925 1926 // Add the consecutive indices to the vector value. 1927 Constant *Cv = ConstantVector::get(Indices); 1928 assert(Cv->getType() == Val->getType() && "Invalid consecutive vec"); 1929 Step = Builder.CreateVectorSplat(VLen, Step); 1930 assert(Step->getType() == Val->getType() && "Invalid step vec"); 1931 // FIXME: The newly created binary instructions should contain nsw/nuw flags, 1932 // which can be found from the original scalar operations. 1933 Step = Builder.CreateMul(Cv, Step); 1934 return Builder.CreateAdd(Val, Step, "induction"); 1935 } 1936 1937 // Floating point induction. 1938 assert((BinOp == Instruction::FAdd || BinOp == Instruction::FSub) && 1939 "Binary Opcode should be specified for FP induction"); 1940 // Create a vector of consecutive numbers from zero to VF. 1941 for (int i = 0; i < VLen; ++i) 1942 Indices.push_back(ConstantFP::get(STy, (double)(StartIdx + i))); 1943 1944 // Add the consecutive indices to the vector value. 1945 Constant *Cv = ConstantVector::get(Indices); 1946 1947 Step = Builder.CreateVectorSplat(VLen, Step); 1948 1949 // Floating point operations had to be 'fast' to enable the induction. 1950 FastMathFlags Flags; 1951 Flags.setFast(); 1952 1953 Value *MulOp = Builder.CreateFMul(Cv, Step); 1954 if (isa<Instruction>(MulOp)) 1955 // Have to check, MulOp may be a constant 1956 cast<Instruction>(MulOp)->setFastMathFlags(Flags); 1957 1958 Value *BOp = Builder.CreateBinOp(BinOp, Val, MulOp, "induction"); 1959 if (isa<Instruction>(BOp)) 1960 cast<Instruction>(BOp)->setFastMathFlags(Flags); 1961 return BOp; 1962 } 1963 1964 void InnerLoopVectorizer::buildScalarSteps(Value *ScalarIV, Value *Step, 1965 Instruction *EntryVal, 1966 const InductionDescriptor &ID) { 1967 // We shouldn't have to build scalar steps if we aren't vectorizing. 1968 assert(VF > 1 && "VF should be greater than one"); 1969 1970 // Get the value type and ensure it and the step have the same integer type. 1971 Type *ScalarIVTy = ScalarIV->getType()->getScalarType(); 1972 assert(ScalarIVTy == Step->getType() && 1973 "Val and Step should have the same type"); 1974 1975 // We build scalar steps for both integer and floating-point induction 1976 // variables. Here, we determine the kind of arithmetic we will perform. 1977 Instruction::BinaryOps AddOp; 1978 Instruction::BinaryOps MulOp; 1979 if (ScalarIVTy->isIntegerTy()) { 1980 AddOp = Instruction::Add; 1981 MulOp = Instruction::Mul; 1982 } else { 1983 AddOp = ID.getInductionOpcode(); 1984 MulOp = Instruction::FMul; 1985 } 1986 1987 // Determine the number of scalars we need to generate for each unroll 1988 // iteration. If EntryVal is uniform, we only need to generate the first 1989 // lane. Otherwise, we generate all VF values. 1990 unsigned Lanes = 1991 Cost->isUniformAfterVectorization(cast<Instruction>(EntryVal), VF) ? 1 1992 : VF; 1993 // Compute the scalar steps and save the results in VectorLoopValueMap. 1994 for (unsigned Part = 0; Part < UF; ++Part) { 1995 for (unsigned Lane = 0; Lane < Lanes; ++Lane) { 1996 auto *StartIdx = getSignedIntOrFpConstant(ScalarIVTy, VF * Part + Lane); 1997 auto *Mul = addFastMathFlag(Builder.CreateBinOp(MulOp, StartIdx, Step)); 1998 auto *Add = addFastMathFlag(Builder.CreateBinOp(AddOp, ScalarIV, Mul)); 1999 VectorLoopValueMap.setScalarValue(EntryVal, {Part, Lane}, Add); 2000 recordVectorLoopValueForInductionCast(ID, EntryVal, Add, Part, Lane); 2001 } 2002 } 2003 } 2004 2005 Value *InnerLoopVectorizer::getOrCreateVectorValue(Value *V, unsigned Part) { 2006 assert(V != Induction && "The new induction variable should not be used."); 2007 assert(!V->getType()->isVectorTy() && "Can't widen a vector"); 2008 assert(!V->getType()->isVoidTy() && "Type does not produce a value"); 2009 2010 // If we have a stride that is replaced by one, do it here. Defer this for 2011 // the VPlan-native path until we start running Legal checks in that path. 2012 if (!EnableVPlanNativePath && Legal->hasStride(V)) 2013 V = ConstantInt::get(V->getType(), 1); 2014 2015 // If we have a vector mapped to this value, return it. 2016 if (VectorLoopValueMap.hasVectorValue(V, Part)) 2017 return VectorLoopValueMap.getVectorValue(V, Part); 2018 2019 // If the value has not been vectorized, check if it has been scalarized 2020 // instead. If it has been scalarized, and we actually need the value in 2021 // vector form, we will construct the vector values on demand. 2022 if (VectorLoopValueMap.hasAnyScalarValue(V)) { 2023 Value *ScalarValue = VectorLoopValueMap.getScalarValue(V, {Part, 0}); 2024 2025 // If we've scalarized a value, that value should be an instruction. 2026 auto *I = cast<Instruction>(V); 2027 2028 // If we aren't vectorizing, we can just copy the scalar map values over to 2029 // the vector map. 2030 if (VF == 1) { 2031 VectorLoopValueMap.setVectorValue(V, Part, ScalarValue); 2032 return ScalarValue; 2033 } 2034 2035 // Get the last scalar instruction we generated for V and Part. If the value 2036 // is known to be uniform after vectorization, this corresponds to lane zero 2037 // of the Part unroll iteration. Otherwise, the last instruction is the one 2038 // we created for the last vector lane of the Part unroll iteration. 2039 unsigned LastLane = Cost->isUniformAfterVectorization(I, VF) ? 0 : VF - 1; 2040 auto *LastInst = cast<Instruction>( 2041 VectorLoopValueMap.getScalarValue(V, {Part, LastLane})); 2042 2043 // Set the insert point after the last scalarized instruction. This ensures 2044 // the insertelement sequence will directly follow the scalar definitions. 2045 auto OldIP = Builder.saveIP(); 2046 auto NewIP = std::next(BasicBlock::iterator(LastInst)); 2047 Builder.SetInsertPoint(&*NewIP); 2048 2049 // However, if we are vectorizing, we need to construct the vector values. 2050 // If the value is known to be uniform after vectorization, we can just 2051 // broadcast the scalar value corresponding to lane zero for each unroll 2052 // iteration. Otherwise, we construct the vector values using insertelement 2053 // instructions. Since the resulting vectors are stored in 2054 // VectorLoopValueMap, we will only generate the insertelements once. 2055 Value *VectorValue = nullptr; 2056 if (Cost->isUniformAfterVectorization(I, VF)) { 2057 VectorValue = getBroadcastInstrs(ScalarValue); 2058 VectorLoopValueMap.setVectorValue(V, Part, VectorValue); 2059 } else { 2060 // Initialize packing with insertelements to start from undef. 2061 Value *Undef = UndefValue::get(VectorType::get(V->getType(), VF)); 2062 VectorLoopValueMap.setVectorValue(V, Part, Undef); 2063 for (unsigned Lane = 0; Lane < VF; ++Lane) 2064 packScalarIntoVectorValue(V, {Part, Lane}); 2065 VectorValue = VectorLoopValueMap.getVectorValue(V, Part); 2066 } 2067 Builder.restoreIP(OldIP); 2068 return VectorValue; 2069 } 2070 2071 // If this scalar is unknown, assume that it is a constant or that it is 2072 // loop invariant. Broadcast V and save the value for future uses. 2073 Value *B = getBroadcastInstrs(V); 2074 VectorLoopValueMap.setVectorValue(V, Part, B); 2075 return B; 2076 } 2077 2078 Value * 2079 InnerLoopVectorizer::getOrCreateScalarValue(Value *V, 2080 const VPIteration &Instance) { 2081 // If the value is not an instruction contained in the loop, it should 2082 // already be scalar. 2083 if (OrigLoop->isLoopInvariant(V)) 2084 return V; 2085 2086 assert(Instance.Lane > 0 2087 ? !Cost->isUniformAfterVectorization(cast<Instruction>(V), VF) 2088 : true && "Uniform values only have lane zero"); 2089 2090 // If the value from the original loop has not been vectorized, it is 2091 // represented by UF x VF scalar values in the new loop. Return the requested 2092 // scalar value. 2093 if (VectorLoopValueMap.hasScalarValue(V, Instance)) 2094 return VectorLoopValueMap.getScalarValue(V, Instance); 2095 2096 // If the value has not been scalarized, get its entry in VectorLoopValueMap 2097 // for the given unroll part. If this entry is not a vector type (i.e., the 2098 // vectorization factor is one), there is no need to generate an 2099 // extractelement instruction. 2100 auto *U = getOrCreateVectorValue(V, Instance.Part); 2101 if (!U->getType()->isVectorTy()) { 2102 assert(VF == 1 && "Value not scalarized has non-vector type"); 2103 return U; 2104 } 2105 2106 // Otherwise, the value from the original loop has been vectorized and is 2107 // represented by UF vector values. Extract and return the requested scalar 2108 // value from the appropriate vector lane. 2109 return Builder.CreateExtractElement(U, Builder.getInt32(Instance.Lane)); 2110 } 2111 2112 void InnerLoopVectorizer::packScalarIntoVectorValue( 2113 Value *V, const VPIteration &Instance) { 2114 assert(V != Induction && "The new induction variable should not be used."); 2115 assert(!V->getType()->isVectorTy() && "Can't pack a vector"); 2116 assert(!V->getType()->isVoidTy() && "Type does not produce a value"); 2117 2118 Value *ScalarInst = VectorLoopValueMap.getScalarValue(V, Instance); 2119 Value *VectorValue = VectorLoopValueMap.getVectorValue(V, Instance.Part); 2120 VectorValue = Builder.CreateInsertElement(VectorValue, ScalarInst, 2121 Builder.getInt32(Instance.Lane)); 2122 VectorLoopValueMap.resetVectorValue(V, Instance.Part, VectorValue); 2123 } 2124 2125 Value *InnerLoopVectorizer::reverseVector(Value *Vec) { 2126 assert(Vec->getType()->isVectorTy() && "Invalid type"); 2127 SmallVector<Constant *, 8> ShuffleMask; 2128 for (unsigned i = 0; i < VF; ++i) 2129 ShuffleMask.push_back(Builder.getInt32(VF - i - 1)); 2130 2131 return Builder.CreateShuffleVector(Vec, UndefValue::get(Vec->getType()), 2132 ConstantVector::get(ShuffleMask), 2133 "reverse"); 2134 } 2135 2136 // Return whether we allow using masked interleave-groups (for dealing with 2137 // strided loads/stores that reside in predicated blocks, or for dealing 2138 // with gaps). 2139 static bool useMaskedInterleavedAccesses(const TargetTransformInfo &TTI) { 2140 // If an override option has been passed in for interleaved accesses, use it. 2141 if (EnableMaskedInterleavedMemAccesses.getNumOccurrences() > 0) 2142 return EnableMaskedInterleavedMemAccesses; 2143 2144 return TTI.enableMaskedInterleavedAccessVectorization(); 2145 } 2146 2147 // Try to vectorize the interleave group that \p Instr belongs to. 2148 // 2149 // E.g. Translate following interleaved load group (factor = 3): 2150 // for (i = 0; i < N; i+=3) { 2151 // R = Pic[i]; // Member of index 0 2152 // G = Pic[i+1]; // Member of index 1 2153 // B = Pic[i+2]; // Member of index 2 2154 // ... // do something to R, G, B 2155 // } 2156 // To: 2157 // %wide.vec = load <12 x i32> ; Read 4 tuples of R,G,B 2158 // %R.vec = shuffle %wide.vec, undef, <0, 3, 6, 9> ; R elements 2159 // %G.vec = shuffle %wide.vec, undef, <1, 4, 7, 10> ; G elements 2160 // %B.vec = shuffle %wide.vec, undef, <2, 5, 8, 11> ; B elements 2161 // 2162 // Or translate following interleaved store group (factor = 3): 2163 // for (i = 0; i < N; i+=3) { 2164 // ... do something to R, G, B 2165 // Pic[i] = R; // Member of index 0 2166 // Pic[i+1] = G; // Member of index 1 2167 // Pic[i+2] = B; // Member of index 2 2168 // } 2169 // To: 2170 // %R_G.vec = shuffle %R.vec, %G.vec, <0, 1, 2, ..., 7> 2171 // %B_U.vec = shuffle %B.vec, undef, <0, 1, 2, 3, u, u, u, u> 2172 // %interleaved.vec = shuffle %R_G.vec, %B_U.vec, 2173 // <0, 4, 8, 1, 5, 9, 2, 6, 10, 3, 7, 11> ; Interleave R,G,B elements 2174 // store <12 x i32> %interleaved.vec ; Write 4 tuples of R,G,B 2175 void InnerLoopVectorizer::vectorizeInterleaveGroup(Instruction *Instr, 2176 VPTransformState &State, 2177 VPValue *Addr, 2178 VPValue *BlockInMask) { 2179 const InterleaveGroup<Instruction> *Group = 2180 Cost->getInterleavedAccessGroup(Instr); 2181 assert(Group && "Fail to get an interleaved access group."); 2182 2183 // Skip if current instruction is not the insert position. 2184 if (Instr != Group->getInsertPos()) 2185 return; 2186 2187 const DataLayout &DL = Instr->getModule()->getDataLayout(); 2188 2189 // Prepare for the vector type of the interleaved load/store. 2190 Type *ScalarTy = getMemInstValueType(Instr); 2191 unsigned InterleaveFactor = Group->getFactor(); 2192 Type *VecTy = VectorType::get(ScalarTy, InterleaveFactor * VF); 2193 2194 // Prepare for the new pointers. 2195 SmallVector<Value *, 2> AddrParts; 2196 unsigned Index = Group->getIndex(Instr); 2197 2198 // TODO: extend the masked interleaved-group support to reversed access. 2199 assert((!BlockInMask || !Group->isReverse()) && 2200 "Reversed masked interleave-group not supported."); 2201 2202 // If the group is reverse, adjust the index to refer to the last vector lane 2203 // instead of the first. We adjust the index from the first vector lane, 2204 // rather than directly getting the pointer for lane VF - 1, because the 2205 // pointer operand of the interleaved access is supposed to be uniform. For 2206 // uniform instructions, we're only required to generate a value for the 2207 // first vector lane in each unroll iteration. 2208 if (Group->isReverse()) 2209 Index += (VF - 1) * Group->getFactor(); 2210 2211 for (unsigned Part = 0; Part < UF; Part++) { 2212 Value *AddrPart = State.get(Addr, {Part, 0}); 2213 setDebugLocFromInst(Builder, AddrPart); 2214 2215 // Notice current instruction could be any index. Need to adjust the address 2216 // to the member of index 0. 2217 // 2218 // E.g. a = A[i+1]; // Member of index 1 (Current instruction) 2219 // b = A[i]; // Member of index 0 2220 // Current pointer is pointed to A[i+1], adjust it to A[i]. 2221 // 2222 // E.g. A[i+1] = a; // Member of index 1 2223 // A[i] = b; // Member of index 0 2224 // A[i+2] = c; // Member of index 2 (Current instruction) 2225 // Current pointer is pointed to A[i+2], adjust it to A[i]. 2226 2227 bool InBounds = false; 2228 if (auto *gep = dyn_cast<GetElementPtrInst>(AddrPart->stripPointerCasts())) 2229 InBounds = gep->isInBounds(); 2230 AddrPart = Builder.CreateGEP(ScalarTy, AddrPart, Builder.getInt32(-Index)); 2231 cast<GetElementPtrInst>(AddrPart)->setIsInBounds(InBounds); 2232 2233 // Cast to the vector pointer type. 2234 unsigned AddressSpace = AddrPart->getType()->getPointerAddressSpace(); 2235 Type *PtrTy = VecTy->getPointerTo(AddressSpace); 2236 AddrParts.push_back(Builder.CreateBitCast(AddrPart, PtrTy)); 2237 } 2238 2239 setDebugLocFromInst(Builder, Instr); 2240 Value *UndefVec = UndefValue::get(VecTy); 2241 2242 Value *MaskForGaps = nullptr; 2243 if (Group->requiresScalarEpilogue() && !Cost->isScalarEpilogueAllowed()) { 2244 MaskForGaps = createBitMaskForGaps(Builder, VF, *Group); 2245 assert(MaskForGaps && "Mask for Gaps is required but it is null"); 2246 } 2247 2248 // Vectorize the interleaved load group. 2249 if (isa<LoadInst>(Instr)) { 2250 // For each unroll part, create a wide load for the group. 2251 SmallVector<Value *, 2> NewLoads; 2252 for (unsigned Part = 0; Part < UF; Part++) { 2253 Instruction *NewLoad; 2254 if (BlockInMask || MaskForGaps) { 2255 assert(useMaskedInterleavedAccesses(*TTI) && 2256 "masked interleaved groups are not allowed."); 2257 Value *GroupMask = MaskForGaps; 2258 if (BlockInMask) { 2259 Value *BlockInMaskPart = State.get(BlockInMask, Part); 2260 auto *Undefs = UndefValue::get(BlockInMaskPart->getType()); 2261 auto *RepMask = createReplicatedMask(Builder, InterleaveFactor, VF); 2262 Value *ShuffledMask = Builder.CreateShuffleVector( 2263 BlockInMaskPart, Undefs, RepMask, "interleaved.mask"); 2264 GroupMask = MaskForGaps 2265 ? Builder.CreateBinOp(Instruction::And, ShuffledMask, 2266 MaskForGaps) 2267 : ShuffledMask; 2268 } 2269 NewLoad = 2270 Builder.CreateMaskedLoad(AddrParts[Part], Group->getAlign(), 2271 GroupMask, UndefVec, "wide.masked.vec"); 2272 } 2273 else 2274 NewLoad = Builder.CreateAlignedLoad(VecTy, AddrParts[Part], 2275 Group->getAlign(), "wide.vec"); 2276 Group->addMetadata(NewLoad); 2277 NewLoads.push_back(NewLoad); 2278 } 2279 2280 // For each member in the group, shuffle out the appropriate data from the 2281 // wide loads. 2282 for (unsigned I = 0; I < InterleaveFactor; ++I) { 2283 Instruction *Member = Group->getMember(I); 2284 2285 // Skip the gaps in the group. 2286 if (!Member) 2287 continue; 2288 2289 Constant *StrideMask = createStrideMask(Builder, I, InterleaveFactor, VF); 2290 for (unsigned Part = 0; Part < UF; Part++) { 2291 Value *StridedVec = Builder.CreateShuffleVector( 2292 NewLoads[Part], UndefVec, StrideMask, "strided.vec"); 2293 2294 // If this member has different type, cast the result type. 2295 if (Member->getType() != ScalarTy) { 2296 VectorType *OtherVTy = VectorType::get(Member->getType(), VF); 2297 StridedVec = createBitOrPointerCast(StridedVec, OtherVTy, DL); 2298 } 2299 2300 if (Group->isReverse()) 2301 StridedVec = reverseVector(StridedVec); 2302 2303 VectorLoopValueMap.setVectorValue(Member, Part, StridedVec); 2304 } 2305 } 2306 return; 2307 } 2308 2309 // The sub vector type for current instruction. 2310 VectorType *SubVT = VectorType::get(ScalarTy, VF); 2311 2312 // Vectorize the interleaved store group. 2313 for (unsigned Part = 0; Part < UF; Part++) { 2314 // Collect the stored vector from each member. 2315 SmallVector<Value *, 4> StoredVecs; 2316 for (unsigned i = 0; i < InterleaveFactor; i++) { 2317 // Interleaved store group doesn't allow a gap, so each index has a member 2318 Instruction *Member = Group->getMember(i); 2319 assert(Member && "Fail to get a member from an interleaved store group"); 2320 2321 Value *StoredVec = getOrCreateVectorValue( 2322 cast<StoreInst>(Member)->getValueOperand(), Part); 2323 if (Group->isReverse()) 2324 StoredVec = reverseVector(StoredVec); 2325 2326 // If this member has different type, cast it to a unified type. 2327 2328 if (StoredVec->getType() != SubVT) 2329 StoredVec = createBitOrPointerCast(StoredVec, SubVT, DL); 2330 2331 StoredVecs.push_back(StoredVec); 2332 } 2333 2334 // Concatenate all vectors into a wide vector. 2335 Value *WideVec = concatenateVectors(Builder, StoredVecs); 2336 2337 // Interleave the elements in the wide vector. 2338 Constant *IMask = createInterleaveMask(Builder, VF, InterleaveFactor); 2339 Value *IVec = Builder.CreateShuffleVector(WideVec, UndefVec, IMask, 2340 "interleaved.vec"); 2341 2342 Instruction *NewStoreInstr; 2343 if (BlockInMask) { 2344 Value *BlockInMaskPart = State.get(BlockInMask, Part); 2345 auto *Undefs = UndefValue::get(BlockInMaskPart->getType()); 2346 auto *RepMask = createReplicatedMask(Builder, InterleaveFactor, VF); 2347 Value *ShuffledMask = Builder.CreateShuffleVector( 2348 BlockInMaskPart, Undefs, RepMask, "interleaved.mask"); 2349 NewStoreInstr = Builder.CreateMaskedStore( 2350 IVec, AddrParts[Part], Group->getAlign(), ShuffledMask); 2351 } 2352 else 2353 NewStoreInstr = 2354 Builder.CreateAlignedStore(IVec, AddrParts[Part], Group->getAlign()); 2355 2356 Group->addMetadata(NewStoreInstr); 2357 } 2358 } 2359 2360 void InnerLoopVectorizer::vectorizeMemoryInstruction(Instruction *Instr, 2361 VPTransformState &State, 2362 VPValue *Addr, 2363 VPValue *StoredValue, 2364 VPValue *BlockInMask) { 2365 // Attempt to issue a wide load. 2366 LoadInst *LI = dyn_cast<LoadInst>(Instr); 2367 StoreInst *SI = dyn_cast<StoreInst>(Instr); 2368 2369 assert((LI || SI) && "Invalid Load/Store instruction"); 2370 assert((!SI || StoredValue) && "No stored value provided for widened store"); 2371 assert((!LI || !StoredValue) && "Stored value provided for widened load"); 2372 2373 LoopVectorizationCostModel::InstWidening Decision = 2374 Cost->getWideningDecision(Instr, VF); 2375 assert(Decision != LoopVectorizationCostModel::CM_Unknown && 2376 "CM decision should be taken at this point"); 2377 if (Decision == LoopVectorizationCostModel::CM_Interleave) 2378 return vectorizeInterleaveGroup(Instr, State, Addr, BlockInMask); 2379 2380 Type *ScalarDataTy = getMemInstValueType(Instr); 2381 Type *DataTy = VectorType::get(ScalarDataTy, VF); 2382 // An alignment of 0 means target abi alignment. We need to use the scalar's 2383 // target abi alignment in such a case. 2384 const DataLayout &DL = Instr->getModule()->getDataLayout(); 2385 const Align Alignment = 2386 DL.getValueOrABITypeAlignment(getLoadStoreAlignment(Instr), ScalarDataTy); 2387 2388 // Determine if the pointer operand of the access is either consecutive or 2389 // reverse consecutive. 2390 bool Reverse = (Decision == LoopVectorizationCostModel::CM_Widen_Reverse); 2391 bool ConsecutiveStride = 2392 Reverse || (Decision == LoopVectorizationCostModel::CM_Widen); 2393 bool CreateGatherScatter = 2394 (Decision == LoopVectorizationCostModel::CM_GatherScatter); 2395 2396 // Either Ptr feeds a vector load/store, or a vector GEP should feed a vector 2397 // gather/scatter. Otherwise Decision should have been to Scalarize. 2398 assert((ConsecutiveStride || CreateGatherScatter) && 2399 "The instruction should be scalarized"); 2400 (void)ConsecutiveStride; 2401 2402 VectorParts BlockInMaskParts(UF); 2403 bool isMaskRequired = BlockInMask; 2404 if (isMaskRequired) 2405 for (unsigned Part = 0; Part < UF; ++Part) 2406 BlockInMaskParts[Part] = State.get(BlockInMask, Part); 2407 2408 const auto CreateVecPtr = [&](unsigned Part, Value *Ptr) -> Value * { 2409 // Calculate the pointer for the specific unroll-part. 2410 GetElementPtrInst *PartPtr = nullptr; 2411 2412 bool InBounds = false; 2413 if (auto *gep = dyn_cast<GetElementPtrInst>(Ptr->stripPointerCasts())) 2414 InBounds = gep->isInBounds(); 2415 2416 if (Reverse) { 2417 // If the address is consecutive but reversed, then the 2418 // wide store needs to start at the last vector element. 2419 PartPtr = cast<GetElementPtrInst>( 2420 Builder.CreateGEP(ScalarDataTy, Ptr, Builder.getInt32(-Part * VF))); 2421 PartPtr->setIsInBounds(InBounds); 2422 PartPtr = cast<GetElementPtrInst>( 2423 Builder.CreateGEP(ScalarDataTy, PartPtr, Builder.getInt32(1 - VF))); 2424 PartPtr->setIsInBounds(InBounds); 2425 if (isMaskRequired) // Reverse of a null all-one mask is a null mask. 2426 BlockInMaskParts[Part] = reverseVector(BlockInMaskParts[Part]); 2427 } else { 2428 PartPtr = cast<GetElementPtrInst>( 2429 Builder.CreateGEP(ScalarDataTy, Ptr, Builder.getInt32(Part * VF))); 2430 PartPtr->setIsInBounds(InBounds); 2431 } 2432 2433 unsigned AddressSpace = Ptr->getType()->getPointerAddressSpace(); 2434 return Builder.CreateBitCast(PartPtr, DataTy->getPointerTo(AddressSpace)); 2435 }; 2436 2437 // Handle Stores: 2438 if (SI) { 2439 setDebugLocFromInst(Builder, SI); 2440 2441 for (unsigned Part = 0; Part < UF; ++Part) { 2442 Instruction *NewSI = nullptr; 2443 Value *StoredVal = State.get(StoredValue, Part); 2444 if (CreateGatherScatter) { 2445 Value *MaskPart = isMaskRequired ? BlockInMaskParts[Part] : nullptr; 2446 Value *VectorGep = State.get(Addr, Part); 2447 NewSI = Builder.CreateMaskedScatter(StoredVal, VectorGep, Alignment, 2448 MaskPart); 2449 } else { 2450 if (Reverse) { 2451 // If we store to reverse consecutive memory locations, then we need 2452 // to reverse the order of elements in the stored value. 2453 StoredVal = reverseVector(StoredVal); 2454 // We don't want to update the value in the map as it might be used in 2455 // another expression. So don't call resetVectorValue(StoredVal). 2456 } 2457 auto *VecPtr = CreateVecPtr(Part, State.get(Addr, {0, 0})); 2458 if (isMaskRequired) 2459 NewSI = Builder.CreateMaskedStore(StoredVal, VecPtr, Alignment, 2460 BlockInMaskParts[Part]); 2461 else 2462 NewSI = Builder.CreateAlignedStore(StoredVal, VecPtr, Alignment); 2463 } 2464 addMetadata(NewSI, SI); 2465 } 2466 return; 2467 } 2468 2469 // Handle loads. 2470 assert(LI && "Must have a load instruction"); 2471 setDebugLocFromInst(Builder, LI); 2472 for (unsigned Part = 0; Part < UF; ++Part) { 2473 Value *NewLI; 2474 if (CreateGatherScatter) { 2475 Value *MaskPart = isMaskRequired ? BlockInMaskParts[Part] : nullptr; 2476 Value *VectorGep = State.get(Addr, Part); 2477 NewLI = Builder.CreateMaskedGather(VectorGep, Alignment, MaskPart, 2478 nullptr, "wide.masked.gather"); 2479 addMetadata(NewLI, LI); 2480 } else { 2481 auto *VecPtr = CreateVecPtr(Part, State.get(Addr, {0, 0})); 2482 if (isMaskRequired) 2483 NewLI = Builder.CreateMaskedLoad( 2484 VecPtr, Alignment, BlockInMaskParts[Part], UndefValue::get(DataTy), 2485 "wide.masked.load"); 2486 else 2487 NewLI = 2488 Builder.CreateAlignedLoad(DataTy, VecPtr, Alignment, "wide.load"); 2489 2490 // Add metadata to the load, but setVectorValue to the reverse shuffle. 2491 addMetadata(NewLI, LI); 2492 if (Reverse) 2493 NewLI = reverseVector(NewLI); 2494 } 2495 VectorLoopValueMap.setVectorValue(Instr, Part, NewLI); 2496 } 2497 } 2498 2499 void InnerLoopVectorizer::scalarizeInstruction(Instruction *Instr, 2500 const VPIteration &Instance, 2501 bool IfPredicateInstr) { 2502 assert(!Instr->getType()->isAggregateType() && "Can't handle vectors"); 2503 2504 setDebugLocFromInst(Builder, Instr); 2505 2506 // Does this instruction return a value ? 2507 bool IsVoidRetTy = Instr->getType()->isVoidTy(); 2508 2509 Instruction *Cloned = Instr->clone(); 2510 if (!IsVoidRetTy) 2511 Cloned->setName(Instr->getName() + ".cloned"); 2512 2513 // Replace the operands of the cloned instructions with their scalar 2514 // equivalents in the new loop. 2515 for (unsigned op = 0, e = Instr->getNumOperands(); op != e; ++op) { 2516 auto *NewOp = getOrCreateScalarValue(Instr->getOperand(op), Instance); 2517 Cloned->setOperand(op, NewOp); 2518 } 2519 addNewMetadata(Cloned, Instr); 2520 2521 // Place the cloned scalar in the new loop. 2522 Builder.Insert(Cloned); 2523 2524 // Add the cloned scalar to the scalar map entry. 2525 VectorLoopValueMap.setScalarValue(Instr, Instance, Cloned); 2526 2527 // If we just cloned a new assumption, add it the assumption cache. 2528 if (auto *II = dyn_cast<IntrinsicInst>(Cloned)) 2529 if (II->getIntrinsicID() == Intrinsic::assume) 2530 AC->registerAssumption(II); 2531 2532 // End if-block. 2533 if (IfPredicateInstr) 2534 PredicatedInstructions.push_back(Cloned); 2535 } 2536 2537 PHINode *InnerLoopVectorizer::createInductionVariable(Loop *L, Value *Start, 2538 Value *End, Value *Step, 2539 Instruction *DL) { 2540 BasicBlock *Header = L->getHeader(); 2541 BasicBlock *Latch = L->getLoopLatch(); 2542 // As we're just creating this loop, it's possible no latch exists 2543 // yet. If so, use the header as this will be a single block loop. 2544 if (!Latch) 2545 Latch = Header; 2546 2547 IRBuilder<> Builder(&*Header->getFirstInsertionPt()); 2548 Instruction *OldInst = getDebugLocFromInstOrOperands(OldInduction); 2549 setDebugLocFromInst(Builder, OldInst); 2550 auto *Induction = Builder.CreatePHI(Start->getType(), 2, "index"); 2551 2552 Builder.SetInsertPoint(Latch->getTerminator()); 2553 setDebugLocFromInst(Builder, OldInst); 2554 2555 // Create i+1 and fill the PHINode. 2556 Value *Next = Builder.CreateAdd(Induction, Step, "index.next"); 2557 Induction->addIncoming(Start, L->getLoopPreheader()); 2558 Induction->addIncoming(Next, Latch); 2559 // Create the compare. 2560 Value *ICmp = Builder.CreateICmpEQ(Next, End); 2561 Builder.CreateCondBr(ICmp, L->getExitBlock(), Header); 2562 2563 // Now we have two terminators. Remove the old one from the block. 2564 Latch->getTerminator()->eraseFromParent(); 2565 2566 return Induction; 2567 } 2568 2569 Value *InnerLoopVectorizer::getOrCreateTripCount(Loop *L) { 2570 if (TripCount) 2571 return TripCount; 2572 2573 assert(L && "Create Trip Count for null loop."); 2574 IRBuilder<> Builder(L->getLoopPreheader()->getTerminator()); 2575 // Find the loop boundaries. 2576 ScalarEvolution *SE = PSE.getSE(); 2577 const SCEV *BackedgeTakenCount = PSE.getBackedgeTakenCount(); 2578 assert(BackedgeTakenCount != SE->getCouldNotCompute() && 2579 "Invalid loop count"); 2580 2581 Type *IdxTy = Legal->getWidestInductionType(); 2582 assert(IdxTy && "No type for induction"); 2583 2584 // The exit count might have the type of i64 while the phi is i32. This can 2585 // happen if we have an induction variable that is sign extended before the 2586 // compare. The only way that we get a backedge taken count is that the 2587 // induction variable was signed and as such will not overflow. In such a case 2588 // truncation is legal. 2589 if (SE->getTypeSizeInBits(BackedgeTakenCount->getType()) > 2590 IdxTy->getPrimitiveSizeInBits()) 2591 BackedgeTakenCount = SE->getTruncateOrNoop(BackedgeTakenCount, IdxTy); 2592 BackedgeTakenCount = SE->getNoopOrZeroExtend(BackedgeTakenCount, IdxTy); 2593 2594 // Get the total trip count from the count by adding 1. 2595 const SCEV *ExitCount = SE->getAddExpr( 2596 BackedgeTakenCount, SE->getOne(BackedgeTakenCount->getType())); 2597 2598 const DataLayout &DL = L->getHeader()->getModule()->getDataLayout(); 2599 2600 // Expand the trip count and place the new instructions in the preheader. 2601 // Notice that the pre-header does not change, only the loop body. 2602 SCEVExpander Exp(*SE, DL, "induction"); 2603 2604 // Count holds the overall loop count (N). 2605 TripCount = Exp.expandCodeFor(ExitCount, ExitCount->getType(), 2606 L->getLoopPreheader()->getTerminator()); 2607 2608 if (TripCount->getType()->isPointerTy()) 2609 TripCount = 2610 CastInst::CreatePointerCast(TripCount, IdxTy, "exitcount.ptrcnt.to.int", 2611 L->getLoopPreheader()->getTerminator()); 2612 2613 return TripCount; 2614 } 2615 2616 Value *InnerLoopVectorizer::getOrCreateVectorTripCount(Loop *L) { 2617 if (VectorTripCount) 2618 return VectorTripCount; 2619 2620 Value *TC = getOrCreateTripCount(L); 2621 IRBuilder<> Builder(L->getLoopPreheader()->getTerminator()); 2622 2623 Type *Ty = TC->getType(); 2624 Constant *Step = ConstantInt::get(Ty, VF * UF); 2625 2626 // If the tail is to be folded by masking, round the number of iterations N 2627 // up to a multiple of Step instead of rounding down. This is done by first 2628 // adding Step-1 and then rounding down. Note that it's ok if this addition 2629 // overflows: the vector induction variable will eventually wrap to zero given 2630 // that it starts at zero and its Step is a power of two; the loop will then 2631 // exit, with the last early-exit vector comparison also producing all-true. 2632 if (Cost->foldTailByMasking()) { 2633 assert(isPowerOf2_32(VF * UF) && 2634 "VF*UF must be a power of 2 when folding tail by masking"); 2635 TC = Builder.CreateAdd(TC, ConstantInt::get(Ty, VF * UF - 1), "n.rnd.up"); 2636 } 2637 2638 // Now we need to generate the expression for the part of the loop that the 2639 // vectorized body will execute. This is equal to N - (N % Step) if scalar 2640 // iterations are not required for correctness, or N - Step, otherwise. Step 2641 // is equal to the vectorization factor (number of SIMD elements) times the 2642 // unroll factor (number of SIMD instructions). 2643 Value *R = Builder.CreateURem(TC, Step, "n.mod.vf"); 2644 2645 // If there is a non-reversed interleaved group that may speculatively access 2646 // memory out-of-bounds, we need to ensure that there will be at least one 2647 // iteration of the scalar epilogue loop. Thus, if the step evenly divides 2648 // the trip count, we set the remainder to be equal to the step. If the step 2649 // does not evenly divide the trip count, no adjustment is necessary since 2650 // there will already be scalar iterations. Note that the minimum iterations 2651 // check ensures that N >= Step. 2652 if (VF > 1 && Cost->requiresScalarEpilogue()) { 2653 auto *IsZero = Builder.CreateICmpEQ(R, ConstantInt::get(R->getType(), 0)); 2654 R = Builder.CreateSelect(IsZero, Step, R); 2655 } 2656 2657 VectorTripCount = Builder.CreateSub(TC, R, "n.vec"); 2658 2659 return VectorTripCount; 2660 } 2661 2662 Value *InnerLoopVectorizer::createBitOrPointerCast(Value *V, VectorType *DstVTy, 2663 const DataLayout &DL) { 2664 // Verify that V is a vector type with same number of elements as DstVTy. 2665 unsigned VF = DstVTy->getNumElements(); 2666 VectorType *SrcVecTy = cast<VectorType>(V->getType()); 2667 assert((VF == SrcVecTy->getNumElements()) && "Vector dimensions do not match"); 2668 Type *SrcElemTy = SrcVecTy->getElementType(); 2669 Type *DstElemTy = DstVTy->getElementType(); 2670 assert((DL.getTypeSizeInBits(SrcElemTy) == DL.getTypeSizeInBits(DstElemTy)) && 2671 "Vector elements must have same size"); 2672 2673 // Do a direct cast if element types are castable. 2674 if (CastInst::isBitOrNoopPointerCastable(SrcElemTy, DstElemTy, DL)) { 2675 return Builder.CreateBitOrPointerCast(V, DstVTy); 2676 } 2677 // V cannot be directly casted to desired vector type. 2678 // May happen when V is a floating point vector but DstVTy is a vector of 2679 // pointers or vice-versa. Handle this using a two-step bitcast using an 2680 // intermediate Integer type for the bitcast i.e. Ptr <-> Int <-> Float. 2681 assert((DstElemTy->isPointerTy() != SrcElemTy->isPointerTy()) && 2682 "Only one type should be a pointer type"); 2683 assert((DstElemTy->isFloatingPointTy() != SrcElemTy->isFloatingPointTy()) && 2684 "Only one type should be a floating point type"); 2685 Type *IntTy = 2686 IntegerType::getIntNTy(V->getContext(), DL.getTypeSizeInBits(SrcElemTy)); 2687 VectorType *VecIntTy = VectorType::get(IntTy, VF); 2688 Value *CastVal = Builder.CreateBitOrPointerCast(V, VecIntTy); 2689 return Builder.CreateBitOrPointerCast(CastVal, DstVTy); 2690 } 2691 2692 void InnerLoopVectorizer::emitMinimumIterationCountCheck(Loop *L, 2693 BasicBlock *Bypass) { 2694 Value *Count = getOrCreateTripCount(L); 2695 // Reuse existing vector loop preheader for TC checks. 2696 // Note that new preheader block is generated for vector loop. 2697 BasicBlock *const TCCheckBlock = LoopVectorPreHeader; 2698 IRBuilder<> Builder(TCCheckBlock->getTerminator()); 2699 2700 // Generate code to check if the loop's trip count is less than VF * UF, or 2701 // equal to it in case a scalar epilogue is required; this implies that the 2702 // vector trip count is zero. This check also covers the case where adding one 2703 // to the backedge-taken count overflowed leading to an incorrect trip count 2704 // of zero. In this case we will also jump to the scalar loop. 2705 auto P = Cost->requiresScalarEpilogue() ? ICmpInst::ICMP_ULE 2706 : ICmpInst::ICMP_ULT; 2707 2708 // If tail is to be folded, vector loop takes care of all iterations. 2709 Value *CheckMinIters = Builder.getFalse(); 2710 if (!Cost->foldTailByMasking()) 2711 CheckMinIters = Builder.CreateICmp( 2712 P, Count, ConstantInt::get(Count->getType(), VF * UF), 2713 "min.iters.check"); 2714 2715 // Create new preheader for vector loop. 2716 LoopVectorPreHeader = 2717 SplitBlock(TCCheckBlock, TCCheckBlock->getTerminator(), DT, LI, nullptr, 2718 "vector.ph"); 2719 2720 assert(DT->properlyDominates(DT->getNode(TCCheckBlock), 2721 DT->getNode(Bypass)->getIDom()) && 2722 "TC check is expected to dominate Bypass"); 2723 2724 // Update dominator for Bypass & LoopExit. 2725 DT->changeImmediateDominator(Bypass, TCCheckBlock); 2726 DT->changeImmediateDominator(LoopExitBlock, TCCheckBlock); 2727 2728 ReplaceInstWithInst( 2729 TCCheckBlock->getTerminator(), 2730 BranchInst::Create(Bypass, LoopVectorPreHeader, CheckMinIters)); 2731 LoopBypassBlocks.push_back(TCCheckBlock); 2732 } 2733 2734 void InnerLoopVectorizer::emitSCEVChecks(Loop *L, BasicBlock *Bypass) { 2735 // Reuse existing vector loop preheader for SCEV checks. 2736 // Note that new preheader block is generated for vector loop. 2737 BasicBlock *const SCEVCheckBlock = LoopVectorPreHeader; 2738 2739 // Generate the code to check that the SCEV assumptions that we made. 2740 // We want the new basic block to start at the first instruction in a 2741 // sequence of instructions that form a check. 2742 SCEVExpander Exp(*PSE.getSE(), Bypass->getModule()->getDataLayout(), 2743 "scev.check"); 2744 Value *SCEVCheck = Exp.expandCodeForPredicate( 2745 &PSE.getUnionPredicate(), SCEVCheckBlock->getTerminator()); 2746 2747 if (auto *C = dyn_cast<ConstantInt>(SCEVCheck)) 2748 if (C->isZero()) 2749 return; 2750 2751 assert(!SCEVCheckBlock->getParent()->hasOptSize() && 2752 "Cannot SCEV check stride or overflow when optimizing for size"); 2753 2754 SCEVCheckBlock->setName("vector.scevcheck"); 2755 // Create new preheader for vector loop. 2756 LoopVectorPreHeader = 2757 SplitBlock(SCEVCheckBlock, SCEVCheckBlock->getTerminator(), DT, LI, 2758 nullptr, "vector.ph"); 2759 2760 // Update dominator only if this is first RT check. 2761 if (LoopBypassBlocks.empty()) { 2762 DT->changeImmediateDominator(Bypass, SCEVCheckBlock); 2763 DT->changeImmediateDominator(LoopExitBlock, SCEVCheckBlock); 2764 } 2765 2766 ReplaceInstWithInst( 2767 SCEVCheckBlock->getTerminator(), 2768 BranchInst::Create(Bypass, LoopVectorPreHeader, SCEVCheck)); 2769 LoopBypassBlocks.push_back(SCEVCheckBlock); 2770 AddedSafetyChecks = true; 2771 } 2772 2773 void InnerLoopVectorizer::emitMemRuntimeChecks(Loop *L, BasicBlock *Bypass) { 2774 // VPlan-native path does not do any analysis for runtime checks currently. 2775 if (EnableVPlanNativePath) 2776 return; 2777 2778 // Reuse existing vector loop preheader for runtime memory checks. 2779 // Note that new preheader block is generated for vector loop. 2780 BasicBlock *const MemCheckBlock = L->getLoopPreheader(); 2781 2782 // Generate the code that checks in runtime if arrays overlap. We put the 2783 // checks into a separate block to make the more common case of few elements 2784 // faster. 2785 Instruction *FirstCheckInst; 2786 Instruction *MemRuntimeCheck; 2787 std::tie(FirstCheckInst, MemRuntimeCheck) = 2788 Legal->getLAI()->addRuntimeChecks(MemCheckBlock->getTerminator()); 2789 if (!MemRuntimeCheck) 2790 return; 2791 2792 if (MemCheckBlock->getParent()->hasOptSize()) { 2793 assert(Cost->Hints->getForce() == LoopVectorizeHints::FK_Enabled && 2794 "Cannot emit memory checks when optimizing for size, unless forced " 2795 "to vectorize."); 2796 ORE->emit([&]() { 2797 return OptimizationRemarkAnalysis(DEBUG_TYPE, "VectorizationCodeSize", 2798 L->getStartLoc(), L->getHeader()) 2799 << "Code-size may be reduced by not forcing " 2800 "vectorization, or by source-code modifications " 2801 "eliminating the need for runtime checks " 2802 "(e.g., adding 'restrict')."; 2803 }); 2804 } 2805 2806 MemCheckBlock->setName("vector.memcheck"); 2807 // Create new preheader for vector loop. 2808 LoopVectorPreHeader = 2809 SplitBlock(MemCheckBlock, MemCheckBlock->getTerminator(), DT, LI, nullptr, 2810 "vector.ph"); 2811 2812 // Update dominator only if this is first RT check. 2813 if (LoopBypassBlocks.empty()) { 2814 DT->changeImmediateDominator(Bypass, MemCheckBlock); 2815 DT->changeImmediateDominator(LoopExitBlock, MemCheckBlock); 2816 } 2817 2818 ReplaceInstWithInst( 2819 MemCheckBlock->getTerminator(), 2820 BranchInst::Create(Bypass, LoopVectorPreHeader, MemRuntimeCheck)); 2821 LoopBypassBlocks.push_back(MemCheckBlock); 2822 AddedSafetyChecks = true; 2823 2824 // We currently don't use LoopVersioning for the actual loop cloning but we 2825 // still use it to add the noalias metadata. 2826 LVer = std::make_unique<LoopVersioning>(*Legal->getLAI(), OrigLoop, LI, DT, 2827 PSE.getSE()); 2828 LVer->prepareNoAliasMetadata(); 2829 } 2830 2831 Value *InnerLoopVectorizer::emitTransformedIndex( 2832 IRBuilder<> &B, Value *Index, ScalarEvolution *SE, const DataLayout &DL, 2833 const InductionDescriptor &ID) const { 2834 2835 SCEVExpander Exp(*SE, DL, "induction"); 2836 auto Step = ID.getStep(); 2837 auto StartValue = ID.getStartValue(); 2838 assert(Index->getType() == Step->getType() && 2839 "Index type does not match StepValue type"); 2840 2841 // Note: the IR at this point is broken. We cannot use SE to create any new 2842 // SCEV and then expand it, hoping that SCEV's simplification will give us 2843 // a more optimal code. Unfortunately, attempt of doing so on invalid IR may 2844 // lead to various SCEV crashes. So all we can do is to use builder and rely 2845 // on InstCombine for future simplifications. Here we handle some trivial 2846 // cases only. 2847 auto CreateAdd = [&B](Value *X, Value *Y) { 2848 assert(X->getType() == Y->getType() && "Types don't match!"); 2849 if (auto *CX = dyn_cast<ConstantInt>(X)) 2850 if (CX->isZero()) 2851 return Y; 2852 if (auto *CY = dyn_cast<ConstantInt>(Y)) 2853 if (CY->isZero()) 2854 return X; 2855 return B.CreateAdd(X, Y); 2856 }; 2857 2858 auto CreateMul = [&B](Value *X, Value *Y) { 2859 assert(X->getType() == Y->getType() && "Types don't match!"); 2860 if (auto *CX = dyn_cast<ConstantInt>(X)) 2861 if (CX->isOne()) 2862 return Y; 2863 if (auto *CY = dyn_cast<ConstantInt>(Y)) 2864 if (CY->isOne()) 2865 return X; 2866 return B.CreateMul(X, Y); 2867 }; 2868 2869 switch (ID.getKind()) { 2870 case InductionDescriptor::IK_IntInduction: { 2871 assert(Index->getType() == StartValue->getType() && 2872 "Index type does not match StartValue type"); 2873 if (ID.getConstIntStepValue() && ID.getConstIntStepValue()->isMinusOne()) 2874 return B.CreateSub(StartValue, Index); 2875 auto *Offset = CreateMul( 2876 Index, Exp.expandCodeFor(Step, Index->getType(), &*B.GetInsertPoint())); 2877 return CreateAdd(StartValue, Offset); 2878 } 2879 case InductionDescriptor::IK_PtrInduction: { 2880 assert(isa<SCEVConstant>(Step) && 2881 "Expected constant step for pointer induction"); 2882 return B.CreateGEP( 2883 StartValue->getType()->getPointerElementType(), StartValue, 2884 CreateMul(Index, Exp.expandCodeFor(Step, Index->getType(), 2885 &*B.GetInsertPoint()))); 2886 } 2887 case InductionDescriptor::IK_FpInduction: { 2888 assert(Step->getType()->isFloatingPointTy() && "Expected FP Step value"); 2889 auto InductionBinOp = ID.getInductionBinOp(); 2890 assert(InductionBinOp && 2891 (InductionBinOp->getOpcode() == Instruction::FAdd || 2892 InductionBinOp->getOpcode() == Instruction::FSub) && 2893 "Original bin op should be defined for FP induction"); 2894 2895 Value *StepValue = cast<SCEVUnknown>(Step)->getValue(); 2896 2897 // Floating point operations had to be 'fast' to enable the induction. 2898 FastMathFlags Flags; 2899 Flags.setFast(); 2900 2901 Value *MulExp = B.CreateFMul(StepValue, Index); 2902 if (isa<Instruction>(MulExp)) 2903 // We have to check, the MulExp may be a constant. 2904 cast<Instruction>(MulExp)->setFastMathFlags(Flags); 2905 2906 Value *BOp = B.CreateBinOp(InductionBinOp->getOpcode(), StartValue, MulExp, 2907 "induction"); 2908 if (isa<Instruction>(BOp)) 2909 cast<Instruction>(BOp)->setFastMathFlags(Flags); 2910 2911 return BOp; 2912 } 2913 case InductionDescriptor::IK_NoInduction: 2914 return nullptr; 2915 } 2916 llvm_unreachable("invalid enum"); 2917 } 2918 2919 BasicBlock *InnerLoopVectorizer::createVectorizedLoopSkeleton() { 2920 /* 2921 In this function we generate a new loop. The new loop will contain 2922 the vectorized instructions while the old loop will continue to run the 2923 scalar remainder. 2924 2925 [ ] <-- loop iteration number check. 2926 / | 2927 / v 2928 | [ ] <-- vector loop bypass (may consist of multiple blocks). 2929 | / | 2930 | / v 2931 || [ ] <-- vector pre header. 2932 |/ | 2933 | v 2934 | [ ] \ 2935 | [ ]_| <-- vector loop. 2936 | | 2937 | v 2938 | -[ ] <--- middle-block. 2939 | / | 2940 | / v 2941 -|- >[ ] <--- new preheader. 2942 | | 2943 | v 2944 | [ ] \ 2945 | [ ]_| <-- old scalar loop to handle remainder. 2946 \ | 2947 \ v 2948 >[ ] <-- exit block. 2949 ... 2950 */ 2951 2952 MDNode *OrigLoopID = OrigLoop->getLoopID(); 2953 2954 // Some loops have a single integer induction variable, while other loops 2955 // don't. One example is c++ iterators that often have multiple pointer 2956 // induction variables. In the code below we also support a case where we 2957 // don't have a single induction variable. 2958 // 2959 // We try to obtain an induction variable from the original loop as hard 2960 // as possible. However if we don't find one that: 2961 // - is an integer 2962 // - counts from zero, stepping by one 2963 // - is the size of the widest induction variable type 2964 // then we create a new one. 2965 OldInduction = Legal->getPrimaryInduction(); 2966 Type *IdxTy = Legal->getWidestInductionType(); 2967 2968 // Split the single block loop into the two loop structure described above. 2969 LoopScalarBody = OrigLoop->getHeader(); 2970 LoopVectorPreHeader = OrigLoop->getLoopPreheader(); 2971 LoopExitBlock = OrigLoop->getExitBlock(); 2972 assert(LoopExitBlock && "Must have an exit block"); 2973 assert(LoopVectorPreHeader && "Invalid loop structure"); 2974 2975 LoopMiddleBlock = 2976 SplitBlock(LoopVectorPreHeader, LoopVectorPreHeader->getTerminator(), DT, 2977 LI, nullptr, "middle.block"); 2978 LoopScalarPreHeader = 2979 SplitBlock(LoopMiddleBlock, LoopMiddleBlock->getTerminator(), DT, LI, 2980 nullptr, "scalar.ph"); 2981 // We intentionally don't let SplitBlock to update LoopInfo since 2982 // LoopVectorBody should belong to another loop than LoopVectorPreHeader. 2983 // LoopVectorBody is explicitly added to the correct place few lines later. 2984 LoopVectorBody = 2985 SplitBlock(LoopVectorPreHeader, LoopVectorPreHeader->getTerminator(), DT, 2986 nullptr, nullptr, "vector.body"); 2987 2988 // Update dominator for loop exit. 2989 DT->changeImmediateDominator(LoopExitBlock, LoopMiddleBlock); 2990 2991 // Create and register the new vector loop. 2992 Loop *Lp = LI->AllocateLoop(); 2993 Loop *ParentLoop = OrigLoop->getParentLoop(); 2994 2995 // Insert the new loop into the loop nest and register the new basic blocks 2996 // before calling any utilities such as SCEV that require valid LoopInfo. 2997 if (ParentLoop) { 2998 ParentLoop->addChildLoop(Lp); 2999 } else { 3000 LI->addTopLevelLoop(Lp); 3001 } 3002 Lp->addBasicBlockToLoop(LoopVectorBody, *LI); 3003 3004 // Find the loop boundaries. 3005 Value *Count = getOrCreateTripCount(Lp); 3006 3007 Value *StartIdx = ConstantInt::get(IdxTy, 0); 3008 3009 // Now, compare the new count to zero. If it is zero skip the vector loop and 3010 // jump to the scalar loop. This check also covers the case where the 3011 // backedge-taken count is uint##_max: adding one to it will overflow leading 3012 // to an incorrect trip count of zero. In this (rare) case we will also jump 3013 // to the scalar loop. 3014 emitMinimumIterationCountCheck(Lp, LoopScalarPreHeader); 3015 3016 // Generate the code to check any assumptions that we've made for SCEV 3017 // expressions. 3018 emitSCEVChecks(Lp, LoopScalarPreHeader); 3019 3020 // Generate the code that checks in runtime if arrays overlap. We put the 3021 // checks into a separate block to make the more common case of few elements 3022 // faster. 3023 emitMemRuntimeChecks(Lp, LoopScalarPreHeader); 3024 3025 // Generate the induction variable. 3026 // The loop step is equal to the vectorization factor (num of SIMD elements) 3027 // times the unroll factor (num of SIMD instructions). 3028 Value *CountRoundDown = getOrCreateVectorTripCount(Lp); 3029 Constant *Step = ConstantInt::get(IdxTy, VF * UF); 3030 Induction = 3031 createInductionVariable(Lp, StartIdx, CountRoundDown, Step, 3032 getDebugLocFromInstOrOperands(OldInduction)); 3033 3034 // We are going to resume the execution of the scalar loop. 3035 // Go over all of the induction variables that we found and fix the 3036 // PHIs that are left in the scalar version of the loop. 3037 // The starting values of PHI nodes depend on the counter of the last 3038 // iteration in the vectorized loop. 3039 // If we come from a bypass edge then we need to start from the original 3040 // start value. 3041 3042 // This variable saves the new starting index for the scalar loop. It is used 3043 // to test if there are any tail iterations left once the vector loop has 3044 // completed. 3045 for (auto &InductionEntry : Legal->getInductionVars()) { 3046 PHINode *OrigPhi = InductionEntry.first; 3047 InductionDescriptor II = InductionEntry.second; 3048 3049 // Create phi nodes to merge from the backedge-taken check block. 3050 PHINode *BCResumeVal = 3051 PHINode::Create(OrigPhi->getType(), 3, "bc.resume.val", 3052 LoopScalarPreHeader->getTerminator()); 3053 // Copy original phi DL over to the new one. 3054 BCResumeVal->setDebugLoc(OrigPhi->getDebugLoc()); 3055 Value *&EndValue = IVEndValues[OrigPhi]; 3056 if (OrigPhi == OldInduction) { 3057 // We know what the end value is. 3058 EndValue = CountRoundDown; 3059 } else { 3060 IRBuilder<> B(Lp->getLoopPreheader()->getTerminator()); 3061 Type *StepType = II.getStep()->getType(); 3062 Instruction::CastOps CastOp = 3063 CastInst::getCastOpcode(CountRoundDown, true, StepType, true); 3064 Value *CRD = B.CreateCast(CastOp, CountRoundDown, StepType, "cast.crd"); 3065 const DataLayout &DL = LoopScalarBody->getModule()->getDataLayout(); 3066 EndValue = emitTransformedIndex(B, CRD, PSE.getSE(), DL, II); 3067 EndValue->setName("ind.end"); 3068 } 3069 3070 // The new PHI merges the original incoming value, in case of a bypass, 3071 // or the value at the end of the vectorized loop. 3072 BCResumeVal->addIncoming(EndValue, LoopMiddleBlock); 3073 3074 // Fix the scalar body counter (PHI node). 3075 // The old induction's phi node in the scalar body needs the truncated 3076 // value. 3077 for (BasicBlock *BB : LoopBypassBlocks) 3078 BCResumeVal->addIncoming(II.getStartValue(), BB); 3079 OrigPhi->setIncomingValueForBlock(LoopScalarPreHeader, BCResumeVal); 3080 } 3081 3082 // We need the OrigLoop (scalar loop part) latch terminator to help 3083 // produce correct debug info for the middle block BB instructions. 3084 // The legality check stage guarantees that the loop will have a single 3085 // latch. 3086 assert(isa<BranchInst>(OrigLoop->getLoopLatch()->getTerminator()) && 3087 "Scalar loop latch terminator isn't a branch"); 3088 BranchInst *ScalarLatchBr = 3089 cast<BranchInst>(OrigLoop->getLoopLatch()->getTerminator()); 3090 3091 // Add a check in the middle block to see if we have completed 3092 // all of the iterations in the first vector loop. 3093 // If (N - N%VF) == N, then we *don't* need to run the remainder. 3094 // If tail is to be folded, we know we don't need to run the remainder. 3095 Value *CmpN = Builder.getTrue(); 3096 if (!Cost->foldTailByMasking()) { 3097 CmpN = CmpInst::Create(Instruction::ICmp, CmpInst::ICMP_EQ, Count, 3098 CountRoundDown, "cmp.n", 3099 LoopMiddleBlock->getTerminator()); 3100 3101 // Here we use the same DebugLoc as the scalar loop latch branch instead 3102 // of the corresponding compare because they may have ended up with 3103 // different line numbers and we want to avoid awkward line stepping while 3104 // debugging. Eg. if the compare has got a line number inside the loop. 3105 cast<Instruction>(CmpN)->setDebugLoc(ScalarLatchBr->getDebugLoc()); 3106 } 3107 3108 BranchInst *BrInst = 3109 BranchInst::Create(LoopExitBlock, LoopScalarPreHeader, CmpN); 3110 BrInst->setDebugLoc(ScalarLatchBr->getDebugLoc()); 3111 ReplaceInstWithInst(LoopMiddleBlock->getTerminator(), BrInst); 3112 3113 // Get ready to start creating new instructions into the vectorized body. 3114 assert(LoopVectorPreHeader == Lp->getLoopPreheader() && 3115 "Inconsistent vector loop preheader"); 3116 Builder.SetInsertPoint(&*LoopVectorBody->getFirstInsertionPt()); 3117 3118 Optional<MDNode *> VectorizedLoopID = 3119 makeFollowupLoopID(OrigLoopID, {LLVMLoopVectorizeFollowupAll, 3120 LLVMLoopVectorizeFollowupVectorized}); 3121 if (VectorizedLoopID.hasValue()) { 3122 Lp->setLoopID(VectorizedLoopID.getValue()); 3123 3124 // Do not setAlreadyVectorized if loop attributes have been defined 3125 // explicitly. 3126 return LoopVectorPreHeader; 3127 } 3128 3129 // Keep all loop hints from the original loop on the vector loop (we'll 3130 // replace the vectorizer-specific hints below). 3131 if (MDNode *LID = OrigLoop->getLoopID()) 3132 Lp->setLoopID(LID); 3133 3134 LoopVectorizeHints Hints(Lp, true, *ORE); 3135 Hints.setAlreadyVectorized(); 3136 3137 #ifdef EXPENSIVE_CHECKS 3138 assert(DT->verify(DominatorTree::VerificationLevel::Fast)); 3139 LI->verify(*DT); 3140 #endif 3141 3142 return LoopVectorPreHeader; 3143 } 3144 3145 // Fix up external users of the induction variable. At this point, we are 3146 // in LCSSA form, with all external PHIs that use the IV having one input value, 3147 // coming from the remainder loop. We need those PHIs to also have a correct 3148 // value for the IV when arriving directly from the middle block. 3149 void InnerLoopVectorizer::fixupIVUsers(PHINode *OrigPhi, 3150 const InductionDescriptor &II, 3151 Value *CountRoundDown, Value *EndValue, 3152 BasicBlock *MiddleBlock) { 3153 // There are two kinds of external IV usages - those that use the value 3154 // computed in the last iteration (the PHI) and those that use the penultimate 3155 // value (the value that feeds into the phi from the loop latch). 3156 // We allow both, but they, obviously, have different values. 3157 3158 assert(OrigLoop->getExitBlock() && "Expected a single exit block"); 3159 3160 DenseMap<Value *, Value *> MissingVals; 3161 3162 // An external user of the last iteration's value should see the value that 3163 // the remainder loop uses to initialize its own IV. 3164 Value *PostInc = OrigPhi->getIncomingValueForBlock(OrigLoop->getLoopLatch()); 3165 for (User *U : PostInc->users()) { 3166 Instruction *UI = cast<Instruction>(U); 3167 if (!OrigLoop->contains(UI)) { 3168 assert(isa<PHINode>(UI) && "Expected LCSSA form"); 3169 MissingVals[UI] = EndValue; 3170 } 3171 } 3172 3173 // An external user of the penultimate value need to see EndValue - Step. 3174 // The simplest way to get this is to recompute it from the constituent SCEVs, 3175 // that is Start + (Step * (CRD - 1)). 3176 for (User *U : OrigPhi->users()) { 3177 auto *UI = cast<Instruction>(U); 3178 if (!OrigLoop->contains(UI)) { 3179 const DataLayout &DL = 3180 OrigLoop->getHeader()->getModule()->getDataLayout(); 3181 assert(isa<PHINode>(UI) && "Expected LCSSA form"); 3182 3183 IRBuilder<> B(MiddleBlock->getTerminator()); 3184 Value *CountMinusOne = B.CreateSub( 3185 CountRoundDown, ConstantInt::get(CountRoundDown->getType(), 1)); 3186 Value *CMO = 3187 !II.getStep()->getType()->isIntegerTy() 3188 ? B.CreateCast(Instruction::SIToFP, CountMinusOne, 3189 II.getStep()->getType()) 3190 : B.CreateSExtOrTrunc(CountMinusOne, II.getStep()->getType()); 3191 CMO->setName("cast.cmo"); 3192 Value *Escape = emitTransformedIndex(B, CMO, PSE.getSE(), DL, II); 3193 Escape->setName("ind.escape"); 3194 MissingVals[UI] = Escape; 3195 } 3196 } 3197 3198 for (auto &I : MissingVals) { 3199 PHINode *PHI = cast<PHINode>(I.first); 3200 // One corner case we have to handle is two IVs "chasing" each-other, 3201 // that is %IV2 = phi [...], [ %IV1, %latch ] 3202 // In this case, if IV1 has an external use, we need to avoid adding both 3203 // "last value of IV1" and "penultimate value of IV2". So, verify that we 3204 // don't already have an incoming value for the middle block. 3205 if (PHI->getBasicBlockIndex(MiddleBlock) == -1) 3206 PHI->addIncoming(I.second, MiddleBlock); 3207 } 3208 } 3209 3210 namespace { 3211 3212 struct CSEDenseMapInfo { 3213 static bool canHandle(const Instruction *I) { 3214 return isa<InsertElementInst>(I) || isa<ExtractElementInst>(I) || 3215 isa<ShuffleVectorInst>(I) || isa<GetElementPtrInst>(I); 3216 } 3217 3218 static inline Instruction *getEmptyKey() { 3219 return DenseMapInfo<Instruction *>::getEmptyKey(); 3220 } 3221 3222 static inline Instruction *getTombstoneKey() { 3223 return DenseMapInfo<Instruction *>::getTombstoneKey(); 3224 } 3225 3226 static unsigned getHashValue(const Instruction *I) { 3227 assert(canHandle(I) && "Unknown instruction!"); 3228 return hash_combine(I->getOpcode(), hash_combine_range(I->value_op_begin(), 3229 I->value_op_end())); 3230 } 3231 3232 static bool isEqual(const Instruction *LHS, const Instruction *RHS) { 3233 if (LHS == getEmptyKey() || RHS == getEmptyKey() || 3234 LHS == getTombstoneKey() || RHS == getTombstoneKey()) 3235 return LHS == RHS; 3236 return LHS->isIdenticalTo(RHS); 3237 } 3238 }; 3239 3240 } // end anonymous namespace 3241 3242 ///Perform cse of induction variable instructions. 3243 static void cse(BasicBlock *BB) { 3244 // Perform simple cse. 3245 SmallDenseMap<Instruction *, Instruction *, 4, CSEDenseMapInfo> CSEMap; 3246 for (BasicBlock::iterator I = BB->begin(), E = BB->end(); I != E;) { 3247 Instruction *In = &*I++; 3248 3249 if (!CSEDenseMapInfo::canHandle(In)) 3250 continue; 3251 3252 // Check if we can replace this instruction with any of the 3253 // visited instructions. 3254 if (Instruction *V = CSEMap.lookup(In)) { 3255 In->replaceAllUsesWith(V); 3256 In->eraseFromParent(); 3257 continue; 3258 } 3259 3260 CSEMap[In] = In; 3261 } 3262 } 3263 3264 unsigned LoopVectorizationCostModel::getVectorCallCost(CallInst *CI, 3265 unsigned VF, 3266 bool &NeedToScalarize) { 3267 Function *F = CI->getCalledFunction(); 3268 Type *ScalarRetTy = CI->getType(); 3269 SmallVector<Type *, 4> Tys, ScalarTys; 3270 for (auto &ArgOp : CI->arg_operands()) 3271 ScalarTys.push_back(ArgOp->getType()); 3272 3273 // Estimate cost of scalarized vector call. The source operands are assumed 3274 // to be vectors, so we need to extract individual elements from there, 3275 // execute VF scalar calls, and then gather the result into the vector return 3276 // value. 3277 unsigned ScalarCallCost = TTI.getCallInstrCost(F, ScalarRetTy, ScalarTys); 3278 if (VF == 1) 3279 return ScalarCallCost; 3280 3281 // Compute corresponding vector type for return value and arguments. 3282 Type *RetTy = ToVectorTy(ScalarRetTy, VF); 3283 for (Type *ScalarTy : ScalarTys) 3284 Tys.push_back(ToVectorTy(ScalarTy, VF)); 3285 3286 // Compute costs of unpacking argument values for the scalar calls and 3287 // packing the return values to a vector. 3288 unsigned ScalarizationCost = getScalarizationOverhead(CI, VF); 3289 3290 unsigned Cost = ScalarCallCost * VF + ScalarizationCost; 3291 3292 // If we can't emit a vector call for this function, then the currently found 3293 // cost is the cost we need to return. 3294 NeedToScalarize = true; 3295 VFShape Shape = VFShape::get(*CI, {VF, false}, false /*HasGlobalPred*/); 3296 Function *VecFunc = VFDatabase(*CI).getVectorizedFunction(Shape); 3297 3298 if (!TLI || CI->isNoBuiltin() || !VecFunc) 3299 return Cost; 3300 3301 // If the corresponding vector cost is cheaper, return its cost. 3302 unsigned VectorCallCost = TTI.getCallInstrCost(nullptr, RetTy, Tys); 3303 if (VectorCallCost < Cost) { 3304 NeedToScalarize = false; 3305 return VectorCallCost; 3306 } 3307 return Cost; 3308 } 3309 3310 unsigned LoopVectorizationCostModel::getVectorIntrinsicCost(CallInst *CI, 3311 unsigned VF) { 3312 Intrinsic::ID ID = getVectorIntrinsicIDForCall(CI, TLI); 3313 assert(ID && "Expected intrinsic call!"); 3314 3315 FastMathFlags FMF; 3316 if (auto *FPMO = dyn_cast<FPMathOperator>(CI)) 3317 FMF = FPMO->getFastMathFlags(); 3318 3319 SmallVector<Value *, 4> Operands(CI->arg_operands()); 3320 return TTI.getIntrinsicInstrCost(ID, CI->getType(), Operands, FMF, VF, CI); 3321 } 3322 3323 static Type *smallestIntegerVectorType(Type *T1, Type *T2) { 3324 auto *I1 = cast<IntegerType>(T1->getVectorElementType()); 3325 auto *I2 = cast<IntegerType>(T2->getVectorElementType()); 3326 return I1->getBitWidth() < I2->getBitWidth() ? T1 : T2; 3327 } 3328 static Type *largestIntegerVectorType(Type *T1, Type *T2) { 3329 auto *I1 = cast<IntegerType>(T1->getVectorElementType()); 3330 auto *I2 = cast<IntegerType>(T2->getVectorElementType()); 3331 return I1->getBitWidth() > I2->getBitWidth() ? T1 : T2; 3332 } 3333 3334 void InnerLoopVectorizer::truncateToMinimalBitwidths() { 3335 // For every instruction `I` in MinBWs, truncate the operands, create a 3336 // truncated version of `I` and reextend its result. InstCombine runs 3337 // later and will remove any ext/trunc pairs. 3338 SmallPtrSet<Value *, 4> Erased; 3339 for (const auto &KV : Cost->getMinimalBitwidths()) { 3340 // If the value wasn't vectorized, we must maintain the original scalar 3341 // type. The absence of the value from VectorLoopValueMap indicates that it 3342 // wasn't vectorized. 3343 if (!VectorLoopValueMap.hasAnyVectorValue(KV.first)) 3344 continue; 3345 for (unsigned Part = 0; Part < UF; ++Part) { 3346 Value *I = getOrCreateVectorValue(KV.first, Part); 3347 if (Erased.find(I) != Erased.end() || I->use_empty() || 3348 !isa<Instruction>(I)) 3349 continue; 3350 Type *OriginalTy = I->getType(); 3351 Type *ScalarTruncatedTy = 3352 IntegerType::get(OriginalTy->getContext(), KV.second); 3353 Type *TruncatedTy = VectorType::get(ScalarTruncatedTy, 3354 OriginalTy->getVectorNumElements()); 3355 if (TruncatedTy == OriginalTy) 3356 continue; 3357 3358 IRBuilder<> B(cast<Instruction>(I)); 3359 auto ShrinkOperand = [&](Value *V) -> Value * { 3360 if (auto *ZI = dyn_cast<ZExtInst>(V)) 3361 if (ZI->getSrcTy() == TruncatedTy) 3362 return ZI->getOperand(0); 3363 return B.CreateZExtOrTrunc(V, TruncatedTy); 3364 }; 3365 3366 // The actual instruction modification depends on the instruction type, 3367 // unfortunately. 3368 Value *NewI = nullptr; 3369 if (auto *BO = dyn_cast<BinaryOperator>(I)) { 3370 NewI = B.CreateBinOp(BO->getOpcode(), ShrinkOperand(BO->getOperand(0)), 3371 ShrinkOperand(BO->getOperand(1))); 3372 3373 // Any wrapping introduced by shrinking this operation shouldn't be 3374 // considered undefined behavior. So, we can't unconditionally copy 3375 // arithmetic wrapping flags to NewI. 3376 cast<BinaryOperator>(NewI)->copyIRFlags(I, /*IncludeWrapFlags=*/false); 3377 } else if (auto *CI = dyn_cast<ICmpInst>(I)) { 3378 NewI = 3379 B.CreateICmp(CI->getPredicate(), ShrinkOperand(CI->getOperand(0)), 3380 ShrinkOperand(CI->getOperand(1))); 3381 } else if (auto *SI = dyn_cast<SelectInst>(I)) { 3382 NewI = B.CreateSelect(SI->getCondition(), 3383 ShrinkOperand(SI->getTrueValue()), 3384 ShrinkOperand(SI->getFalseValue())); 3385 } else if (auto *CI = dyn_cast<CastInst>(I)) { 3386 switch (CI->getOpcode()) { 3387 default: 3388 llvm_unreachable("Unhandled cast!"); 3389 case Instruction::Trunc: 3390 NewI = ShrinkOperand(CI->getOperand(0)); 3391 break; 3392 case Instruction::SExt: 3393 NewI = B.CreateSExtOrTrunc( 3394 CI->getOperand(0), 3395 smallestIntegerVectorType(OriginalTy, TruncatedTy)); 3396 break; 3397 case Instruction::ZExt: 3398 NewI = B.CreateZExtOrTrunc( 3399 CI->getOperand(0), 3400 smallestIntegerVectorType(OriginalTy, TruncatedTy)); 3401 break; 3402 } 3403 } else if (auto *SI = dyn_cast<ShuffleVectorInst>(I)) { 3404 auto Elements0 = SI->getOperand(0)->getType()->getVectorNumElements(); 3405 auto *O0 = B.CreateZExtOrTrunc( 3406 SI->getOperand(0), VectorType::get(ScalarTruncatedTy, Elements0)); 3407 auto Elements1 = SI->getOperand(1)->getType()->getVectorNumElements(); 3408 auto *O1 = B.CreateZExtOrTrunc( 3409 SI->getOperand(1), VectorType::get(ScalarTruncatedTy, Elements1)); 3410 3411 NewI = B.CreateShuffleVector(O0, O1, SI->getShuffleMask()); 3412 } else if (isa<LoadInst>(I) || isa<PHINode>(I)) { 3413 // Don't do anything with the operands, just extend the result. 3414 continue; 3415 } else if (auto *IE = dyn_cast<InsertElementInst>(I)) { 3416 auto Elements = IE->getOperand(0)->getType()->getVectorNumElements(); 3417 auto *O0 = B.CreateZExtOrTrunc( 3418 IE->getOperand(0), VectorType::get(ScalarTruncatedTy, Elements)); 3419 auto *O1 = B.CreateZExtOrTrunc(IE->getOperand(1), ScalarTruncatedTy); 3420 NewI = B.CreateInsertElement(O0, O1, IE->getOperand(2)); 3421 } else if (auto *EE = dyn_cast<ExtractElementInst>(I)) { 3422 auto Elements = EE->getOperand(0)->getType()->getVectorNumElements(); 3423 auto *O0 = B.CreateZExtOrTrunc( 3424 EE->getOperand(0), VectorType::get(ScalarTruncatedTy, Elements)); 3425 NewI = B.CreateExtractElement(O0, EE->getOperand(2)); 3426 } else { 3427 // If we don't know what to do, be conservative and don't do anything. 3428 continue; 3429 } 3430 3431 // Lastly, extend the result. 3432 NewI->takeName(cast<Instruction>(I)); 3433 Value *Res = B.CreateZExtOrTrunc(NewI, OriginalTy); 3434 I->replaceAllUsesWith(Res); 3435 cast<Instruction>(I)->eraseFromParent(); 3436 Erased.insert(I); 3437 VectorLoopValueMap.resetVectorValue(KV.first, Part, Res); 3438 } 3439 } 3440 3441 // We'll have created a bunch of ZExts that are now parentless. Clean up. 3442 for (const auto &KV : Cost->getMinimalBitwidths()) { 3443 // If the value wasn't vectorized, we must maintain the original scalar 3444 // type. The absence of the value from VectorLoopValueMap indicates that it 3445 // wasn't vectorized. 3446 if (!VectorLoopValueMap.hasAnyVectorValue(KV.first)) 3447 continue; 3448 for (unsigned Part = 0; Part < UF; ++Part) { 3449 Value *I = getOrCreateVectorValue(KV.first, Part); 3450 ZExtInst *Inst = dyn_cast<ZExtInst>(I); 3451 if (Inst && Inst->use_empty()) { 3452 Value *NewI = Inst->getOperand(0); 3453 Inst->eraseFromParent(); 3454 VectorLoopValueMap.resetVectorValue(KV.first, Part, NewI); 3455 } 3456 } 3457 } 3458 } 3459 3460 void InnerLoopVectorizer::fixVectorizedLoop() { 3461 // Insert truncates and extends for any truncated instructions as hints to 3462 // InstCombine. 3463 if (VF > 1) 3464 truncateToMinimalBitwidths(); 3465 3466 // Fix widened non-induction PHIs by setting up the PHI operands. 3467 if (OrigPHIsToFix.size()) { 3468 assert(EnableVPlanNativePath && 3469 "Unexpected non-induction PHIs for fixup in non VPlan-native path"); 3470 fixNonInductionPHIs(); 3471 } 3472 3473 // At this point every instruction in the original loop is widened to a 3474 // vector form. Now we need to fix the recurrences in the loop. These PHI 3475 // nodes are currently empty because we did not want to introduce cycles. 3476 // This is the second stage of vectorizing recurrences. 3477 fixCrossIterationPHIs(); 3478 3479 // Forget the original basic block. 3480 PSE.getSE()->forgetLoop(OrigLoop); 3481 3482 // Fix-up external users of the induction variables. 3483 for (auto &Entry : Legal->getInductionVars()) 3484 fixupIVUsers(Entry.first, Entry.second, 3485 getOrCreateVectorTripCount(LI->getLoopFor(LoopVectorBody)), 3486 IVEndValues[Entry.first], LoopMiddleBlock); 3487 3488 fixLCSSAPHIs(); 3489 for (Instruction *PI : PredicatedInstructions) 3490 sinkScalarOperands(&*PI); 3491 3492 // Remove redundant induction instructions. 3493 cse(LoopVectorBody); 3494 3495 // Set/update profile weights for the vector and remainder loops as original 3496 // loop iterations are now distributed among them. Note that original loop 3497 // represented by LoopScalarBody becomes remainder loop after vectorization. 3498 // 3499 // For cases like foldTailByMasking() and requiresScalarEpiloque() we may 3500 // end up getting slightly roughened result but that should be OK since 3501 // profile is not inherently precise anyway. Note also possible bypass of 3502 // vector code caused by legality checks is ignored, assigning all the weight 3503 // to the vector loop, optimistically. 3504 setProfileInfoAfterUnrolling(LI->getLoopFor(LoopScalarBody), 3505 LI->getLoopFor(LoopVectorBody), 3506 LI->getLoopFor(LoopScalarBody), VF * UF); 3507 } 3508 3509 void InnerLoopVectorizer::fixCrossIterationPHIs() { 3510 // In order to support recurrences we need to be able to vectorize Phi nodes. 3511 // Phi nodes have cycles, so we need to vectorize them in two stages. This is 3512 // stage #2: We now need to fix the recurrences by adding incoming edges to 3513 // the currently empty PHI nodes. At this point every instruction in the 3514 // original loop is widened to a vector form so we can use them to construct 3515 // the incoming edges. 3516 for (PHINode &Phi : OrigLoop->getHeader()->phis()) { 3517 // Handle first-order recurrences and reductions that need to be fixed. 3518 if (Legal->isFirstOrderRecurrence(&Phi)) 3519 fixFirstOrderRecurrence(&Phi); 3520 else if (Legal->isReductionVariable(&Phi)) 3521 fixReduction(&Phi); 3522 } 3523 } 3524 3525 void InnerLoopVectorizer::fixFirstOrderRecurrence(PHINode *Phi) { 3526 // This is the second phase of vectorizing first-order recurrences. An 3527 // overview of the transformation is described below. Suppose we have the 3528 // following loop. 3529 // 3530 // for (int i = 0; i < n; ++i) 3531 // b[i] = a[i] - a[i - 1]; 3532 // 3533 // There is a first-order recurrence on "a". For this loop, the shorthand 3534 // scalar IR looks like: 3535 // 3536 // scalar.ph: 3537 // s_init = a[-1] 3538 // br scalar.body 3539 // 3540 // scalar.body: 3541 // i = phi [0, scalar.ph], [i+1, scalar.body] 3542 // s1 = phi [s_init, scalar.ph], [s2, scalar.body] 3543 // s2 = a[i] 3544 // b[i] = s2 - s1 3545 // br cond, scalar.body, ... 3546 // 3547 // In this example, s1 is a recurrence because it's value depends on the 3548 // previous iteration. In the first phase of vectorization, we created a 3549 // temporary value for s1. We now complete the vectorization and produce the 3550 // shorthand vector IR shown below (for VF = 4, UF = 1). 3551 // 3552 // vector.ph: 3553 // v_init = vector(..., ..., ..., a[-1]) 3554 // br vector.body 3555 // 3556 // vector.body 3557 // i = phi [0, vector.ph], [i+4, vector.body] 3558 // v1 = phi [v_init, vector.ph], [v2, vector.body] 3559 // v2 = a[i, i+1, i+2, i+3]; 3560 // v3 = vector(v1(3), v2(0, 1, 2)) 3561 // b[i, i+1, i+2, i+3] = v2 - v3 3562 // br cond, vector.body, middle.block 3563 // 3564 // middle.block: 3565 // x = v2(3) 3566 // br scalar.ph 3567 // 3568 // scalar.ph: 3569 // s_init = phi [x, middle.block], [a[-1], otherwise] 3570 // br scalar.body 3571 // 3572 // After execution completes the vector loop, we extract the next value of 3573 // the recurrence (x) to use as the initial value in the scalar loop. 3574 3575 // Get the original loop preheader and single loop latch. 3576 auto *Preheader = OrigLoop->getLoopPreheader(); 3577 auto *Latch = OrigLoop->getLoopLatch(); 3578 3579 // Get the initial and previous values of the scalar recurrence. 3580 auto *ScalarInit = Phi->getIncomingValueForBlock(Preheader); 3581 auto *Previous = Phi->getIncomingValueForBlock(Latch); 3582 3583 // Create a vector from the initial value. 3584 auto *VectorInit = ScalarInit; 3585 if (VF > 1) { 3586 Builder.SetInsertPoint(LoopVectorPreHeader->getTerminator()); 3587 VectorInit = Builder.CreateInsertElement( 3588 UndefValue::get(VectorType::get(VectorInit->getType(), VF)), VectorInit, 3589 Builder.getInt32(VF - 1), "vector.recur.init"); 3590 } 3591 3592 // We constructed a temporary phi node in the first phase of vectorization. 3593 // This phi node will eventually be deleted. 3594 Builder.SetInsertPoint( 3595 cast<Instruction>(VectorLoopValueMap.getVectorValue(Phi, 0))); 3596 3597 // Create a phi node for the new recurrence. The current value will either be 3598 // the initial value inserted into a vector or loop-varying vector value. 3599 auto *VecPhi = Builder.CreatePHI(VectorInit->getType(), 2, "vector.recur"); 3600 VecPhi->addIncoming(VectorInit, LoopVectorPreHeader); 3601 3602 // Get the vectorized previous value of the last part UF - 1. It appears last 3603 // among all unrolled iterations, due to the order of their construction. 3604 Value *PreviousLastPart = getOrCreateVectorValue(Previous, UF - 1); 3605 3606 // Find and set the insertion point after the previous value if it is an 3607 // instruction. 3608 BasicBlock::iterator InsertPt; 3609 // Note that the previous value may have been constant-folded so it is not 3610 // guaranteed to be an instruction in the vector loop. 3611 // FIXME: Loop invariant values do not form recurrences. We should deal with 3612 // them earlier. 3613 if (LI->getLoopFor(LoopVectorBody)->isLoopInvariant(PreviousLastPart)) 3614 InsertPt = LoopVectorBody->getFirstInsertionPt(); 3615 else { 3616 Instruction *PreviousInst = cast<Instruction>(PreviousLastPart); 3617 if (isa<PHINode>(PreviousLastPart)) 3618 // If the previous value is a phi node, we should insert after all the phi 3619 // nodes in the block containing the PHI to avoid breaking basic block 3620 // verification. Note that the basic block may be different to 3621 // LoopVectorBody, in case we predicate the loop. 3622 InsertPt = PreviousInst->getParent()->getFirstInsertionPt(); 3623 else 3624 InsertPt = ++PreviousInst->getIterator(); 3625 } 3626 Builder.SetInsertPoint(&*InsertPt); 3627 3628 // We will construct a vector for the recurrence by combining the values for 3629 // the current and previous iterations. This is the required shuffle mask. 3630 SmallVector<Constant *, 8> ShuffleMask(VF); 3631 ShuffleMask[0] = Builder.getInt32(VF - 1); 3632 for (unsigned I = 1; I < VF; ++I) 3633 ShuffleMask[I] = Builder.getInt32(I + VF - 1); 3634 3635 // The vector from which to take the initial value for the current iteration 3636 // (actual or unrolled). Initially, this is the vector phi node. 3637 Value *Incoming = VecPhi; 3638 3639 // Shuffle the current and previous vector and update the vector parts. 3640 for (unsigned Part = 0; Part < UF; ++Part) { 3641 Value *PreviousPart = getOrCreateVectorValue(Previous, Part); 3642 Value *PhiPart = VectorLoopValueMap.getVectorValue(Phi, Part); 3643 auto *Shuffle = 3644 VF > 1 ? Builder.CreateShuffleVector(Incoming, PreviousPart, 3645 ConstantVector::get(ShuffleMask)) 3646 : Incoming; 3647 PhiPart->replaceAllUsesWith(Shuffle); 3648 cast<Instruction>(PhiPart)->eraseFromParent(); 3649 VectorLoopValueMap.resetVectorValue(Phi, Part, Shuffle); 3650 Incoming = PreviousPart; 3651 } 3652 3653 // Fix the latch value of the new recurrence in the vector loop. 3654 VecPhi->addIncoming(Incoming, LI->getLoopFor(LoopVectorBody)->getLoopLatch()); 3655 3656 // Extract the last vector element in the middle block. This will be the 3657 // initial value for the recurrence when jumping to the scalar loop. 3658 auto *ExtractForScalar = Incoming; 3659 if (VF > 1) { 3660 Builder.SetInsertPoint(LoopMiddleBlock->getTerminator()); 3661 ExtractForScalar = Builder.CreateExtractElement( 3662 ExtractForScalar, Builder.getInt32(VF - 1), "vector.recur.extract"); 3663 } 3664 // Extract the second last element in the middle block if the 3665 // Phi is used outside the loop. We need to extract the phi itself 3666 // and not the last element (the phi update in the current iteration). This 3667 // will be the value when jumping to the exit block from the LoopMiddleBlock, 3668 // when the scalar loop is not run at all. 3669 Value *ExtractForPhiUsedOutsideLoop = nullptr; 3670 if (VF > 1) 3671 ExtractForPhiUsedOutsideLoop = Builder.CreateExtractElement( 3672 Incoming, Builder.getInt32(VF - 2), "vector.recur.extract.for.phi"); 3673 // When loop is unrolled without vectorizing, initialize 3674 // ExtractForPhiUsedOutsideLoop with the value just prior to unrolled value of 3675 // `Incoming`. This is analogous to the vectorized case above: extracting the 3676 // second last element when VF > 1. 3677 else if (UF > 1) 3678 ExtractForPhiUsedOutsideLoop = getOrCreateVectorValue(Previous, UF - 2); 3679 3680 // Fix the initial value of the original recurrence in the scalar loop. 3681 Builder.SetInsertPoint(&*LoopScalarPreHeader->begin()); 3682 auto *Start = Builder.CreatePHI(Phi->getType(), 2, "scalar.recur.init"); 3683 for (auto *BB : predecessors(LoopScalarPreHeader)) { 3684 auto *Incoming = BB == LoopMiddleBlock ? ExtractForScalar : ScalarInit; 3685 Start->addIncoming(Incoming, BB); 3686 } 3687 3688 Phi->setIncomingValueForBlock(LoopScalarPreHeader, Start); 3689 Phi->setName("scalar.recur"); 3690 3691 // Finally, fix users of the recurrence outside the loop. The users will need 3692 // either the last value of the scalar recurrence or the last value of the 3693 // vector recurrence we extracted in the middle block. Since the loop is in 3694 // LCSSA form, we just need to find all the phi nodes for the original scalar 3695 // recurrence in the exit block, and then add an edge for the middle block. 3696 for (PHINode &LCSSAPhi : LoopExitBlock->phis()) { 3697 if (LCSSAPhi.getIncomingValue(0) == Phi) { 3698 LCSSAPhi.addIncoming(ExtractForPhiUsedOutsideLoop, LoopMiddleBlock); 3699 } 3700 } 3701 } 3702 3703 void InnerLoopVectorizer::fixReduction(PHINode *Phi) { 3704 Constant *Zero = Builder.getInt32(0); 3705 3706 // Get it's reduction variable descriptor. 3707 assert(Legal->isReductionVariable(Phi) && 3708 "Unable to find the reduction variable"); 3709 RecurrenceDescriptor RdxDesc = Legal->getReductionVars()[Phi]; 3710 3711 RecurrenceDescriptor::RecurrenceKind RK = RdxDesc.getRecurrenceKind(); 3712 TrackingVH<Value> ReductionStartValue = RdxDesc.getRecurrenceStartValue(); 3713 Instruction *LoopExitInst = RdxDesc.getLoopExitInstr(); 3714 RecurrenceDescriptor::MinMaxRecurrenceKind MinMaxKind = 3715 RdxDesc.getMinMaxRecurrenceKind(); 3716 setDebugLocFromInst(Builder, ReductionStartValue); 3717 3718 // We need to generate a reduction vector from the incoming scalar. 3719 // To do so, we need to generate the 'identity' vector and override 3720 // one of the elements with the incoming scalar reduction. We need 3721 // to do it in the vector-loop preheader. 3722 Builder.SetInsertPoint(LoopVectorPreHeader->getTerminator()); 3723 3724 // This is the vector-clone of the value that leaves the loop. 3725 Type *VecTy = getOrCreateVectorValue(LoopExitInst, 0)->getType(); 3726 3727 // Find the reduction identity variable. Zero for addition, or, xor, 3728 // one for multiplication, -1 for And. 3729 Value *Identity; 3730 Value *VectorStart; 3731 if (RK == RecurrenceDescriptor::RK_IntegerMinMax || 3732 RK == RecurrenceDescriptor::RK_FloatMinMax) { 3733 // MinMax reduction have the start value as their identify. 3734 if (VF == 1) { 3735 VectorStart = Identity = ReductionStartValue; 3736 } else { 3737 VectorStart = Identity = 3738 Builder.CreateVectorSplat(VF, ReductionStartValue, "minmax.ident"); 3739 } 3740 } else { 3741 // Handle other reduction kinds: 3742 Constant *Iden = RecurrenceDescriptor::getRecurrenceIdentity( 3743 RK, VecTy->getScalarType()); 3744 if (VF == 1) { 3745 Identity = Iden; 3746 // This vector is the Identity vector where the first element is the 3747 // incoming scalar reduction. 3748 VectorStart = ReductionStartValue; 3749 } else { 3750 Identity = ConstantVector::getSplat({VF, false}, Iden); 3751 3752 // This vector is the Identity vector where the first element is the 3753 // incoming scalar reduction. 3754 VectorStart = 3755 Builder.CreateInsertElement(Identity, ReductionStartValue, Zero); 3756 } 3757 } 3758 3759 // Wrap flags are in general invalid after vectorization, clear them. 3760 clearReductionWrapFlags(RdxDesc); 3761 3762 // Fix the vector-loop phi. 3763 3764 // Reductions do not have to start at zero. They can start with 3765 // any loop invariant values. 3766 BasicBlock *Latch = OrigLoop->getLoopLatch(); 3767 Value *LoopVal = Phi->getIncomingValueForBlock(Latch); 3768 3769 for (unsigned Part = 0; Part < UF; ++Part) { 3770 Value *VecRdxPhi = getOrCreateVectorValue(Phi, Part); 3771 Value *Val = getOrCreateVectorValue(LoopVal, Part); 3772 // Make sure to add the reduction start value only to the 3773 // first unroll part. 3774 Value *StartVal = (Part == 0) ? VectorStart : Identity; 3775 cast<PHINode>(VecRdxPhi)->addIncoming(StartVal, LoopVectorPreHeader); 3776 cast<PHINode>(VecRdxPhi) 3777 ->addIncoming(Val, LI->getLoopFor(LoopVectorBody)->getLoopLatch()); 3778 } 3779 3780 // Before each round, move the insertion point right between 3781 // the PHIs and the values we are going to write. 3782 // This allows us to write both PHINodes and the extractelement 3783 // instructions. 3784 Builder.SetInsertPoint(&*LoopMiddleBlock->getFirstInsertionPt()); 3785 3786 setDebugLocFromInst(Builder, LoopExitInst); 3787 3788 // If tail is folded by masking, the vector value to leave the loop should be 3789 // a Select choosing between the vectorized LoopExitInst and vectorized Phi, 3790 // instead of the former. 3791 if (Cost->foldTailByMasking()) { 3792 for (unsigned Part = 0; Part < UF; ++Part) { 3793 Value *VecLoopExitInst = 3794 VectorLoopValueMap.getVectorValue(LoopExitInst, Part); 3795 Value *Sel = nullptr; 3796 for (User *U : VecLoopExitInst->users()) { 3797 if (isa<SelectInst>(U)) { 3798 assert(!Sel && "Reduction exit feeding two selects"); 3799 Sel = U; 3800 } else 3801 assert(isa<PHINode>(U) && "Reduction exit must feed Phi's or select"); 3802 } 3803 assert(Sel && "Reduction exit feeds no select"); 3804 VectorLoopValueMap.resetVectorValue(LoopExitInst, Part, Sel); 3805 } 3806 } 3807 3808 // If the vector reduction can be performed in a smaller type, we truncate 3809 // then extend the loop exit value to enable InstCombine to evaluate the 3810 // entire expression in the smaller type. 3811 if (VF > 1 && Phi->getType() != RdxDesc.getRecurrenceType()) { 3812 Type *RdxVecTy = VectorType::get(RdxDesc.getRecurrenceType(), VF); 3813 Builder.SetInsertPoint( 3814 LI->getLoopFor(LoopVectorBody)->getLoopLatch()->getTerminator()); 3815 VectorParts RdxParts(UF); 3816 for (unsigned Part = 0; Part < UF; ++Part) { 3817 RdxParts[Part] = VectorLoopValueMap.getVectorValue(LoopExitInst, Part); 3818 Value *Trunc = Builder.CreateTrunc(RdxParts[Part], RdxVecTy); 3819 Value *Extnd = RdxDesc.isSigned() ? Builder.CreateSExt(Trunc, VecTy) 3820 : Builder.CreateZExt(Trunc, VecTy); 3821 for (Value::user_iterator UI = RdxParts[Part]->user_begin(); 3822 UI != RdxParts[Part]->user_end();) 3823 if (*UI != Trunc) { 3824 (*UI++)->replaceUsesOfWith(RdxParts[Part], Extnd); 3825 RdxParts[Part] = Extnd; 3826 } else { 3827 ++UI; 3828 } 3829 } 3830 Builder.SetInsertPoint(&*LoopMiddleBlock->getFirstInsertionPt()); 3831 for (unsigned Part = 0; Part < UF; ++Part) { 3832 RdxParts[Part] = Builder.CreateTrunc(RdxParts[Part], RdxVecTy); 3833 VectorLoopValueMap.resetVectorValue(LoopExitInst, Part, RdxParts[Part]); 3834 } 3835 } 3836 3837 // Reduce all of the unrolled parts into a single vector. 3838 Value *ReducedPartRdx = VectorLoopValueMap.getVectorValue(LoopExitInst, 0); 3839 unsigned Op = RecurrenceDescriptor::getRecurrenceBinOp(RK); 3840 3841 // The middle block terminator has already been assigned a DebugLoc here (the 3842 // OrigLoop's single latch terminator). We want the whole middle block to 3843 // appear to execute on this line because: (a) it is all compiler generated, 3844 // (b) these instructions are always executed after evaluating the latch 3845 // conditional branch, and (c) other passes may add new predecessors which 3846 // terminate on this line. This is the easiest way to ensure we don't 3847 // accidentally cause an extra step back into the loop while debugging. 3848 setDebugLocFromInst(Builder, LoopMiddleBlock->getTerminator()); 3849 for (unsigned Part = 1; Part < UF; ++Part) { 3850 Value *RdxPart = VectorLoopValueMap.getVectorValue(LoopExitInst, Part); 3851 if (Op != Instruction::ICmp && Op != Instruction::FCmp) 3852 // Floating point operations had to be 'fast' to enable the reduction. 3853 ReducedPartRdx = addFastMathFlag( 3854 Builder.CreateBinOp((Instruction::BinaryOps)Op, RdxPart, 3855 ReducedPartRdx, "bin.rdx"), 3856 RdxDesc.getFastMathFlags()); 3857 else 3858 ReducedPartRdx = createMinMaxOp(Builder, MinMaxKind, ReducedPartRdx, 3859 RdxPart); 3860 } 3861 3862 if (VF > 1) { 3863 bool NoNaN = Legal->hasFunNoNaNAttr(); 3864 ReducedPartRdx = 3865 createTargetReduction(Builder, TTI, RdxDesc, ReducedPartRdx, NoNaN); 3866 // If the reduction can be performed in a smaller type, we need to extend 3867 // the reduction to the wider type before we branch to the original loop. 3868 if (Phi->getType() != RdxDesc.getRecurrenceType()) 3869 ReducedPartRdx = 3870 RdxDesc.isSigned() 3871 ? Builder.CreateSExt(ReducedPartRdx, Phi->getType()) 3872 : Builder.CreateZExt(ReducedPartRdx, Phi->getType()); 3873 } 3874 3875 // Create a phi node that merges control-flow from the backedge-taken check 3876 // block and the middle block. 3877 PHINode *BCBlockPhi = PHINode::Create(Phi->getType(), 2, "bc.merge.rdx", 3878 LoopScalarPreHeader->getTerminator()); 3879 for (unsigned I = 0, E = LoopBypassBlocks.size(); I != E; ++I) 3880 BCBlockPhi->addIncoming(ReductionStartValue, LoopBypassBlocks[I]); 3881 BCBlockPhi->addIncoming(ReducedPartRdx, LoopMiddleBlock); 3882 3883 // Now, we need to fix the users of the reduction variable 3884 // inside and outside of the scalar remainder loop. 3885 // We know that the loop is in LCSSA form. We need to update the 3886 // PHI nodes in the exit blocks. 3887 for (PHINode &LCSSAPhi : LoopExitBlock->phis()) { 3888 // All PHINodes need to have a single entry edge, or two if 3889 // we already fixed them. 3890 assert(LCSSAPhi.getNumIncomingValues() < 3 && "Invalid LCSSA PHI"); 3891 3892 // We found a reduction value exit-PHI. Update it with the 3893 // incoming bypass edge. 3894 if (LCSSAPhi.getIncomingValue(0) == LoopExitInst) 3895 LCSSAPhi.addIncoming(ReducedPartRdx, LoopMiddleBlock); 3896 } // end of the LCSSA phi scan. 3897 3898 // Fix the scalar loop reduction variable with the incoming reduction sum 3899 // from the vector body and from the backedge value. 3900 int IncomingEdgeBlockIdx = 3901 Phi->getBasicBlockIndex(OrigLoop->getLoopLatch()); 3902 assert(IncomingEdgeBlockIdx >= 0 && "Invalid block index"); 3903 // Pick the other block. 3904 int SelfEdgeBlockIdx = (IncomingEdgeBlockIdx ? 0 : 1); 3905 Phi->setIncomingValue(SelfEdgeBlockIdx, BCBlockPhi); 3906 Phi->setIncomingValue(IncomingEdgeBlockIdx, LoopExitInst); 3907 } 3908 3909 void InnerLoopVectorizer::clearReductionWrapFlags( 3910 RecurrenceDescriptor &RdxDesc) { 3911 RecurrenceDescriptor::RecurrenceKind RK = RdxDesc.getRecurrenceKind(); 3912 if (RK != RecurrenceDescriptor::RK_IntegerAdd && 3913 RK != RecurrenceDescriptor::RK_IntegerMult) 3914 return; 3915 3916 Instruction *LoopExitInstr = RdxDesc.getLoopExitInstr(); 3917 assert(LoopExitInstr && "null loop exit instruction"); 3918 SmallVector<Instruction *, 8> Worklist; 3919 SmallPtrSet<Instruction *, 8> Visited; 3920 Worklist.push_back(LoopExitInstr); 3921 Visited.insert(LoopExitInstr); 3922 3923 while (!Worklist.empty()) { 3924 Instruction *Cur = Worklist.pop_back_val(); 3925 if (isa<OverflowingBinaryOperator>(Cur)) 3926 for (unsigned Part = 0; Part < UF; ++Part) { 3927 Value *V = getOrCreateVectorValue(Cur, Part); 3928 cast<Instruction>(V)->dropPoisonGeneratingFlags(); 3929 } 3930 3931 for (User *U : Cur->users()) { 3932 Instruction *UI = cast<Instruction>(U); 3933 if ((Cur != LoopExitInstr || OrigLoop->contains(UI->getParent())) && 3934 Visited.insert(UI).second) 3935 Worklist.push_back(UI); 3936 } 3937 } 3938 } 3939 3940 void InnerLoopVectorizer::fixLCSSAPHIs() { 3941 for (PHINode &LCSSAPhi : LoopExitBlock->phis()) { 3942 if (LCSSAPhi.getNumIncomingValues() == 1) { 3943 auto *IncomingValue = LCSSAPhi.getIncomingValue(0); 3944 // Non-instruction incoming values will have only one value. 3945 unsigned LastLane = 0; 3946 if (isa<Instruction>(IncomingValue)) 3947 LastLane = Cost->isUniformAfterVectorization( 3948 cast<Instruction>(IncomingValue), VF) 3949 ? 0 3950 : VF - 1; 3951 // Can be a loop invariant incoming value or the last scalar value to be 3952 // extracted from the vectorized loop. 3953 Builder.SetInsertPoint(LoopMiddleBlock->getTerminator()); 3954 Value *lastIncomingValue = 3955 getOrCreateScalarValue(IncomingValue, { UF - 1, LastLane }); 3956 LCSSAPhi.addIncoming(lastIncomingValue, LoopMiddleBlock); 3957 } 3958 } 3959 } 3960 3961 void InnerLoopVectorizer::sinkScalarOperands(Instruction *PredInst) { 3962 // The basic block and loop containing the predicated instruction. 3963 auto *PredBB = PredInst->getParent(); 3964 auto *VectorLoop = LI->getLoopFor(PredBB); 3965 3966 // Initialize a worklist with the operands of the predicated instruction. 3967 SetVector<Value *> Worklist(PredInst->op_begin(), PredInst->op_end()); 3968 3969 // Holds instructions that we need to analyze again. An instruction may be 3970 // reanalyzed if we don't yet know if we can sink it or not. 3971 SmallVector<Instruction *, 8> InstsToReanalyze; 3972 3973 // Returns true if a given use occurs in the predicated block. Phi nodes use 3974 // their operands in their corresponding predecessor blocks. 3975 auto isBlockOfUsePredicated = [&](Use &U) -> bool { 3976 auto *I = cast<Instruction>(U.getUser()); 3977 BasicBlock *BB = I->getParent(); 3978 if (auto *Phi = dyn_cast<PHINode>(I)) 3979 BB = Phi->getIncomingBlock( 3980 PHINode::getIncomingValueNumForOperand(U.getOperandNo())); 3981 return BB == PredBB; 3982 }; 3983 3984 // Iteratively sink the scalarized operands of the predicated instruction 3985 // into the block we created for it. When an instruction is sunk, it's 3986 // operands are then added to the worklist. The algorithm ends after one pass 3987 // through the worklist doesn't sink a single instruction. 3988 bool Changed; 3989 do { 3990 // Add the instructions that need to be reanalyzed to the worklist, and 3991 // reset the changed indicator. 3992 Worklist.insert(InstsToReanalyze.begin(), InstsToReanalyze.end()); 3993 InstsToReanalyze.clear(); 3994 Changed = false; 3995 3996 while (!Worklist.empty()) { 3997 auto *I = dyn_cast<Instruction>(Worklist.pop_back_val()); 3998 3999 // We can't sink an instruction if it is a phi node, is already in the 4000 // predicated block, is not in the loop, or may have side effects. 4001 if (!I || isa<PHINode>(I) || I->getParent() == PredBB || 4002 !VectorLoop->contains(I) || I->mayHaveSideEffects()) 4003 continue; 4004 4005 // It's legal to sink the instruction if all its uses occur in the 4006 // predicated block. Otherwise, there's nothing to do yet, and we may 4007 // need to reanalyze the instruction. 4008 if (!llvm::all_of(I->uses(), isBlockOfUsePredicated)) { 4009 InstsToReanalyze.push_back(I); 4010 continue; 4011 } 4012 4013 // Move the instruction to the beginning of the predicated block, and add 4014 // it's operands to the worklist. 4015 I->moveBefore(&*PredBB->getFirstInsertionPt()); 4016 Worklist.insert(I->op_begin(), I->op_end()); 4017 4018 // The sinking may have enabled other instructions to be sunk, so we will 4019 // need to iterate. 4020 Changed = true; 4021 } 4022 } while (Changed); 4023 } 4024 4025 void InnerLoopVectorizer::fixNonInductionPHIs() { 4026 for (PHINode *OrigPhi : OrigPHIsToFix) { 4027 PHINode *NewPhi = 4028 cast<PHINode>(VectorLoopValueMap.getVectorValue(OrigPhi, 0)); 4029 unsigned NumIncomingValues = OrigPhi->getNumIncomingValues(); 4030 4031 SmallVector<BasicBlock *, 2> ScalarBBPredecessors( 4032 predecessors(OrigPhi->getParent())); 4033 SmallVector<BasicBlock *, 2> VectorBBPredecessors( 4034 predecessors(NewPhi->getParent())); 4035 assert(ScalarBBPredecessors.size() == VectorBBPredecessors.size() && 4036 "Scalar and Vector BB should have the same number of predecessors"); 4037 4038 // The insertion point in Builder may be invalidated by the time we get 4039 // here. Force the Builder insertion point to something valid so that we do 4040 // not run into issues during insertion point restore in 4041 // getOrCreateVectorValue calls below. 4042 Builder.SetInsertPoint(NewPhi); 4043 4044 // The predecessor order is preserved and we can rely on mapping between 4045 // scalar and vector block predecessors. 4046 for (unsigned i = 0; i < NumIncomingValues; ++i) { 4047 BasicBlock *NewPredBB = VectorBBPredecessors[i]; 4048 4049 // When looking up the new scalar/vector values to fix up, use incoming 4050 // values from original phi. 4051 Value *ScIncV = 4052 OrigPhi->getIncomingValueForBlock(ScalarBBPredecessors[i]); 4053 4054 // Scalar incoming value may need a broadcast 4055 Value *NewIncV = getOrCreateVectorValue(ScIncV, 0); 4056 NewPhi->addIncoming(NewIncV, NewPredBB); 4057 } 4058 } 4059 } 4060 4061 void InnerLoopVectorizer::widenGEP(GetElementPtrInst *GEP, unsigned UF, 4062 unsigned VF, bool IsPtrLoopInvariant, 4063 SmallBitVector &IsIndexLoopInvariant) { 4064 // Construct a vector GEP by widening the operands of the scalar GEP as 4065 // necessary. We mark the vector GEP 'inbounds' if appropriate. A GEP 4066 // results in a vector of pointers when at least one operand of the GEP 4067 // is vector-typed. Thus, to keep the representation compact, we only use 4068 // vector-typed operands for loop-varying values. 4069 4070 if (VF > 1 && IsPtrLoopInvariant && IsIndexLoopInvariant.all()) { 4071 // If we are vectorizing, but the GEP has only loop-invariant operands, 4072 // the GEP we build (by only using vector-typed operands for 4073 // loop-varying values) would be a scalar pointer. Thus, to ensure we 4074 // produce a vector of pointers, we need to either arbitrarily pick an 4075 // operand to broadcast, or broadcast a clone of the original GEP. 4076 // Here, we broadcast a clone of the original. 4077 // 4078 // TODO: If at some point we decide to scalarize instructions having 4079 // loop-invariant operands, this special case will no longer be 4080 // required. We would add the scalarization decision to 4081 // collectLoopScalars() and teach getVectorValue() to broadcast 4082 // the lane-zero scalar value. 4083 auto *Clone = Builder.Insert(GEP->clone()); 4084 for (unsigned Part = 0; Part < UF; ++Part) { 4085 Value *EntryPart = Builder.CreateVectorSplat(VF, Clone); 4086 VectorLoopValueMap.setVectorValue(GEP, Part, EntryPart); 4087 addMetadata(EntryPart, GEP); 4088 } 4089 } else { 4090 // If the GEP has at least one loop-varying operand, we are sure to 4091 // produce a vector of pointers. But if we are only unrolling, we want 4092 // to produce a scalar GEP for each unroll part. Thus, the GEP we 4093 // produce with the code below will be scalar (if VF == 1) or vector 4094 // (otherwise). Note that for the unroll-only case, we still maintain 4095 // values in the vector mapping with initVector, as we do for other 4096 // instructions. 4097 for (unsigned Part = 0; Part < UF; ++Part) { 4098 // The pointer operand of the new GEP. If it's loop-invariant, we 4099 // won't broadcast it. 4100 auto *Ptr = IsPtrLoopInvariant 4101 ? GEP->getPointerOperand() 4102 : getOrCreateVectorValue(GEP->getPointerOperand(), Part); 4103 4104 // Collect all the indices for the new GEP. If any index is 4105 // loop-invariant, we won't broadcast it. 4106 SmallVector<Value *, 4> Indices; 4107 for (auto Index : enumerate(GEP->indices())) { 4108 Value *User = Index.value().get(); 4109 if (IsIndexLoopInvariant[Index.index()]) 4110 Indices.push_back(User); 4111 else 4112 Indices.push_back(getOrCreateVectorValue(User, Part)); 4113 } 4114 4115 // Create the new GEP. Note that this GEP may be a scalar if VF == 1, 4116 // but it should be a vector, otherwise. 4117 auto *NewGEP = 4118 GEP->isInBounds() 4119 ? Builder.CreateInBoundsGEP(GEP->getSourceElementType(), Ptr, 4120 Indices) 4121 : Builder.CreateGEP(GEP->getSourceElementType(), Ptr, Indices); 4122 assert((VF == 1 || NewGEP->getType()->isVectorTy()) && 4123 "NewGEP is not a pointer vector"); 4124 VectorLoopValueMap.setVectorValue(GEP, Part, NewGEP); 4125 addMetadata(NewGEP, GEP); 4126 } 4127 } 4128 } 4129 4130 void InnerLoopVectorizer::widenPHIInstruction(Instruction *PN, unsigned UF, 4131 unsigned VF) { 4132 PHINode *P = cast<PHINode>(PN); 4133 if (EnableVPlanNativePath) { 4134 // Currently we enter here in the VPlan-native path for non-induction 4135 // PHIs where all control flow is uniform. We simply widen these PHIs. 4136 // Create a vector phi with no operands - the vector phi operands will be 4137 // set at the end of vector code generation. 4138 Type *VecTy = 4139 (VF == 1) ? PN->getType() : VectorType::get(PN->getType(), VF); 4140 Value *VecPhi = Builder.CreatePHI(VecTy, PN->getNumOperands(), "vec.phi"); 4141 VectorLoopValueMap.setVectorValue(P, 0, VecPhi); 4142 OrigPHIsToFix.push_back(P); 4143 4144 return; 4145 } 4146 4147 assert(PN->getParent() == OrigLoop->getHeader() && 4148 "Non-header phis should have been handled elsewhere"); 4149 4150 // In order to support recurrences we need to be able to vectorize Phi nodes. 4151 // Phi nodes have cycles, so we need to vectorize them in two stages. This is 4152 // stage #1: We create a new vector PHI node with no incoming edges. We'll use 4153 // this value when we vectorize all of the instructions that use the PHI. 4154 if (Legal->isReductionVariable(P) || Legal->isFirstOrderRecurrence(P)) { 4155 for (unsigned Part = 0; Part < UF; ++Part) { 4156 // This is phase one of vectorizing PHIs. 4157 Type *VecTy = 4158 (VF == 1) ? PN->getType() : VectorType::get(PN->getType(), VF); 4159 Value *EntryPart = PHINode::Create( 4160 VecTy, 2, "vec.phi", &*LoopVectorBody->getFirstInsertionPt()); 4161 VectorLoopValueMap.setVectorValue(P, Part, EntryPart); 4162 } 4163 return; 4164 } 4165 4166 setDebugLocFromInst(Builder, P); 4167 4168 // This PHINode must be an induction variable. 4169 // Make sure that we know about it. 4170 assert(Legal->getInductionVars().count(P) && "Not an induction variable"); 4171 4172 InductionDescriptor II = Legal->getInductionVars().lookup(P); 4173 const DataLayout &DL = OrigLoop->getHeader()->getModule()->getDataLayout(); 4174 4175 // FIXME: The newly created binary instructions should contain nsw/nuw flags, 4176 // which can be found from the original scalar operations. 4177 switch (II.getKind()) { 4178 case InductionDescriptor::IK_NoInduction: 4179 llvm_unreachable("Unknown induction"); 4180 case InductionDescriptor::IK_IntInduction: 4181 case InductionDescriptor::IK_FpInduction: 4182 llvm_unreachable("Integer/fp induction is handled elsewhere."); 4183 case InductionDescriptor::IK_PtrInduction: { 4184 // Handle the pointer induction variable case. 4185 assert(P->getType()->isPointerTy() && "Unexpected type."); 4186 // This is the normalized GEP that starts counting at zero. 4187 Value *PtrInd = Induction; 4188 PtrInd = Builder.CreateSExtOrTrunc(PtrInd, II.getStep()->getType()); 4189 // Determine the number of scalars we need to generate for each unroll 4190 // iteration. If the instruction is uniform, we only need to generate the 4191 // first lane. Otherwise, we generate all VF values. 4192 unsigned Lanes = Cost->isUniformAfterVectorization(P, VF) ? 1 : VF; 4193 // These are the scalar results. Notice that we don't generate vector GEPs 4194 // because scalar GEPs result in better code. 4195 for (unsigned Part = 0; Part < UF; ++Part) { 4196 for (unsigned Lane = 0; Lane < Lanes; ++Lane) { 4197 Constant *Idx = ConstantInt::get(PtrInd->getType(), Lane + Part * VF); 4198 Value *GlobalIdx = Builder.CreateAdd(PtrInd, Idx); 4199 Value *SclrGep = 4200 emitTransformedIndex(Builder, GlobalIdx, PSE.getSE(), DL, II); 4201 SclrGep->setName("next.gep"); 4202 VectorLoopValueMap.setScalarValue(P, {Part, Lane}, SclrGep); 4203 } 4204 } 4205 return; 4206 } 4207 } 4208 } 4209 4210 /// A helper function for checking whether an integer division-related 4211 /// instruction may divide by zero (in which case it must be predicated if 4212 /// executed conditionally in the scalar code). 4213 /// TODO: It may be worthwhile to generalize and check isKnownNonZero(). 4214 /// Non-zero divisors that are non compile-time constants will not be 4215 /// converted into multiplication, so we will still end up scalarizing 4216 /// the division, but can do so w/o predication. 4217 static bool mayDivideByZero(Instruction &I) { 4218 assert((I.getOpcode() == Instruction::UDiv || 4219 I.getOpcode() == Instruction::SDiv || 4220 I.getOpcode() == Instruction::URem || 4221 I.getOpcode() == Instruction::SRem) && 4222 "Unexpected instruction"); 4223 Value *Divisor = I.getOperand(1); 4224 auto *CInt = dyn_cast<ConstantInt>(Divisor); 4225 return !CInt || CInt->isZero(); 4226 } 4227 4228 void InnerLoopVectorizer::widenInstruction(Instruction &I) { 4229 switch (I.getOpcode()) { 4230 case Instruction::Call: 4231 case Instruction::Br: 4232 case Instruction::PHI: 4233 case Instruction::GetElementPtr: 4234 llvm_unreachable("This instruction is handled by a different recipe."); 4235 case Instruction::UDiv: 4236 case Instruction::SDiv: 4237 case Instruction::SRem: 4238 case Instruction::URem: 4239 case Instruction::Add: 4240 case Instruction::FAdd: 4241 case Instruction::Sub: 4242 case Instruction::FSub: 4243 case Instruction::FNeg: 4244 case Instruction::Mul: 4245 case Instruction::FMul: 4246 case Instruction::FDiv: 4247 case Instruction::FRem: 4248 case Instruction::Shl: 4249 case Instruction::LShr: 4250 case Instruction::AShr: 4251 case Instruction::And: 4252 case Instruction::Or: 4253 case Instruction::Xor: { 4254 // Just widen unops and binops. 4255 setDebugLocFromInst(Builder, &I); 4256 4257 for (unsigned Part = 0; Part < UF; ++Part) { 4258 SmallVector<Value *, 2> Ops; 4259 for (Value *Op : I.operands()) 4260 Ops.push_back(getOrCreateVectorValue(Op, Part)); 4261 4262 Value *V = Builder.CreateNAryOp(I.getOpcode(), Ops); 4263 4264 if (auto *VecOp = dyn_cast<Instruction>(V)) 4265 VecOp->copyIRFlags(&I); 4266 4267 // Use this vector value for all users of the original instruction. 4268 VectorLoopValueMap.setVectorValue(&I, Part, V); 4269 addMetadata(V, &I); 4270 } 4271 4272 break; 4273 } 4274 case Instruction::Select: { 4275 // Widen selects. 4276 // If the selector is loop invariant we can create a select 4277 // instruction with a scalar condition. Otherwise, use vector-select. 4278 auto *SE = PSE.getSE(); 4279 bool InvariantCond = 4280 SE->isLoopInvariant(PSE.getSCEV(I.getOperand(0)), OrigLoop); 4281 setDebugLocFromInst(Builder, &I); 4282 4283 // The condition can be loop invariant but still defined inside the 4284 // loop. This means that we can't just use the original 'cond' value. 4285 // We have to take the 'vectorized' value and pick the first lane. 4286 // Instcombine will make this a no-op. 4287 4288 auto *ScalarCond = getOrCreateScalarValue(I.getOperand(0), {0, 0}); 4289 4290 for (unsigned Part = 0; Part < UF; ++Part) { 4291 Value *Cond = getOrCreateVectorValue(I.getOperand(0), Part); 4292 Value *Op0 = getOrCreateVectorValue(I.getOperand(1), Part); 4293 Value *Op1 = getOrCreateVectorValue(I.getOperand(2), Part); 4294 Value *Sel = 4295 Builder.CreateSelect(InvariantCond ? ScalarCond : Cond, Op0, Op1); 4296 VectorLoopValueMap.setVectorValue(&I, Part, Sel); 4297 addMetadata(Sel, &I); 4298 } 4299 4300 break; 4301 } 4302 4303 case Instruction::ICmp: 4304 case Instruction::FCmp: { 4305 // Widen compares. Generate vector compares. 4306 bool FCmp = (I.getOpcode() == Instruction::FCmp); 4307 auto *Cmp = cast<CmpInst>(&I); 4308 setDebugLocFromInst(Builder, Cmp); 4309 for (unsigned Part = 0; Part < UF; ++Part) { 4310 Value *A = getOrCreateVectorValue(Cmp->getOperand(0), Part); 4311 Value *B = getOrCreateVectorValue(Cmp->getOperand(1), Part); 4312 Value *C = nullptr; 4313 if (FCmp) { 4314 // Propagate fast math flags. 4315 IRBuilder<>::FastMathFlagGuard FMFG(Builder); 4316 Builder.setFastMathFlags(Cmp->getFastMathFlags()); 4317 C = Builder.CreateFCmp(Cmp->getPredicate(), A, B); 4318 } else { 4319 C = Builder.CreateICmp(Cmp->getPredicate(), A, B); 4320 } 4321 VectorLoopValueMap.setVectorValue(&I, Part, C); 4322 addMetadata(C, &I); 4323 } 4324 4325 break; 4326 } 4327 4328 case Instruction::ZExt: 4329 case Instruction::SExt: 4330 case Instruction::FPToUI: 4331 case Instruction::FPToSI: 4332 case Instruction::FPExt: 4333 case Instruction::PtrToInt: 4334 case Instruction::IntToPtr: 4335 case Instruction::SIToFP: 4336 case Instruction::UIToFP: 4337 case Instruction::Trunc: 4338 case Instruction::FPTrunc: 4339 case Instruction::BitCast: { 4340 auto *CI = cast<CastInst>(&I); 4341 setDebugLocFromInst(Builder, CI); 4342 4343 /// Vectorize casts. 4344 Type *DestTy = 4345 (VF == 1) ? CI->getType() : VectorType::get(CI->getType(), VF); 4346 4347 for (unsigned Part = 0; Part < UF; ++Part) { 4348 Value *A = getOrCreateVectorValue(CI->getOperand(0), Part); 4349 Value *Cast = Builder.CreateCast(CI->getOpcode(), A, DestTy); 4350 VectorLoopValueMap.setVectorValue(&I, Part, Cast); 4351 addMetadata(Cast, &I); 4352 } 4353 break; 4354 } 4355 default: 4356 // This instruction is not vectorized by simple widening. 4357 LLVM_DEBUG(dbgs() << "LV: Found an unhandled instruction: " << I); 4358 llvm_unreachable("Unhandled instruction!"); 4359 } // end of switch. 4360 } 4361 4362 void InnerLoopVectorizer::widenCallInstruction(CallInst &I) { 4363 // Ignore dbg intrinsics. 4364 // TODO: Debug intrinsics should be skipped/handled during VPlan construction 4365 // rather than dropping them here. 4366 if (isa<DbgInfoIntrinsic>(I)) 4367 return; 4368 setDebugLocFromInst(Builder, &I); 4369 4370 Module *M = I.getParent()->getParent()->getParent(); 4371 auto *CI = cast<CallInst>(&I); 4372 4373 SmallVector<Type *, 4> Tys; 4374 for (Value *ArgOperand : CI->arg_operands()) 4375 Tys.push_back(ToVectorTy(ArgOperand->getType(), VF)); 4376 4377 Intrinsic::ID ID = getVectorIntrinsicIDForCall(CI, TLI); 4378 4379 // The flag shows whether we use Intrinsic or a usual Call for vectorized 4380 // version of the instruction. 4381 // Is it beneficial to perform intrinsic call compared to lib call? 4382 bool NeedToScalarize = false; 4383 unsigned CallCost = Cost->getVectorCallCost(CI, VF, NeedToScalarize); 4384 bool UseVectorIntrinsic = 4385 ID && Cost->getVectorIntrinsicCost(CI, VF) <= CallCost; 4386 assert((UseVectorIntrinsic || !NeedToScalarize) && 4387 "Instruction should be scalarized elsewhere."); 4388 4389 for (unsigned Part = 0; Part < UF; ++Part) { 4390 SmallVector<Value *, 4> Args; 4391 for (unsigned i = 0, ie = CI->getNumArgOperands(); i != ie; ++i) { 4392 Value *Arg = CI->getArgOperand(i); 4393 // Some intrinsics have a scalar argument - don't replace it with a 4394 // vector. 4395 if (!UseVectorIntrinsic || !hasVectorInstrinsicScalarOpd(ID, i)) 4396 Arg = getOrCreateVectorValue(CI->getArgOperand(i), Part); 4397 Args.push_back(Arg); 4398 } 4399 4400 Function *VectorF; 4401 if (UseVectorIntrinsic) { 4402 // Use vector version of the intrinsic. 4403 Type *TysForDecl[] = {CI->getType()}; 4404 if (VF > 1) 4405 TysForDecl[0] = VectorType::get(CI->getType()->getScalarType(), VF); 4406 VectorF = Intrinsic::getDeclaration(M, ID, TysForDecl); 4407 } else { 4408 // Use vector version of the function call. 4409 const VFShape Shape = 4410 VFShape::get(*CI, {VF, false} /*EC*/, false /*HasGlobalPred*/); 4411 #ifndef NDEBUG 4412 const SmallVector<VFInfo, 8> Infos = VFDatabase::getMappings(*CI); 4413 assert(std::find_if(Infos.begin(), Infos.end(), 4414 [&Shape](const VFInfo &Info) { 4415 return Info.Shape == Shape; 4416 }) != Infos.end() && 4417 "Vector function shape is missing from the database."); 4418 #endif 4419 VectorF = VFDatabase(*CI).getVectorizedFunction(Shape); 4420 } 4421 assert(VectorF && "Can't create vector function."); 4422 4423 SmallVector<OperandBundleDef, 1> OpBundles; 4424 CI->getOperandBundlesAsDefs(OpBundles); 4425 CallInst *V = Builder.CreateCall(VectorF, Args, OpBundles); 4426 4427 if (isa<FPMathOperator>(V)) 4428 V->copyFastMathFlags(CI); 4429 4430 VectorLoopValueMap.setVectorValue(&I, Part, V); 4431 addMetadata(V, &I); 4432 } 4433 } 4434 4435 void LoopVectorizationCostModel::collectLoopScalars(unsigned VF) { 4436 // We should not collect Scalars more than once per VF. Right now, this 4437 // function is called from collectUniformsAndScalars(), which already does 4438 // this check. Collecting Scalars for VF=1 does not make any sense. 4439 assert(VF >= 2 && Scalars.find(VF) == Scalars.end() && 4440 "This function should not be visited twice for the same VF"); 4441 4442 SmallSetVector<Instruction *, 8> Worklist; 4443 4444 // These sets are used to seed the analysis with pointers used by memory 4445 // accesses that will remain scalar. 4446 SmallSetVector<Instruction *, 8> ScalarPtrs; 4447 SmallPtrSet<Instruction *, 8> PossibleNonScalarPtrs; 4448 4449 // A helper that returns true if the use of Ptr by MemAccess will be scalar. 4450 // The pointer operands of loads and stores will be scalar as long as the 4451 // memory access is not a gather or scatter operation. The value operand of a 4452 // store will remain scalar if the store is scalarized. 4453 auto isScalarUse = [&](Instruction *MemAccess, Value *Ptr) { 4454 InstWidening WideningDecision = getWideningDecision(MemAccess, VF); 4455 assert(WideningDecision != CM_Unknown && 4456 "Widening decision should be ready at this moment"); 4457 if (auto *Store = dyn_cast<StoreInst>(MemAccess)) 4458 if (Ptr == Store->getValueOperand()) 4459 return WideningDecision == CM_Scalarize; 4460 assert(Ptr == getLoadStorePointerOperand(MemAccess) && 4461 "Ptr is neither a value or pointer operand"); 4462 return WideningDecision != CM_GatherScatter; 4463 }; 4464 4465 // A helper that returns true if the given value is a bitcast or 4466 // getelementptr instruction contained in the loop. 4467 auto isLoopVaryingBitCastOrGEP = [&](Value *V) { 4468 return ((isa<BitCastInst>(V) && V->getType()->isPointerTy()) || 4469 isa<GetElementPtrInst>(V)) && 4470 !TheLoop->isLoopInvariant(V); 4471 }; 4472 4473 // A helper that evaluates a memory access's use of a pointer. If the use 4474 // will be a scalar use, and the pointer is only used by memory accesses, we 4475 // place the pointer in ScalarPtrs. Otherwise, the pointer is placed in 4476 // PossibleNonScalarPtrs. 4477 auto evaluatePtrUse = [&](Instruction *MemAccess, Value *Ptr) { 4478 // We only care about bitcast and getelementptr instructions contained in 4479 // the loop. 4480 if (!isLoopVaryingBitCastOrGEP(Ptr)) 4481 return; 4482 4483 // If the pointer has already been identified as scalar (e.g., if it was 4484 // also identified as uniform), there's nothing to do. 4485 auto *I = cast<Instruction>(Ptr); 4486 if (Worklist.count(I)) 4487 return; 4488 4489 // If the use of the pointer will be a scalar use, and all users of the 4490 // pointer are memory accesses, place the pointer in ScalarPtrs. Otherwise, 4491 // place the pointer in PossibleNonScalarPtrs. 4492 if (isScalarUse(MemAccess, Ptr) && llvm::all_of(I->users(), [&](User *U) { 4493 return isa<LoadInst>(U) || isa<StoreInst>(U); 4494 })) 4495 ScalarPtrs.insert(I); 4496 else 4497 PossibleNonScalarPtrs.insert(I); 4498 }; 4499 4500 // We seed the scalars analysis with three classes of instructions: (1) 4501 // instructions marked uniform-after-vectorization, (2) bitcast and 4502 // getelementptr instructions used by memory accesses requiring a scalar use, 4503 // and (3) pointer induction variables and their update instructions (we 4504 // currently only scalarize these). 4505 // 4506 // (1) Add to the worklist all instructions that have been identified as 4507 // uniform-after-vectorization. 4508 Worklist.insert(Uniforms[VF].begin(), Uniforms[VF].end()); 4509 4510 // (2) Add to the worklist all bitcast and getelementptr instructions used by 4511 // memory accesses requiring a scalar use. The pointer operands of loads and 4512 // stores will be scalar as long as the memory accesses is not a gather or 4513 // scatter operation. The value operand of a store will remain scalar if the 4514 // store is scalarized. 4515 for (auto *BB : TheLoop->blocks()) 4516 for (auto &I : *BB) { 4517 if (auto *Load = dyn_cast<LoadInst>(&I)) { 4518 evaluatePtrUse(Load, Load->getPointerOperand()); 4519 } else if (auto *Store = dyn_cast<StoreInst>(&I)) { 4520 evaluatePtrUse(Store, Store->getPointerOperand()); 4521 evaluatePtrUse(Store, Store->getValueOperand()); 4522 } 4523 } 4524 for (auto *I : ScalarPtrs) 4525 if (PossibleNonScalarPtrs.find(I) == PossibleNonScalarPtrs.end()) { 4526 LLVM_DEBUG(dbgs() << "LV: Found scalar instruction: " << *I << "\n"); 4527 Worklist.insert(I); 4528 } 4529 4530 // (3) Add to the worklist all pointer induction variables and their update 4531 // instructions. 4532 // 4533 // TODO: Once we are able to vectorize pointer induction variables we should 4534 // no longer insert them into the worklist here. 4535 auto *Latch = TheLoop->getLoopLatch(); 4536 for (auto &Induction : Legal->getInductionVars()) { 4537 auto *Ind = Induction.first; 4538 auto *IndUpdate = cast<Instruction>(Ind->getIncomingValueForBlock(Latch)); 4539 if (Induction.second.getKind() != InductionDescriptor::IK_PtrInduction) 4540 continue; 4541 Worklist.insert(Ind); 4542 Worklist.insert(IndUpdate); 4543 LLVM_DEBUG(dbgs() << "LV: Found scalar instruction: " << *Ind << "\n"); 4544 LLVM_DEBUG(dbgs() << "LV: Found scalar instruction: " << *IndUpdate 4545 << "\n"); 4546 } 4547 4548 // Insert the forced scalars. 4549 // FIXME: Currently widenPHIInstruction() often creates a dead vector 4550 // induction variable when the PHI user is scalarized. 4551 auto ForcedScalar = ForcedScalars.find(VF); 4552 if (ForcedScalar != ForcedScalars.end()) 4553 for (auto *I : ForcedScalar->second) 4554 Worklist.insert(I); 4555 4556 // Expand the worklist by looking through any bitcasts and getelementptr 4557 // instructions we've already identified as scalar. This is similar to the 4558 // expansion step in collectLoopUniforms(); however, here we're only 4559 // expanding to include additional bitcasts and getelementptr instructions. 4560 unsigned Idx = 0; 4561 while (Idx != Worklist.size()) { 4562 Instruction *Dst = Worklist[Idx++]; 4563 if (!isLoopVaryingBitCastOrGEP(Dst->getOperand(0))) 4564 continue; 4565 auto *Src = cast<Instruction>(Dst->getOperand(0)); 4566 if (llvm::all_of(Src->users(), [&](User *U) -> bool { 4567 auto *J = cast<Instruction>(U); 4568 return !TheLoop->contains(J) || Worklist.count(J) || 4569 ((isa<LoadInst>(J) || isa<StoreInst>(J)) && 4570 isScalarUse(J, Src)); 4571 })) { 4572 Worklist.insert(Src); 4573 LLVM_DEBUG(dbgs() << "LV: Found scalar instruction: " << *Src << "\n"); 4574 } 4575 } 4576 4577 // An induction variable will remain scalar if all users of the induction 4578 // variable and induction variable update remain scalar. 4579 for (auto &Induction : Legal->getInductionVars()) { 4580 auto *Ind = Induction.first; 4581 auto *IndUpdate = cast<Instruction>(Ind->getIncomingValueForBlock(Latch)); 4582 4583 // We already considered pointer induction variables, so there's no reason 4584 // to look at their users again. 4585 // 4586 // TODO: Once we are able to vectorize pointer induction variables we 4587 // should no longer skip over them here. 4588 if (Induction.second.getKind() == InductionDescriptor::IK_PtrInduction) 4589 continue; 4590 4591 // Determine if all users of the induction variable are scalar after 4592 // vectorization. 4593 auto ScalarInd = llvm::all_of(Ind->users(), [&](User *U) -> bool { 4594 auto *I = cast<Instruction>(U); 4595 return I == IndUpdate || !TheLoop->contains(I) || Worklist.count(I); 4596 }); 4597 if (!ScalarInd) 4598 continue; 4599 4600 // Determine if all users of the induction variable update instruction are 4601 // scalar after vectorization. 4602 auto ScalarIndUpdate = 4603 llvm::all_of(IndUpdate->users(), [&](User *U) -> bool { 4604 auto *I = cast<Instruction>(U); 4605 return I == Ind || !TheLoop->contains(I) || Worklist.count(I); 4606 }); 4607 if (!ScalarIndUpdate) 4608 continue; 4609 4610 // The induction variable and its update instruction will remain scalar. 4611 Worklist.insert(Ind); 4612 Worklist.insert(IndUpdate); 4613 LLVM_DEBUG(dbgs() << "LV: Found scalar instruction: " << *Ind << "\n"); 4614 LLVM_DEBUG(dbgs() << "LV: Found scalar instruction: " << *IndUpdate 4615 << "\n"); 4616 } 4617 4618 Scalars[VF].insert(Worklist.begin(), Worklist.end()); 4619 } 4620 4621 bool LoopVectorizationCostModel::isScalarWithPredication(Instruction *I, unsigned VF) { 4622 if (!blockNeedsPredication(I->getParent())) 4623 return false; 4624 switch(I->getOpcode()) { 4625 default: 4626 break; 4627 case Instruction::Load: 4628 case Instruction::Store: { 4629 if (!Legal->isMaskRequired(I)) 4630 return false; 4631 auto *Ptr = getLoadStorePointerOperand(I); 4632 auto *Ty = getMemInstValueType(I); 4633 // We have already decided how to vectorize this instruction, get that 4634 // result. 4635 if (VF > 1) { 4636 InstWidening WideningDecision = getWideningDecision(I, VF); 4637 assert(WideningDecision != CM_Unknown && 4638 "Widening decision should be ready at this moment"); 4639 return WideningDecision == CM_Scalarize; 4640 } 4641 const MaybeAlign Alignment = getLoadStoreAlignment(I); 4642 return isa<LoadInst>(I) ? !(isLegalMaskedLoad(Ty, Ptr, Alignment) || 4643 isLegalMaskedGather(Ty, Alignment)) 4644 : !(isLegalMaskedStore(Ty, Ptr, Alignment) || 4645 isLegalMaskedScatter(Ty, Alignment)); 4646 } 4647 case Instruction::UDiv: 4648 case Instruction::SDiv: 4649 case Instruction::SRem: 4650 case Instruction::URem: 4651 return mayDivideByZero(*I); 4652 } 4653 return false; 4654 } 4655 4656 bool LoopVectorizationCostModel::interleavedAccessCanBeWidened(Instruction *I, 4657 unsigned VF) { 4658 assert(isAccessInterleaved(I) && "Expecting interleaved access."); 4659 assert(getWideningDecision(I, VF) == CM_Unknown && 4660 "Decision should not be set yet."); 4661 auto *Group = getInterleavedAccessGroup(I); 4662 assert(Group && "Must have a group."); 4663 4664 // If the instruction's allocated size doesn't equal it's type size, it 4665 // requires padding and will be scalarized. 4666 auto &DL = I->getModule()->getDataLayout(); 4667 auto *ScalarTy = getMemInstValueType(I); 4668 if (hasIrregularType(ScalarTy, DL, VF)) 4669 return false; 4670 4671 // Check if masking is required. 4672 // A Group may need masking for one of two reasons: it resides in a block that 4673 // needs predication, or it was decided to use masking to deal with gaps. 4674 bool PredicatedAccessRequiresMasking = 4675 Legal->blockNeedsPredication(I->getParent()) && Legal->isMaskRequired(I); 4676 bool AccessWithGapsRequiresMasking = 4677 Group->requiresScalarEpilogue() && !isScalarEpilogueAllowed(); 4678 if (!PredicatedAccessRequiresMasking && !AccessWithGapsRequiresMasking) 4679 return true; 4680 4681 // If masked interleaving is required, we expect that the user/target had 4682 // enabled it, because otherwise it either wouldn't have been created or 4683 // it should have been invalidated by the CostModel. 4684 assert(useMaskedInterleavedAccesses(TTI) && 4685 "Masked interleave-groups for predicated accesses are not enabled."); 4686 4687 auto *Ty = getMemInstValueType(I); 4688 const MaybeAlign Alignment = getLoadStoreAlignment(I); 4689 return isa<LoadInst>(I) ? TTI.isLegalMaskedLoad(Ty, Alignment) 4690 : TTI.isLegalMaskedStore(Ty, Alignment); 4691 } 4692 4693 bool LoopVectorizationCostModel::memoryInstructionCanBeWidened(Instruction *I, 4694 unsigned VF) { 4695 // Get and ensure we have a valid memory instruction. 4696 LoadInst *LI = dyn_cast<LoadInst>(I); 4697 StoreInst *SI = dyn_cast<StoreInst>(I); 4698 assert((LI || SI) && "Invalid memory instruction"); 4699 4700 auto *Ptr = getLoadStorePointerOperand(I); 4701 4702 // In order to be widened, the pointer should be consecutive, first of all. 4703 if (!Legal->isConsecutivePtr(Ptr)) 4704 return false; 4705 4706 // If the instruction is a store located in a predicated block, it will be 4707 // scalarized. 4708 if (isScalarWithPredication(I)) 4709 return false; 4710 4711 // If the instruction's allocated size doesn't equal it's type size, it 4712 // requires padding and will be scalarized. 4713 auto &DL = I->getModule()->getDataLayout(); 4714 auto *ScalarTy = LI ? LI->getType() : SI->getValueOperand()->getType(); 4715 if (hasIrregularType(ScalarTy, DL, VF)) 4716 return false; 4717 4718 return true; 4719 } 4720 4721 void LoopVectorizationCostModel::collectLoopUniforms(unsigned VF) { 4722 // We should not collect Uniforms more than once per VF. Right now, 4723 // this function is called from collectUniformsAndScalars(), which 4724 // already does this check. Collecting Uniforms for VF=1 does not make any 4725 // sense. 4726 4727 assert(VF >= 2 && Uniforms.find(VF) == Uniforms.end() && 4728 "This function should not be visited twice for the same VF"); 4729 4730 // Visit the list of Uniforms. If we'll not find any uniform value, we'll 4731 // not analyze again. Uniforms.count(VF) will return 1. 4732 Uniforms[VF].clear(); 4733 4734 // We now know that the loop is vectorizable! 4735 // Collect instructions inside the loop that will remain uniform after 4736 // vectorization. 4737 4738 // Global values, params and instructions outside of current loop are out of 4739 // scope. 4740 auto isOutOfScope = [&](Value *V) -> bool { 4741 Instruction *I = dyn_cast<Instruction>(V); 4742 return (!I || !TheLoop->contains(I)); 4743 }; 4744 4745 SetVector<Instruction *> Worklist; 4746 BasicBlock *Latch = TheLoop->getLoopLatch(); 4747 4748 // Instructions that are scalar with predication must not be considered 4749 // uniform after vectorization, because that would create an erroneous 4750 // replicating region where only a single instance out of VF should be formed. 4751 // TODO: optimize such seldom cases if found important, see PR40816. 4752 auto addToWorklistIfAllowed = [&](Instruction *I) -> void { 4753 if (isScalarWithPredication(I, VF)) { 4754 LLVM_DEBUG(dbgs() << "LV: Found not uniform being ScalarWithPredication: " 4755 << *I << "\n"); 4756 return; 4757 } 4758 LLVM_DEBUG(dbgs() << "LV: Found uniform instruction: " << *I << "\n"); 4759 Worklist.insert(I); 4760 }; 4761 4762 // Start with the conditional branch. If the branch condition is an 4763 // instruction contained in the loop that is only used by the branch, it is 4764 // uniform. 4765 auto *Cmp = dyn_cast<Instruction>(Latch->getTerminator()->getOperand(0)); 4766 if (Cmp && TheLoop->contains(Cmp) && Cmp->hasOneUse()) 4767 addToWorklistIfAllowed(Cmp); 4768 4769 // Holds consecutive and consecutive-like pointers. Consecutive-like pointers 4770 // are pointers that are treated like consecutive pointers during 4771 // vectorization. The pointer operands of interleaved accesses are an 4772 // example. 4773 SmallSetVector<Instruction *, 8> ConsecutiveLikePtrs; 4774 4775 // Holds pointer operands of instructions that are possibly non-uniform. 4776 SmallPtrSet<Instruction *, 8> PossibleNonUniformPtrs; 4777 4778 auto isUniformDecision = [&](Instruction *I, unsigned VF) { 4779 InstWidening WideningDecision = getWideningDecision(I, VF); 4780 assert(WideningDecision != CM_Unknown && 4781 "Widening decision should be ready at this moment"); 4782 4783 return (WideningDecision == CM_Widen || 4784 WideningDecision == CM_Widen_Reverse || 4785 WideningDecision == CM_Interleave); 4786 }; 4787 // Iterate over the instructions in the loop, and collect all 4788 // consecutive-like pointer operands in ConsecutiveLikePtrs. If it's possible 4789 // that a consecutive-like pointer operand will be scalarized, we collect it 4790 // in PossibleNonUniformPtrs instead. We use two sets here because a single 4791 // getelementptr instruction can be used by both vectorized and scalarized 4792 // memory instructions. For example, if a loop loads and stores from the same 4793 // location, but the store is conditional, the store will be scalarized, and 4794 // the getelementptr won't remain uniform. 4795 for (auto *BB : TheLoop->blocks()) 4796 for (auto &I : *BB) { 4797 // If there's no pointer operand, there's nothing to do. 4798 auto *Ptr = dyn_cast_or_null<Instruction>(getLoadStorePointerOperand(&I)); 4799 if (!Ptr) 4800 continue; 4801 4802 // True if all users of Ptr are memory accesses that have Ptr as their 4803 // pointer operand. 4804 auto UsersAreMemAccesses = 4805 llvm::all_of(Ptr->users(), [&](User *U) -> bool { 4806 return getLoadStorePointerOperand(U) == Ptr; 4807 }); 4808 4809 // Ensure the memory instruction will not be scalarized or used by 4810 // gather/scatter, making its pointer operand non-uniform. If the pointer 4811 // operand is used by any instruction other than a memory access, we 4812 // conservatively assume the pointer operand may be non-uniform. 4813 if (!UsersAreMemAccesses || !isUniformDecision(&I, VF)) 4814 PossibleNonUniformPtrs.insert(Ptr); 4815 4816 // If the memory instruction will be vectorized and its pointer operand 4817 // is consecutive-like, or interleaving - the pointer operand should 4818 // remain uniform. 4819 else 4820 ConsecutiveLikePtrs.insert(Ptr); 4821 } 4822 4823 // Add to the Worklist all consecutive and consecutive-like pointers that 4824 // aren't also identified as possibly non-uniform. 4825 for (auto *V : ConsecutiveLikePtrs) 4826 if (PossibleNonUniformPtrs.find(V) == PossibleNonUniformPtrs.end()) 4827 addToWorklistIfAllowed(V); 4828 4829 // Expand Worklist in topological order: whenever a new instruction 4830 // is added , its users should be already inside Worklist. It ensures 4831 // a uniform instruction will only be used by uniform instructions. 4832 unsigned idx = 0; 4833 while (idx != Worklist.size()) { 4834 Instruction *I = Worklist[idx++]; 4835 4836 for (auto OV : I->operand_values()) { 4837 // isOutOfScope operands cannot be uniform instructions. 4838 if (isOutOfScope(OV)) 4839 continue; 4840 // First order recurrence Phi's should typically be considered 4841 // non-uniform. 4842 auto *OP = dyn_cast<PHINode>(OV); 4843 if (OP && Legal->isFirstOrderRecurrence(OP)) 4844 continue; 4845 // If all the users of the operand are uniform, then add the 4846 // operand into the uniform worklist. 4847 auto *OI = cast<Instruction>(OV); 4848 if (llvm::all_of(OI->users(), [&](User *U) -> bool { 4849 auto *J = cast<Instruction>(U); 4850 return Worklist.count(J) || 4851 (OI == getLoadStorePointerOperand(J) && 4852 isUniformDecision(J, VF)); 4853 })) 4854 addToWorklistIfAllowed(OI); 4855 } 4856 } 4857 4858 // Returns true if Ptr is the pointer operand of a memory access instruction 4859 // I, and I is known to not require scalarization. 4860 auto isVectorizedMemAccessUse = [&](Instruction *I, Value *Ptr) -> bool { 4861 return getLoadStorePointerOperand(I) == Ptr && isUniformDecision(I, VF); 4862 }; 4863 4864 // For an instruction to be added into Worklist above, all its users inside 4865 // the loop should also be in Worklist. However, this condition cannot be 4866 // true for phi nodes that form a cyclic dependence. We must process phi 4867 // nodes separately. An induction variable will remain uniform if all users 4868 // of the induction variable and induction variable update remain uniform. 4869 // The code below handles both pointer and non-pointer induction variables. 4870 for (auto &Induction : Legal->getInductionVars()) { 4871 auto *Ind = Induction.first; 4872 auto *IndUpdate = cast<Instruction>(Ind->getIncomingValueForBlock(Latch)); 4873 4874 // Determine if all users of the induction variable are uniform after 4875 // vectorization. 4876 auto UniformInd = llvm::all_of(Ind->users(), [&](User *U) -> bool { 4877 auto *I = cast<Instruction>(U); 4878 return I == IndUpdate || !TheLoop->contains(I) || Worklist.count(I) || 4879 isVectorizedMemAccessUse(I, Ind); 4880 }); 4881 if (!UniformInd) 4882 continue; 4883 4884 // Determine if all users of the induction variable update instruction are 4885 // uniform after vectorization. 4886 auto UniformIndUpdate = 4887 llvm::all_of(IndUpdate->users(), [&](User *U) -> bool { 4888 auto *I = cast<Instruction>(U); 4889 return I == Ind || !TheLoop->contains(I) || Worklist.count(I) || 4890 isVectorizedMemAccessUse(I, IndUpdate); 4891 }); 4892 if (!UniformIndUpdate) 4893 continue; 4894 4895 // The induction variable and its update instruction will remain uniform. 4896 addToWorklistIfAllowed(Ind); 4897 addToWorklistIfAllowed(IndUpdate); 4898 } 4899 4900 Uniforms[VF].insert(Worklist.begin(), Worklist.end()); 4901 } 4902 4903 bool LoopVectorizationCostModel::runtimeChecksRequired() { 4904 LLVM_DEBUG(dbgs() << "LV: Performing code size checks.\n"); 4905 4906 if (Legal->getRuntimePointerChecking()->Need) { 4907 reportVectorizationFailure("Runtime ptr check is required with -Os/-Oz", 4908 "runtime pointer checks needed. Enable vectorization of this " 4909 "loop with '#pragma clang loop vectorize(enable)' when " 4910 "compiling with -Os/-Oz", 4911 "CantVersionLoopWithOptForSize", ORE, TheLoop); 4912 return true; 4913 } 4914 4915 if (!PSE.getUnionPredicate().getPredicates().empty()) { 4916 reportVectorizationFailure("Runtime SCEV check is required with -Os/-Oz", 4917 "runtime SCEV checks needed. Enable vectorization of this " 4918 "loop with '#pragma clang loop vectorize(enable)' when " 4919 "compiling with -Os/-Oz", 4920 "CantVersionLoopWithOptForSize", ORE, TheLoop); 4921 return true; 4922 } 4923 4924 // FIXME: Avoid specializing for stride==1 instead of bailing out. 4925 if (!Legal->getLAI()->getSymbolicStrides().empty()) { 4926 reportVectorizationFailure("Runtime stride check is required with -Os/-Oz", 4927 "runtime stride == 1 checks needed. Enable vectorization of " 4928 "this loop with '#pragma clang loop vectorize(enable)' when " 4929 "compiling with -Os/-Oz", 4930 "CantVersionLoopWithOptForSize", ORE, TheLoop); 4931 return true; 4932 } 4933 4934 return false; 4935 } 4936 4937 Optional<unsigned> LoopVectorizationCostModel::computeMaxVF() { 4938 if (Legal->getRuntimePointerChecking()->Need && TTI.hasBranchDivergence()) { 4939 // TODO: It may by useful to do since it's still likely to be dynamically 4940 // uniform if the target can skip. 4941 reportVectorizationFailure( 4942 "Not inserting runtime ptr check for divergent target", 4943 "runtime pointer checks needed. Not enabled for divergent target", 4944 "CantVersionLoopWithDivergentTarget", ORE, TheLoop); 4945 return None; 4946 } 4947 4948 unsigned TC = PSE.getSE()->getSmallConstantTripCount(TheLoop); 4949 LLVM_DEBUG(dbgs() << "LV: Found trip count: " << TC << '\n'); 4950 if (TC == 1) { 4951 reportVectorizationFailure("Single iteration (non) loop", 4952 "loop trip count is one, irrelevant for vectorization", 4953 "SingleIterationLoop", ORE, TheLoop); 4954 return None; 4955 } 4956 4957 switch (ScalarEpilogueStatus) { 4958 case CM_ScalarEpilogueAllowed: 4959 return computeFeasibleMaxVF(TC); 4960 case CM_ScalarEpilogueNotNeededUsePredicate: 4961 LLVM_DEBUG( 4962 dbgs() << "LV: vector predicate hint/switch found.\n" 4963 << "LV: Not allowing scalar epilogue, creating predicated " 4964 << "vector loop.\n"); 4965 break; 4966 case CM_ScalarEpilogueNotAllowedLowTripLoop: 4967 // fallthrough as a special case of OptForSize 4968 case CM_ScalarEpilogueNotAllowedOptSize: 4969 if (ScalarEpilogueStatus == CM_ScalarEpilogueNotAllowedOptSize) 4970 LLVM_DEBUG( 4971 dbgs() << "LV: Not allowing scalar epilogue due to -Os/-Oz.\n"); 4972 else 4973 LLVM_DEBUG(dbgs() << "LV: Not allowing scalar epilogue due to low trip " 4974 << "count.\n"); 4975 4976 // Bail if runtime checks are required, which are not good when optimising 4977 // for size. 4978 if (runtimeChecksRequired()) 4979 return None; 4980 break; 4981 } 4982 4983 // Now try the tail folding 4984 4985 // Invalidate interleave groups that require an epilogue if we can't mask 4986 // the interleave-group. 4987 if (!useMaskedInterleavedAccesses(TTI)) 4988 InterleaveInfo.invalidateGroupsRequiringScalarEpilogue(); 4989 4990 unsigned MaxVF = computeFeasibleMaxVF(TC); 4991 if (TC > 0 && TC % MaxVF == 0) { 4992 // Accept MaxVF if we do not have a tail. 4993 LLVM_DEBUG(dbgs() << "LV: No tail will remain for any chosen VF.\n"); 4994 return MaxVF; 4995 } 4996 4997 // If we don't know the precise trip count, or if the trip count that we 4998 // found modulo the vectorization factor is not zero, try to fold the tail 4999 // by masking. 5000 // FIXME: look for a smaller MaxVF that does divide TC rather than masking. 5001 if (Legal->prepareToFoldTailByMasking()) { 5002 FoldTailByMasking = true; 5003 return MaxVF; 5004 } 5005 5006 if (TC == 0) { 5007 reportVectorizationFailure( 5008 "Unable to calculate the loop count due to complex control flow", 5009 "unable to calculate the loop count due to complex control flow", 5010 "UnknownLoopCountComplexCFG", ORE, TheLoop); 5011 return None; 5012 } 5013 5014 reportVectorizationFailure( 5015 "Cannot optimize for size and vectorize at the same time.", 5016 "cannot optimize for size and vectorize at the same time. " 5017 "Enable vectorization of this loop with '#pragma clang loop " 5018 "vectorize(enable)' when compiling with -Os/-Oz", 5019 "NoTailLoopWithOptForSize", ORE, TheLoop); 5020 return None; 5021 } 5022 5023 unsigned 5024 LoopVectorizationCostModel::computeFeasibleMaxVF(unsigned ConstTripCount) { 5025 MinBWs = computeMinimumValueSizes(TheLoop->getBlocks(), *DB, &TTI); 5026 unsigned SmallestType, WidestType; 5027 std::tie(SmallestType, WidestType) = getSmallestAndWidestTypes(); 5028 unsigned WidestRegister = TTI.getRegisterBitWidth(true); 5029 5030 // Get the maximum safe dependence distance in bits computed by LAA. 5031 // It is computed by MaxVF * sizeOf(type) * 8, where type is taken from 5032 // the memory accesses that is most restrictive (involved in the smallest 5033 // dependence distance). 5034 unsigned MaxSafeRegisterWidth = Legal->getMaxSafeRegisterWidth(); 5035 5036 WidestRegister = std::min(WidestRegister, MaxSafeRegisterWidth); 5037 5038 unsigned MaxVectorSize = WidestRegister / WidestType; 5039 5040 LLVM_DEBUG(dbgs() << "LV: The Smallest and Widest types: " << SmallestType 5041 << " / " << WidestType << " bits.\n"); 5042 LLVM_DEBUG(dbgs() << "LV: The Widest register safe to use is: " 5043 << WidestRegister << " bits.\n"); 5044 5045 assert(MaxVectorSize <= 256 && "Did not expect to pack so many elements" 5046 " into one vector!"); 5047 if (MaxVectorSize == 0) { 5048 LLVM_DEBUG(dbgs() << "LV: The target has no vector registers.\n"); 5049 MaxVectorSize = 1; 5050 return MaxVectorSize; 5051 } else if (ConstTripCount && ConstTripCount < MaxVectorSize && 5052 isPowerOf2_32(ConstTripCount)) { 5053 // We need to clamp the VF to be the ConstTripCount. There is no point in 5054 // choosing a higher viable VF as done in the loop below. 5055 LLVM_DEBUG(dbgs() << "LV: Clamping the MaxVF to the constant trip count: " 5056 << ConstTripCount << "\n"); 5057 MaxVectorSize = ConstTripCount; 5058 return MaxVectorSize; 5059 } 5060 5061 unsigned MaxVF = MaxVectorSize; 5062 if (TTI.shouldMaximizeVectorBandwidth(!isScalarEpilogueAllowed()) || 5063 (MaximizeBandwidth && isScalarEpilogueAllowed())) { 5064 // Collect all viable vectorization factors larger than the default MaxVF 5065 // (i.e. MaxVectorSize). 5066 SmallVector<unsigned, 8> VFs; 5067 unsigned NewMaxVectorSize = WidestRegister / SmallestType; 5068 for (unsigned VS = MaxVectorSize * 2; VS <= NewMaxVectorSize; VS *= 2) 5069 VFs.push_back(VS); 5070 5071 // For each VF calculate its register usage. 5072 auto RUs = calculateRegisterUsage(VFs); 5073 5074 // Select the largest VF which doesn't require more registers than existing 5075 // ones. 5076 for (int i = RUs.size() - 1; i >= 0; --i) { 5077 bool Selected = true; 5078 for (auto& pair : RUs[i].MaxLocalUsers) { 5079 unsigned TargetNumRegisters = TTI.getNumberOfRegisters(pair.first); 5080 if (pair.second > TargetNumRegisters) 5081 Selected = false; 5082 } 5083 if (Selected) { 5084 MaxVF = VFs[i]; 5085 break; 5086 } 5087 } 5088 if (unsigned MinVF = TTI.getMinimumVF(SmallestType)) { 5089 if (MaxVF < MinVF) { 5090 LLVM_DEBUG(dbgs() << "LV: Overriding calculated MaxVF(" << MaxVF 5091 << ") with target's minimum: " << MinVF << '\n'); 5092 MaxVF = MinVF; 5093 } 5094 } 5095 } 5096 return MaxVF; 5097 } 5098 5099 VectorizationFactor 5100 LoopVectorizationCostModel::selectVectorizationFactor(unsigned MaxVF) { 5101 float Cost = expectedCost(1).first; 5102 const float ScalarCost = Cost; 5103 unsigned Width = 1; 5104 LLVM_DEBUG(dbgs() << "LV: Scalar loop costs: " << (int)ScalarCost << ".\n"); 5105 5106 bool ForceVectorization = Hints->getForce() == LoopVectorizeHints::FK_Enabled; 5107 if (ForceVectorization && MaxVF > 1) { 5108 // Ignore scalar width, because the user explicitly wants vectorization. 5109 // Initialize cost to max so that VF = 2 is, at least, chosen during cost 5110 // evaluation. 5111 Cost = std::numeric_limits<float>::max(); 5112 } 5113 5114 for (unsigned i = 2; i <= MaxVF; i *= 2) { 5115 // Notice that the vector loop needs to be executed less times, so 5116 // we need to divide the cost of the vector loops by the width of 5117 // the vector elements. 5118 VectorizationCostTy C = expectedCost(i); 5119 float VectorCost = C.first / (float)i; 5120 LLVM_DEBUG(dbgs() << "LV: Vector loop of width " << i 5121 << " costs: " << (int)VectorCost << ".\n"); 5122 if (!C.second && !ForceVectorization) { 5123 LLVM_DEBUG( 5124 dbgs() << "LV: Not considering vector loop of width " << i 5125 << " because it will not generate any vector instructions.\n"); 5126 continue; 5127 } 5128 if (VectorCost < Cost) { 5129 Cost = VectorCost; 5130 Width = i; 5131 } 5132 } 5133 5134 if (!EnableCondStoresVectorization && NumPredStores) { 5135 reportVectorizationFailure("There are conditional stores.", 5136 "store that is conditionally executed prevents vectorization", 5137 "ConditionalStore", ORE, TheLoop); 5138 Width = 1; 5139 Cost = ScalarCost; 5140 } 5141 5142 LLVM_DEBUG(if (ForceVectorization && Width > 1 && Cost >= ScalarCost) dbgs() 5143 << "LV: Vectorization seems to be not beneficial, " 5144 << "but was forced by a user.\n"); 5145 LLVM_DEBUG(dbgs() << "LV: Selecting VF: " << Width << ".\n"); 5146 VectorizationFactor Factor = {Width, (unsigned)(Width * Cost)}; 5147 return Factor; 5148 } 5149 5150 std::pair<unsigned, unsigned> 5151 LoopVectorizationCostModel::getSmallestAndWidestTypes() { 5152 unsigned MinWidth = -1U; 5153 unsigned MaxWidth = 8; 5154 const DataLayout &DL = TheFunction->getParent()->getDataLayout(); 5155 5156 // For each block. 5157 for (BasicBlock *BB : TheLoop->blocks()) { 5158 // For each instruction in the loop. 5159 for (Instruction &I : BB->instructionsWithoutDebug()) { 5160 Type *T = I.getType(); 5161 5162 // Skip ignored values. 5163 if (ValuesToIgnore.find(&I) != ValuesToIgnore.end()) 5164 continue; 5165 5166 // Only examine Loads, Stores and PHINodes. 5167 if (!isa<LoadInst>(I) && !isa<StoreInst>(I) && !isa<PHINode>(I)) 5168 continue; 5169 5170 // Examine PHI nodes that are reduction variables. Update the type to 5171 // account for the recurrence type. 5172 if (auto *PN = dyn_cast<PHINode>(&I)) { 5173 if (!Legal->isReductionVariable(PN)) 5174 continue; 5175 RecurrenceDescriptor RdxDesc = Legal->getReductionVars()[PN]; 5176 T = RdxDesc.getRecurrenceType(); 5177 } 5178 5179 // Examine the stored values. 5180 if (auto *ST = dyn_cast<StoreInst>(&I)) 5181 T = ST->getValueOperand()->getType(); 5182 5183 // Ignore loaded pointer types and stored pointer types that are not 5184 // vectorizable. 5185 // 5186 // FIXME: The check here attempts to predict whether a load or store will 5187 // be vectorized. We only know this for certain after a VF has 5188 // been selected. Here, we assume that if an access can be 5189 // vectorized, it will be. We should also look at extending this 5190 // optimization to non-pointer types. 5191 // 5192 if (T->isPointerTy() && !isConsecutiveLoadOrStore(&I) && 5193 !isAccessInterleaved(&I) && !isLegalGatherOrScatter(&I)) 5194 continue; 5195 5196 MinWidth = std::min(MinWidth, 5197 (unsigned)DL.getTypeSizeInBits(T->getScalarType())); 5198 MaxWidth = std::max(MaxWidth, 5199 (unsigned)DL.getTypeSizeInBits(T->getScalarType())); 5200 } 5201 } 5202 5203 return {MinWidth, MaxWidth}; 5204 } 5205 5206 unsigned LoopVectorizationCostModel::selectInterleaveCount(unsigned VF, 5207 unsigned LoopCost) { 5208 // -- The interleave heuristics -- 5209 // We interleave the loop in order to expose ILP and reduce the loop overhead. 5210 // There are many micro-architectural considerations that we can't predict 5211 // at this level. For example, frontend pressure (on decode or fetch) due to 5212 // code size, or the number and capabilities of the execution ports. 5213 // 5214 // We use the following heuristics to select the interleave count: 5215 // 1. If the code has reductions, then we interleave to break the cross 5216 // iteration dependency. 5217 // 2. If the loop is really small, then we interleave to reduce the loop 5218 // overhead. 5219 // 3. We don't interleave if we think that we will spill registers to memory 5220 // due to the increased register pressure. 5221 5222 if (!isScalarEpilogueAllowed()) 5223 return 1; 5224 5225 // We used the distance for the interleave count. 5226 if (Legal->getMaxSafeDepDistBytes() != -1U) 5227 return 1; 5228 5229 // Do not interleave loops with a relatively small known or estimated trip 5230 // count. 5231 auto BestKnownTC = getSmallBestKnownTC(*PSE.getSE(), TheLoop); 5232 if (BestKnownTC && *BestKnownTC < TinyTripCountInterleaveThreshold) 5233 return 1; 5234 5235 RegisterUsage R = calculateRegisterUsage({VF})[0]; 5236 // We divide by these constants so assume that we have at least one 5237 // instruction that uses at least one register. 5238 for (auto& pair : R.MaxLocalUsers) { 5239 pair.second = std::max(pair.second, 1U); 5240 } 5241 5242 // We calculate the interleave count using the following formula. 5243 // Subtract the number of loop invariants from the number of available 5244 // registers. These registers are used by all of the interleaved instances. 5245 // Next, divide the remaining registers by the number of registers that is 5246 // required by the loop, in order to estimate how many parallel instances 5247 // fit without causing spills. All of this is rounded down if necessary to be 5248 // a power of two. We want power of two interleave count to simplify any 5249 // addressing operations or alignment considerations. 5250 // We also want power of two interleave counts to ensure that the induction 5251 // variable of the vector loop wraps to zero, when tail is folded by masking; 5252 // this currently happens when OptForSize, in which case IC is set to 1 above. 5253 unsigned IC = UINT_MAX; 5254 5255 for (auto& pair : R.MaxLocalUsers) { 5256 unsigned TargetNumRegisters = TTI.getNumberOfRegisters(pair.first); 5257 LLVM_DEBUG(dbgs() << "LV: The target has " << TargetNumRegisters 5258 << " registers of " 5259 << TTI.getRegisterClassName(pair.first) << " register class\n"); 5260 if (VF == 1) { 5261 if (ForceTargetNumScalarRegs.getNumOccurrences() > 0) 5262 TargetNumRegisters = ForceTargetNumScalarRegs; 5263 } else { 5264 if (ForceTargetNumVectorRegs.getNumOccurrences() > 0) 5265 TargetNumRegisters = ForceTargetNumVectorRegs; 5266 } 5267 unsigned MaxLocalUsers = pair.second; 5268 unsigned LoopInvariantRegs = 0; 5269 if (R.LoopInvariantRegs.find(pair.first) != R.LoopInvariantRegs.end()) 5270 LoopInvariantRegs = R.LoopInvariantRegs[pair.first]; 5271 5272 unsigned TmpIC = PowerOf2Floor((TargetNumRegisters - LoopInvariantRegs) / MaxLocalUsers); 5273 // Don't count the induction variable as interleaved. 5274 if (EnableIndVarRegisterHeur) { 5275 TmpIC = 5276 PowerOf2Floor((TargetNumRegisters - LoopInvariantRegs - 1) / 5277 std::max(1U, (MaxLocalUsers - 1))); 5278 } 5279 5280 IC = std::min(IC, TmpIC); 5281 } 5282 5283 // Clamp the interleave ranges to reasonable counts. 5284 unsigned MaxInterleaveCount = TTI.getMaxInterleaveFactor(VF); 5285 5286 // Check if the user has overridden the max. 5287 if (VF == 1) { 5288 if (ForceTargetMaxScalarInterleaveFactor.getNumOccurrences() > 0) 5289 MaxInterleaveCount = ForceTargetMaxScalarInterleaveFactor; 5290 } else { 5291 if (ForceTargetMaxVectorInterleaveFactor.getNumOccurrences() > 0) 5292 MaxInterleaveCount = ForceTargetMaxVectorInterleaveFactor; 5293 } 5294 5295 // If trip count is known or estimated compile time constant, limit the 5296 // interleave count to be less than the trip count divided by VF. 5297 if (BestKnownTC) { 5298 MaxInterleaveCount = std::min(*BestKnownTC / VF, MaxInterleaveCount); 5299 } 5300 5301 // If we did not calculate the cost for VF (because the user selected the VF) 5302 // then we calculate the cost of VF here. 5303 if (LoopCost == 0) 5304 LoopCost = expectedCost(VF).first; 5305 5306 assert(LoopCost && "Non-zero loop cost expected"); 5307 5308 // Clamp the calculated IC to be between the 1 and the max interleave count 5309 // that the target and trip count allows. 5310 if (IC > MaxInterleaveCount) 5311 IC = MaxInterleaveCount; 5312 else if (IC < 1) 5313 IC = 1; 5314 5315 // Interleave if we vectorized this loop and there is a reduction that could 5316 // benefit from interleaving. 5317 if (VF > 1 && !Legal->getReductionVars().empty()) { 5318 LLVM_DEBUG(dbgs() << "LV: Interleaving because of reductions.\n"); 5319 return IC; 5320 } 5321 5322 // Note that if we've already vectorized the loop we will have done the 5323 // runtime check and so interleaving won't require further checks. 5324 bool InterleavingRequiresRuntimePointerCheck = 5325 (VF == 1 && Legal->getRuntimePointerChecking()->Need); 5326 5327 // We want to interleave small loops in order to reduce the loop overhead and 5328 // potentially expose ILP opportunities. 5329 LLVM_DEBUG(dbgs() << "LV: Loop cost is " << LoopCost << '\n'); 5330 if (!InterleavingRequiresRuntimePointerCheck && LoopCost < SmallLoopCost) { 5331 // We assume that the cost overhead is 1 and we use the cost model 5332 // to estimate the cost of the loop and interleave until the cost of the 5333 // loop overhead is about 5% of the cost of the loop. 5334 unsigned SmallIC = 5335 std::min(IC, (unsigned)PowerOf2Floor(SmallLoopCost / LoopCost)); 5336 5337 // Interleave until store/load ports (estimated by max interleave count) are 5338 // saturated. 5339 unsigned NumStores = Legal->getNumStores(); 5340 unsigned NumLoads = Legal->getNumLoads(); 5341 unsigned StoresIC = IC / (NumStores ? NumStores : 1); 5342 unsigned LoadsIC = IC / (NumLoads ? NumLoads : 1); 5343 5344 // If we have a scalar reduction (vector reductions are already dealt with 5345 // by this point), we can increase the critical path length if the loop 5346 // we're interleaving is inside another loop. Limit, by default to 2, so the 5347 // critical path only gets increased by one reduction operation. 5348 if (!Legal->getReductionVars().empty() && TheLoop->getLoopDepth() > 1) { 5349 unsigned F = static_cast<unsigned>(MaxNestedScalarReductionIC); 5350 SmallIC = std::min(SmallIC, F); 5351 StoresIC = std::min(StoresIC, F); 5352 LoadsIC = std::min(LoadsIC, F); 5353 } 5354 5355 if (EnableLoadStoreRuntimeInterleave && 5356 std::max(StoresIC, LoadsIC) > SmallIC) { 5357 LLVM_DEBUG( 5358 dbgs() << "LV: Interleaving to saturate store or load ports.\n"); 5359 return std::max(StoresIC, LoadsIC); 5360 } 5361 5362 LLVM_DEBUG(dbgs() << "LV: Interleaving to reduce branch cost.\n"); 5363 return SmallIC; 5364 } 5365 5366 // Interleave if this is a large loop (small loops are already dealt with by 5367 // this point) that could benefit from interleaving. 5368 bool HasReductions = !Legal->getReductionVars().empty(); 5369 if (TTI.enableAggressiveInterleaving(HasReductions)) { 5370 LLVM_DEBUG(dbgs() << "LV: Interleaving to expose ILP.\n"); 5371 return IC; 5372 } 5373 5374 LLVM_DEBUG(dbgs() << "LV: Not Interleaving.\n"); 5375 return 1; 5376 } 5377 5378 SmallVector<LoopVectorizationCostModel::RegisterUsage, 8> 5379 LoopVectorizationCostModel::calculateRegisterUsage(ArrayRef<unsigned> VFs) { 5380 // This function calculates the register usage by measuring the highest number 5381 // of values that are alive at a single location. Obviously, this is a very 5382 // rough estimation. We scan the loop in a topological order in order and 5383 // assign a number to each instruction. We use RPO to ensure that defs are 5384 // met before their users. We assume that each instruction that has in-loop 5385 // users starts an interval. We record every time that an in-loop value is 5386 // used, so we have a list of the first and last occurrences of each 5387 // instruction. Next, we transpose this data structure into a multi map that 5388 // holds the list of intervals that *end* at a specific location. This multi 5389 // map allows us to perform a linear search. We scan the instructions linearly 5390 // and record each time that a new interval starts, by placing it in a set. 5391 // If we find this value in the multi-map then we remove it from the set. 5392 // The max register usage is the maximum size of the set. 5393 // We also search for instructions that are defined outside the loop, but are 5394 // used inside the loop. We need this number separately from the max-interval 5395 // usage number because when we unroll, loop-invariant values do not take 5396 // more register. 5397 LoopBlocksDFS DFS(TheLoop); 5398 DFS.perform(LI); 5399 5400 RegisterUsage RU; 5401 5402 // Each 'key' in the map opens a new interval. The values 5403 // of the map are the index of the 'last seen' usage of the 5404 // instruction that is the key. 5405 using IntervalMap = DenseMap<Instruction *, unsigned>; 5406 5407 // Maps instruction to its index. 5408 SmallVector<Instruction *, 64> IdxToInstr; 5409 // Marks the end of each interval. 5410 IntervalMap EndPoint; 5411 // Saves the list of instruction indices that are used in the loop. 5412 SmallPtrSet<Instruction *, 8> Ends; 5413 // Saves the list of values that are used in the loop but are 5414 // defined outside the loop, such as arguments and constants. 5415 SmallPtrSet<Value *, 8> LoopInvariants; 5416 5417 for (BasicBlock *BB : make_range(DFS.beginRPO(), DFS.endRPO())) { 5418 for (Instruction &I : BB->instructionsWithoutDebug()) { 5419 IdxToInstr.push_back(&I); 5420 5421 // Save the end location of each USE. 5422 for (Value *U : I.operands()) { 5423 auto *Instr = dyn_cast<Instruction>(U); 5424 5425 // Ignore non-instruction values such as arguments, constants, etc. 5426 if (!Instr) 5427 continue; 5428 5429 // If this instruction is outside the loop then record it and continue. 5430 if (!TheLoop->contains(Instr)) { 5431 LoopInvariants.insert(Instr); 5432 continue; 5433 } 5434 5435 // Overwrite previous end points. 5436 EndPoint[Instr] = IdxToInstr.size(); 5437 Ends.insert(Instr); 5438 } 5439 } 5440 } 5441 5442 // Saves the list of intervals that end with the index in 'key'. 5443 using InstrList = SmallVector<Instruction *, 2>; 5444 DenseMap<unsigned, InstrList> TransposeEnds; 5445 5446 // Transpose the EndPoints to a list of values that end at each index. 5447 for (auto &Interval : EndPoint) 5448 TransposeEnds[Interval.second].push_back(Interval.first); 5449 5450 SmallPtrSet<Instruction *, 8> OpenIntervals; 5451 5452 // Get the size of the widest register. 5453 unsigned MaxSafeDepDist = -1U; 5454 if (Legal->getMaxSafeDepDistBytes() != -1U) 5455 MaxSafeDepDist = Legal->getMaxSafeDepDistBytes() * 8; 5456 unsigned WidestRegister = 5457 std::min(TTI.getRegisterBitWidth(true), MaxSafeDepDist); 5458 const DataLayout &DL = TheFunction->getParent()->getDataLayout(); 5459 5460 SmallVector<RegisterUsage, 8> RUs(VFs.size()); 5461 SmallVector<SmallMapVector<unsigned, unsigned, 4>, 8> MaxUsages(VFs.size()); 5462 5463 LLVM_DEBUG(dbgs() << "LV(REG): Calculating max register usage:\n"); 5464 5465 // A lambda that gets the register usage for the given type and VF. 5466 auto GetRegUsage = [&DL, WidestRegister](Type *Ty, unsigned VF) { 5467 if (Ty->isTokenTy()) 5468 return 0U; 5469 unsigned TypeSize = DL.getTypeSizeInBits(Ty->getScalarType()); 5470 return std::max<unsigned>(1, VF * TypeSize / WidestRegister); 5471 }; 5472 5473 for (unsigned int i = 0, s = IdxToInstr.size(); i < s; ++i) { 5474 Instruction *I = IdxToInstr[i]; 5475 5476 // Remove all of the instructions that end at this location. 5477 InstrList &List = TransposeEnds[i]; 5478 for (Instruction *ToRemove : List) 5479 OpenIntervals.erase(ToRemove); 5480 5481 // Ignore instructions that are never used within the loop. 5482 if (Ends.find(I) == Ends.end()) 5483 continue; 5484 5485 // Skip ignored values. 5486 if (ValuesToIgnore.find(I) != ValuesToIgnore.end()) 5487 continue; 5488 5489 // For each VF find the maximum usage of registers. 5490 for (unsigned j = 0, e = VFs.size(); j < e; ++j) { 5491 // Count the number of live intervals. 5492 SmallMapVector<unsigned, unsigned, 4> RegUsage; 5493 5494 if (VFs[j] == 1) { 5495 for (auto Inst : OpenIntervals) { 5496 unsigned ClassID = TTI.getRegisterClassForType(false, Inst->getType()); 5497 if (RegUsage.find(ClassID) == RegUsage.end()) 5498 RegUsage[ClassID] = 1; 5499 else 5500 RegUsage[ClassID] += 1; 5501 } 5502 } else { 5503 collectUniformsAndScalars(VFs[j]); 5504 for (auto Inst : OpenIntervals) { 5505 // Skip ignored values for VF > 1. 5506 if (VecValuesToIgnore.find(Inst) != VecValuesToIgnore.end()) 5507 continue; 5508 if (isScalarAfterVectorization(Inst, VFs[j])) { 5509 unsigned ClassID = TTI.getRegisterClassForType(false, Inst->getType()); 5510 if (RegUsage.find(ClassID) == RegUsage.end()) 5511 RegUsage[ClassID] = 1; 5512 else 5513 RegUsage[ClassID] += 1; 5514 } else { 5515 unsigned ClassID = TTI.getRegisterClassForType(true, Inst->getType()); 5516 if (RegUsage.find(ClassID) == RegUsage.end()) 5517 RegUsage[ClassID] = GetRegUsage(Inst->getType(), VFs[j]); 5518 else 5519 RegUsage[ClassID] += GetRegUsage(Inst->getType(), VFs[j]); 5520 } 5521 } 5522 } 5523 5524 for (auto& pair : RegUsage) { 5525 if (MaxUsages[j].find(pair.first) != MaxUsages[j].end()) 5526 MaxUsages[j][pair.first] = std::max(MaxUsages[j][pair.first], pair.second); 5527 else 5528 MaxUsages[j][pair.first] = pair.second; 5529 } 5530 } 5531 5532 LLVM_DEBUG(dbgs() << "LV(REG): At #" << i << " Interval # " 5533 << OpenIntervals.size() << '\n'); 5534 5535 // Add the current instruction to the list of open intervals. 5536 OpenIntervals.insert(I); 5537 } 5538 5539 for (unsigned i = 0, e = VFs.size(); i < e; ++i) { 5540 SmallMapVector<unsigned, unsigned, 4> Invariant; 5541 5542 for (auto Inst : LoopInvariants) { 5543 unsigned Usage = VFs[i] == 1 ? 1 : GetRegUsage(Inst->getType(), VFs[i]); 5544 unsigned ClassID = TTI.getRegisterClassForType(VFs[i] > 1, Inst->getType()); 5545 if (Invariant.find(ClassID) == Invariant.end()) 5546 Invariant[ClassID] = Usage; 5547 else 5548 Invariant[ClassID] += Usage; 5549 } 5550 5551 LLVM_DEBUG({ 5552 dbgs() << "LV(REG): VF = " << VFs[i] << '\n'; 5553 dbgs() << "LV(REG): Found max usage: " << MaxUsages[i].size() 5554 << " item\n"; 5555 for (const auto &pair : MaxUsages[i]) { 5556 dbgs() << "LV(REG): RegisterClass: " 5557 << TTI.getRegisterClassName(pair.first) << ", " << pair.second 5558 << " registers\n"; 5559 } 5560 dbgs() << "LV(REG): Found invariant usage: " << Invariant.size() 5561 << " item\n"; 5562 for (const auto &pair : Invariant) { 5563 dbgs() << "LV(REG): RegisterClass: " 5564 << TTI.getRegisterClassName(pair.first) << ", " << pair.second 5565 << " registers\n"; 5566 } 5567 }); 5568 5569 RU.LoopInvariantRegs = Invariant; 5570 RU.MaxLocalUsers = MaxUsages[i]; 5571 RUs[i] = RU; 5572 } 5573 5574 return RUs; 5575 } 5576 5577 bool LoopVectorizationCostModel::useEmulatedMaskMemRefHack(Instruction *I){ 5578 // TODO: Cost model for emulated masked load/store is completely 5579 // broken. This hack guides the cost model to use an artificially 5580 // high enough value to practically disable vectorization with such 5581 // operations, except where previously deployed legality hack allowed 5582 // using very low cost values. This is to avoid regressions coming simply 5583 // from moving "masked load/store" check from legality to cost model. 5584 // Masked Load/Gather emulation was previously never allowed. 5585 // Limited number of Masked Store/Scatter emulation was allowed. 5586 assert(isPredicatedInst(I) && "Expecting a scalar emulated instruction"); 5587 return isa<LoadInst>(I) || 5588 (isa<StoreInst>(I) && 5589 NumPredStores > NumberOfStoresToPredicate); 5590 } 5591 5592 void LoopVectorizationCostModel::collectInstsToScalarize(unsigned VF) { 5593 // If we aren't vectorizing the loop, or if we've already collected the 5594 // instructions to scalarize, there's nothing to do. Collection may already 5595 // have occurred if we have a user-selected VF and are now computing the 5596 // expected cost for interleaving. 5597 if (VF < 2 || InstsToScalarize.find(VF) != InstsToScalarize.end()) 5598 return; 5599 5600 // Initialize a mapping for VF in InstsToScalalarize. If we find that it's 5601 // not profitable to scalarize any instructions, the presence of VF in the 5602 // map will indicate that we've analyzed it already. 5603 ScalarCostsTy &ScalarCostsVF = InstsToScalarize[VF]; 5604 5605 // Find all the instructions that are scalar with predication in the loop and 5606 // determine if it would be better to not if-convert the blocks they are in. 5607 // If so, we also record the instructions to scalarize. 5608 for (BasicBlock *BB : TheLoop->blocks()) { 5609 if (!blockNeedsPredication(BB)) 5610 continue; 5611 for (Instruction &I : *BB) 5612 if (isScalarWithPredication(&I)) { 5613 ScalarCostsTy ScalarCosts; 5614 // Do not apply discount logic if hacked cost is needed 5615 // for emulated masked memrefs. 5616 if (!useEmulatedMaskMemRefHack(&I) && 5617 computePredInstDiscount(&I, ScalarCosts, VF) >= 0) 5618 ScalarCostsVF.insert(ScalarCosts.begin(), ScalarCosts.end()); 5619 // Remember that BB will remain after vectorization. 5620 PredicatedBBsAfterVectorization.insert(BB); 5621 } 5622 } 5623 } 5624 5625 int LoopVectorizationCostModel::computePredInstDiscount( 5626 Instruction *PredInst, DenseMap<Instruction *, unsigned> &ScalarCosts, 5627 unsigned VF) { 5628 assert(!isUniformAfterVectorization(PredInst, VF) && 5629 "Instruction marked uniform-after-vectorization will be predicated"); 5630 5631 // Initialize the discount to zero, meaning that the scalar version and the 5632 // vector version cost the same. 5633 int Discount = 0; 5634 5635 // Holds instructions to analyze. The instructions we visit are mapped in 5636 // ScalarCosts. Those instructions are the ones that would be scalarized if 5637 // we find that the scalar version costs less. 5638 SmallVector<Instruction *, 8> Worklist; 5639 5640 // Returns true if the given instruction can be scalarized. 5641 auto canBeScalarized = [&](Instruction *I) -> bool { 5642 // We only attempt to scalarize instructions forming a single-use chain 5643 // from the original predicated block that would otherwise be vectorized. 5644 // Although not strictly necessary, we give up on instructions we know will 5645 // already be scalar to avoid traversing chains that are unlikely to be 5646 // beneficial. 5647 if (!I->hasOneUse() || PredInst->getParent() != I->getParent() || 5648 isScalarAfterVectorization(I, VF)) 5649 return false; 5650 5651 // If the instruction is scalar with predication, it will be analyzed 5652 // separately. We ignore it within the context of PredInst. 5653 if (isScalarWithPredication(I)) 5654 return false; 5655 5656 // If any of the instruction's operands are uniform after vectorization, 5657 // the instruction cannot be scalarized. This prevents, for example, a 5658 // masked load from being scalarized. 5659 // 5660 // We assume we will only emit a value for lane zero of an instruction 5661 // marked uniform after vectorization, rather than VF identical values. 5662 // Thus, if we scalarize an instruction that uses a uniform, we would 5663 // create uses of values corresponding to the lanes we aren't emitting code 5664 // for. This behavior can be changed by allowing getScalarValue to clone 5665 // the lane zero values for uniforms rather than asserting. 5666 for (Use &U : I->operands()) 5667 if (auto *J = dyn_cast<Instruction>(U.get())) 5668 if (isUniformAfterVectorization(J, VF)) 5669 return false; 5670 5671 // Otherwise, we can scalarize the instruction. 5672 return true; 5673 }; 5674 5675 // Compute the expected cost discount from scalarizing the entire expression 5676 // feeding the predicated instruction. We currently only consider expressions 5677 // that are single-use instruction chains. 5678 Worklist.push_back(PredInst); 5679 while (!Worklist.empty()) { 5680 Instruction *I = Worklist.pop_back_val(); 5681 5682 // If we've already analyzed the instruction, there's nothing to do. 5683 if (ScalarCosts.find(I) != ScalarCosts.end()) 5684 continue; 5685 5686 // Compute the cost of the vector instruction. Note that this cost already 5687 // includes the scalarization overhead of the predicated instruction. 5688 unsigned VectorCost = getInstructionCost(I, VF).first; 5689 5690 // Compute the cost of the scalarized instruction. This cost is the cost of 5691 // the instruction as if it wasn't if-converted and instead remained in the 5692 // predicated block. We will scale this cost by block probability after 5693 // computing the scalarization overhead. 5694 unsigned ScalarCost = VF * getInstructionCost(I, 1).first; 5695 5696 // Compute the scalarization overhead of needed insertelement instructions 5697 // and phi nodes. 5698 if (isScalarWithPredication(I) && !I->getType()->isVoidTy()) { 5699 ScalarCost += TTI.getScalarizationOverhead(ToVectorTy(I->getType(), VF), 5700 true, false); 5701 ScalarCost += VF * TTI.getCFInstrCost(Instruction::PHI); 5702 } 5703 5704 // Compute the scalarization overhead of needed extractelement 5705 // instructions. For each of the instruction's operands, if the operand can 5706 // be scalarized, add it to the worklist; otherwise, account for the 5707 // overhead. 5708 for (Use &U : I->operands()) 5709 if (auto *J = dyn_cast<Instruction>(U.get())) { 5710 assert(VectorType::isValidElementType(J->getType()) && 5711 "Instruction has non-scalar type"); 5712 if (canBeScalarized(J)) 5713 Worklist.push_back(J); 5714 else if (needsExtract(J, VF)) 5715 ScalarCost += TTI.getScalarizationOverhead( 5716 ToVectorTy(J->getType(),VF), false, true); 5717 } 5718 5719 // Scale the total scalar cost by block probability. 5720 ScalarCost /= getReciprocalPredBlockProb(); 5721 5722 // Compute the discount. A non-negative discount means the vector version 5723 // of the instruction costs more, and scalarizing would be beneficial. 5724 Discount += VectorCost - ScalarCost; 5725 ScalarCosts[I] = ScalarCost; 5726 } 5727 5728 return Discount; 5729 } 5730 5731 LoopVectorizationCostModel::VectorizationCostTy 5732 LoopVectorizationCostModel::expectedCost(unsigned VF) { 5733 VectorizationCostTy Cost; 5734 5735 // For each block. 5736 for (BasicBlock *BB : TheLoop->blocks()) { 5737 VectorizationCostTy BlockCost; 5738 5739 // For each instruction in the old loop. 5740 for (Instruction &I : BB->instructionsWithoutDebug()) { 5741 // Skip ignored values. 5742 if (ValuesToIgnore.find(&I) != ValuesToIgnore.end() || 5743 (VF > 1 && VecValuesToIgnore.find(&I) != VecValuesToIgnore.end())) 5744 continue; 5745 5746 VectorizationCostTy C = getInstructionCost(&I, VF); 5747 5748 // Check if we should override the cost. 5749 if (ForceTargetInstructionCost.getNumOccurrences() > 0) 5750 C.first = ForceTargetInstructionCost; 5751 5752 BlockCost.first += C.first; 5753 BlockCost.second |= C.second; 5754 LLVM_DEBUG(dbgs() << "LV: Found an estimated cost of " << C.first 5755 << " for VF " << VF << " For instruction: " << I 5756 << '\n'); 5757 } 5758 5759 // If we are vectorizing a predicated block, it will have been 5760 // if-converted. This means that the block's instructions (aside from 5761 // stores and instructions that may divide by zero) will now be 5762 // unconditionally executed. For the scalar case, we may not always execute 5763 // the predicated block. Thus, scale the block's cost by the probability of 5764 // executing it. 5765 if (VF == 1 && blockNeedsPredication(BB)) 5766 BlockCost.first /= getReciprocalPredBlockProb(); 5767 5768 Cost.first += BlockCost.first; 5769 Cost.second |= BlockCost.second; 5770 } 5771 5772 return Cost; 5773 } 5774 5775 /// Gets Address Access SCEV after verifying that the access pattern 5776 /// is loop invariant except the induction variable dependence. 5777 /// 5778 /// This SCEV can be sent to the Target in order to estimate the address 5779 /// calculation cost. 5780 static const SCEV *getAddressAccessSCEV( 5781 Value *Ptr, 5782 LoopVectorizationLegality *Legal, 5783 PredicatedScalarEvolution &PSE, 5784 const Loop *TheLoop) { 5785 5786 auto *Gep = dyn_cast<GetElementPtrInst>(Ptr); 5787 if (!Gep) 5788 return nullptr; 5789 5790 // We are looking for a gep with all loop invariant indices except for one 5791 // which should be an induction variable. 5792 auto SE = PSE.getSE(); 5793 unsigned NumOperands = Gep->getNumOperands(); 5794 for (unsigned i = 1; i < NumOperands; ++i) { 5795 Value *Opd = Gep->getOperand(i); 5796 if (!SE->isLoopInvariant(SE->getSCEV(Opd), TheLoop) && 5797 !Legal->isInductionVariable(Opd)) 5798 return nullptr; 5799 } 5800 5801 // Now we know we have a GEP ptr, %inv, %ind, %inv. return the Ptr SCEV. 5802 return PSE.getSCEV(Ptr); 5803 } 5804 5805 static bool isStrideMul(Instruction *I, LoopVectorizationLegality *Legal) { 5806 return Legal->hasStride(I->getOperand(0)) || 5807 Legal->hasStride(I->getOperand(1)); 5808 } 5809 5810 unsigned LoopVectorizationCostModel::getMemInstScalarizationCost(Instruction *I, 5811 unsigned VF) { 5812 assert(VF > 1 && "Scalarization cost of instruction implies vectorization."); 5813 Type *ValTy = getMemInstValueType(I); 5814 auto SE = PSE.getSE(); 5815 5816 unsigned AS = getLoadStoreAddressSpace(I); 5817 Value *Ptr = getLoadStorePointerOperand(I); 5818 Type *PtrTy = ToVectorTy(Ptr->getType(), VF); 5819 5820 // Figure out whether the access is strided and get the stride value 5821 // if it's known in compile time 5822 const SCEV *PtrSCEV = getAddressAccessSCEV(Ptr, Legal, PSE, TheLoop); 5823 5824 // Get the cost of the scalar memory instruction and address computation. 5825 unsigned Cost = VF * TTI.getAddressComputationCost(PtrTy, SE, PtrSCEV); 5826 5827 // Don't pass *I here, since it is scalar but will actually be part of a 5828 // vectorized loop where the user of it is a vectorized instruction. 5829 const MaybeAlign Alignment = getLoadStoreAlignment(I); 5830 Cost += VF * TTI.getMemoryOpCost(I->getOpcode(), ValTy->getScalarType(), 5831 Alignment, AS); 5832 5833 // Get the overhead of the extractelement and insertelement instructions 5834 // we might create due to scalarization. 5835 Cost += getScalarizationOverhead(I, VF); 5836 5837 // If we have a predicated store, it may not be executed for each vector 5838 // lane. Scale the cost by the probability of executing the predicated 5839 // block. 5840 if (isPredicatedInst(I)) { 5841 Cost /= getReciprocalPredBlockProb(); 5842 5843 if (useEmulatedMaskMemRefHack(I)) 5844 // Artificially setting to a high enough value to practically disable 5845 // vectorization with such operations. 5846 Cost = 3000000; 5847 } 5848 5849 return Cost; 5850 } 5851 5852 unsigned LoopVectorizationCostModel::getConsecutiveMemOpCost(Instruction *I, 5853 unsigned VF) { 5854 Type *ValTy = getMemInstValueType(I); 5855 Type *VectorTy = ToVectorTy(ValTy, VF); 5856 Value *Ptr = getLoadStorePointerOperand(I); 5857 unsigned AS = getLoadStoreAddressSpace(I); 5858 int ConsecutiveStride = Legal->isConsecutivePtr(Ptr); 5859 5860 assert((ConsecutiveStride == 1 || ConsecutiveStride == -1) && 5861 "Stride should be 1 or -1 for consecutive memory access"); 5862 const MaybeAlign Alignment = getLoadStoreAlignment(I); 5863 unsigned Cost = 0; 5864 if (Legal->isMaskRequired(I)) 5865 Cost += TTI.getMaskedMemoryOpCost(I->getOpcode(), VectorTy, 5866 Alignment ? Alignment->value() : 0, AS); 5867 else 5868 Cost += TTI.getMemoryOpCost(I->getOpcode(), VectorTy, Alignment, AS, I); 5869 5870 bool Reverse = ConsecutiveStride < 0; 5871 if (Reverse) 5872 Cost += TTI.getShuffleCost(TargetTransformInfo::SK_Reverse, VectorTy, 0); 5873 return Cost; 5874 } 5875 5876 unsigned LoopVectorizationCostModel::getUniformMemOpCost(Instruction *I, 5877 unsigned VF) { 5878 Type *ValTy = getMemInstValueType(I); 5879 Type *VectorTy = ToVectorTy(ValTy, VF); 5880 const MaybeAlign Alignment = getLoadStoreAlignment(I); 5881 unsigned AS = getLoadStoreAddressSpace(I); 5882 if (isa<LoadInst>(I)) { 5883 return TTI.getAddressComputationCost(ValTy) + 5884 TTI.getMemoryOpCost(Instruction::Load, ValTy, Alignment, AS) + 5885 TTI.getShuffleCost(TargetTransformInfo::SK_Broadcast, VectorTy); 5886 } 5887 StoreInst *SI = cast<StoreInst>(I); 5888 5889 bool isLoopInvariantStoreValue = Legal->isUniform(SI->getValueOperand()); 5890 return TTI.getAddressComputationCost(ValTy) + 5891 TTI.getMemoryOpCost(Instruction::Store, ValTy, Alignment, AS) + 5892 (isLoopInvariantStoreValue 5893 ? 0 5894 : TTI.getVectorInstrCost(Instruction::ExtractElement, VectorTy, 5895 VF - 1)); 5896 } 5897 5898 unsigned LoopVectorizationCostModel::getGatherScatterCost(Instruction *I, 5899 unsigned VF) { 5900 Type *ValTy = getMemInstValueType(I); 5901 Type *VectorTy = ToVectorTy(ValTy, VF); 5902 const MaybeAlign Alignment = getLoadStoreAlignment(I); 5903 Value *Ptr = getLoadStorePointerOperand(I); 5904 5905 return TTI.getAddressComputationCost(VectorTy) + 5906 TTI.getGatherScatterOpCost(I->getOpcode(), VectorTy, Ptr, 5907 Legal->isMaskRequired(I), 5908 Alignment ? Alignment->value() : 0, I); 5909 } 5910 5911 unsigned LoopVectorizationCostModel::getInterleaveGroupCost(Instruction *I, 5912 unsigned VF) { 5913 Type *ValTy = getMemInstValueType(I); 5914 Type *VectorTy = ToVectorTy(ValTy, VF); 5915 unsigned AS = getLoadStoreAddressSpace(I); 5916 5917 auto Group = getInterleavedAccessGroup(I); 5918 assert(Group && "Fail to get an interleaved access group."); 5919 5920 unsigned InterleaveFactor = Group->getFactor(); 5921 Type *WideVecTy = VectorType::get(ValTy, VF * InterleaveFactor); 5922 5923 // Holds the indices of existing members in an interleaved load group. 5924 // An interleaved store group doesn't need this as it doesn't allow gaps. 5925 SmallVector<unsigned, 4> Indices; 5926 if (isa<LoadInst>(I)) { 5927 for (unsigned i = 0; i < InterleaveFactor; i++) 5928 if (Group->getMember(i)) 5929 Indices.push_back(i); 5930 } 5931 5932 // Calculate the cost of the whole interleaved group. 5933 bool UseMaskForGaps = 5934 Group->requiresScalarEpilogue() && !isScalarEpilogueAllowed(); 5935 unsigned Cost = TTI.getInterleavedMemoryOpCost( 5936 I->getOpcode(), WideVecTy, Group->getFactor(), Indices, 5937 Group->getAlign().value(), AS, Legal->isMaskRequired(I), UseMaskForGaps); 5938 5939 if (Group->isReverse()) { 5940 // TODO: Add support for reversed masked interleaved access. 5941 assert(!Legal->isMaskRequired(I) && 5942 "Reverse masked interleaved access not supported."); 5943 Cost += Group->getNumMembers() * 5944 TTI.getShuffleCost(TargetTransformInfo::SK_Reverse, VectorTy, 0); 5945 } 5946 return Cost; 5947 } 5948 5949 unsigned LoopVectorizationCostModel::getMemoryInstructionCost(Instruction *I, 5950 unsigned VF) { 5951 // Calculate scalar cost only. Vectorization cost should be ready at this 5952 // moment. 5953 if (VF == 1) { 5954 Type *ValTy = getMemInstValueType(I); 5955 const MaybeAlign Alignment = getLoadStoreAlignment(I); 5956 unsigned AS = getLoadStoreAddressSpace(I); 5957 5958 return TTI.getAddressComputationCost(ValTy) + 5959 TTI.getMemoryOpCost(I->getOpcode(), ValTy, Alignment, AS, I); 5960 } 5961 return getWideningCost(I, VF); 5962 } 5963 5964 LoopVectorizationCostModel::VectorizationCostTy 5965 LoopVectorizationCostModel::getInstructionCost(Instruction *I, unsigned VF) { 5966 // If we know that this instruction will remain uniform, check the cost of 5967 // the scalar version. 5968 if (isUniformAfterVectorization(I, VF)) 5969 VF = 1; 5970 5971 if (VF > 1 && isProfitableToScalarize(I, VF)) 5972 return VectorizationCostTy(InstsToScalarize[VF][I], false); 5973 5974 // Forced scalars do not have any scalarization overhead. 5975 auto ForcedScalar = ForcedScalars.find(VF); 5976 if (VF > 1 && ForcedScalar != ForcedScalars.end()) { 5977 auto InstSet = ForcedScalar->second; 5978 if (InstSet.find(I) != InstSet.end()) 5979 return VectorizationCostTy((getInstructionCost(I, 1).first * VF), false); 5980 } 5981 5982 Type *VectorTy; 5983 unsigned C = getInstructionCost(I, VF, VectorTy); 5984 5985 bool TypeNotScalarized = 5986 VF > 1 && VectorTy->isVectorTy() && TTI.getNumberOfParts(VectorTy) < VF; 5987 return VectorizationCostTy(C, TypeNotScalarized); 5988 } 5989 5990 unsigned LoopVectorizationCostModel::getScalarizationOverhead(Instruction *I, 5991 unsigned VF) { 5992 5993 if (VF == 1) 5994 return 0; 5995 5996 unsigned Cost = 0; 5997 Type *RetTy = ToVectorTy(I->getType(), VF); 5998 if (!RetTy->isVoidTy() && 5999 (!isa<LoadInst>(I) || !TTI.supportsEfficientVectorElementLoadStore())) 6000 Cost += TTI.getScalarizationOverhead(RetTy, true, false); 6001 6002 // Some targets keep addresses scalar. 6003 if (isa<LoadInst>(I) && !TTI.prefersVectorizedAddressing()) 6004 return Cost; 6005 6006 // Some targets support efficient element stores. 6007 if (isa<StoreInst>(I) && TTI.supportsEfficientVectorElementLoadStore()) 6008 return Cost; 6009 6010 // Collect operands to consider. 6011 CallInst *CI = dyn_cast<CallInst>(I); 6012 Instruction::op_range Ops = CI ? CI->arg_operands() : I->operands(); 6013 6014 // Skip operands that do not require extraction/scalarization and do not incur 6015 // any overhead. 6016 return Cost + TTI.getOperandsScalarizationOverhead( 6017 filterExtractingOperands(Ops, VF), VF); 6018 } 6019 6020 void LoopVectorizationCostModel::setCostBasedWideningDecision(unsigned VF) { 6021 if (VF == 1) 6022 return; 6023 NumPredStores = 0; 6024 for (BasicBlock *BB : TheLoop->blocks()) { 6025 // For each instruction in the old loop. 6026 for (Instruction &I : *BB) { 6027 Value *Ptr = getLoadStorePointerOperand(&I); 6028 if (!Ptr) 6029 continue; 6030 6031 // TODO: We should generate better code and update the cost model for 6032 // predicated uniform stores. Today they are treated as any other 6033 // predicated store (see added test cases in 6034 // invariant-store-vectorization.ll). 6035 if (isa<StoreInst>(&I) && isScalarWithPredication(&I)) 6036 NumPredStores++; 6037 6038 if (Legal->isUniform(Ptr) && 6039 // Conditional loads and stores should be scalarized and predicated. 6040 // isScalarWithPredication cannot be used here since masked 6041 // gather/scatters are not considered scalar with predication. 6042 !Legal->blockNeedsPredication(I.getParent())) { 6043 // TODO: Avoid replicating loads and stores instead of 6044 // relying on instcombine to remove them. 6045 // Load: Scalar load + broadcast 6046 // Store: Scalar store + isLoopInvariantStoreValue ? 0 : extract 6047 unsigned Cost = getUniformMemOpCost(&I, VF); 6048 setWideningDecision(&I, VF, CM_Scalarize, Cost); 6049 continue; 6050 } 6051 6052 // We assume that widening is the best solution when possible. 6053 if (memoryInstructionCanBeWidened(&I, VF)) { 6054 unsigned Cost = getConsecutiveMemOpCost(&I, VF); 6055 int ConsecutiveStride = 6056 Legal->isConsecutivePtr(getLoadStorePointerOperand(&I)); 6057 assert((ConsecutiveStride == 1 || ConsecutiveStride == -1) && 6058 "Expected consecutive stride."); 6059 InstWidening Decision = 6060 ConsecutiveStride == 1 ? CM_Widen : CM_Widen_Reverse; 6061 setWideningDecision(&I, VF, Decision, Cost); 6062 continue; 6063 } 6064 6065 // Choose between Interleaving, Gather/Scatter or Scalarization. 6066 unsigned InterleaveCost = std::numeric_limits<unsigned>::max(); 6067 unsigned NumAccesses = 1; 6068 if (isAccessInterleaved(&I)) { 6069 auto Group = getInterleavedAccessGroup(&I); 6070 assert(Group && "Fail to get an interleaved access group."); 6071 6072 // Make one decision for the whole group. 6073 if (getWideningDecision(&I, VF) != CM_Unknown) 6074 continue; 6075 6076 NumAccesses = Group->getNumMembers(); 6077 if (interleavedAccessCanBeWidened(&I, VF)) 6078 InterleaveCost = getInterleaveGroupCost(&I, VF); 6079 } 6080 6081 unsigned GatherScatterCost = 6082 isLegalGatherOrScatter(&I) 6083 ? getGatherScatterCost(&I, VF) * NumAccesses 6084 : std::numeric_limits<unsigned>::max(); 6085 6086 unsigned ScalarizationCost = 6087 getMemInstScalarizationCost(&I, VF) * NumAccesses; 6088 6089 // Choose better solution for the current VF, 6090 // write down this decision and use it during vectorization. 6091 unsigned Cost; 6092 InstWidening Decision; 6093 if (InterleaveCost <= GatherScatterCost && 6094 InterleaveCost < ScalarizationCost) { 6095 Decision = CM_Interleave; 6096 Cost = InterleaveCost; 6097 } else if (GatherScatterCost < ScalarizationCost) { 6098 Decision = CM_GatherScatter; 6099 Cost = GatherScatterCost; 6100 } else { 6101 Decision = CM_Scalarize; 6102 Cost = ScalarizationCost; 6103 } 6104 // If the instructions belongs to an interleave group, the whole group 6105 // receives the same decision. The whole group receives the cost, but 6106 // the cost will actually be assigned to one instruction. 6107 if (auto Group = getInterleavedAccessGroup(&I)) 6108 setWideningDecision(Group, VF, Decision, Cost); 6109 else 6110 setWideningDecision(&I, VF, Decision, Cost); 6111 } 6112 } 6113 6114 // Make sure that any load of address and any other address computation 6115 // remains scalar unless there is gather/scatter support. This avoids 6116 // inevitable extracts into address registers, and also has the benefit of 6117 // activating LSR more, since that pass can't optimize vectorized 6118 // addresses. 6119 if (TTI.prefersVectorizedAddressing()) 6120 return; 6121 6122 // Start with all scalar pointer uses. 6123 SmallPtrSet<Instruction *, 8> AddrDefs; 6124 for (BasicBlock *BB : TheLoop->blocks()) 6125 for (Instruction &I : *BB) { 6126 Instruction *PtrDef = 6127 dyn_cast_or_null<Instruction>(getLoadStorePointerOperand(&I)); 6128 if (PtrDef && TheLoop->contains(PtrDef) && 6129 getWideningDecision(&I, VF) != CM_GatherScatter) 6130 AddrDefs.insert(PtrDef); 6131 } 6132 6133 // Add all instructions used to generate the addresses. 6134 SmallVector<Instruction *, 4> Worklist; 6135 for (auto *I : AddrDefs) 6136 Worklist.push_back(I); 6137 while (!Worklist.empty()) { 6138 Instruction *I = Worklist.pop_back_val(); 6139 for (auto &Op : I->operands()) 6140 if (auto *InstOp = dyn_cast<Instruction>(Op)) 6141 if ((InstOp->getParent() == I->getParent()) && !isa<PHINode>(InstOp) && 6142 AddrDefs.insert(InstOp).second) 6143 Worklist.push_back(InstOp); 6144 } 6145 6146 for (auto *I : AddrDefs) { 6147 if (isa<LoadInst>(I)) { 6148 // Setting the desired widening decision should ideally be handled in 6149 // by cost functions, but since this involves the task of finding out 6150 // if the loaded register is involved in an address computation, it is 6151 // instead changed here when we know this is the case. 6152 InstWidening Decision = getWideningDecision(I, VF); 6153 if (Decision == CM_Widen || Decision == CM_Widen_Reverse) 6154 // Scalarize a widened load of address. 6155 setWideningDecision(I, VF, CM_Scalarize, 6156 (VF * getMemoryInstructionCost(I, 1))); 6157 else if (auto Group = getInterleavedAccessGroup(I)) { 6158 // Scalarize an interleave group of address loads. 6159 for (unsigned I = 0; I < Group->getFactor(); ++I) { 6160 if (Instruction *Member = Group->getMember(I)) 6161 setWideningDecision(Member, VF, CM_Scalarize, 6162 (VF * getMemoryInstructionCost(Member, 1))); 6163 } 6164 } 6165 } else 6166 // Make sure I gets scalarized and a cost estimate without 6167 // scalarization overhead. 6168 ForcedScalars[VF].insert(I); 6169 } 6170 } 6171 6172 unsigned LoopVectorizationCostModel::getInstructionCost(Instruction *I, 6173 unsigned VF, 6174 Type *&VectorTy) { 6175 Type *RetTy = I->getType(); 6176 if (canTruncateToMinimalBitwidth(I, VF)) 6177 RetTy = IntegerType::get(RetTy->getContext(), MinBWs[I]); 6178 VectorTy = isScalarAfterVectorization(I, VF) ? RetTy : ToVectorTy(RetTy, VF); 6179 auto SE = PSE.getSE(); 6180 6181 // TODO: We need to estimate the cost of intrinsic calls. 6182 switch (I->getOpcode()) { 6183 case Instruction::GetElementPtr: 6184 // We mark this instruction as zero-cost because the cost of GEPs in 6185 // vectorized code depends on whether the corresponding memory instruction 6186 // is scalarized or not. Therefore, we handle GEPs with the memory 6187 // instruction cost. 6188 return 0; 6189 case Instruction::Br: { 6190 // In cases of scalarized and predicated instructions, there will be VF 6191 // predicated blocks in the vectorized loop. Each branch around these 6192 // blocks requires also an extract of its vector compare i1 element. 6193 bool ScalarPredicatedBB = false; 6194 BranchInst *BI = cast<BranchInst>(I); 6195 if (VF > 1 && BI->isConditional() && 6196 (PredicatedBBsAfterVectorization.find(BI->getSuccessor(0)) != 6197 PredicatedBBsAfterVectorization.end() || 6198 PredicatedBBsAfterVectorization.find(BI->getSuccessor(1)) != 6199 PredicatedBBsAfterVectorization.end())) 6200 ScalarPredicatedBB = true; 6201 6202 if (ScalarPredicatedBB) { 6203 // Return cost for branches around scalarized and predicated blocks. 6204 Type *Vec_i1Ty = 6205 VectorType::get(IntegerType::getInt1Ty(RetTy->getContext()), VF); 6206 return (TTI.getScalarizationOverhead(Vec_i1Ty, false, true) + 6207 (TTI.getCFInstrCost(Instruction::Br) * VF)); 6208 } else if (I->getParent() == TheLoop->getLoopLatch() || VF == 1) 6209 // The back-edge branch will remain, as will all scalar branches. 6210 return TTI.getCFInstrCost(Instruction::Br); 6211 else 6212 // This branch will be eliminated by if-conversion. 6213 return 0; 6214 // Note: We currently assume zero cost for an unconditional branch inside 6215 // a predicated block since it will become a fall-through, although we 6216 // may decide in the future to call TTI for all branches. 6217 } 6218 case Instruction::PHI: { 6219 auto *Phi = cast<PHINode>(I); 6220 6221 // First-order recurrences are replaced by vector shuffles inside the loop. 6222 // NOTE: Don't use ToVectorTy as SK_ExtractSubvector expects a vector type. 6223 if (VF > 1 && Legal->isFirstOrderRecurrence(Phi)) 6224 return TTI.getShuffleCost(TargetTransformInfo::SK_ExtractSubvector, 6225 VectorTy, VF - 1, VectorType::get(RetTy, 1)); 6226 6227 // Phi nodes in non-header blocks (not inductions, reductions, etc.) are 6228 // converted into select instructions. We require N - 1 selects per phi 6229 // node, where N is the number of incoming values. 6230 if (VF > 1 && Phi->getParent() != TheLoop->getHeader()) 6231 return (Phi->getNumIncomingValues() - 1) * 6232 TTI.getCmpSelInstrCost( 6233 Instruction::Select, ToVectorTy(Phi->getType(), VF), 6234 ToVectorTy(Type::getInt1Ty(Phi->getContext()), VF)); 6235 6236 return TTI.getCFInstrCost(Instruction::PHI); 6237 } 6238 case Instruction::UDiv: 6239 case Instruction::SDiv: 6240 case Instruction::URem: 6241 case Instruction::SRem: 6242 // If we have a predicated instruction, it may not be executed for each 6243 // vector lane. Get the scalarization cost and scale this amount by the 6244 // probability of executing the predicated block. If the instruction is not 6245 // predicated, we fall through to the next case. 6246 if (VF > 1 && isScalarWithPredication(I)) { 6247 unsigned Cost = 0; 6248 6249 // These instructions have a non-void type, so account for the phi nodes 6250 // that we will create. This cost is likely to be zero. The phi node 6251 // cost, if any, should be scaled by the block probability because it 6252 // models a copy at the end of each predicated block. 6253 Cost += VF * TTI.getCFInstrCost(Instruction::PHI); 6254 6255 // The cost of the non-predicated instruction. 6256 Cost += VF * TTI.getArithmeticInstrCost(I->getOpcode(), RetTy); 6257 6258 // The cost of insertelement and extractelement instructions needed for 6259 // scalarization. 6260 Cost += getScalarizationOverhead(I, VF); 6261 6262 // Scale the cost by the probability of executing the predicated blocks. 6263 // This assumes the predicated block for each vector lane is equally 6264 // likely. 6265 return Cost / getReciprocalPredBlockProb(); 6266 } 6267 LLVM_FALLTHROUGH; 6268 case Instruction::Add: 6269 case Instruction::FAdd: 6270 case Instruction::Sub: 6271 case Instruction::FSub: 6272 case Instruction::Mul: 6273 case Instruction::FMul: 6274 case Instruction::FDiv: 6275 case Instruction::FRem: 6276 case Instruction::Shl: 6277 case Instruction::LShr: 6278 case Instruction::AShr: 6279 case Instruction::And: 6280 case Instruction::Or: 6281 case Instruction::Xor: { 6282 // Since we will replace the stride by 1 the multiplication should go away. 6283 if (I->getOpcode() == Instruction::Mul && isStrideMul(I, Legal)) 6284 return 0; 6285 // Certain instructions can be cheaper to vectorize if they have a constant 6286 // second vector operand. One example of this are shifts on x86. 6287 Value *Op2 = I->getOperand(1); 6288 TargetTransformInfo::OperandValueProperties Op2VP; 6289 TargetTransformInfo::OperandValueKind Op2VK = 6290 TTI.getOperandInfo(Op2, Op2VP); 6291 if (Op2VK == TargetTransformInfo::OK_AnyValue && Legal->isUniform(Op2)) 6292 Op2VK = TargetTransformInfo::OK_UniformValue; 6293 6294 SmallVector<const Value *, 4> Operands(I->operand_values()); 6295 unsigned N = isScalarAfterVectorization(I, VF) ? VF : 1; 6296 return N * TTI.getArithmeticInstrCost( 6297 I->getOpcode(), VectorTy, TargetTransformInfo::OK_AnyValue, 6298 Op2VK, TargetTransformInfo::OP_None, Op2VP, Operands, I); 6299 } 6300 case Instruction::FNeg: { 6301 unsigned N = isScalarAfterVectorization(I, VF) ? VF : 1; 6302 return N * TTI.getArithmeticInstrCost( 6303 I->getOpcode(), VectorTy, TargetTransformInfo::OK_AnyValue, 6304 TargetTransformInfo::OK_AnyValue, 6305 TargetTransformInfo::OP_None, TargetTransformInfo::OP_None, 6306 I->getOperand(0), I); 6307 } 6308 case Instruction::Select: { 6309 SelectInst *SI = cast<SelectInst>(I); 6310 const SCEV *CondSCEV = SE->getSCEV(SI->getCondition()); 6311 bool ScalarCond = (SE->isLoopInvariant(CondSCEV, TheLoop)); 6312 Type *CondTy = SI->getCondition()->getType(); 6313 if (!ScalarCond) 6314 CondTy = VectorType::get(CondTy, VF); 6315 6316 return TTI.getCmpSelInstrCost(I->getOpcode(), VectorTy, CondTy, I); 6317 } 6318 case Instruction::ICmp: 6319 case Instruction::FCmp: { 6320 Type *ValTy = I->getOperand(0)->getType(); 6321 Instruction *Op0AsInstruction = dyn_cast<Instruction>(I->getOperand(0)); 6322 if (canTruncateToMinimalBitwidth(Op0AsInstruction, VF)) 6323 ValTy = IntegerType::get(ValTy->getContext(), MinBWs[Op0AsInstruction]); 6324 VectorTy = ToVectorTy(ValTy, VF); 6325 return TTI.getCmpSelInstrCost(I->getOpcode(), VectorTy, nullptr, I); 6326 } 6327 case Instruction::Store: 6328 case Instruction::Load: { 6329 unsigned Width = VF; 6330 if (Width > 1) { 6331 InstWidening Decision = getWideningDecision(I, Width); 6332 assert(Decision != CM_Unknown && 6333 "CM decision should be taken at this point"); 6334 if (Decision == CM_Scalarize) 6335 Width = 1; 6336 } 6337 VectorTy = ToVectorTy(getMemInstValueType(I), Width); 6338 return getMemoryInstructionCost(I, VF); 6339 } 6340 case Instruction::ZExt: 6341 case Instruction::SExt: 6342 case Instruction::FPToUI: 6343 case Instruction::FPToSI: 6344 case Instruction::FPExt: 6345 case Instruction::PtrToInt: 6346 case Instruction::IntToPtr: 6347 case Instruction::SIToFP: 6348 case Instruction::UIToFP: 6349 case Instruction::Trunc: 6350 case Instruction::FPTrunc: 6351 case Instruction::BitCast: { 6352 // We optimize the truncation of induction variables having constant 6353 // integer steps. The cost of these truncations is the same as the scalar 6354 // operation. 6355 if (isOptimizableIVTruncate(I, VF)) { 6356 auto *Trunc = cast<TruncInst>(I); 6357 return TTI.getCastInstrCost(Instruction::Trunc, Trunc->getDestTy(), 6358 Trunc->getSrcTy(), Trunc); 6359 } 6360 6361 Type *SrcScalarTy = I->getOperand(0)->getType(); 6362 Type *SrcVecTy = 6363 VectorTy->isVectorTy() ? ToVectorTy(SrcScalarTy, VF) : SrcScalarTy; 6364 if (canTruncateToMinimalBitwidth(I, VF)) { 6365 // This cast is going to be shrunk. This may remove the cast or it might 6366 // turn it into slightly different cast. For example, if MinBW == 16, 6367 // "zext i8 %1 to i32" becomes "zext i8 %1 to i16". 6368 // 6369 // Calculate the modified src and dest types. 6370 Type *MinVecTy = VectorTy; 6371 if (I->getOpcode() == Instruction::Trunc) { 6372 SrcVecTy = smallestIntegerVectorType(SrcVecTy, MinVecTy); 6373 VectorTy = 6374 largestIntegerVectorType(ToVectorTy(I->getType(), VF), MinVecTy); 6375 } else if (I->getOpcode() == Instruction::ZExt || 6376 I->getOpcode() == Instruction::SExt) { 6377 SrcVecTy = largestIntegerVectorType(SrcVecTy, MinVecTy); 6378 VectorTy = 6379 smallestIntegerVectorType(ToVectorTy(I->getType(), VF), MinVecTy); 6380 } 6381 } 6382 6383 unsigned N = isScalarAfterVectorization(I, VF) ? VF : 1; 6384 return N * TTI.getCastInstrCost(I->getOpcode(), VectorTy, SrcVecTy, I); 6385 } 6386 case Instruction::Call: { 6387 bool NeedToScalarize; 6388 CallInst *CI = cast<CallInst>(I); 6389 unsigned CallCost = getVectorCallCost(CI, VF, NeedToScalarize); 6390 if (getVectorIntrinsicIDForCall(CI, TLI)) 6391 return std::min(CallCost, getVectorIntrinsicCost(CI, VF)); 6392 return CallCost; 6393 } 6394 default: 6395 // The cost of executing VF copies of the scalar instruction. This opcode 6396 // is unknown. Assume that it is the same as 'mul'. 6397 return VF * TTI.getArithmeticInstrCost(Instruction::Mul, VectorTy) + 6398 getScalarizationOverhead(I, VF); 6399 } // end of switch. 6400 } 6401 6402 char LoopVectorize::ID = 0; 6403 6404 static const char lv_name[] = "Loop Vectorization"; 6405 6406 INITIALIZE_PASS_BEGIN(LoopVectorize, LV_NAME, lv_name, false, false) 6407 INITIALIZE_PASS_DEPENDENCY(TargetTransformInfoWrapperPass) 6408 INITIALIZE_PASS_DEPENDENCY(BasicAAWrapperPass) 6409 INITIALIZE_PASS_DEPENDENCY(AAResultsWrapperPass) 6410 INITIALIZE_PASS_DEPENDENCY(GlobalsAAWrapperPass) 6411 INITIALIZE_PASS_DEPENDENCY(AssumptionCacheTracker) 6412 INITIALIZE_PASS_DEPENDENCY(BlockFrequencyInfoWrapperPass) 6413 INITIALIZE_PASS_DEPENDENCY(DominatorTreeWrapperPass) 6414 INITIALIZE_PASS_DEPENDENCY(ScalarEvolutionWrapperPass) 6415 INITIALIZE_PASS_DEPENDENCY(LoopInfoWrapperPass) 6416 INITIALIZE_PASS_DEPENDENCY(LoopAccessLegacyAnalysis) 6417 INITIALIZE_PASS_DEPENDENCY(DemandedBitsWrapperPass) 6418 INITIALIZE_PASS_DEPENDENCY(OptimizationRemarkEmitterWrapperPass) 6419 INITIALIZE_PASS_DEPENDENCY(ProfileSummaryInfoWrapperPass) 6420 INITIALIZE_PASS_DEPENDENCY(InjectTLIMappingsLegacy) 6421 INITIALIZE_PASS_END(LoopVectorize, LV_NAME, lv_name, false, false) 6422 6423 namespace llvm { 6424 6425 Pass *createLoopVectorizePass() { return new LoopVectorize(); } 6426 6427 Pass *createLoopVectorizePass(bool InterleaveOnlyWhenForced, 6428 bool VectorizeOnlyWhenForced) { 6429 return new LoopVectorize(InterleaveOnlyWhenForced, VectorizeOnlyWhenForced); 6430 } 6431 6432 } // end namespace llvm 6433 6434 bool LoopVectorizationCostModel::isConsecutiveLoadOrStore(Instruction *Inst) { 6435 // Check if the pointer operand of a load or store instruction is 6436 // consecutive. 6437 if (auto *Ptr = getLoadStorePointerOperand(Inst)) 6438 return Legal->isConsecutivePtr(Ptr); 6439 return false; 6440 } 6441 6442 void LoopVectorizationCostModel::collectValuesToIgnore() { 6443 // Ignore ephemeral values. 6444 CodeMetrics::collectEphemeralValues(TheLoop, AC, ValuesToIgnore); 6445 6446 // Ignore type-promoting instructions we identified during reduction 6447 // detection. 6448 for (auto &Reduction : Legal->getReductionVars()) { 6449 RecurrenceDescriptor &RedDes = Reduction.second; 6450 SmallPtrSetImpl<Instruction *> &Casts = RedDes.getCastInsts(); 6451 VecValuesToIgnore.insert(Casts.begin(), Casts.end()); 6452 } 6453 // Ignore type-casting instructions we identified during induction 6454 // detection. 6455 for (auto &Induction : Legal->getInductionVars()) { 6456 InductionDescriptor &IndDes = Induction.second; 6457 const SmallVectorImpl<Instruction *> &Casts = IndDes.getCastInsts(); 6458 VecValuesToIgnore.insert(Casts.begin(), Casts.end()); 6459 } 6460 } 6461 6462 // TODO: we could return a pair of values that specify the max VF and 6463 // min VF, to be used in `buildVPlans(MinVF, MaxVF)` instead of 6464 // `buildVPlans(VF, VF)`. We cannot do it because VPLAN at the moment 6465 // doesn't have a cost model that can choose which plan to execute if 6466 // more than one is generated. 6467 static unsigned determineVPlanVF(const unsigned WidestVectorRegBits, 6468 LoopVectorizationCostModel &CM) { 6469 unsigned WidestType; 6470 std::tie(std::ignore, WidestType) = CM.getSmallestAndWidestTypes(); 6471 return WidestVectorRegBits / WidestType; 6472 } 6473 6474 VectorizationFactor 6475 LoopVectorizationPlanner::planInVPlanNativePath(unsigned UserVF) { 6476 unsigned VF = UserVF; 6477 // Outer loop handling: They may require CFG and instruction level 6478 // transformations before even evaluating whether vectorization is profitable. 6479 // Since we cannot modify the incoming IR, we need to build VPlan upfront in 6480 // the vectorization pipeline. 6481 if (!OrigLoop->empty()) { 6482 // If the user doesn't provide a vectorization factor, determine a 6483 // reasonable one. 6484 if (!UserVF) { 6485 VF = determineVPlanVF(TTI->getRegisterBitWidth(true /* Vector*/), CM); 6486 LLVM_DEBUG(dbgs() << "LV: VPlan computed VF " << VF << ".\n"); 6487 6488 // Make sure we have a VF > 1 for stress testing. 6489 if (VPlanBuildStressTest && VF < 2) { 6490 LLVM_DEBUG(dbgs() << "LV: VPlan stress testing: " 6491 << "overriding computed VF.\n"); 6492 VF = 4; 6493 } 6494 } 6495 assert(EnableVPlanNativePath && "VPlan-native path is not enabled."); 6496 assert(isPowerOf2_32(VF) && "VF needs to be a power of two"); 6497 LLVM_DEBUG(dbgs() << "LV: Using " << (UserVF ? "user " : "") << "VF " << VF 6498 << " to build VPlans.\n"); 6499 buildVPlans(VF, VF); 6500 6501 // For VPlan build stress testing, we bail out after VPlan construction. 6502 if (VPlanBuildStressTest) 6503 return VectorizationFactor::Disabled(); 6504 6505 return {VF, 0}; 6506 } 6507 6508 LLVM_DEBUG( 6509 dbgs() << "LV: Not vectorizing. Inner loops aren't supported in the " 6510 "VPlan-native path.\n"); 6511 return VectorizationFactor::Disabled(); 6512 } 6513 6514 Optional<VectorizationFactor> LoopVectorizationPlanner::plan(unsigned UserVF) { 6515 assert(OrigLoop->empty() && "Inner loop expected."); 6516 Optional<unsigned> MaybeMaxVF = CM.computeMaxVF(); 6517 if (!MaybeMaxVF) // Cases that should not to be vectorized nor interleaved. 6518 return None; 6519 6520 // Invalidate interleave groups if all blocks of loop will be predicated. 6521 if (CM.blockNeedsPredication(OrigLoop->getHeader()) && 6522 !useMaskedInterleavedAccesses(*TTI)) { 6523 LLVM_DEBUG( 6524 dbgs() 6525 << "LV: Invalidate all interleaved groups due to fold-tail by masking " 6526 "which requires masked-interleaved support.\n"); 6527 CM.InterleaveInfo.reset(); 6528 } 6529 6530 if (UserVF) { 6531 LLVM_DEBUG(dbgs() << "LV: Using user VF " << UserVF << ".\n"); 6532 assert(isPowerOf2_32(UserVF) && "VF needs to be a power of two"); 6533 // Collect the instructions (and their associated costs) that will be more 6534 // profitable to scalarize. 6535 CM.selectUserVectorizationFactor(UserVF); 6536 buildVPlansWithVPRecipes(UserVF, UserVF); 6537 LLVM_DEBUG(printPlans(dbgs())); 6538 return {{UserVF, 0}}; 6539 } 6540 6541 unsigned MaxVF = MaybeMaxVF.getValue(); 6542 assert(MaxVF != 0 && "MaxVF is zero."); 6543 6544 for (unsigned VF = 1; VF <= MaxVF; VF *= 2) { 6545 // Collect Uniform and Scalar instructions after vectorization with VF. 6546 CM.collectUniformsAndScalars(VF); 6547 6548 // Collect the instructions (and their associated costs) that will be more 6549 // profitable to scalarize. 6550 if (VF > 1) 6551 CM.collectInstsToScalarize(VF); 6552 } 6553 6554 buildVPlansWithVPRecipes(1, MaxVF); 6555 LLVM_DEBUG(printPlans(dbgs())); 6556 if (MaxVF == 1) 6557 return VectorizationFactor::Disabled(); 6558 6559 // Select the optimal vectorization factor. 6560 return CM.selectVectorizationFactor(MaxVF); 6561 } 6562 6563 void LoopVectorizationPlanner::setBestPlan(unsigned VF, unsigned UF) { 6564 LLVM_DEBUG(dbgs() << "Setting best plan to VF=" << VF << ", UF=" << UF 6565 << '\n'); 6566 BestVF = VF; 6567 BestUF = UF; 6568 6569 erase_if(VPlans, [VF](const VPlanPtr &Plan) { 6570 return !Plan->hasVF(VF); 6571 }); 6572 assert(VPlans.size() == 1 && "Best VF has not a single VPlan."); 6573 } 6574 6575 void LoopVectorizationPlanner::executePlan(InnerLoopVectorizer &ILV, 6576 DominatorTree *DT) { 6577 // Perform the actual loop transformation. 6578 6579 // 1. Create a new empty loop. Unlink the old loop and connect the new one. 6580 VPCallbackILV CallbackILV(ILV); 6581 6582 VPTransformState State{BestVF, BestUF, LI, 6583 DT, ILV.Builder, ILV.VectorLoopValueMap, 6584 &ILV, CallbackILV}; 6585 State.CFG.PrevBB = ILV.createVectorizedLoopSkeleton(); 6586 State.TripCount = ILV.getOrCreateTripCount(nullptr); 6587 6588 //===------------------------------------------------===// 6589 // 6590 // Notice: any optimization or new instruction that go 6591 // into the code below should also be implemented in 6592 // the cost-model. 6593 // 6594 //===------------------------------------------------===// 6595 6596 // 2. Copy and widen instructions from the old loop into the new loop. 6597 assert(VPlans.size() == 1 && "Not a single VPlan to execute."); 6598 VPlans.front()->execute(&State); 6599 6600 // 3. Fix the vectorized code: take care of header phi's, live-outs, 6601 // predication, updating analyses. 6602 ILV.fixVectorizedLoop(); 6603 } 6604 6605 void LoopVectorizationPlanner::collectTriviallyDeadInstructions( 6606 SmallPtrSetImpl<Instruction *> &DeadInstructions) { 6607 BasicBlock *Latch = OrigLoop->getLoopLatch(); 6608 6609 // We create new control-flow for the vectorized loop, so the original 6610 // condition will be dead after vectorization if it's only used by the 6611 // branch. 6612 auto *Cmp = dyn_cast<Instruction>(Latch->getTerminator()->getOperand(0)); 6613 if (Cmp && Cmp->hasOneUse()) 6614 DeadInstructions.insert(Cmp); 6615 6616 // We create new "steps" for induction variable updates to which the original 6617 // induction variables map. An original update instruction will be dead if 6618 // all its users except the induction variable are dead. 6619 for (auto &Induction : Legal->getInductionVars()) { 6620 PHINode *Ind = Induction.first; 6621 auto *IndUpdate = cast<Instruction>(Ind->getIncomingValueForBlock(Latch)); 6622 if (llvm::all_of(IndUpdate->users(), [&](User *U) -> bool { 6623 return U == Ind || DeadInstructions.find(cast<Instruction>(U)) != 6624 DeadInstructions.end(); 6625 })) 6626 DeadInstructions.insert(IndUpdate); 6627 6628 // We record as "Dead" also the type-casting instructions we had identified 6629 // during induction analysis. We don't need any handling for them in the 6630 // vectorized loop because we have proven that, under a proper runtime 6631 // test guarding the vectorized loop, the value of the phi, and the casted 6632 // value of the phi, are the same. The last instruction in this casting chain 6633 // will get its scalar/vector/widened def from the scalar/vector/widened def 6634 // of the respective phi node. Any other casts in the induction def-use chain 6635 // have no other uses outside the phi update chain, and will be ignored. 6636 InductionDescriptor &IndDes = Induction.second; 6637 const SmallVectorImpl<Instruction *> &Casts = IndDes.getCastInsts(); 6638 DeadInstructions.insert(Casts.begin(), Casts.end()); 6639 } 6640 } 6641 6642 Value *InnerLoopUnroller::reverseVector(Value *Vec) { return Vec; } 6643 6644 Value *InnerLoopUnroller::getBroadcastInstrs(Value *V) { return V; } 6645 6646 Value *InnerLoopUnroller::getStepVector(Value *Val, int StartIdx, Value *Step, 6647 Instruction::BinaryOps BinOp) { 6648 // When unrolling and the VF is 1, we only need to add a simple scalar. 6649 Type *Ty = Val->getType(); 6650 assert(!Ty->isVectorTy() && "Val must be a scalar"); 6651 6652 if (Ty->isFloatingPointTy()) { 6653 Constant *C = ConstantFP::get(Ty, (double)StartIdx); 6654 6655 // Floating point operations had to be 'fast' to enable the unrolling. 6656 Value *MulOp = addFastMathFlag(Builder.CreateFMul(C, Step)); 6657 return addFastMathFlag(Builder.CreateBinOp(BinOp, Val, MulOp)); 6658 } 6659 Constant *C = ConstantInt::get(Ty, StartIdx); 6660 return Builder.CreateAdd(Val, Builder.CreateMul(C, Step), "induction"); 6661 } 6662 6663 static void AddRuntimeUnrollDisableMetaData(Loop *L) { 6664 SmallVector<Metadata *, 4> MDs; 6665 // Reserve first location for self reference to the LoopID metadata node. 6666 MDs.push_back(nullptr); 6667 bool IsUnrollMetadata = false; 6668 MDNode *LoopID = L->getLoopID(); 6669 if (LoopID) { 6670 // First find existing loop unrolling disable metadata. 6671 for (unsigned i = 1, ie = LoopID->getNumOperands(); i < ie; ++i) { 6672 auto *MD = dyn_cast<MDNode>(LoopID->getOperand(i)); 6673 if (MD) { 6674 const auto *S = dyn_cast<MDString>(MD->getOperand(0)); 6675 IsUnrollMetadata = 6676 S && S->getString().startswith("llvm.loop.unroll.disable"); 6677 } 6678 MDs.push_back(LoopID->getOperand(i)); 6679 } 6680 } 6681 6682 if (!IsUnrollMetadata) { 6683 // Add runtime unroll disable metadata. 6684 LLVMContext &Context = L->getHeader()->getContext(); 6685 SmallVector<Metadata *, 1> DisableOperands; 6686 DisableOperands.push_back( 6687 MDString::get(Context, "llvm.loop.unroll.runtime.disable")); 6688 MDNode *DisableNode = MDNode::get(Context, DisableOperands); 6689 MDs.push_back(DisableNode); 6690 MDNode *NewLoopID = MDNode::get(Context, MDs); 6691 // Set operand 0 to refer to the loop id itself. 6692 NewLoopID->replaceOperandWith(0, NewLoopID); 6693 L->setLoopID(NewLoopID); 6694 } 6695 } 6696 6697 bool LoopVectorizationPlanner::getDecisionAndClampRange( 6698 const std::function<bool(unsigned)> &Predicate, VFRange &Range) { 6699 assert(Range.End > Range.Start && "Trying to test an empty VF range."); 6700 bool PredicateAtRangeStart = Predicate(Range.Start); 6701 6702 for (unsigned TmpVF = Range.Start * 2; TmpVF < Range.End; TmpVF *= 2) 6703 if (Predicate(TmpVF) != PredicateAtRangeStart) { 6704 Range.End = TmpVF; 6705 break; 6706 } 6707 6708 return PredicateAtRangeStart; 6709 } 6710 6711 /// Build VPlans for the full range of feasible VF's = {\p MinVF, 2 * \p MinVF, 6712 /// 4 * \p MinVF, ..., \p MaxVF} by repeatedly building a VPlan for a sub-range 6713 /// of VF's starting at a given VF and extending it as much as possible. Each 6714 /// vectorization decision can potentially shorten this sub-range during 6715 /// buildVPlan(). 6716 void LoopVectorizationPlanner::buildVPlans(unsigned MinVF, unsigned MaxVF) { 6717 for (unsigned VF = MinVF; VF < MaxVF + 1;) { 6718 VFRange SubRange = {VF, MaxVF + 1}; 6719 VPlans.push_back(buildVPlan(SubRange)); 6720 VF = SubRange.End; 6721 } 6722 } 6723 6724 VPValue *VPRecipeBuilder::createEdgeMask(BasicBlock *Src, BasicBlock *Dst, 6725 VPlanPtr &Plan) { 6726 assert(is_contained(predecessors(Dst), Src) && "Invalid edge"); 6727 6728 // Look for cached value. 6729 std::pair<BasicBlock *, BasicBlock *> Edge(Src, Dst); 6730 EdgeMaskCacheTy::iterator ECEntryIt = EdgeMaskCache.find(Edge); 6731 if (ECEntryIt != EdgeMaskCache.end()) 6732 return ECEntryIt->second; 6733 6734 VPValue *SrcMask = createBlockInMask(Src, Plan); 6735 6736 // The terminator has to be a branch inst! 6737 BranchInst *BI = dyn_cast<BranchInst>(Src->getTerminator()); 6738 assert(BI && "Unexpected terminator found"); 6739 6740 if (!BI->isConditional() || BI->getSuccessor(0) == BI->getSuccessor(1)) 6741 return EdgeMaskCache[Edge] = SrcMask; 6742 6743 VPValue *EdgeMask = Plan->getVPValue(BI->getCondition()); 6744 assert(EdgeMask && "No Edge Mask found for condition"); 6745 6746 if (BI->getSuccessor(0) != Dst) 6747 EdgeMask = Builder.createNot(EdgeMask); 6748 6749 if (SrcMask) // Otherwise block in-mask is all-one, no need to AND. 6750 EdgeMask = Builder.createAnd(EdgeMask, SrcMask); 6751 6752 return EdgeMaskCache[Edge] = EdgeMask; 6753 } 6754 6755 VPValue *VPRecipeBuilder::createBlockInMask(BasicBlock *BB, VPlanPtr &Plan) { 6756 assert(OrigLoop->contains(BB) && "Block is not a part of a loop"); 6757 6758 // Look for cached value. 6759 BlockMaskCacheTy::iterator BCEntryIt = BlockMaskCache.find(BB); 6760 if (BCEntryIt != BlockMaskCache.end()) 6761 return BCEntryIt->second; 6762 6763 // All-one mask is modelled as no-mask following the convention for masked 6764 // load/store/gather/scatter. Initialize BlockMask to no-mask. 6765 VPValue *BlockMask = nullptr; 6766 6767 if (OrigLoop->getHeader() == BB) { 6768 if (!CM.blockNeedsPredication(BB)) 6769 return BlockMaskCache[BB] = BlockMask; // Loop incoming mask is all-one. 6770 6771 // Introduce the early-exit compare IV <= BTC to form header block mask. 6772 // This is used instead of IV < TC because TC may wrap, unlike BTC. 6773 VPValue *IV = Plan->getVPValue(Legal->getPrimaryInduction()); 6774 VPValue *BTC = Plan->getOrCreateBackedgeTakenCount(); 6775 BlockMask = Builder.createNaryOp(VPInstruction::ICmpULE, {IV, BTC}); 6776 return BlockMaskCache[BB] = BlockMask; 6777 } 6778 6779 // This is the block mask. We OR all incoming edges. 6780 for (auto *Predecessor : predecessors(BB)) { 6781 VPValue *EdgeMask = createEdgeMask(Predecessor, BB, Plan); 6782 if (!EdgeMask) // Mask of predecessor is all-one so mask of block is too. 6783 return BlockMaskCache[BB] = EdgeMask; 6784 6785 if (!BlockMask) { // BlockMask has its initialized nullptr value. 6786 BlockMask = EdgeMask; 6787 continue; 6788 } 6789 6790 BlockMask = Builder.createOr(BlockMask, EdgeMask); 6791 } 6792 6793 return BlockMaskCache[BB] = BlockMask; 6794 } 6795 6796 VPWidenMemoryInstructionRecipe * 6797 VPRecipeBuilder::tryToWidenMemory(Instruction *I, VFRange &Range, 6798 VPlanPtr &Plan) { 6799 if (!isa<LoadInst>(I) && !isa<StoreInst>(I)) 6800 return nullptr; 6801 6802 auto willWiden = [&](unsigned VF) -> bool { 6803 if (VF == 1) 6804 return false; 6805 LoopVectorizationCostModel::InstWidening Decision = 6806 CM.getWideningDecision(I, VF); 6807 assert(Decision != LoopVectorizationCostModel::CM_Unknown && 6808 "CM decision should be taken at this point."); 6809 if (Decision == LoopVectorizationCostModel::CM_Interleave) 6810 return true; 6811 if (CM.isScalarAfterVectorization(I, VF) || 6812 CM.isProfitableToScalarize(I, VF)) 6813 return false; 6814 return Decision != LoopVectorizationCostModel::CM_Scalarize; 6815 }; 6816 6817 if (!LoopVectorizationPlanner::getDecisionAndClampRange(willWiden, Range)) 6818 return nullptr; 6819 6820 VPValue *Mask = nullptr; 6821 if (Legal->isMaskRequired(I)) 6822 Mask = createBlockInMask(I->getParent(), Plan); 6823 6824 VPValue *Addr = Plan->getOrAddVPValue(getLoadStorePointerOperand(I)); 6825 if (LoadInst *Load = dyn_cast<LoadInst>(I)) 6826 return new VPWidenMemoryInstructionRecipe(*Load, Addr, Mask); 6827 6828 StoreInst *Store = cast<StoreInst>(I); 6829 VPValue *StoredValue = Plan->getOrAddVPValue(Store->getValueOperand()); 6830 return new VPWidenMemoryInstructionRecipe(*Store, Addr, StoredValue, Mask); 6831 } 6832 6833 VPWidenIntOrFpInductionRecipe * 6834 VPRecipeBuilder::tryToOptimizeInduction(Instruction *I, VFRange &Range) { 6835 if (PHINode *Phi = dyn_cast<PHINode>(I)) { 6836 // Check if this is an integer or fp induction. If so, build the recipe that 6837 // produces its scalar and vector values. 6838 InductionDescriptor II = Legal->getInductionVars().lookup(Phi); 6839 if (II.getKind() == InductionDescriptor::IK_IntInduction || 6840 II.getKind() == InductionDescriptor::IK_FpInduction) 6841 return new VPWidenIntOrFpInductionRecipe(Phi); 6842 6843 return nullptr; 6844 } 6845 6846 // Optimize the special case where the source is a constant integer 6847 // induction variable. Notice that we can only optimize the 'trunc' case 6848 // because (a) FP conversions lose precision, (b) sext/zext may wrap, and 6849 // (c) other casts depend on pointer size. 6850 6851 // Determine whether \p K is a truncation based on an induction variable that 6852 // can be optimized. 6853 auto isOptimizableIVTruncate = 6854 [&](Instruction *K) -> std::function<bool(unsigned)> { 6855 return 6856 [=](unsigned VF) -> bool { return CM.isOptimizableIVTruncate(K, VF); }; 6857 }; 6858 6859 if (isa<TruncInst>(I) && LoopVectorizationPlanner::getDecisionAndClampRange( 6860 isOptimizableIVTruncate(I), Range)) 6861 return new VPWidenIntOrFpInductionRecipe(cast<PHINode>(I->getOperand(0)), 6862 cast<TruncInst>(I)); 6863 return nullptr; 6864 } 6865 6866 VPBlendRecipe *VPRecipeBuilder::tryToBlend(Instruction *I, VPlanPtr &Plan) { 6867 PHINode *Phi = dyn_cast<PHINode>(I); 6868 if (!Phi || Phi->getParent() == OrigLoop->getHeader()) 6869 return nullptr; 6870 6871 // We know that all PHIs in non-header blocks are converted into selects, so 6872 // we don't have to worry about the insertion order and we can just use the 6873 // builder. At this point we generate the predication tree. There may be 6874 // duplications since this is a simple recursive scan, but future 6875 // optimizations will clean it up. 6876 6877 SmallVector<VPValue *, 2> Masks; 6878 unsigned NumIncoming = Phi->getNumIncomingValues(); 6879 for (unsigned In = 0; In < NumIncoming; In++) { 6880 VPValue *EdgeMask = 6881 createEdgeMask(Phi->getIncomingBlock(In), Phi->getParent(), Plan); 6882 assert((EdgeMask || NumIncoming == 1) && 6883 "Multiple predecessors with one having a full mask"); 6884 if (EdgeMask) 6885 Masks.push_back(EdgeMask); 6886 } 6887 return new VPBlendRecipe(Phi, Masks); 6888 } 6889 6890 VPWidenCallRecipe *VPRecipeBuilder::tryToWidenCall(Instruction *I, 6891 VFRange &Range) { 6892 6893 bool IsPredicated = LoopVectorizationPlanner::getDecisionAndClampRange( 6894 [&](unsigned VF) { return CM.isScalarWithPredication(I, VF); }, Range); 6895 6896 CallInst *CI = dyn_cast<CallInst>(I); 6897 if (IsPredicated || !CI) 6898 return nullptr; 6899 6900 Intrinsic::ID ID = getVectorIntrinsicIDForCall(CI, TLI); 6901 if (ID && (ID == Intrinsic::assume || ID == Intrinsic::lifetime_end || 6902 ID == Intrinsic::lifetime_start || ID == Intrinsic::sideeffect)) 6903 return nullptr; 6904 6905 auto willWiden = [&](unsigned VF) -> bool { 6906 Intrinsic::ID ID = getVectorIntrinsicIDForCall(CI, TLI); 6907 // The following case may be scalarized depending on the VF. 6908 // The flag shows whether we use Intrinsic or a usual Call for vectorized 6909 // version of the instruction. 6910 // Is it beneficial to perform intrinsic call compared to lib call? 6911 bool NeedToScalarize = false; 6912 unsigned CallCost = CM.getVectorCallCost(CI, VF, NeedToScalarize); 6913 bool UseVectorIntrinsic = 6914 ID && CM.getVectorIntrinsicCost(CI, VF) <= CallCost; 6915 return UseVectorIntrinsic || !NeedToScalarize; 6916 }; 6917 6918 if (!LoopVectorizationPlanner::getDecisionAndClampRange(willWiden, Range)) 6919 return nullptr; 6920 6921 // Success: widen this call. 6922 return new VPWidenCallRecipe(*CI); 6923 } 6924 6925 VPWidenRecipe *VPRecipeBuilder::tryToWiden(Instruction *I, VFRange &Range) { 6926 bool IsPredicated = LoopVectorizationPlanner::getDecisionAndClampRange( 6927 [&](unsigned VF) { return CM.isScalarWithPredication(I, VF); }, Range); 6928 6929 if (IsPredicated) 6930 return nullptr; 6931 6932 auto IsVectorizableOpcode = [](unsigned Opcode) { 6933 switch (Opcode) { 6934 case Instruction::Add: 6935 case Instruction::And: 6936 case Instruction::AShr: 6937 case Instruction::BitCast: 6938 case Instruction::Br: 6939 case Instruction::FAdd: 6940 case Instruction::FCmp: 6941 case Instruction::FDiv: 6942 case Instruction::FMul: 6943 case Instruction::FNeg: 6944 case Instruction::FPExt: 6945 case Instruction::FPToSI: 6946 case Instruction::FPToUI: 6947 case Instruction::FPTrunc: 6948 case Instruction::FRem: 6949 case Instruction::FSub: 6950 case Instruction::ICmp: 6951 case Instruction::IntToPtr: 6952 case Instruction::Load: 6953 case Instruction::LShr: 6954 case Instruction::Mul: 6955 case Instruction::Or: 6956 case Instruction::PHI: 6957 case Instruction::PtrToInt: 6958 case Instruction::SDiv: 6959 case Instruction::Select: 6960 case Instruction::SExt: 6961 case Instruction::Shl: 6962 case Instruction::SIToFP: 6963 case Instruction::SRem: 6964 case Instruction::Store: 6965 case Instruction::Sub: 6966 case Instruction::Trunc: 6967 case Instruction::UDiv: 6968 case Instruction::UIToFP: 6969 case Instruction::URem: 6970 case Instruction::Xor: 6971 case Instruction::ZExt: 6972 return true; 6973 } 6974 return false; 6975 }; 6976 6977 if (!IsVectorizableOpcode(I->getOpcode())) 6978 return nullptr; 6979 6980 auto willWiden = [&](unsigned VF) -> bool { 6981 if (!isa<PHINode>(I) && (CM.isScalarAfterVectorization(I, VF) || 6982 CM.isProfitableToScalarize(I, VF))) 6983 return false; 6984 if (isa<LoadInst>(I) || isa<StoreInst>(I)) { 6985 assert(CM.getWideningDecision(I, VF) == 6986 LoopVectorizationCostModel::CM_Scalarize && 6987 "Memory widening decisions should have been taken care by now"); 6988 return false; 6989 } 6990 return true; 6991 }; 6992 6993 if (!LoopVectorizationPlanner::getDecisionAndClampRange(willWiden, Range)) 6994 return nullptr; 6995 6996 // Success: widen this instruction. 6997 return new VPWidenRecipe(*I); 6998 } 6999 7000 VPBasicBlock *VPRecipeBuilder::handleReplication( 7001 Instruction *I, VFRange &Range, VPBasicBlock *VPBB, 7002 DenseMap<Instruction *, VPReplicateRecipe *> &PredInst2Recipe, 7003 VPlanPtr &Plan) { 7004 bool IsUniform = LoopVectorizationPlanner::getDecisionAndClampRange( 7005 [&](unsigned VF) { return CM.isUniformAfterVectorization(I, VF); }, 7006 Range); 7007 7008 bool IsPredicated = LoopVectorizationPlanner::getDecisionAndClampRange( 7009 [&](unsigned VF) { return CM.isScalarWithPredication(I, VF); }, Range); 7010 7011 auto *Recipe = new VPReplicateRecipe(I, IsUniform, IsPredicated); 7012 setRecipe(I, Recipe); 7013 7014 // Find if I uses a predicated instruction. If so, it will use its scalar 7015 // value. Avoid hoisting the insert-element which packs the scalar value into 7016 // a vector value, as that happens iff all users use the vector value. 7017 for (auto &Op : I->operands()) 7018 if (auto *PredInst = dyn_cast<Instruction>(Op)) 7019 if (PredInst2Recipe.find(PredInst) != PredInst2Recipe.end()) 7020 PredInst2Recipe[PredInst]->setAlsoPack(false); 7021 7022 // Finalize the recipe for Instr, first if it is not predicated. 7023 if (!IsPredicated) { 7024 LLVM_DEBUG(dbgs() << "LV: Scalarizing:" << *I << "\n"); 7025 VPBB->appendRecipe(Recipe); 7026 return VPBB; 7027 } 7028 LLVM_DEBUG(dbgs() << "LV: Scalarizing and predicating:" << *I << "\n"); 7029 assert(VPBB->getSuccessors().empty() && 7030 "VPBB has successors when handling predicated replication."); 7031 // Record predicated instructions for above packing optimizations. 7032 PredInst2Recipe[I] = Recipe; 7033 VPBlockBase *Region = createReplicateRegion(I, Recipe, Plan); 7034 VPBlockUtils::insertBlockAfter(Region, VPBB); 7035 auto *RegSucc = new VPBasicBlock(); 7036 VPBlockUtils::insertBlockAfter(RegSucc, Region); 7037 return RegSucc; 7038 } 7039 7040 VPRegionBlock *VPRecipeBuilder::createReplicateRegion(Instruction *Instr, 7041 VPRecipeBase *PredRecipe, 7042 VPlanPtr &Plan) { 7043 // Instructions marked for predication are replicated and placed under an 7044 // if-then construct to prevent side-effects. 7045 7046 // Generate recipes to compute the block mask for this region. 7047 VPValue *BlockInMask = createBlockInMask(Instr->getParent(), Plan); 7048 7049 // Build the triangular if-then region. 7050 std::string RegionName = (Twine("pred.") + Instr->getOpcodeName()).str(); 7051 assert(Instr->getParent() && "Predicated instruction not in any basic block"); 7052 auto *BOMRecipe = new VPBranchOnMaskRecipe(BlockInMask); 7053 auto *Entry = new VPBasicBlock(Twine(RegionName) + ".entry", BOMRecipe); 7054 auto *PHIRecipe = 7055 Instr->getType()->isVoidTy() ? nullptr : new VPPredInstPHIRecipe(Instr); 7056 auto *Exit = new VPBasicBlock(Twine(RegionName) + ".continue", PHIRecipe); 7057 auto *Pred = new VPBasicBlock(Twine(RegionName) + ".if", PredRecipe); 7058 VPRegionBlock *Region = new VPRegionBlock(Entry, Exit, RegionName, true); 7059 7060 // Note: first set Entry as region entry and then connect successors starting 7061 // from it in order, to propagate the "parent" of each VPBasicBlock. 7062 VPBlockUtils::insertTwoBlocksAfter(Pred, Exit, BlockInMask, Entry); 7063 VPBlockUtils::connectBlocks(Pred, Exit); 7064 7065 return Region; 7066 } 7067 7068 bool VPRecipeBuilder::tryToCreateRecipe(Instruction *Instr, VFRange &Range, 7069 VPlanPtr &Plan, VPBasicBlock *VPBB) { 7070 VPRecipeBase *Recipe = nullptr; 7071 7072 // First, check for specific widening recipes that deal with calls, memory 7073 // operations, inductions and Phi nodes. 7074 if ((Recipe = tryToWidenCall(Instr, Range)) || 7075 (Recipe = tryToWidenMemory(Instr, Range, Plan)) || 7076 (Recipe = tryToOptimizeInduction(Instr, Range)) || 7077 (Recipe = tryToBlend(Instr, Plan)) || 7078 (isa<PHINode>(Instr) && 7079 (Recipe = new VPWidenPHIRecipe(cast<PHINode>(Instr))))) { 7080 setRecipe(Instr, Recipe); 7081 VPBB->appendRecipe(Recipe); 7082 return true; 7083 } 7084 7085 // Handle GEP widening. 7086 if (GetElementPtrInst *GEP = dyn_cast<GetElementPtrInst>(Instr)) { 7087 auto Scalarize = [&](unsigned VF) { 7088 return CM.isScalarWithPredication(Instr, VF) || 7089 CM.isScalarAfterVectorization(Instr, VF) || 7090 CM.isProfitableToScalarize(Instr, VF); 7091 }; 7092 if (LoopVectorizationPlanner::getDecisionAndClampRange(Scalarize, Range)) 7093 return false; 7094 VPWidenGEPRecipe *Recipe = new VPWidenGEPRecipe(GEP, OrigLoop); 7095 setRecipe(Instr, Recipe); 7096 VPBB->appendRecipe(Recipe); 7097 return true; 7098 } 7099 7100 // Check if Instr is to be widened by a general VPWidenRecipe, after 7101 // having first checked for specific widening recipes. 7102 if ((Recipe = tryToWiden(Instr, Range))) { 7103 setRecipe(Instr, Recipe); 7104 VPBB->appendRecipe(Recipe); 7105 return true; 7106 } 7107 7108 return false; 7109 } 7110 7111 void LoopVectorizationPlanner::buildVPlansWithVPRecipes(unsigned MinVF, 7112 unsigned MaxVF) { 7113 assert(OrigLoop->empty() && "Inner loop expected."); 7114 7115 // Collect conditions feeding internal conditional branches; they need to be 7116 // represented in VPlan for it to model masking. 7117 SmallPtrSet<Value *, 1> NeedDef; 7118 7119 auto *Latch = OrigLoop->getLoopLatch(); 7120 for (BasicBlock *BB : OrigLoop->blocks()) { 7121 if (BB == Latch) 7122 continue; 7123 BranchInst *Branch = dyn_cast<BranchInst>(BB->getTerminator()); 7124 if (Branch && Branch->isConditional()) 7125 NeedDef.insert(Branch->getCondition()); 7126 } 7127 7128 // If the tail is to be folded by masking, the primary induction variable 7129 // needs to be represented in VPlan for it to model early-exit masking. 7130 // Also, both the Phi and the live-out instruction of each reduction are 7131 // required in order to introduce a select between them in VPlan. 7132 if (CM.foldTailByMasking()) { 7133 NeedDef.insert(Legal->getPrimaryInduction()); 7134 for (auto &Reduction : Legal->getReductionVars()) { 7135 NeedDef.insert(Reduction.first); 7136 NeedDef.insert(Reduction.second.getLoopExitInstr()); 7137 } 7138 } 7139 7140 // Collect instructions from the original loop that will become trivially dead 7141 // in the vectorized loop. We don't need to vectorize these instructions. For 7142 // example, original induction update instructions can become dead because we 7143 // separately emit induction "steps" when generating code for the new loop. 7144 // Similarly, we create a new latch condition when setting up the structure 7145 // of the new loop, so the old one can become dead. 7146 SmallPtrSet<Instruction *, 4> DeadInstructions; 7147 collectTriviallyDeadInstructions(DeadInstructions); 7148 7149 // Add assume instructions we need to drop to DeadInstructions, to prevent 7150 // them from being added to the VPlan. 7151 // TODO: We only need to drop assumes in blocks that get flattend. If the 7152 // control flow is preserved, we should keep them. 7153 auto &ConditionalAssumes = Legal->getConditionalAssumes(); 7154 DeadInstructions.insert(ConditionalAssumes.begin(), ConditionalAssumes.end()); 7155 7156 DenseMap<Instruction *, Instruction *> &SinkAfter = Legal->getSinkAfter(); 7157 // Dead instructions do not need sinking. Remove them from SinkAfter. 7158 for (Instruction *I : DeadInstructions) 7159 SinkAfter.erase(I); 7160 7161 for (unsigned VF = MinVF; VF < MaxVF + 1;) { 7162 VFRange SubRange = {VF, MaxVF + 1}; 7163 VPlans.push_back(buildVPlanWithVPRecipes(SubRange, NeedDef, 7164 DeadInstructions, SinkAfter)); 7165 VF = SubRange.End; 7166 } 7167 } 7168 7169 VPlanPtr LoopVectorizationPlanner::buildVPlanWithVPRecipes( 7170 VFRange &Range, SmallPtrSetImpl<Value *> &NeedDef, 7171 SmallPtrSetImpl<Instruction *> &DeadInstructions, 7172 const DenseMap<Instruction *, Instruction *> &SinkAfter) { 7173 7174 // Hold a mapping from predicated instructions to their recipes, in order to 7175 // fix their AlsoPack behavior if a user is determined to replicate and use a 7176 // scalar instead of vector value. 7177 DenseMap<Instruction *, VPReplicateRecipe *> PredInst2Recipe; 7178 7179 SmallPtrSet<const InterleaveGroup<Instruction> *, 1> InterleaveGroups; 7180 7181 VPRecipeBuilder RecipeBuilder(OrigLoop, TLI, Legal, CM, Builder); 7182 7183 // --------------------------------------------------------------------------- 7184 // Pre-construction: record ingredients whose recipes we'll need to further 7185 // process after constructing the initial VPlan. 7186 // --------------------------------------------------------------------------- 7187 7188 // Mark instructions we'll need to sink later and their targets as 7189 // ingredients whose recipe we'll need to record. 7190 for (auto &Entry : SinkAfter) { 7191 RecipeBuilder.recordRecipeOf(Entry.first); 7192 RecipeBuilder.recordRecipeOf(Entry.second); 7193 } 7194 7195 // For each interleave group which is relevant for this (possibly trimmed) 7196 // Range, add it to the set of groups to be later applied to the VPlan and add 7197 // placeholders for its members' Recipes which we'll be replacing with a 7198 // single VPInterleaveRecipe. 7199 for (InterleaveGroup<Instruction> *IG : IAI.getInterleaveGroups()) { 7200 auto applyIG = [IG, this](unsigned VF) -> bool { 7201 return (VF >= 2 && // Query is illegal for VF == 1 7202 CM.getWideningDecision(IG->getInsertPos(), VF) == 7203 LoopVectorizationCostModel::CM_Interleave); 7204 }; 7205 if (!getDecisionAndClampRange(applyIG, Range)) 7206 continue; 7207 InterleaveGroups.insert(IG); 7208 for (unsigned i = 0; i < IG->getFactor(); i++) 7209 if (Instruction *Member = IG->getMember(i)) 7210 RecipeBuilder.recordRecipeOf(Member); 7211 }; 7212 7213 // --------------------------------------------------------------------------- 7214 // Build initial VPlan: Scan the body of the loop in a topological order to 7215 // visit each basic block after having visited its predecessor basic blocks. 7216 // --------------------------------------------------------------------------- 7217 7218 // Create a dummy pre-entry VPBasicBlock to start building the VPlan. 7219 auto Plan = std::make_unique<VPlan>(); 7220 VPBasicBlock *VPBB = new VPBasicBlock("Pre-Entry"); 7221 Plan->setEntry(VPBB); 7222 7223 // Represent values that will have defs inside VPlan. 7224 for (Value *V : NeedDef) 7225 Plan->addVPValue(V); 7226 7227 // Scan the body of the loop in a topological order to visit each basic block 7228 // after having visited its predecessor basic blocks. 7229 LoopBlocksDFS DFS(OrigLoop); 7230 DFS.perform(LI); 7231 7232 for (BasicBlock *BB : make_range(DFS.beginRPO(), DFS.endRPO())) { 7233 // Relevant instructions from basic block BB will be grouped into VPRecipe 7234 // ingredients and fill a new VPBasicBlock. 7235 unsigned VPBBsForBB = 0; 7236 auto *FirstVPBBForBB = new VPBasicBlock(BB->getName()); 7237 VPBlockUtils::insertBlockAfter(FirstVPBBForBB, VPBB); 7238 VPBB = FirstVPBBForBB; 7239 Builder.setInsertPoint(VPBB); 7240 7241 // Introduce each ingredient into VPlan. 7242 for (Instruction &I : BB->instructionsWithoutDebug()) { 7243 Instruction *Instr = &I; 7244 7245 // First filter out irrelevant instructions, to ensure no recipes are 7246 // built for them. 7247 if (isa<BranchInst>(Instr) || 7248 DeadInstructions.find(Instr) != DeadInstructions.end()) 7249 continue; 7250 7251 if (RecipeBuilder.tryToCreateRecipe(Instr, Range, Plan, VPBB)) 7252 continue; 7253 7254 // Otherwise, if all widening options failed, Instruction is to be 7255 // replicated. This may create a successor for VPBB. 7256 VPBasicBlock *NextVPBB = RecipeBuilder.handleReplication( 7257 Instr, Range, VPBB, PredInst2Recipe, Plan); 7258 if (NextVPBB != VPBB) { 7259 VPBB = NextVPBB; 7260 VPBB->setName(BB->hasName() ? BB->getName() + "." + Twine(VPBBsForBB++) 7261 : ""); 7262 } 7263 } 7264 } 7265 7266 // Discard empty dummy pre-entry VPBasicBlock. Note that other VPBasicBlocks 7267 // may also be empty, such as the last one VPBB, reflecting original 7268 // basic-blocks with no recipes. 7269 VPBasicBlock *PreEntry = cast<VPBasicBlock>(Plan->getEntry()); 7270 assert(PreEntry->empty() && "Expecting empty pre-entry block."); 7271 VPBlockBase *Entry = Plan->setEntry(PreEntry->getSingleSuccessor()); 7272 VPBlockUtils::disconnectBlocks(PreEntry, Entry); 7273 delete PreEntry; 7274 7275 // --------------------------------------------------------------------------- 7276 // Transform initial VPlan: Apply previously taken decisions, in order, to 7277 // bring the VPlan to its final state. 7278 // --------------------------------------------------------------------------- 7279 7280 // Apply Sink-After legal constraints. 7281 for (auto &Entry : SinkAfter) { 7282 VPRecipeBase *Sink = RecipeBuilder.getRecipe(Entry.first); 7283 VPRecipeBase *Target = RecipeBuilder.getRecipe(Entry.second); 7284 Sink->moveAfter(Target); 7285 } 7286 7287 // Interleave memory: for each Interleave Group we marked earlier as relevant 7288 // for this VPlan, replace the Recipes widening its memory instructions with a 7289 // single VPInterleaveRecipe at its insertion point. 7290 for (auto IG : InterleaveGroups) { 7291 auto *Recipe = cast<VPWidenMemoryInstructionRecipe>( 7292 RecipeBuilder.getRecipe(IG->getInsertPos())); 7293 (new VPInterleaveRecipe(IG, Recipe->getAddr(), Recipe->getMask())) 7294 ->insertBefore(Recipe); 7295 7296 for (unsigned i = 0; i < IG->getFactor(); ++i) 7297 if (Instruction *Member = IG->getMember(i)) { 7298 RecipeBuilder.getRecipe(Member)->eraseFromParent(); 7299 } 7300 } 7301 7302 // Finally, if tail is folded by masking, introduce selects between the phi 7303 // and the live-out instruction of each reduction, at the end of the latch. 7304 if (CM.foldTailByMasking()) { 7305 Builder.setInsertPoint(VPBB); 7306 auto *Cond = RecipeBuilder.createBlockInMask(OrigLoop->getHeader(), Plan); 7307 for (auto &Reduction : Legal->getReductionVars()) { 7308 VPValue *Phi = Plan->getVPValue(Reduction.first); 7309 VPValue *Red = Plan->getVPValue(Reduction.second.getLoopExitInstr()); 7310 Builder.createNaryOp(Instruction::Select, {Cond, Red, Phi}); 7311 } 7312 } 7313 7314 std::string PlanName; 7315 raw_string_ostream RSO(PlanName); 7316 unsigned VF = Range.Start; 7317 Plan->addVF(VF); 7318 RSO << "Initial VPlan for VF={" << VF; 7319 for (VF *= 2; VF < Range.End; VF *= 2) { 7320 Plan->addVF(VF); 7321 RSO << "," << VF; 7322 } 7323 RSO << "},UF>=1"; 7324 RSO.flush(); 7325 Plan->setName(PlanName); 7326 7327 return Plan; 7328 } 7329 7330 VPlanPtr LoopVectorizationPlanner::buildVPlan(VFRange &Range) { 7331 // Outer loop handling: They may require CFG and instruction level 7332 // transformations before even evaluating whether vectorization is profitable. 7333 // Since we cannot modify the incoming IR, we need to build VPlan upfront in 7334 // the vectorization pipeline. 7335 assert(!OrigLoop->empty()); 7336 assert(EnableVPlanNativePath && "VPlan-native path is not enabled."); 7337 7338 // Create new empty VPlan 7339 auto Plan = std::make_unique<VPlan>(); 7340 7341 // Build hierarchical CFG 7342 VPlanHCFGBuilder HCFGBuilder(OrigLoop, LI, *Plan); 7343 HCFGBuilder.buildHierarchicalCFG(); 7344 7345 for (unsigned VF = Range.Start; VF < Range.End; VF *= 2) 7346 Plan->addVF(VF); 7347 7348 if (EnableVPlanPredication) { 7349 VPlanPredicator VPP(*Plan); 7350 VPP.predicate(); 7351 7352 // Avoid running transformation to recipes until masked code generation in 7353 // VPlan-native path is in place. 7354 return Plan; 7355 } 7356 7357 SmallPtrSet<Instruction *, 1> DeadInstructions; 7358 VPlanTransforms::VPInstructionsToVPRecipes( 7359 OrigLoop, Plan, Legal->getInductionVars(), DeadInstructions); 7360 return Plan; 7361 } 7362 7363 Value* LoopVectorizationPlanner::VPCallbackILV:: 7364 getOrCreateVectorValues(Value *V, unsigned Part) { 7365 return ILV.getOrCreateVectorValue(V, Part); 7366 } 7367 7368 Value *LoopVectorizationPlanner::VPCallbackILV::getOrCreateScalarValue( 7369 Value *V, const VPIteration &Instance) { 7370 return ILV.getOrCreateScalarValue(V, Instance); 7371 } 7372 7373 void VPInterleaveRecipe::print(raw_ostream &O, const Twine &Indent, 7374 VPSlotTracker &SlotTracker) const { 7375 O << " +\n" 7376 << Indent << "\"INTERLEAVE-GROUP with factor " << IG->getFactor() << " at "; 7377 IG->getInsertPos()->printAsOperand(O, false); 7378 O << ", "; 7379 getAddr()->printAsOperand(O, SlotTracker); 7380 VPValue *Mask = getMask(); 7381 if (Mask) { 7382 O << ", "; 7383 Mask->printAsOperand(O, SlotTracker); 7384 } 7385 O << "\\l\""; 7386 for (unsigned i = 0; i < IG->getFactor(); ++i) 7387 if (Instruction *I = IG->getMember(i)) 7388 O << " +\n" 7389 << Indent << "\" " << VPlanIngredient(I) << " " << i << "\\l\""; 7390 } 7391 7392 void VPWidenCallRecipe::execute(VPTransformState &State) { 7393 State.ILV->widenCallInstruction(Ingredient); 7394 } 7395 7396 void VPWidenRecipe::execute(VPTransformState &State) { 7397 State.ILV->widenInstruction(Ingredient); 7398 } 7399 7400 void VPWidenGEPRecipe::execute(VPTransformState &State) { 7401 State.ILV->widenGEP(GEP, State.UF, State.VF, IsPtrLoopInvariant, 7402 IsIndexLoopInvariant); 7403 } 7404 7405 void VPWidenIntOrFpInductionRecipe::execute(VPTransformState &State) { 7406 assert(!State.Instance && "Int or FP induction being replicated."); 7407 State.ILV->widenIntOrFpInduction(IV, Trunc); 7408 } 7409 7410 void VPWidenPHIRecipe::execute(VPTransformState &State) { 7411 State.ILV->widenPHIInstruction(Phi, State.UF, State.VF); 7412 } 7413 7414 void VPBlendRecipe::execute(VPTransformState &State) { 7415 State.ILV->setDebugLocFromInst(State.Builder, Phi); 7416 // We know that all PHIs in non-header blocks are converted into 7417 // selects, so we don't have to worry about the insertion order and we 7418 // can just use the builder. 7419 // At this point we generate the predication tree. There may be 7420 // duplications since this is a simple recursive scan, but future 7421 // optimizations will clean it up. 7422 7423 unsigned NumIncoming = Phi->getNumIncomingValues(); 7424 7425 assert((User || NumIncoming == 1) && 7426 "Multiple predecessors with predecessors having a full mask"); 7427 // Generate a sequence of selects of the form: 7428 // SELECT(Mask3, In3, 7429 // SELECT(Mask2, In2, 7430 // ( ...))) 7431 InnerLoopVectorizer::VectorParts Entry(State.UF); 7432 for (unsigned In = 0; In < NumIncoming; ++In) { 7433 for (unsigned Part = 0; Part < State.UF; ++Part) { 7434 // We might have single edge PHIs (blocks) - use an identity 7435 // 'select' for the first PHI operand. 7436 Value *In0 = 7437 State.ILV->getOrCreateVectorValue(Phi->getIncomingValue(In), Part); 7438 if (In == 0) 7439 Entry[Part] = In0; // Initialize with the first incoming value. 7440 else { 7441 // Select between the current value and the previous incoming edge 7442 // based on the incoming mask. 7443 Value *Cond = State.get(User->getOperand(In), Part); 7444 Entry[Part] = 7445 State.Builder.CreateSelect(Cond, In0, Entry[Part], "predphi"); 7446 } 7447 } 7448 } 7449 for (unsigned Part = 0; Part < State.UF; ++Part) 7450 State.ValueMap.setVectorValue(Phi, Part, Entry[Part]); 7451 } 7452 7453 void VPInterleaveRecipe::execute(VPTransformState &State) { 7454 assert(!State.Instance && "Interleave group being replicated."); 7455 State.ILV->vectorizeInterleaveGroup(IG->getInsertPos(), State, getAddr(), 7456 getMask()); 7457 } 7458 7459 void VPReplicateRecipe::execute(VPTransformState &State) { 7460 if (State.Instance) { // Generate a single instance. 7461 State.ILV->scalarizeInstruction(Ingredient, *State.Instance, IsPredicated); 7462 // Insert scalar instance packing it into a vector. 7463 if (AlsoPack && State.VF > 1) { 7464 // If we're constructing lane 0, initialize to start from undef. 7465 if (State.Instance->Lane == 0) { 7466 Value *Undef = 7467 UndefValue::get(VectorType::get(Ingredient->getType(), State.VF)); 7468 State.ValueMap.setVectorValue(Ingredient, State.Instance->Part, Undef); 7469 } 7470 State.ILV->packScalarIntoVectorValue(Ingredient, *State.Instance); 7471 } 7472 return; 7473 } 7474 7475 // Generate scalar instances for all VF lanes of all UF parts, unless the 7476 // instruction is uniform inwhich case generate only the first lane for each 7477 // of the UF parts. 7478 unsigned EndLane = IsUniform ? 1 : State.VF; 7479 for (unsigned Part = 0; Part < State.UF; ++Part) 7480 for (unsigned Lane = 0; Lane < EndLane; ++Lane) 7481 State.ILV->scalarizeInstruction(Ingredient, {Part, Lane}, IsPredicated); 7482 } 7483 7484 void VPBranchOnMaskRecipe::execute(VPTransformState &State) { 7485 assert(State.Instance && "Branch on Mask works only on single instance."); 7486 7487 unsigned Part = State.Instance->Part; 7488 unsigned Lane = State.Instance->Lane; 7489 7490 Value *ConditionBit = nullptr; 7491 if (!User) // Block in mask is all-one. 7492 ConditionBit = State.Builder.getTrue(); 7493 else { 7494 VPValue *BlockInMask = User->getOperand(0); 7495 ConditionBit = State.get(BlockInMask, Part); 7496 if (ConditionBit->getType()->isVectorTy()) 7497 ConditionBit = State.Builder.CreateExtractElement( 7498 ConditionBit, State.Builder.getInt32(Lane)); 7499 } 7500 7501 // Replace the temporary unreachable terminator with a new conditional branch, 7502 // whose two destinations will be set later when they are created. 7503 auto *CurrentTerminator = State.CFG.PrevBB->getTerminator(); 7504 assert(isa<UnreachableInst>(CurrentTerminator) && 7505 "Expected to replace unreachable terminator with conditional branch."); 7506 auto *CondBr = BranchInst::Create(State.CFG.PrevBB, nullptr, ConditionBit); 7507 CondBr->setSuccessor(0, nullptr); 7508 ReplaceInstWithInst(CurrentTerminator, CondBr); 7509 } 7510 7511 void VPPredInstPHIRecipe::execute(VPTransformState &State) { 7512 assert(State.Instance && "Predicated instruction PHI works per instance."); 7513 Instruction *ScalarPredInst = cast<Instruction>( 7514 State.ValueMap.getScalarValue(PredInst, *State.Instance)); 7515 BasicBlock *PredicatedBB = ScalarPredInst->getParent(); 7516 BasicBlock *PredicatingBB = PredicatedBB->getSinglePredecessor(); 7517 assert(PredicatingBB && "Predicated block has no single predecessor."); 7518 7519 // By current pack/unpack logic we need to generate only a single phi node: if 7520 // a vector value for the predicated instruction exists at this point it means 7521 // the instruction has vector users only, and a phi for the vector value is 7522 // needed. In this case the recipe of the predicated instruction is marked to 7523 // also do that packing, thereby "hoisting" the insert-element sequence. 7524 // Otherwise, a phi node for the scalar value is needed. 7525 unsigned Part = State.Instance->Part; 7526 if (State.ValueMap.hasVectorValue(PredInst, Part)) { 7527 Value *VectorValue = State.ValueMap.getVectorValue(PredInst, Part); 7528 InsertElementInst *IEI = cast<InsertElementInst>(VectorValue); 7529 PHINode *VPhi = State.Builder.CreatePHI(IEI->getType(), 2); 7530 VPhi->addIncoming(IEI->getOperand(0), PredicatingBB); // Unmodified vector. 7531 VPhi->addIncoming(IEI, PredicatedBB); // New vector with inserted element. 7532 State.ValueMap.resetVectorValue(PredInst, Part, VPhi); // Update cache. 7533 } else { 7534 Type *PredInstType = PredInst->getType(); 7535 PHINode *Phi = State.Builder.CreatePHI(PredInstType, 2); 7536 Phi->addIncoming(UndefValue::get(ScalarPredInst->getType()), PredicatingBB); 7537 Phi->addIncoming(ScalarPredInst, PredicatedBB); 7538 State.ValueMap.resetScalarValue(PredInst, *State.Instance, Phi); 7539 } 7540 } 7541 7542 void VPWidenMemoryInstructionRecipe::execute(VPTransformState &State) { 7543 VPValue *StoredValue = isa<StoreInst>(Instr) ? getStoredValue() : nullptr; 7544 State.ILV->vectorizeMemoryInstruction(&Instr, State, getAddr(), StoredValue, 7545 getMask()); 7546 } 7547 7548 // Determine how to lower the scalar epilogue, which depends on 1) optimising 7549 // for minimum code-size, 2) predicate compiler options, 3) loop hints forcing 7550 // predication, and 4) a TTI hook that analyses whether the loop is suitable 7551 // for predication. 7552 static ScalarEpilogueLowering getScalarEpilogueLowering( 7553 Function *F, Loop *L, LoopVectorizeHints &Hints, ProfileSummaryInfo *PSI, 7554 BlockFrequencyInfo *BFI, TargetTransformInfo *TTI, TargetLibraryInfo *TLI, 7555 AssumptionCache *AC, LoopInfo *LI, ScalarEvolution *SE, DominatorTree *DT, 7556 LoopVectorizationLegality &LVL) { 7557 bool OptSize = 7558 F->hasOptSize() || llvm::shouldOptimizeForSize(L->getHeader(), PSI, BFI, 7559 PGSOQueryType::IRPass); 7560 // 1) OptSize takes precedence over all other options, i.e. if this is set, 7561 // don't look at hints or options, and don't request a scalar epilogue. 7562 if (OptSize && Hints.getForce() != LoopVectorizeHints::FK_Enabled) 7563 return CM_ScalarEpilogueNotAllowedOptSize; 7564 7565 bool PredicateOptDisabled = PreferPredicateOverEpilog.getNumOccurrences() && 7566 !PreferPredicateOverEpilog; 7567 7568 // 2) Next, if disabling predication is requested on the command line, honour 7569 // this and request a scalar epilogue. Also do this if we don't have a 7570 // primary induction variable, which is required for predication. 7571 if (PredicateOptDisabled || !LVL.getPrimaryInduction()) 7572 return CM_ScalarEpilogueAllowed; 7573 7574 // 3) and 4) look if enabling predication is requested on the command line, 7575 // with a loop hint, or if the TTI hook indicates this is profitable, request 7576 // predication . 7577 if (PreferPredicateOverEpilog || 7578 Hints.getPredicate() == LoopVectorizeHints::FK_Enabled || 7579 (TTI->preferPredicateOverEpilogue(L, LI, *SE, *AC, TLI, DT, 7580 LVL.getLAI()) && 7581 Hints.getPredicate() != LoopVectorizeHints::FK_Disabled)) 7582 return CM_ScalarEpilogueNotNeededUsePredicate; 7583 7584 return CM_ScalarEpilogueAllowed; 7585 } 7586 7587 // Process the loop in the VPlan-native vectorization path. This path builds 7588 // VPlan upfront in the vectorization pipeline, which allows to apply 7589 // VPlan-to-VPlan transformations from the very beginning without modifying the 7590 // input LLVM IR. 7591 static bool processLoopInVPlanNativePath( 7592 Loop *L, PredicatedScalarEvolution &PSE, LoopInfo *LI, DominatorTree *DT, 7593 LoopVectorizationLegality *LVL, TargetTransformInfo *TTI, 7594 TargetLibraryInfo *TLI, DemandedBits *DB, AssumptionCache *AC, 7595 OptimizationRemarkEmitter *ORE, BlockFrequencyInfo *BFI, 7596 ProfileSummaryInfo *PSI, LoopVectorizeHints &Hints) { 7597 7598 assert(EnableVPlanNativePath && "VPlan-native path is disabled."); 7599 Function *F = L->getHeader()->getParent(); 7600 InterleavedAccessInfo IAI(PSE, L, DT, LI, LVL->getLAI()); 7601 7602 ScalarEpilogueLowering SEL = getScalarEpilogueLowering( 7603 F, L, Hints, PSI, BFI, TTI, TLI, AC, LI, PSE.getSE(), DT, *LVL); 7604 7605 LoopVectorizationCostModel CM(SEL, L, PSE, LI, LVL, *TTI, TLI, DB, AC, ORE, F, 7606 &Hints, IAI); 7607 // Use the planner for outer loop vectorization. 7608 // TODO: CM is not used at this point inside the planner. Turn CM into an 7609 // optional argument if we don't need it in the future. 7610 LoopVectorizationPlanner LVP(L, LI, TLI, TTI, LVL, CM, IAI); 7611 7612 // Get user vectorization factor. 7613 const unsigned UserVF = Hints.getWidth(); 7614 7615 // Plan how to best vectorize, return the best VF and its cost. 7616 const VectorizationFactor VF = LVP.planInVPlanNativePath(UserVF); 7617 7618 // If we are stress testing VPlan builds, do not attempt to generate vector 7619 // code. Masked vector code generation support will follow soon. 7620 // Also, do not attempt to vectorize if no vector code will be produced. 7621 if (VPlanBuildStressTest || EnableVPlanPredication || 7622 VectorizationFactor::Disabled() == VF) 7623 return false; 7624 7625 LVP.setBestPlan(VF.Width, 1); 7626 7627 InnerLoopVectorizer LB(L, PSE, LI, DT, TLI, TTI, AC, ORE, VF.Width, 1, LVL, 7628 &CM); 7629 LLVM_DEBUG(dbgs() << "Vectorizing outer loop in \"" 7630 << L->getHeader()->getParent()->getName() << "\"\n"); 7631 LVP.executePlan(LB, DT); 7632 7633 // Mark the loop as already vectorized to avoid vectorizing again. 7634 Hints.setAlreadyVectorized(); 7635 7636 LLVM_DEBUG(verifyFunction(*L->getHeader()->getParent())); 7637 return true; 7638 } 7639 7640 bool LoopVectorizePass::processLoop(Loop *L) { 7641 assert((EnableVPlanNativePath || L->empty()) && 7642 "VPlan-native path is not enabled. Only process inner loops."); 7643 7644 #ifndef NDEBUG 7645 const std::string DebugLocStr = getDebugLocString(L); 7646 #endif /* NDEBUG */ 7647 7648 LLVM_DEBUG(dbgs() << "\nLV: Checking a loop in \"" 7649 << L->getHeader()->getParent()->getName() << "\" from " 7650 << DebugLocStr << "\n"); 7651 7652 LoopVectorizeHints Hints(L, InterleaveOnlyWhenForced, *ORE); 7653 7654 LLVM_DEBUG( 7655 dbgs() << "LV: Loop hints:" 7656 << " force=" 7657 << (Hints.getForce() == LoopVectorizeHints::FK_Disabled 7658 ? "disabled" 7659 : (Hints.getForce() == LoopVectorizeHints::FK_Enabled 7660 ? "enabled" 7661 : "?")) 7662 << " width=" << Hints.getWidth() 7663 << " unroll=" << Hints.getInterleave() << "\n"); 7664 7665 // Function containing loop 7666 Function *F = L->getHeader()->getParent(); 7667 7668 // Looking at the diagnostic output is the only way to determine if a loop 7669 // was vectorized (other than looking at the IR or machine code), so it 7670 // is important to generate an optimization remark for each loop. Most of 7671 // these messages are generated as OptimizationRemarkAnalysis. Remarks 7672 // generated as OptimizationRemark and OptimizationRemarkMissed are 7673 // less verbose reporting vectorized loops and unvectorized loops that may 7674 // benefit from vectorization, respectively. 7675 7676 if (!Hints.allowVectorization(F, L, VectorizeOnlyWhenForced)) { 7677 LLVM_DEBUG(dbgs() << "LV: Loop hints prevent vectorization.\n"); 7678 return false; 7679 } 7680 7681 PredicatedScalarEvolution PSE(*SE, *L); 7682 7683 // Check if it is legal to vectorize the loop. 7684 LoopVectorizationRequirements Requirements(*ORE); 7685 LoopVectorizationLegality LVL(L, PSE, DT, TTI, TLI, AA, F, GetLAA, LI, ORE, 7686 &Requirements, &Hints, DB, AC); 7687 if (!LVL.canVectorize(EnableVPlanNativePath)) { 7688 LLVM_DEBUG(dbgs() << "LV: Not vectorizing: Cannot prove legality.\n"); 7689 Hints.emitRemarkWithHints(); 7690 return false; 7691 } 7692 7693 // Check the function attributes and profiles to find out if this function 7694 // should be optimized for size. 7695 ScalarEpilogueLowering SEL = getScalarEpilogueLowering( 7696 F, L, Hints, PSI, BFI, TTI, TLI, AC, LI, PSE.getSE(), DT, LVL); 7697 7698 // Entrance to the VPlan-native vectorization path. Outer loops are processed 7699 // here. They may require CFG and instruction level transformations before 7700 // even evaluating whether vectorization is profitable. Since we cannot modify 7701 // the incoming IR, we need to build VPlan upfront in the vectorization 7702 // pipeline. 7703 if (!L->empty()) 7704 return processLoopInVPlanNativePath(L, PSE, LI, DT, &LVL, TTI, TLI, DB, AC, 7705 ORE, BFI, PSI, Hints); 7706 7707 assert(L->empty() && "Inner loop expected."); 7708 7709 // Check the loop for a trip count threshold: vectorize loops with a tiny trip 7710 // count by optimizing for size, to minimize overheads. 7711 auto ExpectedTC = getSmallBestKnownTC(*SE, L); 7712 if (ExpectedTC && *ExpectedTC < TinyTripCountVectorThreshold) { 7713 LLVM_DEBUG(dbgs() << "LV: Found a loop with a very small trip count. " 7714 << "This loop is worth vectorizing only if no scalar " 7715 << "iteration overheads are incurred."); 7716 if (Hints.getForce() == LoopVectorizeHints::FK_Enabled) 7717 LLVM_DEBUG(dbgs() << " But vectorizing was explicitly forced.\n"); 7718 else { 7719 LLVM_DEBUG(dbgs() << "\n"); 7720 SEL = CM_ScalarEpilogueNotAllowedLowTripLoop; 7721 } 7722 } 7723 7724 // Check the function attributes to see if implicit floats are allowed. 7725 // FIXME: This check doesn't seem possibly correct -- what if the loop is 7726 // an integer loop and the vector instructions selected are purely integer 7727 // vector instructions? 7728 if (F->hasFnAttribute(Attribute::NoImplicitFloat)) { 7729 reportVectorizationFailure( 7730 "Can't vectorize when the NoImplicitFloat attribute is used", 7731 "loop not vectorized due to NoImplicitFloat attribute", 7732 "NoImplicitFloat", ORE, L); 7733 Hints.emitRemarkWithHints(); 7734 return false; 7735 } 7736 7737 // Check if the target supports potentially unsafe FP vectorization. 7738 // FIXME: Add a check for the type of safety issue (denormal, signaling) 7739 // for the target we're vectorizing for, to make sure none of the 7740 // additional fp-math flags can help. 7741 if (Hints.isPotentiallyUnsafe() && 7742 TTI->isFPVectorizationPotentiallyUnsafe()) { 7743 reportVectorizationFailure( 7744 "Potentially unsafe FP op prevents vectorization", 7745 "loop not vectorized due to unsafe FP support.", 7746 "UnsafeFP", ORE, L); 7747 Hints.emitRemarkWithHints(); 7748 return false; 7749 } 7750 7751 bool UseInterleaved = TTI->enableInterleavedAccessVectorization(); 7752 InterleavedAccessInfo IAI(PSE, L, DT, LI, LVL.getLAI()); 7753 7754 // If an override option has been passed in for interleaved accesses, use it. 7755 if (EnableInterleavedMemAccesses.getNumOccurrences() > 0) 7756 UseInterleaved = EnableInterleavedMemAccesses; 7757 7758 // Analyze interleaved memory accesses. 7759 if (UseInterleaved) { 7760 IAI.analyzeInterleaving(useMaskedInterleavedAccesses(*TTI)); 7761 } 7762 7763 // Use the cost model. 7764 LoopVectorizationCostModel CM(SEL, L, PSE, LI, &LVL, *TTI, TLI, DB, AC, ORE, 7765 F, &Hints, IAI); 7766 CM.collectValuesToIgnore(); 7767 7768 // Use the planner for vectorization. 7769 LoopVectorizationPlanner LVP(L, LI, TLI, TTI, &LVL, CM, IAI); 7770 7771 // Get user vectorization factor. 7772 unsigned UserVF = Hints.getWidth(); 7773 7774 // Plan how to best vectorize, return the best VF and its cost. 7775 Optional<VectorizationFactor> MaybeVF = LVP.plan(UserVF); 7776 7777 VectorizationFactor VF = VectorizationFactor::Disabled(); 7778 unsigned IC = 1; 7779 unsigned UserIC = Hints.getInterleave(); 7780 7781 if (MaybeVF) { 7782 VF = *MaybeVF; 7783 // Select the interleave count. 7784 IC = CM.selectInterleaveCount(VF.Width, VF.Cost); 7785 } 7786 7787 // Identify the diagnostic messages that should be produced. 7788 std::pair<StringRef, std::string> VecDiagMsg, IntDiagMsg; 7789 bool VectorizeLoop = true, InterleaveLoop = true; 7790 if (Requirements.doesNotMeet(F, L, Hints)) { 7791 LLVM_DEBUG(dbgs() << "LV: Not vectorizing: loop did not meet vectorization " 7792 "requirements.\n"); 7793 Hints.emitRemarkWithHints(); 7794 return false; 7795 } 7796 7797 if (VF.Width == 1) { 7798 LLVM_DEBUG(dbgs() << "LV: Vectorization is possible but not beneficial.\n"); 7799 VecDiagMsg = std::make_pair( 7800 "VectorizationNotBeneficial", 7801 "the cost-model indicates that vectorization is not beneficial"); 7802 VectorizeLoop = false; 7803 } 7804 7805 if (!MaybeVF && UserIC > 1) { 7806 // Tell the user interleaving was avoided up-front, despite being explicitly 7807 // requested. 7808 LLVM_DEBUG(dbgs() << "LV: Ignoring UserIC, because vectorization and " 7809 "interleaving should be avoided up front\n"); 7810 IntDiagMsg = std::make_pair( 7811 "InterleavingAvoided", 7812 "Ignoring UserIC, because interleaving was avoided up front"); 7813 InterleaveLoop = false; 7814 } else if (IC == 1 && UserIC <= 1) { 7815 // Tell the user interleaving is not beneficial. 7816 LLVM_DEBUG(dbgs() << "LV: Interleaving is not beneficial.\n"); 7817 IntDiagMsg = std::make_pair( 7818 "InterleavingNotBeneficial", 7819 "the cost-model indicates that interleaving is not beneficial"); 7820 InterleaveLoop = false; 7821 if (UserIC == 1) { 7822 IntDiagMsg.first = "InterleavingNotBeneficialAndDisabled"; 7823 IntDiagMsg.second += 7824 " and is explicitly disabled or interleave count is set to 1"; 7825 } 7826 } else if (IC > 1 && UserIC == 1) { 7827 // Tell the user interleaving is beneficial, but it explicitly disabled. 7828 LLVM_DEBUG( 7829 dbgs() << "LV: Interleaving is beneficial but is explicitly disabled."); 7830 IntDiagMsg = std::make_pair( 7831 "InterleavingBeneficialButDisabled", 7832 "the cost-model indicates that interleaving is beneficial " 7833 "but is explicitly disabled or interleave count is set to 1"); 7834 InterleaveLoop = false; 7835 } 7836 7837 // Override IC if user provided an interleave count. 7838 IC = UserIC > 0 ? UserIC : IC; 7839 7840 // Emit diagnostic messages, if any. 7841 const char *VAPassName = Hints.vectorizeAnalysisPassName(); 7842 if (!VectorizeLoop && !InterleaveLoop) { 7843 // Do not vectorize or interleaving the loop. 7844 ORE->emit([&]() { 7845 return OptimizationRemarkMissed(VAPassName, VecDiagMsg.first, 7846 L->getStartLoc(), L->getHeader()) 7847 << VecDiagMsg.second; 7848 }); 7849 ORE->emit([&]() { 7850 return OptimizationRemarkMissed(LV_NAME, IntDiagMsg.first, 7851 L->getStartLoc(), L->getHeader()) 7852 << IntDiagMsg.second; 7853 }); 7854 return false; 7855 } else if (!VectorizeLoop && InterleaveLoop) { 7856 LLVM_DEBUG(dbgs() << "LV: Interleave Count is " << IC << '\n'); 7857 ORE->emit([&]() { 7858 return OptimizationRemarkAnalysis(VAPassName, VecDiagMsg.first, 7859 L->getStartLoc(), L->getHeader()) 7860 << VecDiagMsg.second; 7861 }); 7862 } else if (VectorizeLoop && !InterleaveLoop) { 7863 LLVM_DEBUG(dbgs() << "LV: Found a vectorizable loop (" << VF.Width 7864 << ") in " << DebugLocStr << '\n'); 7865 ORE->emit([&]() { 7866 return OptimizationRemarkAnalysis(LV_NAME, IntDiagMsg.first, 7867 L->getStartLoc(), L->getHeader()) 7868 << IntDiagMsg.second; 7869 }); 7870 } else if (VectorizeLoop && InterleaveLoop) { 7871 LLVM_DEBUG(dbgs() << "LV: Found a vectorizable loop (" << VF.Width 7872 << ") in " << DebugLocStr << '\n'); 7873 LLVM_DEBUG(dbgs() << "LV: Interleave Count is " << IC << '\n'); 7874 } 7875 7876 LVP.setBestPlan(VF.Width, IC); 7877 7878 using namespace ore; 7879 bool DisableRuntimeUnroll = false; 7880 MDNode *OrigLoopID = L->getLoopID(); 7881 7882 if (!VectorizeLoop) { 7883 assert(IC > 1 && "interleave count should not be 1 or 0"); 7884 // If we decided that it is not legal to vectorize the loop, then 7885 // interleave it. 7886 InnerLoopUnroller Unroller(L, PSE, LI, DT, TLI, TTI, AC, ORE, IC, &LVL, 7887 &CM); 7888 LVP.executePlan(Unroller, DT); 7889 7890 ORE->emit([&]() { 7891 return OptimizationRemark(LV_NAME, "Interleaved", L->getStartLoc(), 7892 L->getHeader()) 7893 << "interleaved loop (interleaved count: " 7894 << NV("InterleaveCount", IC) << ")"; 7895 }); 7896 } else { 7897 // If we decided that it is *legal* to vectorize the loop, then do it. 7898 InnerLoopVectorizer LB(L, PSE, LI, DT, TLI, TTI, AC, ORE, VF.Width, IC, 7899 &LVL, &CM); 7900 LVP.executePlan(LB, DT); 7901 ++LoopsVectorized; 7902 7903 // Add metadata to disable runtime unrolling a scalar loop when there are 7904 // no runtime checks about strides and memory. A scalar loop that is 7905 // rarely used is not worth unrolling. 7906 if (!LB.areSafetyChecksAdded()) 7907 DisableRuntimeUnroll = true; 7908 7909 // Report the vectorization decision. 7910 ORE->emit([&]() { 7911 return OptimizationRemark(LV_NAME, "Vectorized", L->getStartLoc(), 7912 L->getHeader()) 7913 << "vectorized loop (vectorization width: " 7914 << NV("VectorizationFactor", VF.Width) 7915 << ", interleaved count: " << NV("InterleaveCount", IC) << ")"; 7916 }); 7917 } 7918 7919 Optional<MDNode *> RemainderLoopID = 7920 makeFollowupLoopID(OrigLoopID, {LLVMLoopVectorizeFollowupAll, 7921 LLVMLoopVectorizeFollowupEpilogue}); 7922 if (RemainderLoopID.hasValue()) { 7923 L->setLoopID(RemainderLoopID.getValue()); 7924 } else { 7925 if (DisableRuntimeUnroll) 7926 AddRuntimeUnrollDisableMetaData(L); 7927 7928 // Mark the loop as already vectorized to avoid vectorizing again. 7929 Hints.setAlreadyVectorized(); 7930 } 7931 7932 LLVM_DEBUG(verifyFunction(*L->getHeader()->getParent())); 7933 return true; 7934 } 7935 7936 bool LoopVectorizePass::runImpl( 7937 Function &F, ScalarEvolution &SE_, LoopInfo &LI_, TargetTransformInfo &TTI_, 7938 DominatorTree &DT_, BlockFrequencyInfo &BFI_, TargetLibraryInfo *TLI_, 7939 DemandedBits &DB_, AliasAnalysis &AA_, AssumptionCache &AC_, 7940 std::function<const LoopAccessInfo &(Loop &)> &GetLAA_, 7941 OptimizationRemarkEmitter &ORE_, ProfileSummaryInfo *PSI_) { 7942 SE = &SE_; 7943 LI = &LI_; 7944 TTI = &TTI_; 7945 DT = &DT_; 7946 BFI = &BFI_; 7947 TLI = TLI_; 7948 AA = &AA_; 7949 AC = &AC_; 7950 GetLAA = &GetLAA_; 7951 DB = &DB_; 7952 ORE = &ORE_; 7953 PSI = PSI_; 7954 7955 // Don't attempt if 7956 // 1. the target claims to have no vector registers, and 7957 // 2. interleaving won't help ILP. 7958 // 7959 // The second condition is necessary because, even if the target has no 7960 // vector registers, loop vectorization may still enable scalar 7961 // interleaving. 7962 if (!TTI->getNumberOfRegisters(TTI->getRegisterClassForType(true)) && 7963 TTI->getMaxInterleaveFactor(1) < 2) 7964 return false; 7965 7966 bool Changed = false; 7967 7968 // The vectorizer requires loops to be in simplified form. 7969 // Since simplification may add new inner loops, it has to run before the 7970 // legality and profitability checks. This means running the loop vectorizer 7971 // will simplify all loops, regardless of whether anything end up being 7972 // vectorized. 7973 for (auto &L : *LI) 7974 Changed |= 7975 simplifyLoop(L, DT, LI, SE, AC, nullptr, false /* PreserveLCSSA */); 7976 7977 // Build up a worklist of inner-loops to vectorize. This is necessary as 7978 // the act of vectorizing or partially unrolling a loop creates new loops 7979 // and can invalidate iterators across the loops. 7980 SmallVector<Loop *, 8> Worklist; 7981 7982 for (Loop *L : *LI) 7983 collectSupportedLoops(*L, LI, ORE, Worklist); 7984 7985 LoopsAnalyzed += Worklist.size(); 7986 7987 // Now walk the identified inner loops. 7988 while (!Worklist.empty()) { 7989 Loop *L = Worklist.pop_back_val(); 7990 7991 // For the inner loops we actually process, form LCSSA to simplify the 7992 // transform. 7993 Changed |= formLCSSARecursively(*L, *DT, LI, SE); 7994 7995 Changed |= processLoop(L); 7996 } 7997 7998 // Process each loop nest in the function. 7999 return Changed; 8000 } 8001 8002 PreservedAnalyses LoopVectorizePass::run(Function &F, 8003 FunctionAnalysisManager &AM) { 8004 auto &SE = AM.getResult<ScalarEvolutionAnalysis>(F); 8005 auto &LI = AM.getResult<LoopAnalysis>(F); 8006 auto &TTI = AM.getResult<TargetIRAnalysis>(F); 8007 auto &DT = AM.getResult<DominatorTreeAnalysis>(F); 8008 auto &BFI = AM.getResult<BlockFrequencyAnalysis>(F); 8009 auto &TLI = AM.getResult<TargetLibraryAnalysis>(F); 8010 auto &AA = AM.getResult<AAManager>(F); 8011 auto &AC = AM.getResult<AssumptionAnalysis>(F); 8012 auto &DB = AM.getResult<DemandedBitsAnalysis>(F); 8013 auto &ORE = AM.getResult<OptimizationRemarkEmitterAnalysis>(F); 8014 MemorySSA *MSSA = EnableMSSALoopDependency 8015 ? &AM.getResult<MemorySSAAnalysis>(F).getMSSA() 8016 : nullptr; 8017 8018 auto &LAM = AM.getResult<LoopAnalysisManagerFunctionProxy>(F).getManager(); 8019 std::function<const LoopAccessInfo &(Loop &)> GetLAA = 8020 [&](Loop &L) -> const LoopAccessInfo & { 8021 LoopStandardAnalysisResults AR = {AA, AC, DT, LI, SE, TLI, TTI, MSSA}; 8022 return LAM.getResult<LoopAccessAnalysis>(L, AR); 8023 }; 8024 const ModuleAnalysisManager &MAM = 8025 AM.getResult<ModuleAnalysisManagerFunctionProxy>(F).getManager(); 8026 ProfileSummaryInfo *PSI = 8027 MAM.getCachedResult<ProfileSummaryAnalysis>(*F.getParent()); 8028 bool Changed = 8029 runImpl(F, SE, LI, TTI, DT, BFI, &TLI, DB, AA, AC, GetLAA, ORE, PSI); 8030 if (!Changed) 8031 return PreservedAnalyses::all(); 8032 PreservedAnalyses PA; 8033 8034 // We currently do not preserve loopinfo/dominator analyses with outer loop 8035 // vectorization. Until this is addressed, mark these analyses as preserved 8036 // only for non-VPlan-native path. 8037 // TODO: Preserve Loop and Dominator analyses for VPlan-native path. 8038 if (!EnableVPlanNativePath) { 8039 PA.preserve<LoopAnalysis>(); 8040 PA.preserve<DominatorTreeAnalysis>(); 8041 } 8042 PA.preserve<BasicAA>(); 8043 PA.preserve<GlobalsAA>(); 8044 return PA; 8045 } 8046