1 //===- LoopVectorize.cpp - A Loop Vectorizer ------------------------------===// 2 // 3 // Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. 4 // See https://llvm.org/LICENSE.txt for license information. 5 // SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception 6 // 7 //===----------------------------------------------------------------------===// 8 // 9 // This is the LLVM loop vectorizer. This pass modifies 'vectorizable' loops 10 // and generates target-independent LLVM-IR. 11 // The vectorizer uses the TargetTransformInfo analysis to estimate the costs 12 // of instructions in order to estimate the profitability of vectorization. 13 // 14 // The loop vectorizer combines consecutive loop iterations into a single 15 // 'wide' iteration. After this transformation the index is incremented 16 // by the SIMD vector width, and not by one. 17 // 18 // This pass has three parts: 19 // 1. The main loop pass that drives the different parts. 20 // 2. LoopVectorizationLegality - A unit that checks for the legality 21 // of the vectorization. 22 // 3. InnerLoopVectorizer - A unit that performs the actual 23 // widening of instructions. 24 // 4. LoopVectorizationCostModel - A unit that checks for the profitability 25 // of vectorization. It decides on the optimal vector width, which 26 // can be one, if vectorization is not profitable. 27 // 28 // There is a development effort going on to migrate loop vectorizer to the 29 // VPlan infrastructure and to introduce outer loop vectorization support (see 30 // docs/Proposal/VectorizationPlan.rst and 31 // http://lists.llvm.org/pipermail/llvm-dev/2017-December/119523.html). For this 32 // purpose, we temporarily introduced the VPlan-native vectorization path: an 33 // alternative vectorization path that is natively implemented on top of the 34 // VPlan infrastructure. See EnableVPlanNativePath for enabling. 35 // 36 //===----------------------------------------------------------------------===// 37 // 38 // The reduction-variable vectorization is based on the paper: 39 // D. Nuzman and R. Henderson. Multi-platform Auto-vectorization. 40 // 41 // Variable uniformity checks are inspired by: 42 // Karrenberg, R. and Hack, S. Whole Function Vectorization. 43 // 44 // The interleaved access vectorization is based on the paper: 45 // Dorit Nuzman, Ira Rosen and Ayal Zaks. Auto-Vectorization of Interleaved 46 // Data for SIMD 47 // 48 // Other ideas/concepts are from: 49 // A. Zaks and D. Nuzman. Autovectorization in GCC-two years later. 50 // 51 // S. Maleki, Y. Gao, M. Garzaran, T. Wong and D. Padua. An Evaluation of 52 // Vectorizing Compilers. 53 // 54 //===----------------------------------------------------------------------===// 55 56 #include "llvm/Transforms/Vectorize/LoopVectorize.h" 57 #include "LoopVectorizationPlanner.h" 58 #include "VPRecipeBuilder.h" 59 #include "VPlan.h" 60 #include "VPlanHCFGBuilder.h" 61 #include "VPlanPredicator.h" 62 #include "VPlanTransforms.h" 63 #include "llvm/ADT/APInt.h" 64 #include "llvm/ADT/ArrayRef.h" 65 #include "llvm/ADT/DenseMap.h" 66 #include "llvm/ADT/DenseMapInfo.h" 67 #include "llvm/ADT/Hashing.h" 68 #include "llvm/ADT/MapVector.h" 69 #include "llvm/ADT/None.h" 70 #include "llvm/ADT/Optional.h" 71 #include "llvm/ADT/STLExtras.h" 72 #include "llvm/ADT/SetVector.h" 73 #include "llvm/ADT/SmallPtrSet.h" 74 #include "llvm/ADT/SmallVector.h" 75 #include "llvm/ADT/Statistic.h" 76 #include "llvm/ADT/StringRef.h" 77 #include "llvm/ADT/Twine.h" 78 #include "llvm/ADT/iterator_range.h" 79 #include "llvm/Analysis/AssumptionCache.h" 80 #include "llvm/Analysis/BasicAliasAnalysis.h" 81 #include "llvm/Analysis/BlockFrequencyInfo.h" 82 #include "llvm/Analysis/CFG.h" 83 #include "llvm/Analysis/CodeMetrics.h" 84 #include "llvm/Analysis/DemandedBits.h" 85 #include "llvm/Analysis/GlobalsModRef.h" 86 #include "llvm/Analysis/LoopAccessAnalysis.h" 87 #include "llvm/Analysis/LoopAnalysisManager.h" 88 #include "llvm/Analysis/LoopInfo.h" 89 #include "llvm/Analysis/LoopIterator.h" 90 #include "llvm/Analysis/MemorySSA.h" 91 #include "llvm/Analysis/OptimizationRemarkEmitter.h" 92 #include "llvm/Analysis/ProfileSummaryInfo.h" 93 #include "llvm/Analysis/ScalarEvolution.h" 94 #include "llvm/Analysis/ScalarEvolutionExpander.h" 95 #include "llvm/Analysis/ScalarEvolutionExpressions.h" 96 #include "llvm/Analysis/TargetLibraryInfo.h" 97 #include "llvm/Analysis/TargetTransformInfo.h" 98 #include "llvm/Analysis/VectorUtils.h" 99 #include "llvm/IR/Attributes.h" 100 #include "llvm/IR/BasicBlock.h" 101 #include "llvm/IR/CFG.h" 102 #include "llvm/IR/Constant.h" 103 #include "llvm/IR/Constants.h" 104 #include "llvm/IR/DataLayout.h" 105 #include "llvm/IR/DebugInfoMetadata.h" 106 #include "llvm/IR/DebugLoc.h" 107 #include "llvm/IR/DerivedTypes.h" 108 #include "llvm/IR/DiagnosticInfo.h" 109 #include "llvm/IR/Dominators.h" 110 #include "llvm/IR/Function.h" 111 #include "llvm/IR/IRBuilder.h" 112 #include "llvm/IR/InstrTypes.h" 113 #include "llvm/IR/Instruction.h" 114 #include "llvm/IR/Instructions.h" 115 #include "llvm/IR/IntrinsicInst.h" 116 #include "llvm/IR/Intrinsics.h" 117 #include "llvm/IR/LLVMContext.h" 118 #include "llvm/IR/Metadata.h" 119 #include "llvm/IR/Module.h" 120 #include "llvm/IR/Operator.h" 121 #include "llvm/IR/Type.h" 122 #include "llvm/IR/Use.h" 123 #include "llvm/IR/User.h" 124 #include "llvm/IR/Value.h" 125 #include "llvm/IR/ValueHandle.h" 126 #include "llvm/IR/Verifier.h" 127 #include "llvm/InitializePasses.h" 128 #include "llvm/Pass.h" 129 #include "llvm/Support/Casting.h" 130 #include "llvm/Support/CommandLine.h" 131 #include "llvm/Support/Compiler.h" 132 #include "llvm/Support/Debug.h" 133 #include "llvm/Support/ErrorHandling.h" 134 #include "llvm/Support/MathExtras.h" 135 #include "llvm/Support/raw_ostream.h" 136 #include "llvm/Transforms/Utils/BasicBlockUtils.h" 137 #include "llvm/Transforms/Utils/InjectTLIMappings.h" 138 #include "llvm/Transforms/Utils/LoopSimplify.h" 139 #include "llvm/Transforms/Utils/LoopUtils.h" 140 #include "llvm/Transforms/Utils/LoopVersioning.h" 141 #include "llvm/Transforms/Utils/SizeOpts.h" 142 #include "llvm/Transforms/Vectorize/LoopVectorizationLegality.h" 143 #include <algorithm> 144 #include <cassert> 145 #include <cstdint> 146 #include <cstdlib> 147 #include <functional> 148 #include <iterator> 149 #include <limits> 150 #include <memory> 151 #include <string> 152 #include <tuple> 153 #include <utility> 154 155 using namespace llvm; 156 157 #define LV_NAME "loop-vectorize" 158 #define DEBUG_TYPE LV_NAME 159 160 /// @{ 161 /// Metadata attribute names 162 static const char *const LLVMLoopVectorizeFollowupAll = 163 "llvm.loop.vectorize.followup_all"; 164 static const char *const LLVMLoopVectorizeFollowupVectorized = 165 "llvm.loop.vectorize.followup_vectorized"; 166 static const char *const LLVMLoopVectorizeFollowupEpilogue = 167 "llvm.loop.vectorize.followup_epilogue"; 168 /// @} 169 170 STATISTIC(LoopsVectorized, "Number of loops vectorized"); 171 STATISTIC(LoopsAnalyzed, "Number of loops analyzed for vectorization"); 172 173 /// Loops with a known constant trip count below this number are vectorized only 174 /// if no scalar iteration overheads are incurred. 175 static cl::opt<unsigned> TinyTripCountVectorThreshold( 176 "vectorizer-min-trip-count", cl::init(16), cl::Hidden, 177 cl::desc("Loops with a constant trip count that is smaller than this " 178 "value are vectorized only if no scalar iteration overheads " 179 "are incurred.")); 180 181 // Indicates that an epilogue is undesired, predication is preferred. 182 // This means that the vectorizer will try to fold the loop-tail (epilogue) 183 // into the loop and predicate the loop body accordingly. 184 static cl::opt<bool> PreferPredicateOverEpilog( 185 "prefer-predicate-over-epilog", cl::init(false), cl::Hidden, 186 cl::desc("Indicate that an epilogue is undesired, predication should be " 187 "used instead.")); 188 189 static cl::opt<bool> MaximizeBandwidth( 190 "vectorizer-maximize-bandwidth", cl::init(false), cl::Hidden, 191 cl::desc("Maximize bandwidth when selecting vectorization factor which " 192 "will be determined by the smallest type in loop.")); 193 194 static cl::opt<bool> EnableInterleavedMemAccesses( 195 "enable-interleaved-mem-accesses", cl::init(false), cl::Hidden, 196 cl::desc("Enable vectorization on interleaved memory accesses in a loop")); 197 198 /// An interleave-group may need masking if it resides in a block that needs 199 /// predication, or in order to mask away gaps. 200 static cl::opt<bool> EnableMaskedInterleavedMemAccesses( 201 "enable-masked-interleaved-mem-accesses", cl::init(false), cl::Hidden, 202 cl::desc("Enable vectorization on masked interleaved memory accesses in a loop")); 203 204 static cl::opt<unsigned> TinyTripCountInterleaveThreshold( 205 "tiny-trip-count-interleave-threshold", cl::init(128), cl::Hidden, 206 cl::desc("We don't interleave loops with a estimated constant trip count " 207 "below this number")); 208 209 static cl::opt<unsigned> ForceTargetNumScalarRegs( 210 "force-target-num-scalar-regs", cl::init(0), cl::Hidden, 211 cl::desc("A flag that overrides the target's number of scalar registers.")); 212 213 static cl::opt<unsigned> ForceTargetNumVectorRegs( 214 "force-target-num-vector-regs", cl::init(0), cl::Hidden, 215 cl::desc("A flag that overrides the target's number of vector registers.")); 216 217 static cl::opt<unsigned> ForceTargetMaxScalarInterleaveFactor( 218 "force-target-max-scalar-interleave", cl::init(0), cl::Hidden, 219 cl::desc("A flag that overrides the target's max interleave factor for " 220 "scalar loops.")); 221 222 static cl::opt<unsigned> ForceTargetMaxVectorInterleaveFactor( 223 "force-target-max-vector-interleave", cl::init(0), cl::Hidden, 224 cl::desc("A flag that overrides the target's max interleave factor for " 225 "vectorized loops.")); 226 227 static cl::opt<unsigned> ForceTargetInstructionCost( 228 "force-target-instruction-cost", cl::init(0), cl::Hidden, 229 cl::desc("A flag that overrides the target's expected cost for " 230 "an instruction to a single constant value. Mostly " 231 "useful for getting consistent testing.")); 232 233 static cl::opt<unsigned> SmallLoopCost( 234 "small-loop-cost", cl::init(20), cl::Hidden, 235 cl::desc( 236 "The cost of a loop that is considered 'small' by the interleaver.")); 237 238 static cl::opt<bool> LoopVectorizeWithBlockFrequency( 239 "loop-vectorize-with-block-frequency", cl::init(true), cl::Hidden, 240 cl::desc("Enable the use of the block frequency analysis to access PGO " 241 "heuristics minimizing code growth in cold regions and being more " 242 "aggressive in hot regions.")); 243 244 // Runtime interleave loops for load/store throughput. 245 static cl::opt<bool> EnableLoadStoreRuntimeInterleave( 246 "enable-loadstore-runtime-interleave", cl::init(true), cl::Hidden, 247 cl::desc( 248 "Enable runtime interleaving until load/store ports are saturated")); 249 250 /// The number of stores in a loop that are allowed to need predication. 251 static cl::opt<unsigned> NumberOfStoresToPredicate( 252 "vectorize-num-stores-pred", cl::init(1), cl::Hidden, 253 cl::desc("Max number of stores to be predicated behind an if.")); 254 255 static cl::opt<bool> EnableIndVarRegisterHeur( 256 "enable-ind-var-reg-heur", cl::init(true), cl::Hidden, 257 cl::desc("Count the induction variable only once when interleaving")); 258 259 static cl::opt<bool> EnableCondStoresVectorization( 260 "enable-cond-stores-vec", cl::init(true), cl::Hidden, 261 cl::desc("Enable if predication of stores during vectorization.")); 262 263 static cl::opt<unsigned> MaxNestedScalarReductionIC( 264 "max-nested-scalar-reduction-interleave", cl::init(2), cl::Hidden, 265 cl::desc("The maximum interleave count to use when interleaving a scalar " 266 "reduction in a nested loop.")); 267 268 cl::opt<bool> EnableVPlanNativePath( 269 "enable-vplan-native-path", cl::init(false), cl::Hidden, 270 cl::desc("Enable VPlan-native vectorization path with " 271 "support for outer loop vectorization.")); 272 273 // FIXME: Remove this switch once we have divergence analysis. Currently we 274 // assume divergent non-backedge branches when this switch is true. 275 cl::opt<bool> EnableVPlanPredication( 276 "enable-vplan-predication", cl::init(false), cl::Hidden, 277 cl::desc("Enable VPlan-native vectorization path predicator with " 278 "support for outer loop vectorization.")); 279 280 // This flag enables the stress testing of the VPlan H-CFG construction in the 281 // VPlan-native vectorization path. It must be used in conjuction with 282 // -enable-vplan-native-path. -vplan-verify-hcfg can also be used to enable the 283 // verification of the H-CFGs built. 284 static cl::opt<bool> VPlanBuildStressTest( 285 "vplan-build-stress-test", cl::init(false), cl::Hidden, 286 cl::desc( 287 "Build VPlan for every supported loop nest in the function and bail " 288 "out right after the build (stress test the VPlan H-CFG construction " 289 "in the VPlan-native vectorization path).")); 290 291 cl::opt<bool> llvm::EnableLoopInterleaving( 292 "interleave-loops", cl::init(true), cl::Hidden, 293 cl::desc("Enable loop interleaving in Loop vectorization passes")); 294 cl::opt<bool> llvm::EnableLoopVectorization( 295 "vectorize-loops", cl::init(true), cl::Hidden, 296 cl::desc("Run the Loop vectorization passes")); 297 298 /// A helper function that returns the type of loaded or stored value. 299 static Type *getMemInstValueType(Value *I) { 300 assert((isa<LoadInst>(I) || isa<StoreInst>(I)) && 301 "Expected Load or Store instruction"); 302 if (auto *LI = dyn_cast<LoadInst>(I)) 303 return LI->getType(); 304 return cast<StoreInst>(I)->getValueOperand()->getType(); 305 } 306 307 /// A helper function that returns true if the given type is irregular. The 308 /// type is irregular if its allocated size doesn't equal the store size of an 309 /// element of the corresponding vector type at the given vectorization factor. 310 static bool hasIrregularType(Type *Ty, const DataLayout &DL, unsigned VF) { 311 // Determine if an array of VF elements of type Ty is "bitcast compatible" 312 // with a <VF x Ty> vector. 313 if (VF > 1) { 314 auto *VectorTy = VectorType::get(Ty, VF); 315 return VF * DL.getTypeAllocSize(Ty) != DL.getTypeStoreSize(VectorTy); 316 } 317 318 // If the vectorization factor is one, we just check if an array of type Ty 319 // requires padding between elements. 320 return DL.getTypeAllocSizeInBits(Ty) != DL.getTypeSizeInBits(Ty); 321 } 322 323 /// A helper function that returns the reciprocal of the block probability of 324 /// predicated blocks. If we return X, we are assuming the predicated block 325 /// will execute once for every X iterations of the loop header. 326 /// 327 /// TODO: We should use actual block probability here, if available. Currently, 328 /// we always assume predicated blocks have a 50% chance of executing. 329 static unsigned getReciprocalPredBlockProb() { return 2; } 330 331 /// A helper function that adds a 'fast' flag to floating-point operations. 332 static Value *addFastMathFlag(Value *V) { 333 if (isa<FPMathOperator>(V)) 334 cast<Instruction>(V)->setFastMathFlags(FastMathFlags::getFast()); 335 return V; 336 } 337 338 static Value *addFastMathFlag(Value *V, FastMathFlags FMF) { 339 if (isa<FPMathOperator>(V)) 340 cast<Instruction>(V)->setFastMathFlags(FMF); 341 return V; 342 } 343 344 /// A helper function that returns an integer or floating-point constant with 345 /// value C. 346 static Constant *getSignedIntOrFpConstant(Type *Ty, int64_t C) { 347 return Ty->isIntegerTy() ? ConstantInt::getSigned(Ty, C) 348 : ConstantFP::get(Ty, C); 349 } 350 351 /// Returns "best known" trip count for the specified loop \p L as defined by 352 /// the following procedure: 353 /// 1) Returns exact trip count if it is known. 354 /// 2) Returns expected trip count according to profile data if any. 355 /// 3) Returns upper bound estimate if it is known. 356 /// 4) Returns None if all of the above failed. 357 static Optional<unsigned> getSmallBestKnownTC(ScalarEvolution &SE, Loop *L) { 358 // Check if exact trip count is known. 359 if (unsigned ExpectedTC = SE.getSmallConstantTripCount(L)) 360 return ExpectedTC; 361 362 // Check if there is an expected trip count available from profile data. 363 if (LoopVectorizeWithBlockFrequency) 364 if (auto EstimatedTC = getLoopEstimatedTripCount(L)) 365 return EstimatedTC; 366 367 // Check if upper bound estimate is known. 368 if (unsigned ExpectedTC = SE.getSmallConstantMaxTripCount(L)) 369 return ExpectedTC; 370 371 return None; 372 } 373 374 namespace llvm { 375 376 /// InnerLoopVectorizer vectorizes loops which contain only one basic 377 /// block to a specified vectorization factor (VF). 378 /// This class performs the widening of scalars into vectors, or multiple 379 /// scalars. This class also implements the following features: 380 /// * It inserts an epilogue loop for handling loops that don't have iteration 381 /// counts that are known to be a multiple of the vectorization factor. 382 /// * It handles the code generation for reduction variables. 383 /// * Scalarization (implementation using scalars) of un-vectorizable 384 /// instructions. 385 /// InnerLoopVectorizer does not perform any vectorization-legality 386 /// checks, and relies on the caller to check for the different legality 387 /// aspects. The InnerLoopVectorizer relies on the 388 /// LoopVectorizationLegality class to provide information about the induction 389 /// and reduction variables that were found to a given vectorization factor. 390 class InnerLoopVectorizer { 391 public: 392 InnerLoopVectorizer(Loop *OrigLoop, PredicatedScalarEvolution &PSE, 393 LoopInfo *LI, DominatorTree *DT, 394 const TargetLibraryInfo *TLI, 395 const TargetTransformInfo *TTI, AssumptionCache *AC, 396 OptimizationRemarkEmitter *ORE, unsigned VecWidth, 397 unsigned UnrollFactor, LoopVectorizationLegality *LVL, 398 LoopVectorizationCostModel *CM) 399 : OrigLoop(OrigLoop), PSE(PSE), LI(LI), DT(DT), TLI(TLI), TTI(TTI), 400 AC(AC), ORE(ORE), VF(VecWidth), UF(UnrollFactor), 401 Builder(PSE.getSE()->getContext()), 402 VectorLoopValueMap(UnrollFactor, VecWidth), Legal(LVL), Cost(CM) {} 403 virtual ~InnerLoopVectorizer() = default; 404 405 /// Create a new empty loop. Unlink the old loop and connect the new one. 406 /// Return the pre-header block of the new loop. 407 BasicBlock *createVectorizedLoopSkeleton(); 408 409 /// Widen a single instruction within the innermost loop. 410 void widenInstruction(Instruction &I); 411 412 /// Widen a single call instruction within the innermost loop. 413 void widenCallInstruction(CallInst &I, VPUser &ArgOperands, 414 VPTransformState &State); 415 416 /// Fix the vectorized code, taking care of header phi's, live-outs, and more. 417 void fixVectorizedLoop(); 418 419 // Return true if any runtime check is added. 420 bool areSafetyChecksAdded() { return AddedSafetyChecks; } 421 422 /// A type for vectorized values in the new loop. Each value from the 423 /// original loop, when vectorized, is represented by UF vector values in the 424 /// new unrolled loop, where UF is the unroll factor. 425 using VectorParts = SmallVector<Value *, 2>; 426 427 /// Vectorize a single GetElementPtrInst based on information gathered and 428 /// decisions taken during planning. 429 void widenGEP(GetElementPtrInst *GEP, unsigned UF, unsigned VF, 430 bool IsPtrLoopInvariant, SmallBitVector &IsIndexLoopInvariant); 431 432 /// Vectorize a single PHINode in a block. This method handles the induction 433 /// variable canonicalization. It supports both VF = 1 for unrolled loops and 434 /// arbitrary length vectors. 435 void widenPHIInstruction(Instruction *PN, unsigned UF, unsigned VF); 436 437 /// A helper function to scalarize a single Instruction in the innermost loop. 438 /// Generates a sequence of scalar instances for each lane between \p MinLane 439 /// and \p MaxLane, times each part between \p MinPart and \p MaxPart, 440 /// inclusive.. 441 void scalarizeInstruction(Instruction *Instr, const VPIteration &Instance, 442 bool IfPredicateInstr); 443 444 /// Widen an integer or floating-point induction variable \p IV. If \p Trunc 445 /// is provided, the integer induction variable will first be truncated to 446 /// the corresponding type. 447 void widenIntOrFpInduction(PHINode *IV, TruncInst *Trunc = nullptr); 448 449 /// getOrCreateVectorValue and getOrCreateScalarValue coordinate to generate a 450 /// vector or scalar value on-demand if one is not yet available. When 451 /// vectorizing a loop, we visit the definition of an instruction before its 452 /// uses. When visiting the definition, we either vectorize or scalarize the 453 /// instruction, creating an entry for it in the corresponding map. (In some 454 /// cases, such as induction variables, we will create both vector and scalar 455 /// entries.) Then, as we encounter uses of the definition, we derive values 456 /// for each scalar or vector use unless such a value is already available. 457 /// For example, if we scalarize a definition and one of its uses is vector, 458 /// we build the required vector on-demand with an insertelement sequence 459 /// when visiting the use. Otherwise, if the use is scalar, we can use the 460 /// existing scalar definition. 461 /// 462 /// Return a value in the new loop corresponding to \p V from the original 463 /// loop at unroll index \p Part. If the value has already been vectorized, 464 /// the corresponding vector entry in VectorLoopValueMap is returned. If, 465 /// however, the value has a scalar entry in VectorLoopValueMap, we construct 466 /// a new vector value on-demand by inserting the scalar values into a vector 467 /// with an insertelement sequence. If the value has been neither vectorized 468 /// nor scalarized, it must be loop invariant, so we simply broadcast the 469 /// value into a vector. 470 Value *getOrCreateVectorValue(Value *V, unsigned Part); 471 472 /// Return a value in the new loop corresponding to \p V from the original 473 /// loop at unroll and vector indices \p Instance. If the value has been 474 /// vectorized but not scalarized, the necessary extractelement instruction 475 /// will be generated. 476 Value *getOrCreateScalarValue(Value *V, const VPIteration &Instance); 477 478 /// Construct the vector value of a scalarized value \p V one lane at a time. 479 void packScalarIntoVectorValue(Value *V, const VPIteration &Instance); 480 481 /// Try to vectorize the interleaved access group that \p Instr belongs to 482 /// with the base address given in \p Addr, optionally masking the vector 483 /// operations if \p BlockInMask is non-null. Use \p State to translate given 484 /// VPValues to IR values in the vectorized loop. 485 void vectorizeInterleaveGroup(Instruction *Instr, VPTransformState &State, 486 VPValue *Addr, VPValue *BlockInMask = nullptr); 487 488 /// Vectorize Load and Store instructions with the base address given in \p 489 /// Addr, optionally masking the vector operations if \p BlockInMask is 490 /// non-null. Use \p State to translate given VPValues to IR values in the 491 /// vectorized loop. 492 void vectorizeMemoryInstruction(Instruction *Instr, VPTransformState &State, 493 VPValue *Addr, VPValue *StoredValue, 494 VPValue *BlockInMask); 495 496 /// Set the debug location in the builder using the debug location in 497 /// the instruction. 498 void setDebugLocFromInst(IRBuilder<> &B, const Value *Ptr); 499 500 /// Fix the non-induction PHIs in the OrigPHIsToFix vector. 501 void fixNonInductionPHIs(void); 502 503 protected: 504 friend class LoopVectorizationPlanner; 505 506 /// A small list of PHINodes. 507 using PhiVector = SmallVector<PHINode *, 4>; 508 509 /// A type for scalarized values in the new loop. Each value from the 510 /// original loop, when scalarized, is represented by UF x VF scalar values 511 /// in the new unrolled loop, where UF is the unroll factor and VF is the 512 /// vectorization factor. 513 using ScalarParts = SmallVector<SmallVector<Value *, 4>, 2>; 514 515 /// Set up the values of the IVs correctly when exiting the vector loop. 516 void fixupIVUsers(PHINode *OrigPhi, const InductionDescriptor &II, 517 Value *CountRoundDown, Value *EndValue, 518 BasicBlock *MiddleBlock); 519 520 /// Create a new induction variable inside L. 521 PHINode *createInductionVariable(Loop *L, Value *Start, Value *End, 522 Value *Step, Instruction *DL); 523 524 /// Handle all cross-iteration phis in the header. 525 void fixCrossIterationPHIs(); 526 527 /// Fix a first-order recurrence. This is the second phase of vectorizing 528 /// this phi node. 529 void fixFirstOrderRecurrence(PHINode *Phi); 530 531 /// Fix a reduction cross-iteration phi. This is the second phase of 532 /// vectorizing this phi node. 533 void fixReduction(PHINode *Phi); 534 535 /// Clear NSW/NUW flags from reduction instructions if necessary. 536 void clearReductionWrapFlags(RecurrenceDescriptor &RdxDesc); 537 538 /// The Loop exit block may have single value PHI nodes with some 539 /// incoming value. While vectorizing we only handled real values 540 /// that were defined inside the loop and we should have one value for 541 /// each predecessor of its parent basic block. See PR14725. 542 void fixLCSSAPHIs(); 543 544 /// Iteratively sink the scalarized operands of a predicated instruction into 545 /// the block that was created for it. 546 void sinkScalarOperands(Instruction *PredInst); 547 548 /// Shrinks vector element sizes to the smallest bitwidth they can be legally 549 /// represented as. 550 void truncateToMinimalBitwidths(); 551 552 /// Create a broadcast instruction. This method generates a broadcast 553 /// instruction (shuffle) for loop invariant values and for the induction 554 /// value. If this is the induction variable then we extend it to N, N+1, ... 555 /// this is needed because each iteration in the loop corresponds to a SIMD 556 /// element. 557 virtual Value *getBroadcastInstrs(Value *V); 558 559 /// This function adds (StartIdx, StartIdx + Step, StartIdx + 2*Step, ...) 560 /// to each vector element of Val. The sequence starts at StartIndex. 561 /// \p Opcode is relevant for FP induction variable. 562 virtual Value *getStepVector(Value *Val, int StartIdx, Value *Step, 563 Instruction::BinaryOps Opcode = 564 Instruction::BinaryOpsEnd); 565 566 /// Compute scalar induction steps. \p ScalarIV is the scalar induction 567 /// variable on which to base the steps, \p Step is the size of the step, and 568 /// \p EntryVal is the value from the original loop that maps to the steps. 569 /// Note that \p EntryVal doesn't have to be an induction variable - it 570 /// can also be a truncate instruction. 571 void buildScalarSteps(Value *ScalarIV, Value *Step, Instruction *EntryVal, 572 const InductionDescriptor &ID); 573 574 /// Create a vector induction phi node based on an existing scalar one. \p 575 /// EntryVal is the value from the original loop that maps to the vector phi 576 /// node, and \p Step is the loop-invariant step. If \p EntryVal is a 577 /// truncate instruction, instead of widening the original IV, we widen a 578 /// version of the IV truncated to \p EntryVal's type. 579 void createVectorIntOrFpInductionPHI(const InductionDescriptor &II, 580 Value *Step, Instruction *EntryVal); 581 582 /// Returns true if an instruction \p I should be scalarized instead of 583 /// vectorized for the chosen vectorization factor. 584 bool shouldScalarizeInstruction(Instruction *I) const; 585 586 /// Returns true if we should generate a scalar version of \p IV. 587 bool needsScalarInduction(Instruction *IV) const; 588 589 /// If there is a cast involved in the induction variable \p ID, which should 590 /// be ignored in the vectorized loop body, this function records the 591 /// VectorLoopValue of the respective Phi also as the VectorLoopValue of the 592 /// cast. We had already proved that the casted Phi is equal to the uncasted 593 /// Phi in the vectorized loop (under a runtime guard), and therefore 594 /// there is no need to vectorize the cast - the same value can be used in the 595 /// vector loop for both the Phi and the cast. 596 /// If \p VectorLoopValue is a scalarized value, \p Lane is also specified, 597 /// Otherwise, \p VectorLoopValue is a widened/vectorized value. 598 /// 599 /// \p EntryVal is the value from the original loop that maps to the vector 600 /// phi node and is used to distinguish what is the IV currently being 601 /// processed - original one (if \p EntryVal is a phi corresponding to the 602 /// original IV) or the "newly-created" one based on the proof mentioned above 603 /// (see also buildScalarSteps() and createVectorIntOrFPInductionPHI()). In the 604 /// latter case \p EntryVal is a TruncInst and we must not record anything for 605 /// that IV, but it's error-prone to expect callers of this routine to care 606 /// about that, hence this explicit parameter. 607 void recordVectorLoopValueForInductionCast(const InductionDescriptor &ID, 608 const Instruction *EntryVal, 609 Value *VectorLoopValue, 610 unsigned Part, 611 unsigned Lane = UINT_MAX); 612 613 /// Generate a shuffle sequence that will reverse the vector Vec. 614 virtual Value *reverseVector(Value *Vec); 615 616 /// Returns (and creates if needed) the original loop trip count. 617 Value *getOrCreateTripCount(Loop *NewLoop); 618 619 /// Returns (and creates if needed) the trip count of the widened loop. 620 Value *getOrCreateVectorTripCount(Loop *NewLoop); 621 622 /// Returns a bitcasted value to the requested vector type. 623 /// Also handles bitcasts of vector<float> <-> vector<pointer> types. 624 Value *createBitOrPointerCast(Value *V, VectorType *DstVTy, 625 const DataLayout &DL); 626 627 /// Emit a bypass check to see if the vector trip count is zero, including if 628 /// it overflows. 629 void emitMinimumIterationCountCheck(Loop *L, BasicBlock *Bypass); 630 631 /// Emit a bypass check to see if all of the SCEV assumptions we've 632 /// had to make are correct. 633 void emitSCEVChecks(Loop *L, BasicBlock *Bypass); 634 635 /// Emit bypass checks to check any memory assumptions we may have made. 636 void emitMemRuntimeChecks(Loop *L, BasicBlock *Bypass); 637 638 /// Compute the transformed value of Index at offset StartValue using step 639 /// StepValue. 640 /// For integer induction, returns StartValue + Index * StepValue. 641 /// For pointer induction, returns StartValue[Index * StepValue]. 642 /// FIXME: The newly created binary instructions should contain nsw/nuw 643 /// flags, which can be found from the original scalar operations. 644 Value *emitTransformedIndex(IRBuilder<> &B, Value *Index, ScalarEvolution *SE, 645 const DataLayout &DL, 646 const InductionDescriptor &ID) const; 647 648 /// Add additional metadata to \p To that was not present on \p Orig. 649 /// 650 /// Currently this is used to add the noalias annotations based on the 651 /// inserted memchecks. Use this for instructions that are *cloned* into the 652 /// vector loop. 653 void addNewMetadata(Instruction *To, const Instruction *Orig); 654 655 /// Add metadata from one instruction to another. 656 /// 657 /// This includes both the original MDs from \p From and additional ones (\see 658 /// addNewMetadata). Use this for *newly created* instructions in the vector 659 /// loop. 660 void addMetadata(Instruction *To, Instruction *From); 661 662 /// Similar to the previous function but it adds the metadata to a 663 /// vector of instructions. 664 void addMetadata(ArrayRef<Value *> To, Instruction *From); 665 666 /// The original loop. 667 Loop *OrigLoop; 668 669 /// A wrapper around ScalarEvolution used to add runtime SCEV checks. Applies 670 /// dynamic knowledge to simplify SCEV expressions and converts them to a 671 /// more usable form. 672 PredicatedScalarEvolution &PSE; 673 674 /// Loop Info. 675 LoopInfo *LI; 676 677 /// Dominator Tree. 678 DominatorTree *DT; 679 680 /// Alias Analysis. 681 AliasAnalysis *AA; 682 683 /// Target Library Info. 684 const TargetLibraryInfo *TLI; 685 686 /// Target Transform Info. 687 const TargetTransformInfo *TTI; 688 689 /// Assumption Cache. 690 AssumptionCache *AC; 691 692 /// Interface to emit optimization remarks. 693 OptimizationRemarkEmitter *ORE; 694 695 /// LoopVersioning. It's only set up (non-null) if memchecks were 696 /// used. 697 /// 698 /// This is currently only used to add no-alias metadata based on the 699 /// memchecks. The actually versioning is performed manually. 700 std::unique_ptr<LoopVersioning> LVer; 701 702 /// The vectorization SIMD factor to use. Each vector will have this many 703 /// vector elements. 704 unsigned VF; 705 706 /// The vectorization unroll factor to use. Each scalar is vectorized to this 707 /// many different vector instructions. 708 unsigned UF; 709 710 /// The builder that we use 711 IRBuilder<> Builder; 712 713 // --- Vectorization state --- 714 715 /// The vector-loop preheader. 716 BasicBlock *LoopVectorPreHeader; 717 718 /// The scalar-loop preheader. 719 BasicBlock *LoopScalarPreHeader; 720 721 /// Middle Block between the vector and the scalar. 722 BasicBlock *LoopMiddleBlock; 723 724 /// The ExitBlock of the scalar loop. 725 BasicBlock *LoopExitBlock; 726 727 /// The vector loop body. 728 BasicBlock *LoopVectorBody; 729 730 /// The scalar loop body. 731 BasicBlock *LoopScalarBody; 732 733 /// A list of all bypass blocks. The first block is the entry of the loop. 734 SmallVector<BasicBlock *, 4> LoopBypassBlocks; 735 736 /// The new Induction variable which was added to the new block. 737 PHINode *Induction = nullptr; 738 739 /// The induction variable of the old basic block. 740 PHINode *OldInduction = nullptr; 741 742 /// Maps values from the original loop to their corresponding values in the 743 /// vectorized loop. A key value can map to either vector values, scalar 744 /// values or both kinds of values, depending on whether the key was 745 /// vectorized and scalarized. 746 VectorizerValueMap VectorLoopValueMap; 747 748 /// Store instructions that were predicated. 749 SmallVector<Instruction *, 4> PredicatedInstructions; 750 751 /// Trip count of the original loop. 752 Value *TripCount = nullptr; 753 754 /// Trip count of the widened loop (TripCount - TripCount % (VF*UF)) 755 Value *VectorTripCount = nullptr; 756 757 /// The legality analysis. 758 LoopVectorizationLegality *Legal; 759 760 /// The profitablity analysis. 761 LoopVectorizationCostModel *Cost; 762 763 // Record whether runtime checks are added. 764 bool AddedSafetyChecks = false; 765 766 // Holds the end values for each induction variable. We save the end values 767 // so we can later fix-up the external users of the induction variables. 768 DenseMap<PHINode *, Value *> IVEndValues; 769 770 // Vector of original scalar PHIs whose corresponding widened PHIs need to be 771 // fixed up at the end of vector code generation. 772 SmallVector<PHINode *, 8> OrigPHIsToFix; 773 }; 774 775 class InnerLoopUnroller : public InnerLoopVectorizer { 776 public: 777 InnerLoopUnroller(Loop *OrigLoop, PredicatedScalarEvolution &PSE, 778 LoopInfo *LI, DominatorTree *DT, 779 const TargetLibraryInfo *TLI, 780 const TargetTransformInfo *TTI, AssumptionCache *AC, 781 OptimizationRemarkEmitter *ORE, unsigned UnrollFactor, 782 LoopVectorizationLegality *LVL, 783 LoopVectorizationCostModel *CM) 784 : InnerLoopVectorizer(OrigLoop, PSE, LI, DT, TLI, TTI, AC, ORE, 1, 785 UnrollFactor, LVL, CM) {} 786 787 private: 788 Value *getBroadcastInstrs(Value *V) override; 789 Value *getStepVector(Value *Val, int StartIdx, Value *Step, 790 Instruction::BinaryOps Opcode = 791 Instruction::BinaryOpsEnd) override; 792 Value *reverseVector(Value *Vec) override; 793 }; 794 795 } // end namespace llvm 796 797 /// Look for a meaningful debug location on the instruction or it's 798 /// operands. 799 static Instruction *getDebugLocFromInstOrOperands(Instruction *I) { 800 if (!I) 801 return I; 802 803 DebugLoc Empty; 804 if (I->getDebugLoc() != Empty) 805 return I; 806 807 for (User::op_iterator OI = I->op_begin(), OE = I->op_end(); OI != OE; ++OI) { 808 if (Instruction *OpInst = dyn_cast<Instruction>(*OI)) 809 if (OpInst->getDebugLoc() != Empty) 810 return OpInst; 811 } 812 813 return I; 814 } 815 816 void InnerLoopVectorizer::setDebugLocFromInst(IRBuilder<> &B, const Value *Ptr) { 817 if (const Instruction *Inst = dyn_cast_or_null<Instruction>(Ptr)) { 818 const DILocation *DIL = Inst->getDebugLoc(); 819 if (DIL && Inst->getFunction()->isDebugInfoForProfiling() && 820 !isa<DbgInfoIntrinsic>(Inst)) { 821 auto NewDIL = DIL->cloneByMultiplyingDuplicationFactor(UF * VF); 822 if (NewDIL) 823 B.SetCurrentDebugLocation(NewDIL.getValue()); 824 else 825 LLVM_DEBUG(dbgs() 826 << "Failed to create new discriminator: " 827 << DIL->getFilename() << " Line: " << DIL->getLine()); 828 } 829 else 830 B.SetCurrentDebugLocation(DIL); 831 } else 832 B.SetCurrentDebugLocation(DebugLoc()); 833 } 834 835 /// Write a record \p DebugMsg about vectorization failure to the debug 836 /// output stream. If \p I is passed, it is an instruction that prevents 837 /// vectorization. 838 #ifndef NDEBUG 839 static void debugVectorizationFailure(const StringRef DebugMsg, 840 Instruction *I) { 841 dbgs() << "LV: Not vectorizing: " << DebugMsg; 842 if (I != nullptr) 843 dbgs() << " " << *I; 844 else 845 dbgs() << '.'; 846 dbgs() << '\n'; 847 } 848 #endif 849 850 /// Create an analysis remark that explains why vectorization failed 851 /// 852 /// \p PassName is the name of the pass (e.g. can be AlwaysPrint). \p 853 /// RemarkName is the identifier for the remark. If \p I is passed it is an 854 /// instruction that prevents vectorization. Otherwise \p TheLoop is used for 855 /// the location of the remark. \return the remark object that can be 856 /// streamed to. 857 static OptimizationRemarkAnalysis createLVAnalysis(const char *PassName, 858 StringRef RemarkName, Loop *TheLoop, Instruction *I) { 859 Value *CodeRegion = TheLoop->getHeader(); 860 DebugLoc DL = TheLoop->getStartLoc(); 861 862 if (I) { 863 CodeRegion = I->getParent(); 864 // If there is no debug location attached to the instruction, revert back to 865 // using the loop's. 866 if (I->getDebugLoc()) 867 DL = I->getDebugLoc(); 868 } 869 870 OptimizationRemarkAnalysis R(PassName, RemarkName, DL, CodeRegion); 871 R << "loop not vectorized: "; 872 return R; 873 } 874 875 namespace llvm { 876 877 void reportVectorizationFailure(const StringRef DebugMsg, 878 const StringRef OREMsg, const StringRef ORETag, 879 OptimizationRemarkEmitter *ORE, Loop *TheLoop, Instruction *I) { 880 LLVM_DEBUG(debugVectorizationFailure(DebugMsg, I)); 881 LoopVectorizeHints Hints(TheLoop, true /* doesn't matter */, *ORE); 882 ORE->emit(createLVAnalysis(Hints.vectorizeAnalysisPassName(), 883 ORETag, TheLoop, I) << OREMsg); 884 } 885 886 } // end namespace llvm 887 888 #ifndef NDEBUG 889 /// \return string containing a file name and a line # for the given loop. 890 static std::string getDebugLocString(const Loop *L) { 891 std::string Result; 892 if (L) { 893 raw_string_ostream OS(Result); 894 if (const DebugLoc LoopDbgLoc = L->getStartLoc()) 895 LoopDbgLoc.print(OS); 896 else 897 // Just print the module name. 898 OS << L->getHeader()->getParent()->getParent()->getModuleIdentifier(); 899 OS.flush(); 900 } 901 return Result; 902 } 903 #endif 904 905 void InnerLoopVectorizer::addNewMetadata(Instruction *To, 906 const Instruction *Orig) { 907 // If the loop was versioned with memchecks, add the corresponding no-alias 908 // metadata. 909 if (LVer && (isa<LoadInst>(Orig) || isa<StoreInst>(Orig))) 910 LVer->annotateInstWithNoAlias(To, Orig); 911 } 912 913 void InnerLoopVectorizer::addMetadata(Instruction *To, 914 Instruction *From) { 915 propagateMetadata(To, From); 916 addNewMetadata(To, From); 917 } 918 919 void InnerLoopVectorizer::addMetadata(ArrayRef<Value *> To, 920 Instruction *From) { 921 for (Value *V : To) { 922 if (Instruction *I = dyn_cast<Instruction>(V)) 923 addMetadata(I, From); 924 } 925 } 926 927 namespace llvm { 928 929 // Loop vectorization cost-model hints how the scalar epilogue loop should be 930 // lowered. 931 enum ScalarEpilogueLowering { 932 933 // The default: allowing scalar epilogues. 934 CM_ScalarEpilogueAllowed, 935 936 // Vectorization with OptForSize: don't allow epilogues. 937 CM_ScalarEpilogueNotAllowedOptSize, 938 939 // A special case of vectorisation with OptForSize: loops with a very small 940 // trip count are considered for vectorization under OptForSize, thereby 941 // making sure the cost of their loop body is dominant, free of runtime 942 // guards and scalar iteration overheads. 943 CM_ScalarEpilogueNotAllowedLowTripLoop, 944 945 // Loop hint predicate indicating an epilogue is undesired. 946 CM_ScalarEpilogueNotNeededUsePredicate 947 }; 948 949 /// LoopVectorizationCostModel - estimates the expected speedups due to 950 /// vectorization. 951 /// In many cases vectorization is not profitable. This can happen because of 952 /// a number of reasons. In this class we mainly attempt to predict the 953 /// expected speedup/slowdowns due to the supported instruction set. We use the 954 /// TargetTransformInfo to query the different backends for the cost of 955 /// different operations. 956 class LoopVectorizationCostModel { 957 public: 958 LoopVectorizationCostModel(ScalarEpilogueLowering SEL, Loop *L, 959 PredicatedScalarEvolution &PSE, LoopInfo *LI, 960 LoopVectorizationLegality *Legal, 961 const TargetTransformInfo &TTI, 962 const TargetLibraryInfo *TLI, DemandedBits *DB, 963 AssumptionCache *AC, 964 OptimizationRemarkEmitter *ORE, const Function *F, 965 const LoopVectorizeHints *Hints, 966 InterleavedAccessInfo &IAI) 967 : ScalarEpilogueStatus(SEL), TheLoop(L), PSE(PSE), LI(LI), Legal(Legal), 968 TTI(TTI), TLI(TLI), DB(DB), AC(AC), ORE(ORE), TheFunction(F), 969 Hints(Hints), InterleaveInfo(IAI) {} 970 971 /// \return An upper bound for the vectorization factor, or None if 972 /// vectorization and interleaving should be avoided up front. 973 Optional<unsigned> computeMaxVF(); 974 975 /// \return True if runtime checks are required for vectorization, and false 976 /// otherwise. 977 bool runtimeChecksRequired(); 978 979 /// \return The most profitable vectorization factor and the cost of that VF. 980 /// This method checks every power of two up to MaxVF. If UserVF is not ZERO 981 /// then this vectorization factor will be selected if vectorization is 982 /// possible. 983 VectorizationFactor selectVectorizationFactor(unsigned MaxVF); 984 985 /// Setup cost-based decisions for user vectorization factor. 986 void selectUserVectorizationFactor(unsigned UserVF) { 987 collectUniformsAndScalars(UserVF); 988 collectInstsToScalarize(UserVF); 989 } 990 991 /// \return The size (in bits) of the smallest and widest types in the code 992 /// that needs to be vectorized. We ignore values that remain scalar such as 993 /// 64 bit loop indices. 994 std::pair<unsigned, unsigned> getSmallestAndWidestTypes(); 995 996 /// \return The desired interleave count. 997 /// If interleave count has been specified by metadata it will be returned. 998 /// Otherwise, the interleave count is computed and returned. VF and LoopCost 999 /// are the selected vectorization factor and the cost of the selected VF. 1000 unsigned selectInterleaveCount(unsigned VF, unsigned LoopCost); 1001 1002 /// Memory access instruction may be vectorized in more than one way. 1003 /// Form of instruction after vectorization depends on cost. 1004 /// This function takes cost-based decisions for Load/Store instructions 1005 /// and collects them in a map. This decisions map is used for building 1006 /// the lists of loop-uniform and loop-scalar instructions. 1007 /// The calculated cost is saved with widening decision in order to 1008 /// avoid redundant calculations. 1009 void setCostBasedWideningDecision(unsigned VF); 1010 1011 /// A struct that represents some properties of the register usage 1012 /// of a loop. 1013 struct RegisterUsage { 1014 /// Holds the number of loop invariant values that are used in the loop. 1015 /// The key is ClassID of target-provided register class. 1016 SmallMapVector<unsigned, unsigned, 4> LoopInvariantRegs; 1017 /// Holds the maximum number of concurrent live intervals in the loop. 1018 /// The key is ClassID of target-provided register class. 1019 SmallMapVector<unsigned, unsigned, 4> MaxLocalUsers; 1020 }; 1021 1022 /// \return Returns information about the register usages of the loop for the 1023 /// given vectorization factors. 1024 SmallVector<RegisterUsage, 8> calculateRegisterUsage(ArrayRef<unsigned> VFs); 1025 1026 /// Collect values we want to ignore in the cost model. 1027 void collectValuesToIgnore(); 1028 1029 /// \returns The smallest bitwidth each instruction can be represented with. 1030 /// The vector equivalents of these instructions should be truncated to this 1031 /// type. 1032 const MapVector<Instruction *, uint64_t> &getMinimalBitwidths() const { 1033 return MinBWs; 1034 } 1035 1036 /// \returns True if it is more profitable to scalarize instruction \p I for 1037 /// vectorization factor \p VF. 1038 bool isProfitableToScalarize(Instruction *I, unsigned VF) const { 1039 assert(VF > 1 && "Profitable to scalarize relevant only for VF > 1."); 1040 1041 // Cost model is not run in the VPlan-native path - return conservative 1042 // result until this changes. 1043 if (EnableVPlanNativePath) 1044 return false; 1045 1046 auto Scalars = InstsToScalarize.find(VF); 1047 assert(Scalars != InstsToScalarize.end() && 1048 "VF not yet analyzed for scalarization profitability"); 1049 return Scalars->second.find(I) != Scalars->second.end(); 1050 } 1051 1052 /// Returns true if \p I is known to be uniform after vectorization. 1053 bool isUniformAfterVectorization(Instruction *I, unsigned VF) const { 1054 if (VF == 1) 1055 return true; 1056 1057 // Cost model is not run in the VPlan-native path - return conservative 1058 // result until this changes. 1059 if (EnableVPlanNativePath) 1060 return false; 1061 1062 auto UniformsPerVF = Uniforms.find(VF); 1063 assert(UniformsPerVF != Uniforms.end() && 1064 "VF not yet analyzed for uniformity"); 1065 return UniformsPerVF->second.find(I) != UniformsPerVF->second.end(); 1066 } 1067 1068 /// Returns true if \p I is known to be scalar after vectorization. 1069 bool isScalarAfterVectorization(Instruction *I, unsigned VF) const { 1070 if (VF == 1) 1071 return true; 1072 1073 // Cost model is not run in the VPlan-native path - return conservative 1074 // result until this changes. 1075 if (EnableVPlanNativePath) 1076 return false; 1077 1078 auto ScalarsPerVF = Scalars.find(VF); 1079 assert(ScalarsPerVF != Scalars.end() && 1080 "Scalar values are not calculated for VF"); 1081 return ScalarsPerVF->second.find(I) != ScalarsPerVF->second.end(); 1082 } 1083 1084 /// \returns True if instruction \p I can be truncated to a smaller bitwidth 1085 /// for vectorization factor \p VF. 1086 bool canTruncateToMinimalBitwidth(Instruction *I, unsigned VF) const { 1087 return VF > 1 && MinBWs.find(I) != MinBWs.end() && 1088 !isProfitableToScalarize(I, VF) && 1089 !isScalarAfterVectorization(I, VF); 1090 } 1091 1092 /// Decision that was taken during cost calculation for memory instruction. 1093 enum InstWidening { 1094 CM_Unknown, 1095 CM_Widen, // For consecutive accesses with stride +1. 1096 CM_Widen_Reverse, // For consecutive accesses with stride -1. 1097 CM_Interleave, 1098 CM_GatherScatter, 1099 CM_Scalarize 1100 }; 1101 1102 /// Save vectorization decision \p W and \p Cost taken by the cost model for 1103 /// instruction \p I and vector width \p VF. 1104 void setWideningDecision(Instruction *I, unsigned VF, InstWidening W, 1105 unsigned Cost) { 1106 assert(VF >= 2 && "Expected VF >=2"); 1107 WideningDecisions[std::make_pair(I, VF)] = std::make_pair(W, Cost); 1108 } 1109 1110 /// Save vectorization decision \p W and \p Cost taken by the cost model for 1111 /// interleaving group \p Grp and vector width \p VF. 1112 void setWideningDecision(const InterleaveGroup<Instruction> *Grp, unsigned VF, 1113 InstWidening W, unsigned Cost) { 1114 assert(VF >= 2 && "Expected VF >=2"); 1115 /// Broadcast this decicion to all instructions inside the group. 1116 /// But the cost will be assigned to one instruction only. 1117 for (unsigned i = 0; i < Grp->getFactor(); ++i) { 1118 if (auto *I = Grp->getMember(i)) { 1119 if (Grp->getInsertPos() == I) 1120 WideningDecisions[std::make_pair(I, VF)] = std::make_pair(W, Cost); 1121 else 1122 WideningDecisions[std::make_pair(I, VF)] = std::make_pair(W, 0); 1123 } 1124 } 1125 } 1126 1127 /// Return the cost model decision for the given instruction \p I and vector 1128 /// width \p VF. Return CM_Unknown if this instruction did not pass 1129 /// through the cost modeling. 1130 InstWidening getWideningDecision(Instruction *I, unsigned VF) { 1131 assert(VF >= 2 && "Expected VF >=2"); 1132 1133 // Cost model is not run in the VPlan-native path - return conservative 1134 // result until this changes. 1135 if (EnableVPlanNativePath) 1136 return CM_GatherScatter; 1137 1138 std::pair<Instruction *, unsigned> InstOnVF = std::make_pair(I, VF); 1139 auto Itr = WideningDecisions.find(InstOnVF); 1140 if (Itr == WideningDecisions.end()) 1141 return CM_Unknown; 1142 return Itr->second.first; 1143 } 1144 1145 /// Return the vectorization cost for the given instruction \p I and vector 1146 /// width \p VF. 1147 unsigned getWideningCost(Instruction *I, unsigned VF) { 1148 assert(VF >= 2 && "Expected VF >=2"); 1149 std::pair<Instruction *, unsigned> InstOnVF = std::make_pair(I, VF); 1150 assert(WideningDecisions.find(InstOnVF) != WideningDecisions.end() && 1151 "The cost is not calculated"); 1152 return WideningDecisions[InstOnVF].second; 1153 } 1154 1155 /// Return True if instruction \p I is an optimizable truncate whose operand 1156 /// is an induction variable. Such a truncate will be removed by adding a new 1157 /// induction variable with the destination type. 1158 bool isOptimizableIVTruncate(Instruction *I, unsigned VF) { 1159 // If the instruction is not a truncate, return false. 1160 auto *Trunc = dyn_cast<TruncInst>(I); 1161 if (!Trunc) 1162 return false; 1163 1164 // Get the source and destination types of the truncate. 1165 Type *SrcTy = ToVectorTy(cast<CastInst>(I)->getSrcTy(), VF); 1166 Type *DestTy = ToVectorTy(cast<CastInst>(I)->getDestTy(), VF); 1167 1168 // If the truncate is free for the given types, return false. Replacing a 1169 // free truncate with an induction variable would add an induction variable 1170 // update instruction to each iteration of the loop. We exclude from this 1171 // check the primary induction variable since it will need an update 1172 // instruction regardless. 1173 Value *Op = Trunc->getOperand(0); 1174 if (Op != Legal->getPrimaryInduction() && TTI.isTruncateFree(SrcTy, DestTy)) 1175 return false; 1176 1177 // If the truncated value is not an induction variable, return false. 1178 return Legal->isInductionPhi(Op); 1179 } 1180 1181 /// Collects the instructions to scalarize for each predicated instruction in 1182 /// the loop. 1183 void collectInstsToScalarize(unsigned VF); 1184 1185 /// Collect Uniform and Scalar values for the given \p VF. 1186 /// The sets depend on CM decision for Load/Store instructions 1187 /// that may be vectorized as interleave, gather-scatter or scalarized. 1188 void collectUniformsAndScalars(unsigned VF) { 1189 // Do the analysis once. 1190 if (VF == 1 || Uniforms.find(VF) != Uniforms.end()) 1191 return; 1192 setCostBasedWideningDecision(VF); 1193 collectLoopUniforms(VF); 1194 collectLoopScalars(VF); 1195 } 1196 1197 /// Returns true if the target machine supports masked store operation 1198 /// for the given \p DataType and kind of access to \p Ptr. 1199 bool isLegalMaskedStore(Type *DataType, Value *Ptr, MaybeAlign Alignment) { 1200 return Legal->isConsecutivePtr(Ptr) && 1201 TTI.isLegalMaskedStore(DataType, Alignment); 1202 } 1203 1204 /// Returns true if the target machine supports masked load operation 1205 /// for the given \p DataType and kind of access to \p Ptr. 1206 bool isLegalMaskedLoad(Type *DataType, Value *Ptr, MaybeAlign Alignment) { 1207 return Legal->isConsecutivePtr(Ptr) && 1208 TTI.isLegalMaskedLoad(DataType, Alignment); 1209 } 1210 1211 /// Returns true if the target machine supports masked scatter operation 1212 /// for the given \p DataType. 1213 bool isLegalMaskedScatter(Type *DataType, MaybeAlign Alignment) { 1214 return TTI.isLegalMaskedScatter(DataType, Alignment); 1215 } 1216 1217 /// Returns true if the target machine supports masked gather operation 1218 /// for the given \p DataType. 1219 bool isLegalMaskedGather(Type *DataType, MaybeAlign Alignment) { 1220 return TTI.isLegalMaskedGather(DataType, Alignment); 1221 } 1222 1223 /// Returns true if the target machine can represent \p V as a masked gather 1224 /// or scatter operation. 1225 bool isLegalGatherOrScatter(Value *V) { 1226 bool LI = isa<LoadInst>(V); 1227 bool SI = isa<StoreInst>(V); 1228 if (!LI && !SI) 1229 return false; 1230 auto *Ty = getMemInstValueType(V); 1231 MaybeAlign Align = getLoadStoreAlignment(V); 1232 return (LI && isLegalMaskedGather(Ty, Align)) || 1233 (SI && isLegalMaskedScatter(Ty, Align)); 1234 } 1235 1236 /// Returns true if \p I is an instruction that will be scalarized with 1237 /// predication. Such instructions include conditional stores and 1238 /// instructions that may divide by zero. 1239 /// If a non-zero VF has been calculated, we check if I will be scalarized 1240 /// predication for that VF. 1241 bool isScalarWithPredication(Instruction *I, unsigned VF = 1); 1242 1243 // Returns true if \p I is an instruction that will be predicated either 1244 // through scalar predication or masked load/store or masked gather/scatter. 1245 // Superset of instructions that return true for isScalarWithPredication. 1246 bool isPredicatedInst(Instruction *I) { 1247 if (!blockNeedsPredication(I->getParent())) 1248 return false; 1249 // Loads and stores that need some form of masked operation are predicated 1250 // instructions. 1251 if (isa<LoadInst>(I) || isa<StoreInst>(I)) 1252 return Legal->isMaskRequired(I); 1253 return isScalarWithPredication(I); 1254 } 1255 1256 /// Returns true if \p I is a memory instruction with consecutive memory 1257 /// access that can be widened. 1258 bool memoryInstructionCanBeWidened(Instruction *I, unsigned VF = 1); 1259 1260 /// Returns true if \p I is a memory instruction in an interleaved-group 1261 /// of memory accesses that can be vectorized with wide vector loads/stores 1262 /// and shuffles. 1263 bool interleavedAccessCanBeWidened(Instruction *I, unsigned VF = 1); 1264 1265 /// Check if \p Instr belongs to any interleaved access group. 1266 bool isAccessInterleaved(Instruction *Instr) { 1267 return InterleaveInfo.isInterleaved(Instr); 1268 } 1269 1270 /// Get the interleaved access group that \p Instr belongs to. 1271 const InterleaveGroup<Instruction> * 1272 getInterleavedAccessGroup(Instruction *Instr) { 1273 return InterleaveInfo.getInterleaveGroup(Instr); 1274 } 1275 1276 /// Returns true if an interleaved group requires a scalar iteration 1277 /// to handle accesses with gaps, and there is nothing preventing us from 1278 /// creating a scalar epilogue. 1279 bool requiresScalarEpilogue() const { 1280 return isScalarEpilogueAllowed() && InterleaveInfo.requiresScalarEpilogue(); 1281 } 1282 1283 /// Returns true if a scalar epilogue is not allowed due to optsize or a 1284 /// loop hint annotation. 1285 bool isScalarEpilogueAllowed() const { 1286 return ScalarEpilogueStatus == CM_ScalarEpilogueAllowed; 1287 } 1288 1289 /// Returns true if all loop blocks should be masked to fold tail loop. 1290 bool foldTailByMasking() const { return FoldTailByMasking; } 1291 1292 bool blockNeedsPredication(BasicBlock *BB) { 1293 return foldTailByMasking() || Legal->blockNeedsPredication(BB); 1294 } 1295 1296 /// Estimate cost of an intrinsic call instruction CI if it were vectorized 1297 /// with factor VF. Return the cost of the instruction, including 1298 /// scalarization overhead if it's needed. 1299 unsigned getVectorIntrinsicCost(CallInst *CI, unsigned VF); 1300 1301 /// Estimate cost of a call instruction CI if it were vectorized with factor 1302 /// VF. Return the cost of the instruction, including scalarization overhead 1303 /// if it's needed. The flag NeedToScalarize shows if the call needs to be 1304 /// scalarized - 1305 /// i.e. either vector version isn't available, or is too expensive. 1306 unsigned getVectorCallCost(CallInst *CI, unsigned VF, bool &NeedToScalarize); 1307 1308 private: 1309 unsigned NumPredStores = 0; 1310 1311 /// \return An upper bound for the vectorization factor, larger than zero. 1312 /// One is returned if vectorization should best be avoided due to cost. 1313 unsigned computeFeasibleMaxVF(unsigned ConstTripCount); 1314 1315 /// The vectorization cost is a combination of the cost itself and a boolean 1316 /// indicating whether any of the contributing operations will actually 1317 /// operate on 1318 /// vector values after type legalization in the backend. If this latter value 1319 /// is 1320 /// false, then all operations will be scalarized (i.e. no vectorization has 1321 /// actually taken place). 1322 using VectorizationCostTy = std::pair<unsigned, bool>; 1323 1324 /// Returns the expected execution cost. The unit of the cost does 1325 /// not matter because we use the 'cost' units to compare different 1326 /// vector widths. The cost that is returned is *not* normalized by 1327 /// the factor width. 1328 VectorizationCostTy expectedCost(unsigned VF); 1329 1330 /// Returns the execution time cost of an instruction for a given vector 1331 /// width. Vector width of one means scalar. 1332 VectorizationCostTy getInstructionCost(Instruction *I, unsigned VF); 1333 1334 /// The cost-computation logic from getInstructionCost which provides 1335 /// the vector type as an output parameter. 1336 unsigned getInstructionCost(Instruction *I, unsigned VF, Type *&VectorTy); 1337 1338 /// Calculate vectorization cost of memory instruction \p I. 1339 unsigned getMemoryInstructionCost(Instruction *I, unsigned VF); 1340 1341 /// The cost computation for scalarized memory instruction. 1342 unsigned getMemInstScalarizationCost(Instruction *I, unsigned VF); 1343 1344 /// The cost computation for interleaving group of memory instructions. 1345 unsigned getInterleaveGroupCost(Instruction *I, unsigned VF); 1346 1347 /// The cost computation for Gather/Scatter instruction. 1348 unsigned getGatherScatterCost(Instruction *I, unsigned VF); 1349 1350 /// The cost computation for widening instruction \p I with consecutive 1351 /// memory access. 1352 unsigned getConsecutiveMemOpCost(Instruction *I, unsigned VF); 1353 1354 /// The cost calculation for Load/Store instruction \p I with uniform pointer - 1355 /// Load: scalar load + broadcast. 1356 /// Store: scalar store + (loop invariant value stored? 0 : extract of last 1357 /// element) 1358 unsigned getUniformMemOpCost(Instruction *I, unsigned VF); 1359 1360 /// Estimate the overhead of scalarizing an instruction. This is a 1361 /// convenience wrapper for the type-based getScalarizationOverhead API. 1362 unsigned getScalarizationOverhead(Instruction *I, unsigned VF); 1363 1364 /// Returns whether the instruction is a load or store and will be a emitted 1365 /// as a vector operation. 1366 bool isConsecutiveLoadOrStore(Instruction *I); 1367 1368 /// Returns true if an artificially high cost for emulated masked memrefs 1369 /// should be used. 1370 bool useEmulatedMaskMemRefHack(Instruction *I); 1371 1372 /// Map of scalar integer values to the smallest bitwidth they can be legally 1373 /// represented as. The vector equivalents of these values should be truncated 1374 /// to this type. 1375 MapVector<Instruction *, uint64_t> MinBWs; 1376 1377 /// A type representing the costs for instructions if they were to be 1378 /// scalarized rather than vectorized. The entries are Instruction-Cost 1379 /// pairs. 1380 using ScalarCostsTy = DenseMap<Instruction *, unsigned>; 1381 1382 /// A set containing all BasicBlocks that are known to present after 1383 /// vectorization as a predicated block. 1384 SmallPtrSet<BasicBlock *, 4> PredicatedBBsAfterVectorization; 1385 1386 /// Records whether it is allowed to have the original scalar loop execute at 1387 /// least once. This may be needed as a fallback loop in case runtime 1388 /// aliasing/dependence checks fail, or to handle the tail/remainder 1389 /// iterations when the trip count is unknown or doesn't divide by the VF, 1390 /// or as a peel-loop to handle gaps in interleave-groups. 1391 /// Under optsize and when the trip count is very small we don't allow any 1392 /// iterations to execute in the scalar loop. 1393 ScalarEpilogueLowering ScalarEpilogueStatus = CM_ScalarEpilogueAllowed; 1394 1395 /// All blocks of loop are to be masked to fold tail of scalar iterations. 1396 bool FoldTailByMasking = false; 1397 1398 /// A map holding scalar costs for different vectorization factors. The 1399 /// presence of a cost for an instruction in the mapping indicates that the 1400 /// instruction will be scalarized when vectorizing with the associated 1401 /// vectorization factor. The entries are VF-ScalarCostTy pairs. 1402 DenseMap<unsigned, ScalarCostsTy> InstsToScalarize; 1403 1404 /// Holds the instructions known to be uniform after vectorization. 1405 /// The data is collected per VF. 1406 DenseMap<unsigned, SmallPtrSet<Instruction *, 4>> Uniforms; 1407 1408 /// Holds the instructions known to be scalar after vectorization. 1409 /// The data is collected per VF. 1410 DenseMap<unsigned, SmallPtrSet<Instruction *, 4>> Scalars; 1411 1412 /// Holds the instructions (address computations) that are forced to be 1413 /// scalarized. 1414 DenseMap<unsigned, SmallPtrSet<Instruction *, 4>> ForcedScalars; 1415 1416 /// Returns the expected difference in cost from scalarizing the expression 1417 /// feeding a predicated instruction \p PredInst. The instructions to 1418 /// scalarize and their scalar costs are collected in \p ScalarCosts. A 1419 /// non-negative return value implies the expression will be scalarized. 1420 /// Currently, only single-use chains are considered for scalarization. 1421 int computePredInstDiscount(Instruction *PredInst, ScalarCostsTy &ScalarCosts, 1422 unsigned VF); 1423 1424 /// Collect the instructions that are uniform after vectorization. An 1425 /// instruction is uniform if we represent it with a single scalar value in 1426 /// the vectorized loop corresponding to each vector iteration. Examples of 1427 /// uniform instructions include pointer operands of consecutive or 1428 /// interleaved memory accesses. Note that although uniformity implies an 1429 /// instruction will be scalar, the reverse is not true. In general, a 1430 /// scalarized instruction will be represented by VF scalar values in the 1431 /// vectorized loop, each corresponding to an iteration of the original 1432 /// scalar loop. 1433 void collectLoopUniforms(unsigned VF); 1434 1435 /// Collect the instructions that are scalar after vectorization. An 1436 /// instruction is scalar if it is known to be uniform or will be scalarized 1437 /// during vectorization. Non-uniform scalarized instructions will be 1438 /// represented by VF values in the vectorized loop, each corresponding to an 1439 /// iteration of the original scalar loop. 1440 void collectLoopScalars(unsigned VF); 1441 1442 /// Keeps cost model vectorization decision and cost for instructions. 1443 /// Right now it is used for memory instructions only. 1444 using DecisionList = DenseMap<std::pair<Instruction *, unsigned>, 1445 std::pair<InstWidening, unsigned>>; 1446 1447 DecisionList WideningDecisions; 1448 1449 /// Returns true if \p V is expected to be vectorized and it needs to be 1450 /// extracted. 1451 bool needsExtract(Value *V, unsigned VF) const { 1452 Instruction *I = dyn_cast<Instruction>(V); 1453 if (VF == 1 || !I || !TheLoop->contains(I) || TheLoop->isLoopInvariant(I)) 1454 return false; 1455 1456 // Assume we can vectorize V (and hence we need extraction) if the 1457 // scalars are not computed yet. This can happen, because it is called 1458 // via getScalarizationOverhead from setCostBasedWideningDecision, before 1459 // the scalars are collected. That should be a safe assumption in most 1460 // cases, because we check if the operands have vectorizable types 1461 // beforehand in LoopVectorizationLegality. 1462 return Scalars.find(VF) == Scalars.end() || 1463 !isScalarAfterVectorization(I, VF); 1464 }; 1465 1466 /// Returns a range containing only operands needing to be extracted. 1467 SmallVector<Value *, 4> filterExtractingOperands(Instruction::op_range Ops, 1468 unsigned VF) { 1469 return SmallVector<Value *, 4>(make_filter_range( 1470 Ops, [this, VF](Value *V) { return this->needsExtract(V, VF); })); 1471 } 1472 1473 public: 1474 /// The loop that we evaluate. 1475 Loop *TheLoop; 1476 1477 /// Predicated scalar evolution analysis. 1478 PredicatedScalarEvolution &PSE; 1479 1480 /// Loop Info analysis. 1481 LoopInfo *LI; 1482 1483 /// Vectorization legality. 1484 LoopVectorizationLegality *Legal; 1485 1486 /// Vector target information. 1487 const TargetTransformInfo &TTI; 1488 1489 /// Target Library Info. 1490 const TargetLibraryInfo *TLI; 1491 1492 /// Demanded bits analysis. 1493 DemandedBits *DB; 1494 1495 /// Assumption cache. 1496 AssumptionCache *AC; 1497 1498 /// Interface to emit optimization remarks. 1499 OptimizationRemarkEmitter *ORE; 1500 1501 const Function *TheFunction; 1502 1503 /// Loop Vectorize Hint. 1504 const LoopVectorizeHints *Hints; 1505 1506 /// The interleave access information contains groups of interleaved accesses 1507 /// with the same stride and close to each other. 1508 InterleavedAccessInfo &InterleaveInfo; 1509 1510 /// Values to ignore in the cost model. 1511 SmallPtrSet<const Value *, 16> ValuesToIgnore; 1512 1513 /// Values to ignore in the cost model when VF > 1. 1514 SmallPtrSet<const Value *, 16> VecValuesToIgnore; 1515 }; 1516 1517 } // end namespace llvm 1518 1519 // Return true if \p OuterLp is an outer loop annotated with hints for explicit 1520 // vectorization. The loop needs to be annotated with #pragma omp simd 1521 // simdlen(#) or #pragma clang vectorize(enable) vectorize_width(#). If the 1522 // vector length information is not provided, vectorization is not considered 1523 // explicit. Interleave hints are not allowed either. These limitations will be 1524 // relaxed in the future. 1525 // Please, note that we are currently forced to abuse the pragma 'clang 1526 // vectorize' semantics. This pragma provides *auto-vectorization hints* 1527 // (i.e., LV must check that vectorization is legal) whereas pragma 'omp simd' 1528 // provides *explicit vectorization hints* (LV can bypass legal checks and 1529 // assume that vectorization is legal). However, both hints are implemented 1530 // using the same metadata (llvm.loop.vectorize, processed by 1531 // LoopVectorizeHints). This will be fixed in the future when the native IR 1532 // representation for pragma 'omp simd' is introduced. 1533 static bool isExplicitVecOuterLoop(Loop *OuterLp, 1534 OptimizationRemarkEmitter *ORE) { 1535 assert(!OuterLp->empty() && "This is not an outer loop"); 1536 LoopVectorizeHints Hints(OuterLp, true /*DisableInterleaving*/, *ORE); 1537 1538 // Only outer loops with an explicit vectorization hint are supported. 1539 // Unannotated outer loops are ignored. 1540 if (Hints.getForce() == LoopVectorizeHints::FK_Undefined) 1541 return false; 1542 1543 Function *Fn = OuterLp->getHeader()->getParent(); 1544 if (!Hints.allowVectorization(Fn, OuterLp, 1545 true /*VectorizeOnlyWhenForced*/)) { 1546 LLVM_DEBUG(dbgs() << "LV: Loop hints prevent outer loop vectorization.\n"); 1547 return false; 1548 } 1549 1550 if (Hints.getInterleave() > 1) { 1551 // TODO: Interleave support is future work. 1552 LLVM_DEBUG(dbgs() << "LV: Not vectorizing: Interleave is not supported for " 1553 "outer loops.\n"); 1554 Hints.emitRemarkWithHints(); 1555 return false; 1556 } 1557 1558 return true; 1559 } 1560 1561 static void collectSupportedLoops(Loop &L, LoopInfo *LI, 1562 OptimizationRemarkEmitter *ORE, 1563 SmallVectorImpl<Loop *> &V) { 1564 // Collect inner loops and outer loops without irreducible control flow. For 1565 // now, only collect outer loops that have explicit vectorization hints. If we 1566 // are stress testing the VPlan H-CFG construction, we collect the outermost 1567 // loop of every loop nest. 1568 if (L.empty() || VPlanBuildStressTest || 1569 (EnableVPlanNativePath && isExplicitVecOuterLoop(&L, ORE))) { 1570 LoopBlocksRPO RPOT(&L); 1571 RPOT.perform(LI); 1572 if (!containsIrreducibleCFG<const BasicBlock *>(RPOT, *LI)) { 1573 V.push_back(&L); 1574 // TODO: Collect inner loops inside marked outer loops in case 1575 // vectorization fails for the outer loop. Do not invoke 1576 // 'containsIrreducibleCFG' again for inner loops when the outer loop is 1577 // already known to be reducible. We can use an inherited attribute for 1578 // that. 1579 return; 1580 } 1581 } 1582 for (Loop *InnerL : L) 1583 collectSupportedLoops(*InnerL, LI, ORE, V); 1584 } 1585 1586 namespace { 1587 1588 /// The LoopVectorize Pass. 1589 struct LoopVectorize : public FunctionPass { 1590 /// Pass identification, replacement for typeid 1591 static char ID; 1592 1593 LoopVectorizePass Impl; 1594 1595 explicit LoopVectorize(bool InterleaveOnlyWhenForced = false, 1596 bool VectorizeOnlyWhenForced = false) 1597 : FunctionPass(ID) { 1598 Impl.InterleaveOnlyWhenForced = InterleaveOnlyWhenForced; 1599 Impl.VectorizeOnlyWhenForced = VectorizeOnlyWhenForced; 1600 initializeLoopVectorizePass(*PassRegistry::getPassRegistry()); 1601 } 1602 1603 bool runOnFunction(Function &F) override { 1604 if (skipFunction(F)) 1605 return false; 1606 1607 auto *SE = &getAnalysis<ScalarEvolutionWrapperPass>().getSE(); 1608 auto *LI = &getAnalysis<LoopInfoWrapperPass>().getLoopInfo(); 1609 auto *TTI = &getAnalysis<TargetTransformInfoWrapperPass>().getTTI(F); 1610 auto *DT = &getAnalysis<DominatorTreeWrapperPass>().getDomTree(); 1611 auto *BFI = &getAnalysis<BlockFrequencyInfoWrapperPass>().getBFI(); 1612 auto *TLIP = getAnalysisIfAvailable<TargetLibraryInfoWrapperPass>(); 1613 auto *TLI = TLIP ? &TLIP->getTLI(F) : nullptr; 1614 auto *AA = &getAnalysis<AAResultsWrapperPass>().getAAResults(); 1615 auto *AC = &getAnalysis<AssumptionCacheTracker>().getAssumptionCache(F); 1616 auto *LAA = &getAnalysis<LoopAccessLegacyAnalysis>(); 1617 auto *DB = &getAnalysis<DemandedBitsWrapperPass>().getDemandedBits(); 1618 auto *ORE = &getAnalysis<OptimizationRemarkEmitterWrapperPass>().getORE(); 1619 auto *PSI = &getAnalysis<ProfileSummaryInfoWrapperPass>().getPSI(); 1620 1621 std::function<const LoopAccessInfo &(Loop &)> GetLAA = 1622 [&](Loop &L) -> const LoopAccessInfo & { return LAA->getInfo(&L); }; 1623 1624 return Impl.runImpl(F, *SE, *LI, *TTI, *DT, *BFI, TLI, *DB, *AA, *AC, 1625 GetLAA, *ORE, PSI); 1626 } 1627 1628 void getAnalysisUsage(AnalysisUsage &AU) const override { 1629 AU.addRequired<AssumptionCacheTracker>(); 1630 AU.addRequired<BlockFrequencyInfoWrapperPass>(); 1631 AU.addRequired<DominatorTreeWrapperPass>(); 1632 AU.addRequired<LoopInfoWrapperPass>(); 1633 AU.addRequired<ScalarEvolutionWrapperPass>(); 1634 AU.addRequired<TargetTransformInfoWrapperPass>(); 1635 AU.addRequired<AAResultsWrapperPass>(); 1636 AU.addRequired<LoopAccessLegacyAnalysis>(); 1637 AU.addRequired<DemandedBitsWrapperPass>(); 1638 AU.addRequired<OptimizationRemarkEmitterWrapperPass>(); 1639 AU.addRequired<InjectTLIMappingsLegacy>(); 1640 1641 // We currently do not preserve loopinfo/dominator analyses with outer loop 1642 // vectorization. Until this is addressed, mark these analyses as preserved 1643 // only for non-VPlan-native path. 1644 // TODO: Preserve Loop and Dominator analyses for VPlan-native path. 1645 if (!EnableVPlanNativePath) { 1646 AU.addPreserved<LoopInfoWrapperPass>(); 1647 AU.addPreserved<DominatorTreeWrapperPass>(); 1648 } 1649 1650 AU.addPreserved<BasicAAWrapperPass>(); 1651 AU.addPreserved<GlobalsAAWrapperPass>(); 1652 AU.addRequired<ProfileSummaryInfoWrapperPass>(); 1653 } 1654 }; 1655 1656 } // end anonymous namespace 1657 1658 //===----------------------------------------------------------------------===// 1659 // Implementation of LoopVectorizationLegality, InnerLoopVectorizer and 1660 // LoopVectorizationCostModel and LoopVectorizationPlanner. 1661 //===----------------------------------------------------------------------===// 1662 1663 Value *InnerLoopVectorizer::getBroadcastInstrs(Value *V) { 1664 // We need to place the broadcast of invariant variables outside the loop, 1665 // but only if it's proven safe to do so. Else, broadcast will be inside 1666 // vector loop body. 1667 Instruction *Instr = dyn_cast<Instruction>(V); 1668 bool SafeToHoist = OrigLoop->isLoopInvariant(V) && 1669 (!Instr || 1670 DT->dominates(Instr->getParent(), LoopVectorPreHeader)); 1671 // Place the code for broadcasting invariant variables in the new preheader. 1672 IRBuilder<>::InsertPointGuard Guard(Builder); 1673 if (SafeToHoist) 1674 Builder.SetInsertPoint(LoopVectorPreHeader->getTerminator()); 1675 1676 // Broadcast the scalar into all locations in the vector. 1677 Value *Shuf = Builder.CreateVectorSplat(VF, V, "broadcast"); 1678 1679 return Shuf; 1680 } 1681 1682 void InnerLoopVectorizer::createVectorIntOrFpInductionPHI( 1683 const InductionDescriptor &II, Value *Step, Instruction *EntryVal) { 1684 assert((isa<PHINode>(EntryVal) || isa<TruncInst>(EntryVal)) && 1685 "Expected either an induction phi-node or a truncate of it!"); 1686 Value *Start = II.getStartValue(); 1687 1688 // Construct the initial value of the vector IV in the vector loop preheader 1689 auto CurrIP = Builder.saveIP(); 1690 Builder.SetInsertPoint(LoopVectorPreHeader->getTerminator()); 1691 if (isa<TruncInst>(EntryVal)) { 1692 assert(Start->getType()->isIntegerTy() && 1693 "Truncation requires an integer type"); 1694 auto *TruncType = cast<IntegerType>(EntryVal->getType()); 1695 Step = Builder.CreateTrunc(Step, TruncType); 1696 Start = Builder.CreateCast(Instruction::Trunc, Start, TruncType); 1697 } 1698 Value *SplatStart = Builder.CreateVectorSplat(VF, Start); 1699 Value *SteppedStart = 1700 getStepVector(SplatStart, 0, Step, II.getInductionOpcode()); 1701 1702 // We create vector phi nodes for both integer and floating-point induction 1703 // variables. Here, we determine the kind of arithmetic we will perform. 1704 Instruction::BinaryOps AddOp; 1705 Instruction::BinaryOps MulOp; 1706 if (Step->getType()->isIntegerTy()) { 1707 AddOp = Instruction::Add; 1708 MulOp = Instruction::Mul; 1709 } else { 1710 AddOp = II.getInductionOpcode(); 1711 MulOp = Instruction::FMul; 1712 } 1713 1714 // Multiply the vectorization factor by the step using integer or 1715 // floating-point arithmetic as appropriate. 1716 Value *ConstVF = getSignedIntOrFpConstant(Step->getType(), VF); 1717 Value *Mul = addFastMathFlag(Builder.CreateBinOp(MulOp, Step, ConstVF)); 1718 1719 // Create a vector splat to use in the induction update. 1720 // 1721 // FIXME: If the step is non-constant, we create the vector splat with 1722 // IRBuilder. IRBuilder can constant-fold the multiply, but it doesn't 1723 // handle a constant vector splat. 1724 Value *SplatVF = 1725 isa<Constant>(Mul) 1726 ? ConstantVector::getSplat({VF, false}, cast<Constant>(Mul)) 1727 : Builder.CreateVectorSplat(VF, Mul); 1728 Builder.restoreIP(CurrIP); 1729 1730 // We may need to add the step a number of times, depending on the unroll 1731 // factor. The last of those goes into the PHI. 1732 PHINode *VecInd = PHINode::Create(SteppedStart->getType(), 2, "vec.ind", 1733 &*LoopVectorBody->getFirstInsertionPt()); 1734 VecInd->setDebugLoc(EntryVal->getDebugLoc()); 1735 Instruction *LastInduction = VecInd; 1736 for (unsigned Part = 0; Part < UF; ++Part) { 1737 VectorLoopValueMap.setVectorValue(EntryVal, Part, LastInduction); 1738 1739 if (isa<TruncInst>(EntryVal)) 1740 addMetadata(LastInduction, EntryVal); 1741 recordVectorLoopValueForInductionCast(II, EntryVal, LastInduction, Part); 1742 1743 LastInduction = cast<Instruction>(addFastMathFlag( 1744 Builder.CreateBinOp(AddOp, LastInduction, SplatVF, "step.add"))); 1745 LastInduction->setDebugLoc(EntryVal->getDebugLoc()); 1746 } 1747 1748 // Move the last step to the end of the latch block. This ensures consistent 1749 // placement of all induction updates. 1750 auto *LoopVectorLatch = LI->getLoopFor(LoopVectorBody)->getLoopLatch(); 1751 auto *Br = cast<BranchInst>(LoopVectorLatch->getTerminator()); 1752 auto *ICmp = cast<Instruction>(Br->getCondition()); 1753 LastInduction->moveBefore(ICmp); 1754 LastInduction->setName("vec.ind.next"); 1755 1756 VecInd->addIncoming(SteppedStart, LoopVectorPreHeader); 1757 VecInd->addIncoming(LastInduction, LoopVectorLatch); 1758 } 1759 1760 bool InnerLoopVectorizer::shouldScalarizeInstruction(Instruction *I) const { 1761 return Cost->isScalarAfterVectorization(I, VF) || 1762 Cost->isProfitableToScalarize(I, VF); 1763 } 1764 1765 bool InnerLoopVectorizer::needsScalarInduction(Instruction *IV) const { 1766 if (shouldScalarizeInstruction(IV)) 1767 return true; 1768 auto isScalarInst = [&](User *U) -> bool { 1769 auto *I = cast<Instruction>(U); 1770 return (OrigLoop->contains(I) && shouldScalarizeInstruction(I)); 1771 }; 1772 return llvm::any_of(IV->users(), isScalarInst); 1773 } 1774 1775 void InnerLoopVectorizer::recordVectorLoopValueForInductionCast( 1776 const InductionDescriptor &ID, const Instruction *EntryVal, 1777 Value *VectorLoopVal, unsigned Part, unsigned Lane) { 1778 assert((isa<PHINode>(EntryVal) || isa<TruncInst>(EntryVal)) && 1779 "Expected either an induction phi-node or a truncate of it!"); 1780 1781 // This induction variable is not the phi from the original loop but the 1782 // newly-created IV based on the proof that casted Phi is equal to the 1783 // uncasted Phi in the vectorized loop (under a runtime guard possibly). It 1784 // re-uses the same InductionDescriptor that original IV uses but we don't 1785 // have to do any recording in this case - that is done when original IV is 1786 // processed. 1787 if (isa<TruncInst>(EntryVal)) 1788 return; 1789 1790 const SmallVectorImpl<Instruction *> &Casts = ID.getCastInsts(); 1791 if (Casts.empty()) 1792 return; 1793 // Only the first Cast instruction in the Casts vector is of interest. 1794 // The rest of the Casts (if exist) have no uses outside the 1795 // induction update chain itself. 1796 Instruction *CastInst = *Casts.begin(); 1797 if (Lane < UINT_MAX) 1798 VectorLoopValueMap.setScalarValue(CastInst, {Part, Lane}, VectorLoopVal); 1799 else 1800 VectorLoopValueMap.setVectorValue(CastInst, Part, VectorLoopVal); 1801 } 1802 1803 void InnerLoopVectorizer::widenIntOrFpInduction(PHINode *IV, TruncInst *Trunc) { 1804 assert((IV->getType()->isIntegerTy() || IV != OldInduction) && 1805 "Primary induction variable must have an integer type"); 1806 1807 auto II = Legal->getInductionVars().find(IV); 1808 assert(II != Legal->getInductionVars().end() && "IV is not an induction"); 1809 1810 auto ID = II->second; 1811 assert(IV->getType() == ID.getStartValue()->getType() && "Types must match"); 1812 1813 // The value from the original loop to which we are mapping the new induction 1814 // variable. 1815 Instruction *EntryVal = Trunc ? cast<Instruction>(Trunc) : IV; 1816 1817 auto &DL = OrigLoop->getHeader()->getModule()->getDataLayout(); 1818 1819 // Generate code for the induction step. Note that induction steps are 1820 // required to be loop-invariant 1821 auto CreateStepValue = [&](const SCEV *Step) -> Value * { 1822 assert(PSE.getSE()->isLoopInvariant(Step, OrigLoop) && 1823 "Induction step should be loop invariant"); 1824 if (PSE.getSE()->isSCEVable(IV->getType())) { 1825 SCEVExpander Exp(*PSE.getSE(), DL, "induction"); 1826 return Exp.expandCodeFor(Step, Step->getType(), 1827 LoopVectorPreHeader->getTerminator()); 1828 } 1829 return cast<SCEVUnknown>(Step)->getValue(); 1830 }; 1831 1832 // The scalar value to broadcast. This is derived from the canonical 1833 // induction variable. If a truncation type is given, truncate the canonical 1834 // induction variable and step. Otherwise, derive these values from the 1835 // induction descriptor. 1836 auto CreateScalarIV = [&](Value *&Step) -> Value * { 1837 Value *ScalarIV = Induction; 1838 if (IV != OldInduction) { 1839 ScalarIV = IV->getType()->isIntegerTy() 1840 ? Builder.CreateSExtOrTrunc(Induction, IV->getType()) 1841 : Builder.CreateCast(Instruction::SIToFP, Induction, 1842 IV->getType()); 1843 ScalarIV = emitTransformedIndex(Builder, ScalarIV, PSE.getSE(), DL, ID); 1844 ScalarIV->setName("offset.idx"); 1845 } 1846 if (Trunc) { 1847 auto *TruncType = cast<IntegerType>(Trunc->getType()); 1848 assert(Step->getType()->isIntegerTy() && 1849 "Truncation requires an integer step"); 1850 ScalarIV = Builder.CreateTrunc(ScalarIV, TruncType); 1851 Step = Builder.CreateTrunc(Step, TruncType); 1852 } 1853 return ScalarIV; 1854 }; 1855 1856 // Create the vector values from the scalar IV, in the absence of creating a 1857 // vector IV. 1858 auto CreateSplatIV = [&](Value *ScalarIV, Value *Step) { 1859 Value *Broadcasted = getBroadcastInstrs(ScalarIV); 1860 for (unsigned Part = 0; Part < UF; ++Part) { 1861 Value *EntryPart = 1862 getStepVector(Broadcasted, VF * Part, Step, ID.getInductionOpcode()); 1863 VectorLoopValueMap.setVectorValue(EntryVal, Part, EntryPart); 1864 if (Trunc) 1865 addMetadata(EntryPart, Trunc); 1866 recordVectorLoopValueForInductionCast(ID, EntryVal, EntryPart, Part); 1867 } 1868 }; 1869 1870 // Now do the actual transformations, and start with creating the step value. 1871 Value *Step = CreateStepValue(ID.getStep()); 1872 if (VF <= 1) { 1873 Value *ScalarIV = CreateScalarIV(Step); 1874 CreateSplatIV(ScalarIV, Step); 1875 return; 1876 } 1877 1878 // Determine if we want a scalar version of the induction variable. This is 1879 // true if the induction variable itself is not widened, or if it has at 1880 // least one user in the loop that is not widened. 1881 auto NeedsScalarIV = needsScalarInduction(EntryVal); 1882 if (!NeedsScalarIV) { 1883 createVectorIntOrFpInductionPHI(ID, Step, EntryVal); 1884 return; 1885 } 1886 1887 // Try to create a new independent vector induction variable. If we can't 1888 // create the phi node, we will splat the scalar induction variable in each 1889 // loop iteration. 1890 if (!shouldScalarizeInstruction(EntryVal)) { 1891 createVectorIntOrFpInductionPHI(ID, Step, EntryVal); 1892 Value *ScalarIV = CreateScalarIV(Step); 1893 // Create scalar steps that can be used by instructions we will later 1894 // scalarize. Note that the addition of the scalar steps will not increase 1895 // the number of instructions in the loop in the common case prior to 1896 // InstCombine. We will be trading one vector extract for each scalar step. 1897 buildScalarSteps(ScalarIV, Step, EntryVal, ID); 1898 return; 1899 } 1900 1901 // If we haven't yet vectorized the induction variable, splat the scalar 1902 // induction variable, and build the necessary step vectors. 1903 // TODO: Don't do it unless the vectorized IV is really required. 1904 Value *ScalarIV = CreateScalarIV(Step); 1905 CreateSplatIV(ScalarIV, Step); 1906 buildScalarSteps(ScalarIV, Step, EntryVal, ID); 1907 } 1908 1909 Value *InnerLoopVectorizer::getStepVector(Value *Val, int StartIdx, Value *Step, 1910 Instruction::BinaryOps BinOp) { 1911 // Create and check the types. 1912 assert(Val->getType()->isVectorTy() && "Must be a vector"); 1913 int VLen = Val->getType()->getVectorNumElements(); 1914 1915 Type *STy = Val->getType()->getScalarType(); 1916 assert((STy->isIntegerTy() || STy->isFloatingPointTy()) && 1917 "Induction Step must be an integer or FP"); 1918 assert(Step->getType() == STy && "Step has wrong type"); 1919 1920 SmallVector<Constant *, 8> Indices; 1921 1922 if (STy->isIntegerTy()) { 1923 // Create a vector of consecutive numbers from zero to VF. 1924 for (int i = 0; i < VLen; ++i) 1925 Indices.push_back(ConstantInt::get(STy, StartIdx + i)); 1926 1927 // Add the consecutive indices to the vector value. 1928 Constant *Cv = ConstantVector::get(Indices); 1929 assert(Cv->getType() == Val->getType() && "Invalid consecutive vec"); 1930 Step = Builder.CreateVectorSplat(VLen, Step); 1931 assert(Step->getType() == Val->getType() && "Invalid step vec"); 1932 // FIXME: The newly created binary instructions should contain nsw/nuw flags, 1933 // which can be found from the original scalar operations. 1934 Step = Builder.CreateMul(Cv, Step); 1935 return Builder.CreateAdd(Val, Step, "induction"); 1936 } 1937 1938 // Floating point induction. 1939 assert((BinOp == Instruction::FAdd || BinOp == Instruction::FSub) && 1940 "Binary Opcode should be specified for FP induction"); 1941 // Create a vector of consecutive numbers from zero to VF. 1942 for (int i = 0; i < VLen; ++i) 1943 Indices.push_back(ConstantFP::get(STy, (double)(StartIdx + i))); 1944 1945 // Add the consecutive indices to the vector value. 1946 Constant *Cv = ConstantVector::get(Indices); 1947 1948 Step = Builder.CreateVectorSplat(VLen, Step); 1949 1950 // Floating point operations had to be 'fast' to enable the induction. 1951 FastMathFlags Flags; 1952 Flags.setFast(); 1953 1954 Value *MulOp = Builder.CreateFMul(Cv, Step); 1955 if (isa<Instruction>(MulOp)) 1956 // Have to check, MulOp may be a constant 1957 cast<Instruction>(MulOp)->setFastMathFlags(Flags); 1958 1959 Value *BOp = Builder.CreateBinOp(BinOp, Val, MulOp, "induction"); 1960 if (isa<Instruction>(BOp)) 1961 cast<Instruction>(BOp)->setFastMathFlags(Flags); 1962 return BOp; 1963 } 1964 1965 void InnerLoopVectorizer::buildScalarSteps(Value *ScalarIV, Value *Step, 1966 Instruction *EntryVal, 1967 const InductionDescriptor &ID) { 1968 // We shouldn't have to build scalar steps if we aren't vectorizing. 1969 assert(VF > 1 && "VF should be greater than one"); 1970 1971 // Get the value type and ensure it and the step have the same integer type. 1972 Type *ScalarIVTy = ScalarIV->getType()->getScalarType(); 1973 assert(ScalarIVTy == Step->getType() && 1974 "Val and Step should have the same type"); 1975 1976 // We build scalar steps for both integer and floating-point induction 1977 // variables. Here, we determine the kind of arithmetic we will perform. 1978 Instruction::BinaryOps AddOp; 1979 Instruction::BinaryOps MulOp; 1980 if (ScalarIVTy->isIntegerTy()) { 1981 AddOp = Instruction::Add; 1982 MulOp = Instruction::Mul; 1983 } else { 1984 AddOp = ID.getInductionOpcode(); 1985 MulOp = Instruction::FMul; 1986 } 1987 1988 // Determine the number of scalars we need to generate for each unroll 1989 // iteration. If EntryVal is uniform, we only need to generate the first 1990 // lane. Otherwise, we generate all VF values. 1991 unsigned Lanes = 1992 Cost->isUniformAfterVectorization(cast<Instruction>(EntryVal), VF) ? 1 1993 : VF; 1994 // Compute the scalar steps and save the results in VectorLoopValueMap. 1995 for (unsigned Part = 0; Part < UF; ++Part) { 1996 for (unsigned Lane = 0; Lane < Lanes; ++Lane) { 1997 auto *StartIdx = getSignedIntOrFpConstant(ScalarIVTy, VF * Part + Lane); 1998 auto *Mul = addFastMathFlag(Builder.CreateBinOp(MulOp, StartIdx, Step)); 1999 auto *Add = addFastMathFlag(Builder.CreateBinOp(AddOp, ScalarIV, Mul)); 2000 VectorLoopValueMap.setScalarValue(EntryVal, {Part, Lane}, Add); 2001 recordVectorLoopValueForInductionCast(ID, EntryVal, Add, Part, Lane); 2002 } 2003 } 2004 } 2005 2006 Value *InnerLoopVectorizer::getOrCreateVectorValue(Value *V, unsigned Part) { 2007 assert(V != Induction && "The new induction variable should not be used."); 2008 assert(!V->getType()->isVectorTy() && "Can't widen a vector"); 2009 assert(!V->getType()->isVoidTy() && "Type does not produce a value"); 2010 2011 // If we have a stride that is replaced by one, do it here. Defer this for 2012 // the VPlan-native path until we start running Legal checks in that path. 2013 if (!EnableVPlanNativePath && Legal->hasStride(V)) 2014 V = ConstantInt::get(V->getType(), 1); 2015 2016 // If we have a vector mapped to this value, return it. 2017 if (VectorLoopValueMap.hasVectorValue(V, Part)) 2018 return VectorLoopValueMap.getVectorValue(V, Part); 2019 2020 // If the value has not been vectorized, check if it has been scalarized 2021 // instead. If it has been scalarized, and we actually need the value in 2022 // vector form, we will construct the vector values on demand. 2023 if (VectorLoopValueMap.hasAnyScalarValue(V)) { 2024 Value *ScalarValue = VectorLoopValueMap.getScalarValue(V, {Part, 0}); 2025 2026 // If we've scalarized a value, that value should be an instruction. 2027 auto *I = cast<Instruction>(V); 2028 2029 // If we aren't vectorizing, we can just copy the scalar map values over to 2030 // the vector map. 2031 if (VF == 1) { 2032 VectorLoopValueMap.setVectorValue(V, Part, ScalarValue); 2033 return ScalarValue; 2034 } 2035 2036 // Get the last scalar instruction we generated for V and Part. If the value 2037 // is known to be uniform after vectorization, this corresponds to lane zero 2038 // of the Part unroll iteration. Otherwise, the last instruction is the one 2039 // we created for the last vector lane of the Part unroll iteration. 2040 unsigned LastLane = Cost->isUniformAfterVectorization(I, VF) ? 0 : VF - 1; 2041 auto *LastInst = cast<Instruction>( 2042 VectorLoopValueMap.getScalarValue(V, {Part, LastLane})); 2043 2044 // Set the insert point after the last scalarized instruction. This ensures 2045 // the insertelement sequence will directly follow the scalar definitions. 2046 auto OldIP = Builder.saveIP(); 2047 auto NewIP = std::next(BasicBlock::iterator(LastInst)); 2048 Builder.SetInsertPoint(&*NewIP); 2049 2050 // However, if we are vectorizing, we need to construct the vector values. 2051 // If the value is known to be uniform after vectorization, we can just 2052 // broadcast the scalar value corresponding to lane zero for each unroll 2053 // iteration. Otherwise, we construct the vector values using insertelement 2054 // instructions. Since the resulting vectors are stored in 2055 // VectorLoopValueMap, we will only generate the insertelements once. 2056 Value *VectorValue = nullptr; 2057 if (Cost->isUniformAfterVectorization(I, VF)) { 2058 VectorValue = getBroadcastInstrs(ScalarValue); 2059 VectorLoopValueMap.setVectorValue(V, Part, VectorValue); 2060 } else { 2061 // Initialize packing with insertelements to start from undef. 2062 Value *Undef = UndefValue::get(VectorType::get(V->getType(), VF)); 2063 VectorLoopValueMap.setVectorValue(V, Part, Undef); 2064 for (unsigned Lane = 0; Lane < VF; ++Lane) 2065 packScalarIntoVectorValue(V, {Part, Lane}); 2066 VectorValue = VectorLoopValueMap.getVectorValue(V, Part); 2067 } 2068 Builder.restoreIP(OldIP); 2069 return VectorValue; 2070 } 2071 2072 // If this scalar is unknown, assume that it is a constant or that it is 2073 // loop invariant. Broadcast V and save the value for future uses. 2074 Value *B = getBroadcastInstrs(V); 2075 VectorLoopValueMap.setVectorValue(V, Part, B); 2076 return B; 2077 } 2078 2079 Value * 2080 InnerLoopVectorizer::getOrCreateScalarValue(Value *V, 2081 const VPIteration &Instance) { 2082 // If the value is not an instruction contained in the loop, it should 2083 // already be scalar. 2084 if (OrigLoop->isLoopInvariant(V)) 2085 return V; 2086 2087 assert(Instance.Lane > 0 2088 ? !Cost->isUniformAfterVectorization(cast<Instruction>(V), VF) 2089 : true && "Uniform values only have lane zero"); 2090 2091 // If the value from the original loop has not been vectorized, it is 2092 // represented by UF x VF scalar values in the new loop. Return the requested 2093 // scalar value. 2094 if (VectorLoopValueMap.hasScalarValue(V, Instance)) 2095 return VectorLoopValueMap.getScalarValue(V, Instance); 2096 2097 // If the value has not been scalarized, get its entry in VectorLoopValueMap 2098 // for the given unroll part. If this entry is not a vector type (i.e., the 2099 // vectorization factor is one), there is no need to generate an 2100 // extractelement instruction. 2101 auto *U = getOrCreateVectorValue(V, Instance.Part); 2102 if (!U->getType()->isVectorTy()) { 2103 assert(VF == 1 && "Value not scalarized has non-vector type"); 2104 return U; 2105 } 2106 2107 // Otherwise, the value from the original loop has been vectorized and is 2108 // represented by UF vector values. Extract and return the requested scalar 2109 // value from the appropriate vector lane. 2110 return Builder.CreateExtractElement(U, Builder.getInt32(Instance.Lane)); 2111 } 2112 2113 void InnerLoopVectorizer::packScalarIntoVectorValue( 2114 Value *V, const VPIteration &Instance) { 2115 assert(V != Induction && "The new induction variable should not be used."); 2116 assert(!V->getType()->isVectorTy() && "Can't pack a vector"); 2117 assert(!V->getType()->isVoidTy() && "Type does not produce a value"); 2118 2119 Value *ScalarInst = VectorLoopValueMap.getScalarValue(V, Instance); 2120 Value *VectorValue = VectorLoopValueMap.getVectorValue(V, Instance.Part); 2121 VectorValue = Builder.CreateInsertElement(VectorValue, ScalarInst, 2122 Builder.getInt32(Instance.Lane)); 2123 VectorLoopValueMap.resetVectorValue(V, Instance.Part, VectorValue); 2124 } 2125 2126 Value *InnerLoopVectorizer::reverseVector(Value *Vec) { 2127 assert(Vec->getType()->isVectorTy() && "Invalid type"); 2128 SmallVector<Constant *, 8> ShuffleMask; 2129 for (unsigned i = 0; i < VF; ++i) 2130 ShuffleMask.push_back(Builder.getInt32(VF - i - 1)); 2131 2132 return Builder.CreateShuffleVector(Vec, UndefValue::get(Vec->getType()), 2133 ConstantVector::get(ShuffleMask), 2134 "reverse"); 2135 } 2136 2137 // Return whether we allow using masked interleave-groups (for dealing with 2138 // strided loads/stores that reside in predicated blocks, or for dealing 2139 // with gaps). 2140 static bool useMaskedInterleavedAccesses(const TargetTransformInfo &TTI) { 2141 // If an override option has been passed in for interleaved accesses, use it. 2142 if (EnableMaskedInterleavedMemAccesses.getNumOccurrences() > 0) 2143 return EnableMaskedInterleavedMemAccesses; 2144 2145 return TTI.enableMaskedInterleavedAccessVectorization(); 2146 } 2147 2148 // Try to vectorize the interleave group that \p Instr belongs to. 2149 // 2150 // E.g. Translate following interleaved load group (factor = 3): 2151 // for (i = 0; i < N; i+=3) { 2152 // R = Pic[i]; // Member of index 0 2153 // G = Pic[i+1]; // Member of index 1 2154 // B = Pic[i+2]; // Member of index 2 2155 // ... // do something to R, G, B 2156 // } 2157 // To: 2158 // %wide.vec = load <12 x i32> ; Read 4 tuples of R,G,B 2159 // %R.vec = shuffle %wide.vec, undef, <0, 3, 6, 9> ; R elements 2160 // %G.vec = shuffle %wide.vec, undef, <1, 4, 7, 10> ; G elements 2161 // %B.vec = shuffle %wide.vec, undef, <2, 5, 8, 11> ; B elements 2162 // 2163 // Or translate following interleaved store group (factor = 3): 2164 // for (i = 0; i < N; i+=3) { 2165 // ... do something to R, G, B 2166 // Pic[i] = R; // Member of index 0 2167 // Pic[i+1] = G; // Member of index 1 2168 // Pic[i+2] = B; // Member of index 2 2169 // } 2170 // To: 2171 // %R_G.vec = shuffle %R.vec, %G.vec, <0, 1, 2, ..., 7> 2172 // %B_U.vec = shuffle %B.vec, undef, <0, 1, 2, 3, u, u, u, u> 2173 // %interleaved.vec = shuffle %R_G.vec, %B_U.vec, 2174 // <0, 4, 8, 1, 5, 9, 2, 6, 10, 3, 7, 11> ; Interleave R,G,B elements 2175 // store <12 x i32> %interleaved.vec ; Write 4 tuples of R,G,B 2176 void InnerLoopVectorizer::vectorizeInterleaveGroup(Instruction *Instr, 2177 VPTransformState &State, 2178 VPValue *Addr, 2179 VPValue *BlockInMask) { 2180 const InterleaveGroup<Instruction> *Group = 2181 Cost->getInterleavedAccessGroup(Instr); 2182 assert(Group && "Fail to get an interleaved access group."); 2183 2184 // Skip if current instruction is not the insert position. 2185 if (Instr != Group->getInsertPos()) 2186 return; 2187 2188 const DataLayout &DL = Instr->getModule()->getDataLayout(); 2189 2190 // Prepare for the vector type of the interleaved load/store. 2191 Type *ScalarTy = getMemInstValueType(Instr); 2192 unsigned InterleaveFactor = Group->getFactor(); 2193 Type *VecTy = VectorType::get(ScalarTy, InterleaveFactor * VF); 2194 2195 // Prepare for the new pointers. 2196 SmallVector<Value *, 2> AddrParts; 2197 unsigned Index = Group->getIndex(Instr); 2198 2199 // TODO: extend the masked interleaved-group support to reversed access. 2200 assert((!BlockInMask || !Group->isReverse()) && 2201 "Reversed masked interleave-group not supported."); 2202 2203 // If the group is reverse, adjust the index to refer to the last vector lane 2204 // instead of the first. We adjust the index from the first vector lane, 2205 // rather than directly getting the pointer for lane VF - 1, because the 2206 // pointer operand of the interleaved access is supposed to be uniform. For 2207 // uniform instructions, we're only required to generate a value for the 2208 // first vector lane in each unroll iteration. 2209 if (Group->isReverse()) 2210 Index += (VF - 1) * Group->getFactor(); 2211 2212 for (unsigned Part = 0; Part < UF; Part++) { 2213 Value *AddrPart = State.get(Addr, {Part, 0}); 2214 setDebugLocFromInst(Builder, AddrPart); 2215 2216 // Notice current instruction could be any index. Need to adjust the address 2217 // to the member of index 0. 2218 // 2219 // E.g. a = A[i+1]; // Member of index 1 (Current instruction) 2220 // b = A[i]; // Member of index 0 2221 // Current pointer is pointed to A[i+1], adjust it to A[i]. 2222 // 2223 // E.g. A[i+1] = a; // Member of index 1 2224 // A[i] = b; // Member of index 0 2225 // A[i+2] = c; // Member of index 2 (Current instruction) 2226 // Current pointer is pointed to A[i+2], adjust it to A[i]. 2227 2228 bool InBounds = false; 2229 if (auto *gep = dyn_cast<GetElementPtrInst>(AddrPart->stripPointerCasts())) 2230 InBounds = gep->isInBounds(); 2231 AddrPart = Builder.CreateGEP(ScalarTy, AddrPart, Builder.getInt32(-Index)); 2232 cast<GetElementPtrInst>(AddrPart)->setIsInBounds(InBounds); 2233 2234 // Cast to the vector pointer type. 2235 unsigned AddressSpace = AddrPart->getType()->getPointerAddressSpace(); 2236 Type *PtrTy = VecTy->getPointerTo(AddressSpace); 2237 AddrParts.push_back(Builder.CreateBitCast(AddrPart, PtrTy)); 2238 } 2239 2240 setDebugLocFromInst(Builder, Instr); 2241 Value *UndefVec = UndefValue::get(VecTy); 2242 2243 Value *MaskForGaps = nullptr; 2244 if (Group->requiresScalarEpilogue() && !Cost->isScalarEpilogueAllowed()) { 2245 MaskForGaps = createBitMaskForGaps(Builder, VF, *Group); 2246 assert(MaskForGaps && "Mask for Gaps is required but it is null"); 2247 } 2248 2249 // Vectorize the interleaved load group. 2250 if (isa<LoadInst>(Instr)) { 2251 // For each unroll part, create a wide load for the group. 2252 SmallVector<Value *, 2> NewLoads; 2253 for (unsigned Part = 0; Part < UF; Part++) { 2254 Instruction *NewLoad; 2255 if (BlockInMask || MaskForGaps) { 2256 assert(useMaskedInterleavedAccesses(*TTI) && 2257 "masked interleaved groups are not allowed."); 2258 Value *GroupMask = MaskForGaps; 2259 if (BlockInMask) { 2260 Value *BlockInMaskPart = State.get(BlockInMask, Part); 2261 auto *Undefs = UndefValue::get(BlockInMaskPart->getType()); 2262 auto *RepMask = createReplicatedMask(Builder, InterleaveFactor, VF); 2263 Value *ShuffledMask = Builder.CreateShuffleVector( 2264 BlockInMaskPart, Undefs, RepMask, "interleaved.mask"); 2265 GroupMask = MaskForGaps 2266 ? Builder.CreateBinOp(Instruction::And, ShuffledMask, 2267 MaskForGaps) 2268 : ShuffledMask; 2269 } 2270 NewLoad = 2271 Builder.CreateMaskedLoad(AddrParts[Part], Group->getAlign(), 2272 GroupMask, UndefVec, "wide.masked.vec"); 2273 } 2274 else 2275 NewLoad = Builder.CreateAlignedLoad(VecTy, AddrParts[Part], 2276 Group->getAlign(), "wide.vec"); 2277 Group->addMetadata(NewLoad); 2278 NewLoads.push_back(NewLoad); 2279 } 2280 2281 // For each member in the group, shuffle out the appropriate data from the 2282 // wide loads. 2283 for (unsigned I = 0; I < InterleaveFactor; ++I) { 2284 Instruction *Member = Group->getMember(I); 2285 2286 // Skip the gaps in the group. 2287 if (!Member) 2288 continue; 2289 2290 Constant *StrideMask = createStrideMask(Builder, I, InterleaveFactor, VF); 2291 for (unsigned Part = 0; Part < UF; Part++) { 2292 Value *StridedVec = Builder.CreateShuffleVector( 2293 NewLoads[Part], UndefVec, StrideMask, "strided.vec"); 2294 2295 // If this member has different type, cast the result type. 2296 if (Member->getType() != ScalarTy) { 2297 VectorType *OtherVTy = VectorType::get(Member->getType(), VF); 2298 StridedVec = createBitOrPointerCast(StridedVec, OtherVTy, DL); 2299 } 2300 2301 if (Group->isReverse()) 2302 StridedVec = reverseVector(StridedVec); 2303 2304 VectorLoopValueMap.setVectorValue(Member, Part, StridedVec); 2305 } 2306 } 2307 return; 2308 } 2309 2310 // The sub vector type for current instruction. 2311 VectorType *SubVT = VectorType::get(ScalarTy, VF); 2312 2313 // Vectorize the interleaved store group. 2314 for (unsigned Part = 0; Part < UF; Part++) { 2315 // Collect the stored vector from each member. 2316 SmallVector<Value *, 4> StoredVecs; 2317 for (unsigned i = 0; i < InterleaveFactor; i++) { 2318 // Interleaved store group doesn't allow a gap, so each index has a member 2319 Instruction *Member = Group->getMember(i); 2320 assert(Member && "Fail to get a member from an interleaved store group"); 2321 2322 Value *StoredVec = getOrCreateVectorValue( 2323 cast<StoreInst>(Member)->getValueOperand(), Part); 2324 if (Group->isReverse()) 2325 StoredVec = reverseVector(StoredVec); 2326 2327 // If this member has different type, cast it to a unified type. 2328 2329 if (StoredVec->getType() != SubVT) 2330 StoredVec = createBitOrPointerCast(StoredVec, SubVT, DL); 2331 2332 StoredVecs.push_back(StoredVec); 2333 } 2334 2335 // Concatenate all vectors into a wide vector. 2336 Value *WideVec = concatenateVectors(Builder, StoredVecs); 2337 2338 // Interleave the elements in the wide vector. 2339 Constant *IMask = createInterleaveMask(Builder, VF, InterleaveFactor); 2340 Value *IVec = Builder.CreateShuffleVector(WideVec, UndefVec, IMask, 2341 "interleaved.vec"); 2342 2343 Instruction *NewStoreInstr; 2344 if (BlockInMask) { 2345 Value *BlockInMaskPart = State.get(BlockInMask, Part); 2346 auto *Undefs = UndefValue::get(BlockInMaskPart->getType()); 2347 auto *RepMask = createReplicatedMask(Builder, InterleaveFactor, VF); 2348 Value *ShuffledMask = Builder.CreateShuffleVector( 2349 BlockInMaskPart, Undefs, RepMask, "interleaved.mask"); 2350 NewStoreInstr = Builder.CreateMaskedStore( 2351 IVec, AddrParts[Part], Group->getAlign(), ShuffledMask); 2352 } 2353 else 2354 NewStoreInstr = 2355 Builder.CreateAlignedStore(IVec, AddrParts[Part], Group->getAlign()); 2356 2357 Group->addMetadata(NewStoreInstr); 2358 } 2359 } 2360 2361 void InnerLoopVectorizer::vectorizeMemoryInstruction(Instruction *Instr, 2362 VPTransformState &State, 2363 VPValue *Addr, 2364 VPValue *StoredValue, 2365 VPValue *BlockInMask) { 2366 // Attempt to issue a wide load. 2367 LoadInst *LI = dyn_cast<LoadInst>(Instr); 2368 StoreInst *SI = dyn_cast<StoreInst>(Instr); 2369 2370 assert((LI || SI) && "Invalid Load/Store instruction"); 2371 assert((!SI || StoredValue) && "No stored value provided for widened store"); 2372 assert((!LI || !StoredValue) && "Stored value provided for widened load"); 2373 2374 LoopVectorizationCostModel::InstWidening Decision = 2375 Cost->getWideningDecision(Instr, VF); 2376 assert(Decision != LoopVectorizationCostModel::CM_Unknown && 2377 "CM decision should be taken at this point"); 2378 if (Decision == LoopVectorizationCostModel::CM_Interleave) 2379 return vectorizeInterleaveGroup(Instr, State, Addr, BlockInMask); 2380 2381 Type *ScalarDataTy = getMemInstValueType(Instr); 2382 Type *DataTy = VectorType::get(ScalarDataTy, VF); 2383 // An alignment of 0 means target abi alignment. We need to use the scalar's 2384 // target abi alignment in such a case. 2385 const DataLayout &DL = Instr->getModule()->getDataLayout(); 2386 const Align Alignment = 2387 DL.getValueOrABITypeAlignment(getLoadStoreAlignment(Instr), ScalarDataTy); 2388 2389 // Determine if the pointer operand of the access is either consecutive or 2390 // reverse consecutive. 2391 bool Reverse = (Decision == LoopVectorizationCostModel::CM_Widen_Reverse); 2392 bool ConsecutiveStride = 2393 Reverse || (Decision == LoopVectorizationCostModel::CM_Widen); 2394 bool CreateGatherScatter = 2395 (Decision == LoopVectorizationCostModel::CM_GatherScatter); 2396 2397 // Either Ptr feeds a vector load/store, or a vector GEP should feed a vector 2398 // gather/scatter. Otherwise Decision should have been to Scalarize. 2399 assert((ConsecutiveStride || CreateGatherScatter) && 2400 "The instruction should be scalarized"); 2401 (void)ConsecutiveStride; 2402 2403 VectorParts BlockInMaskParts(UF); 2404 bool isMaskRequired = BlockInMask; 2405 if (isMaskRequired) 2406 for (unsigned Part = 0; Part < UF; ++Part) 2407 BlockInMaskParts[Part] = State.get(BlockInMask, Part); 2408 2409 const auto CreateVecPtr = [&](unsigned Part, Value *Ptr) -> Value * { 2410 // Calculate the pointer for the specific unroll-part. 2411 GetElementPtrInst *PartPtr = nullptr; 2412 2413 bool InBounds = false; 2414 if (auto *gep = dyn_cast<GetElementPtrInst>(Ptr->stripPointerCasts())) 2415 InBounds = gep->isInBounds(); 2416 2417 if (Reverse) { 2418 // If the address is consecutive but reversed, then the 2419 // wide store needs to start at the last vector element. 2420 PartPtr = cast<GetElementPtrInst>( 2421 Builder.CreateGEP(ScalarDataTy, Ptr, Builder.getInt32(-Part * VF))); 2422 PartPtr->setIsInBounds(InBounds); 2423 PartPtr = cast<GetElementPtrInst>( 2424 Builder.CreateGEP(ScalarDataTy, PartPtr, Builder.getInt32(1 - VF))); 2425 PartPtr->setIsInBounds(InBounds); 2426 if (isMaskRequired) // Reverse of a null all-one mask is a null mask. 2427 BlockInMaskParts[Part] = reverseVector(BlockInMaskParts[Part]); 2428 } else { 2429 PartPtr = cast<GetElementPtrInst>( 2430 Builder.CreateGEP(ScalarDataTy, Ptr, Builder.getInt32(Part * VF))); 2431 PartPtr->setIsInBounds(InBounds); 2432 } 2433 2434 unsigned AddressSpace = Ptr->getType()->getPointerAddressSpace(); 2435 return Builder.CreateBitCast(PartPtr, DataTy->getPointerTo(AddressSpace)); 2436 }; 2437 2438 // Handle Stores: 2439 if (SI) { 2440 setDebugLocFromInst(Builder, SI); 2441 2442 for (unsigned Part = 0; Part < UF; ++Part) { 2443 Instruction *NewSI = nullptr; 2444 Value *StoredVal = State.get(StoredValue, Part); 2445 if (CreateGatherScatter) { 2446 Value *MaskPart = isMaskRequired ? BlockInMaskParts[Part] : nullptr; 2447 Value *VectorGep = State.get(Addr, Part); 2448 NewSI = Builder.CreateMaskedScatter(StoredVal, VectorGep, Alignment, 2449 MaskPart); 2450 } else { 2451 if (Reverse) { 2452 // If we store to reverse consecutive memory locations, then we need 2453 // to reverse the order of elements in the stored value. 2454 StoredVal = reverseVector(StoredVal); 2455 // We don't want to update the value in the map as it might be used in 2456 // another expression. So don't call resetVectorValue(StoredVal). 2457 } 2458 auto *VecPtr = CreateVecPtr(Part, State.get(Addr, {0, 0})); 2459 if (isMaskRequired) 2460 NewSI = Builder.CreateMaskedStore(StoredVal, VecPtr, Alignment, 2461 BlockInMaskParts[Part]); 2462 else 2463 NewSI = Builder.CreateAlignedStore(StoredVal, VecPtr, Alignment); 2464 } 2465 addMetadata(NewSI, SI); 2466 } 2467 return; 2468 } 2469 2470 // Handle loads. 2471 assert(LI && "Must have a load instruction"); 2472 setDebugLocFromInst(Builder, LI); 2473 for (unsigned Part = 0; Part < UF; ++Part) { 2474 Value *NewLI; 2475 if (CreateGatherScatter) { 2476 Value *MaskPart = isMaskRequired ? BlockInMaskParts[Part] : nullptr; 2477 Value *VectorGep = State.get(Addr, Part); 2478 NewLI = Builder.CreateMaskedGather(VectorGep, Alignment, MaskPart, 2479 nullptr, "wide.masked.gather"); 2480 addMetadata(NewLI, LI); 2481 } else { 2482 auto *VecPtr = CreateVecPtr(Part, State.get(Addr, {0, 0})); 2483 if (isMaskRequired) 2484 NewLI = Builder.CreateMaskedLoad( 2485 VecPtr, Alignment, BlockInMaskParts[Part], UndefValue::get(DataTy), 2486 "wide.masked.load"); 2487 else 2488 NewLI = 2489 Builder.CreateAlignedLoad(DataTy, VecPtr, Alignment, "wide.load"); 2490 2491 // Add metadata to the load, but setVectorValue to the reverse shuffle. 2492 addMetadata(NewLI, LI); 2493 if (Reverse) 2494 NewLI = reverseVector(NewLI); 2495 } 2496 VectorLoopValueMap.setVectorValue(Instr, Part, NewLI); 2497 } 2498 } 2499 2500 void InnerLoopVectorizer::scalarizeInstruction(Instruction *Instr, 2501 const VPIteration &Instance, 2502 bool IfPredicateInstr) { 2503 assert(!Instr->getType()->isAggregateType() && "Can't handle vectors"); 2504 2505 setDebugLocFromInst(Builder, Instr); 2506 2507 // Does this instruction return a value ? 2508 bool IsVoidRetTy = Instr->getType()->isVoidTy(); 2509 2510 Instruction *Cloned = Instr->clone(); 2511 if (!IsVoidRetTy) 2512 Cloned->setName(Instr->getName() + ".cloned"); 2513 2514 // Replace the operands of the cloned instructions with their scalar 2515 // equivalents in the new loop. 2516 for (unsigned op = 0, e = Instr->getNumOperands(); op != e; ++op) { 2517 auto *NewOp = getOrCreateScalarValue(Instr->getOperand(op), Instance); 2518 Cloned->setOperand(op, NewOp); 2519 } 2520 addNewMetadata(Cloned, Instr); 2521 2522 // Place the cloned scalar in the new loop. 2523 Builder.Insert(Cloned); 2524 2525 // Add the cloned scalar to the scalar map entry. 2526 VectorLoopValueMap.setScalarValue(Instr, Instance, Cloned); 2527 2528 // If we just cloned a new assumption, add it the assumption cache. 2529 if (auto *II = dyn_cast<IntrinsicInst>(Cloned)) 2530 if (II->getIntrinsicID() == Intrinsic::assume) 2531 AC->registerAssumption(II); 2532 2533 // End if-block. 2534 if (IfPredicateInstr) 2535 PredicatedInstructions.push_back(Cloned); 2536 } 2537 2538 PHINode *InnerLoopVectorizer::createInductionVariable(Loop *L, Value *Start, 2539 Value *End, Value *Step, 2540 Instruction *DL) { 2541 BasicBlock *Header = L->getHeader(); 2542 BasicBlock *Latch = L->getLoopLatch(); 2543 // As we're just creating this loop, it's possible no latch exists 2544 // yet. If so, use the header as this will be a single block loop. 2545 if (!Latch) 2546 Latch = Header; 2547 2548 IRBuilder<> Builder(&*Header->getFirstInsertionPt()); 2549 Instruction *OldInst = getDebugLocFromInstOrOperands(OldInduction); 2550 setDebugLocFromInst(Builder, OldInst); 2551 auto *Induction = Builder.CreatePHI(Start->getType(), 2, "index"); 2552 2553 Builder.SetInsertPoint(Latch->getTerminator()); 2554 setDebugLocFromInst(Builder, OldInst); 2555 2556 // Create i+1 and fill the PHINode. 2557 Value *Next = Builder.CreateAdd(Induction, Step, "index.next"); 2558 Induction->addIncoming(Start, L->getLoopPreheader()); 2559 Induction->addIncoming(Next, Latch); 2560 // Create the compare. 2561 Value *ICmp = Builder.CreateICmpEQ(Next, End); 2562 Builder.CreateCondBr(ICmp, L->getExitBlock(), Header); 2563 2564 // Now we have two terminators. Remove the old one from the block. 2565 Latch->getTerminator()->eraseFromParent(); 2566 2567 return Induction; 2568 } 2569 2570 Value *InnerLoopVectorizer::getOrCreateTripCount(Loop *L) { 2571 if (TripCount) 2572 return TripCount; 2573 2574 assert(L && "Create Trip Count for null loop."); 2575 IRBuilder<> Builder(L->getLoopPreheader()->getTerminator()); 2576 // Find the loop boundaries. 2577 ScalarEvolution *SE = PSE.getSE(); 2578 const SCEV *BackedgeTakenCount = PSE.getBackedgeTakenCount(); 2579 assert(BackedgeTakenCount != SE->getCouldNotCompute() && 2580 "Invalid loop count"); 2581 2582 Type *IdxTy = Legal->getWidestInductionType(); 2583 assert(IdxTy && "No type for induction"); 2584 2585 // The exit count might have the type of i64 while the phi is i32. This can 2586 // happen if we have an induction variable that is sign extended before the 2587 // compare. The only way that we get a backedge taken count is that the 2588 // induction variable was signed and as such will not overflow. In such a case 2589 // truncation is legal. 2590 if (SE->getTypeSizeInBits(BackedgeTakenCount->getType()) > 2591 IdxTy->getPrimitiveSizeInBits()) 2592 BackedgeTakenCount = SE->getTruncateOrNoop(BackedgeTakenCount, IdxTy); 2593 BackedgeTakenCount = SE->getNoopOrZeroExtend(BackedgeTakenCount, IdxTy); 2594 2595 // Get the total trip count from the count by adding 1. 2596 const SCEV *ExitCount = SE->getAddExpr( 2597 BackedgeTakenCount, SE->getOne(BackedgeTakenCount->getType())); 2598 2599 const DataLayout &DL = L->getHeader()->getModule()->getDataLayout(); 2600 2601 // Expand the trip count and place the new instructions in the preheader. 2602 // Notice that the pre-header does not change, only the loop body. 2603 SCEVExpander Exp(*SE, DL, "induction"); 2604 2605 // Count holds the overall loop count (N). 2606 TripCount = Exp.expandCodeFor(ExitCount, ExitCount->getType(), 2607 L->getLoopPreheader()->getTerminator()); 2608 2609 if (TripCount->getType()->isPointerTy()) 2610 TripCount = 2611 CastInst::CreatePointerCast(TripCount, IdxTy, "exitcount.ptrcnt.to.int", 2612 L->getLoopPreheader()->getTerminator()); 2613 2614 return TripCount; 2615 } 2616 2617 Value *InnerLoopVectorizer::getOrCreateVectorTripCount(Loop *L) { 2618 if (VectorTripCount) 2619 return VectorTripCount; 2620 2621 Value *TC = getOrCreateTripCount(L); 2622 IRBuilder<> Builder(L->getLoopPreheader()->getTerminator()); 2623 2624 Type *Ty = TC->getType(); 2625 Constant *Step = ConstantInt::get(Ty, VF * UF); 2626 2627 // If the tail is to be folded by masking, round the number of iterations N 2628 // up to a multiple of Step instead of rounding down. This is done by first 2629 // adding Step-1 and then rounding down. Note that it's ok if this addition 2630 // overflows: the vector induction variable will eventually wrap to zero given 2631 // that it starts at zero and its Step is a power of two; the loop will then 2632 // exit, with the last early-exit vector comparison also producing all-true. 2633 if (Cost->foldTailByMasking()) { 2634 assert(isPowerOf2_32(VF * UF) && 2635 "VF*UF must be a power of 2 when folding tail by masking"); 2636 TC = Builder.CreateAdd(TC, ConstantInt::get(Ty, VF * UF - 1), "n.rnd.up"); 2637 } 2638 2639 // Now we need to generate the expression for the part of the loop that the 2640 // vectorized body will execute. This is equal to N - (N % Step) if scalar 2641 // iterations are not required for correctness, or N - Step, otherwise. Step 2642 // is equal to the vectorization factor (number of SIMD elements) times the 2643 // unroll factor (number of SIMD instructions). 2644 Value *R = Builder.CreateURem(TC, Step, "n.mod.vf"); 2645 2646 // If there is a non-reversed interleaved group that may speculatively access 2647 // memory out-of-bounds, we need to ensure that there will be at least one 2648 // iteration of the scalar epilogue loop. Thus, if the step evenly divides 2649 // the trip count, we set the remainder to be equal to the step. If the step 2650 // does not evenly divide the trip count, no adjustment is necessary since 2651 // there will already be scalar iterations. Note that the minimum iterations 2652 // check ensures that N >= Step. 2653 if (VF > 1 && Cost->requiresScalarEpilogue()) { 2654 auto *IsZero = Builder.CreateICmpEQ(R, ConstantInt::get(R->getType(), 0)); 2655 R = Builder.CreateSelect(IsZero, Step, R); 2656 } 2657 2658 VectorTripCount = Builder.CreateSub(TC, R, "n.vec"); 2659 2660 return VectorTripCount; 2661 } 2662 2663 Value *InnerLoopVectorizer::createBitOrPointerCast(Value *V, VectorType *DstVTy, 2664 const DataLayout &DL) { 2665 // Verify that V is a vector type with same number of elements as DstVTy. 2666 unsigned VF = DstVTy->getNumElements(); 2667 VectorType *SrcVecTy = cast<VectorType>(V->getType()); 2668 assert((VF == SrcVecTy->getNumElements()) && "Vector dimensions do not match"); 2669 Type *SrcElemTy = SrcVecTy->getElementType(); 2670 Type *DstElemTy = DstVTy->getElementType(); 2671 assert((DL.getTypeSizeInBits(SrcElemTy) == DL.getTypeSizeInBits(DstElemTy)) && 2672 "Vector elements must have same size"); 2673 2674 // Do a direct cast if element types are castable. 2675 if (CastInst::isBitOrNoopPointerCastable(SrcElemTy, DstElemTy, DL)) { 2676 return Builder.CreateBitOrPointerCast(V, DstVTy); 2677 } 2678 // V cannot be directly casted to desired vector type. 2679 // May happen when V is a floating point vector but DstVTy is a vector of 2680 // pointers or vice-versa. Handle this using a two-step bitcast using an 2681 // intermediate Integer type for the bitcast i.e. Ptr <-> Int <-> Float. 2682 assert((DstElemTy->isPointerTy() != SrcElemTy->isPointerTy()) && 2683 "Only one type should be a pointer type"); 2684 assert((DstElemTy->isFloatingPointTy() != SrcElemTy->isFloatingPointTy()) && 2685 "Only one type should be a floating point type"); 2686 Type *IntTy = 2687 IntegerType::getIntNTy(V->getContext(), DL.getTypeSizeInBits(SrcElemTy)); 2688 VectorType *VecIntTy = VectorType::get(IntTy, VF); 2689 Value *CastVal = Builder.CreateBitOrPointerCast(V, VecIntTy); 2690 return Builder.CreateBitOrPointerCast(CastVal, DstVTy); 2691 } 2692 2693 void InnerLoopVectorizer::emitMinimumIterationCountCheck(Loop *L, 2694 BasicBlock *Bypass) { 2695 Value *Count = getOrCreateTripCount(L); 2696 // Reuse existing vector loop preheader for TC checks. 2697 // Note that new preheader block is generated for vector loop. 2698 BasicBlock *const TCCheckBlock = LoopVectorPreHeader; 2699 IRBuilder<> Builder(TCCheckBlock->getTerminator()); 2700 2701 // Generate code to check if the loop's trip count is less than VF * UF, or 2702 // equal to it in case a scalar epilogue is required; this implies that the 2703 // vector trip count is zero. This check also covers the case where adding one 2704 // to the backedge-taken count overflowed leading to an incorrect trip count 2705 // of zero. In this case we will also jump to the scalar loop. 2706 auto P = Cost->requiresScalarEpilogue() ? ICmpInst::ICMP_ULE 2707 : ICmpInst::ICMP_ULT; 2708 2709 // If tail is to be folded, vector loop takes care of all iterations. 2710 Value *CheckMinIters = Builder.getFalse(); 2711 if (!Cost->foldTailByMasking()) 2712 CheckMinIters = Builder.CreateICmp( 2713 P, Count, ConstantInt::get(Count->getType(), VF * UF), 2714 "min.iters.check"); 2715 2716 // Create new preheader for vector loop. 2717 LoopVectorPreHeader = 2718 SplitBlock(TCCheckBlock, TCCheckBlock->getTerminator(), DT, LI, nullptr, 2719 "vector.ph"); 2720 2721 assert(DT->properlyDominates(DT->getNode(TCCheckBlock), 2722 DT->getNode(Bypass)->getIDom()) && 2723 "TC check is expected to dominate Bypass"); 2724 2725 // Update dominator for Bypass & LoopExit. 2726 DT->changeImmediateDominator(Bypass, TCCheckBlock); 2727 DT->changeImmediateDominator(LoopExitBlock, TCCheckBlock); 2728 2729 ReplaceInstWithInst( 2730 TCCheckBlock->getTerminator(), 2731 BranchInst::Create(Bypass, LoopVectorPreHeader, CheckMinIters)); 2732 LoopBypassBlocks.push_back(TCCheckBlock); 2733 } 2734 2735 void InnerLoopVectorizer::emitSCEVChecks(Loop *L, BasicBlock *Bypass) { 2736 // Reuse existing vector loop preheader for SCEV checks. 2737 // Note that new preheader block is generated for vector loop. 2738 BasicBlock *const SCEVCheckBlock = LoopVectorPreHeader; 2739 2740 // Generate the code to check that the SCEV assumptions that we made. 2741 // We want the new basic block to start at the first instruction in a 2742 // sequence of instructions that form a check. 2743 SCEVExpander Exp(*PSE.getSE(), Bypass->getModule()->getDataLayout(), 2744 "scev.check"); 2745 Value *SCEVCheck = Exp.expandCodeForPredicate( 2746 &PSE.getUnionPredicate(), SCEVCheckBlock->getTerminator()); 2747 2748 if (auto *C = dyn_cast<ConstantInt>(SCEVCheck)) 2749 if (C->isZero()) 2750 return; 2751 2752 assert(!SCEVCheckBlock->getParent()->hasOptSize() && 2753 "Cannot SCEV check stride or overflow when optimizing for size"); 2754 2755 SCEVCheckBlock->setName("vector.scevcheck"); 2756 // Create new preheader for vector loop. 2757 LoopVectorPreHeader = 2758 SplitBlock(SCEVCheckBlock, SCEVCheckBlock->getTerminator(), DT, LI, 2759 nullptr, "vector.ph"); 2760 2761 // Update dominator only if this is first RT check. 2762 if (LoopBypassBlocks.empty()) { 2763 DT->changeImmediateDominator(Bypass, SCEVCheckBlock); 2764 DT->changeImmediateDominator(LoopExitBlock, SCEVCheckBlock); 2765 } 2766 2767 ReplaceInstWithInst( 2768 SCEVCheckBlock->getTerminator(), 2769 BranchInst::Create(Bypass, LoopVectorPreHeader, SCEVCheck)); 2770 LoopBypassBlocks.push_back(SCEVCheckBlock); 2771 AddedSafetyChecks = true; 2772 } 2773 2774 void InnerLoopVectorizer::emitMemRuntimeChecks(Loop *L, BasicBlock *Bypass) { 2775 // VPlan-native path does not do any analysis for runtime checks currently. 2776 if (EnableVPlanNativePath) 2777 return; 2778 2779 // Reuse existing vector loop preheader for runtime memory checks. 2780 // Note that new preheader block is generated for vector loop. 2781 BasicBlock *const MemCheckBlock = L->getLoopPreheader(); 2782 2783 // Generate the code that checks in runtime if arrays overlap. We put the 2784 // checks into a separate block to make the more common case of few elements 2785 // faster. 2786 Instruction *FirstCheckInst; 2787 Instruction *MemRuntimeCheck; 2788 std::tie(FirstCheckInst, MemRuntimeCheck) = 2789 Legal->getLAI()->addRuntimeChecks(MemCheckBlock->getTerminator()); 2790 if (!MemRuntimeCheck) 2791 return; 2792 2793 if (MemCheckBlock->getParent()->hasOptSize()) { 2794 assert(Cost->Hints->getForce() == LoopVectorizeHints::FK_Enabled && 2795 "Cannot emit memory checks when optimizing for size, unless forced " 2796 "to vectorize."); 2797 ORE->emit([&]() { 2798 return OptimizationRemarkAnalysis(DEBUG_TYPE, "VectorizationCodeSize", 2799 L->getStartLoc(), L->getHeader()) 2800 << "Code-size may be reduced by not forcing " 2801 "vectorization, or by source-code modifications " 2802 "eliminating the need for runtime checks " 2803 "(e.g., adding 'restrict')."; 2804 }); 2805 } 2806 2807 MemCheckBlock->setName("vector.memcheck"); 2808 // Create new preheader for vector loop. 2809 LoopVectorPreHeader = 2810 SplitBlock(MemCheckBlock, MemCheckBlock->getTerminator(), DT, LI, nullptr, 2811 "vector.ph"); 2812 2813 // Update dominator only if this is first RT check. 2814 if (LoopBypassBlocks.empty()) { 2815 DT->changeImmediateDominator(Bypass, MemCheckBlock); 2816 DT->changeImmediateDominator(LoopExitBlock, MemCheckBlock); 2817 } 2818 2819 ReplaceInstWithInst( 2820 MemCheckBlock->getTerminator(), 2821 BranchInst::Create(Bypass, LoopVectorPreHeader, MemRuntimeCheck)); 2822 LoopBypassBlocks.push_back(MemCheckBlock); 2823 AddedSafetyChecks = true; 2824 2825 // We currently don't use LoopVersioning for the actual loop cloning but we 2826 // still use it to add the noalias metadata. 2827 LVer = std::make_unique<LoopVersioning>(*Legal->getLAI(), OrigLoop, LI, DT, 2828 PSE.getSE()); 2829 LVer->prepareNoAliasMetadata(); 2830 } 2831 2832 Value *InnerLoopVectorizer::emitTransformedIndex( 2833 IRBuilder<> &B, Value *Index, ScalarEvolution *SE, const DataLayout &DL, 2834 const InductionDescriptor &ID) const { 2835 2836 SCEVExpander Exp(*SE, DL, "induction"); 2837 auto Step = ID.getStep(); 2838 auto StartValue = ID.getStartValue(); 2839 assert(Index->getType() == Step->getType() && 2840 "Index type does not match StepValue type"); 2841 2842 // Note: the IR at this point is broken. We cannot use SE to create any new 2843 // SCEV and then expand it, hoping that SCEV's simplification will give us 2844 // a more optimal code. Unfortunately, attempt of doing so on invalid IR may 2845 // lead to various SCEV crashes. So all we can do is to use builder and rely 2846 // on InstCombine for future simplifications. Here we handle some trivial 2847 // cases only. 2848 auto CreateAdd = [&B](Value *X, Value *Y) { 2849 assert(X->getType() == Y->getType() && "Types don't match!"); 2850 if (auto *CX = dyn_cast<ConstantInt>(X)) 2851 if (CX->isZero()) 2852 return Y; 2853 if (auto *CY = dyn_cast<ConstantInt>(Y)) 2854 if (CY->isZero()) 2855 return X; 2856 return B.CreateAdd(X, Y); 2857 }; 2858 2859 auto CreateMul = [&B](Value *X, Value *Y) { 2860 assert(X->getType() == Y->getType() && "Types don't match!"); 2861 if (auto *CX = dyn_cast<ConstantInt>(X)) 2862 if (CX->isOne()) 2863 return Y; 2864 if (auto *CY = dyn_cast<ConstantInt>(Y)) 2865 if (CY->isOne()) 2866 return X; 2867 return B.CreateMul(X, Y); 2868 }; 2869 2870 switch (ID.getKind()) { 2871 case InductionDescriptor::IK_IntInduction: { 2872 assert(Index->getType() == StartValue->getType() && 2873 "Index type does not match StartValue type"); 2874 if (ID.getConstIntStepValue() && ID.getConstIntStepValue()->isMinusOne()) 2875 return B.CreateSub(StartValue, Index); 2876 auto *Offset = CreateMul( 2877 Index, Exp.expandCodeFor(Step, Index->getType(), &*B.GetInsertPoint())); 2878 return CreateAdd(StartValue, Offset); 2879 } 2880 case InductionDescriptor::IK_PtrInduction: { 2881 assert(isa<SCEVConstant>(Step) && 2882 "Expected constant step for pointer induction"); 2883 return B.CreateGEP( 2884 StartValue->getType()->getPointerElementType(), StartValue, 2885 CreateMul(Index, Exp.expandCodeFor(Step, Index->getType(), 2886 &*B.GetInsertPoint()))); 2887 } 2888 case InductionDescriptor::IK_FpInduction: { 2889 assert(Step->getType()->isFloatingPointTy() && "Expected FP Step value"); 2890 auto InductionBinOp = ID.getInductionBinOp(); 2891 assert(InductionBinOp && 2892 (InductionBinOp->getOpcode() == Instruction::FAdd || 2893 InductionBinOp->getOpcode() == Instruction::FSub) && 2894 "Original bin op should be defined for FP induction"); 2895 2896 Value *StepValue = cast<SCEVUnknown>(Step)->getValue(); 2897 2898 // Floating point operations had to be 'fast' to enable the induction. 2899 FastMathFlags Flags; 2900 Flags.setFast(); 2901 2902 Value *MulExp = B.CreateFMul(StepValue, Index); 2903 if (isa<Instruction>(MulExp)) 2904 // We have to check, the MulExp may be a constant. 2905 cast<Instruction>(MulExp)->setFastMathFlags(Flags); 2906 2907 Value *BOp = B.CreateBinOp(InductionBinOp->getOpcode(), StartValue, MulExp, 2908 "induction"); 2909 if (isa<Instruction>(BOp)) 2910 cast<Instruction>(BOp)->setFastMathFlags(Flags); 2911 2912 return BOp; 2913 } 2914 case InductionDescriptor::IK_NoInduction: 2915 return nullptr; 2916 } 2917 llvm_unreachable("invalid enum"); 2918 } 2919 2920 BasicBlock *InnerLoopVectorizer::createVectorizedLoopSkeleton() { 2921 /* 2922 In this function we generate a new loop. The new loop will contain 2923 the vectorized instructions while the old loop will continue to run the 2924 scalar remainder. 2925 2926 [ ] <-- loop iteration number check. 2927 / | 2928 / v 2929 | [ ] <-- vector loop bypass (may consist of multiple blocks). 2930 | / | 2931 | / v 2932 || [ ] <-- vector pre header. 2933 |/ | 2934 | v 2935 | [ ] \ 2936 | [ ]_| <-- vector loop. 2937 | | 2938 | v 2939 | -[ ] <--- middle-block. 2940 | / | 2941 | / v 2942 -|- >[ ] <--- new preheader. 2943 | | 2944 | v 2945 | [ ] \ 2946 | [ ]_| <-- old scalar loop to handle remainder. 2947 \ | 2948 \ v 2949 >[ ] <-- exit block. 2950 ... 2951 */ 2952 2953 MDNode *OrigLoopID = OrigLoop->getLoopID(); 2954 2955 // Some loops have a single integer induction variable, while other loops 2956 // don't. One example is c++ iterators that often have multiple pointer 2957 // induction variables. In the code below we also support a case where we 2958 // don't have a single induction variable. 2959 // 2960 // We try to obtain an induction variable from the original loop as hard 2961 // as possible. However if we don't find one that: 2962 // - is an integer 2963 // - counts from zero, stepping by one 2964 // - is the size of the widest induction variable type 2965 // then we create a new one. 2966 OldInduction = Legal->getPrimaryInduction(); 2967 Type *IdxTy = Legal->getWidestInductionType(); 2968 2969 // Split the single block loop into the two loop structure described above. 2970 LoopScalarBody = OrigLoop->getHeader(); 2971 LoopVectorPreHeader = OrigLoop->getLoopPreheader(); 2972 LoopExitBlock = OrigLoop->getExitBlock(); 2973 assert(LoopExitBlock && "Must have an exit block"); 2974 assert(LoopVectorPreHeader && "Invalid loop structure"); 2975 2976 LoopMiddleBlock = 2977 SplitBlock(LoopVectorPreHeader, LoopVectorPreHeader->getTerminator(), DT, 2978 LI, nullptr, "middle.block"); 2979 LoopScalarPreHeader = 2980 SplitBlock(LoopMiddleBlock, LoopMiddleBlock->getTerminator(), DT, LI, 2981 nullptr, "scalar.ph"); 2982 // We intentionally don't let SplitBlock to update LoopInfo since 2983 // LoopVectorBody should belong to another loop than LoopVectorPreHeader. 2984 // LoopVectorBody is explicitly added to the correct place few lines later. 2985 LoopVectorBody = 2986 SplitBlock(LoopVectorPreHeader, LoopVectorPreHeader->getTerminator(), DT, 2987 nullptr, nullptr, "vector.body"); 2988 2989 // Update dominator for loop exit. 2990 DT->changeImmediateDominator(LoopExitBlock, LoopMiddleBlock); 2991 2992 // Create and register the new vector loop. 2993 Loop *Lp = LI->AllocateLoop(); 2994 Loop *ParentLoop = OrigLoop->getParentLoop(); 2995 2996 // Insert the new loop into the loop nest and register the new basic blocks 2997 // before calling any utilities such as SCEV that require valid LoopInfo. 2998 if (ParentLoop) { 2999 ParentLoop->addChildLoop(Lp); 3000 } else { 3001 LI->addTopLevelLoop(Lp); 3002 } 3003 Lp->addBasicBlockToLoop(LoopVectorBody, *LI); 3004 3005 // Find the loop boundaries. 3006 Value *Count = getOrCreateTripCount(Lp); 3007 3008 Value *StartIdx = ConstantInt::get(IdxTy, 0); 3009 3010 // Now, compare the new count to zero. If it is zero skip the vector loop and 3011 // jump to the scalar loop. This check also covers the case where the 3012 // backedge-taken count is uint##_max: adding one to it will overflow leading 3013 // to an incorrect trip count of zero. In this (rare) case we will also jump 3014 // to the scalar loop. 3015 emitMinimumIterationCountCheck(Lp, LoopScalarPreHeader); 3016 3017 // Generate the code to check any assumptions that we've made for SCEV 3018 // expressions. 3019 emitSCEVChecks(Lp, LoopScalarPreHeader); 3020 3021 // Generate the code that checks in runtime if arrays overlap. We put the 3022 // checks into a separate block to make the more common case of few elements 3023 // faster. 3024 emitMemRuntimeChecks(Lp, LoopScalarPreHeader); 3025 3026 // Generate the induction variable. 3027 // The loop step is equal to the vectorization factor (num of SIMD elements) 3028 // times the unroll factor (num of SIMD instructions). 3029 Value *CountRoundDown = getOrCreateVectorTripCount(Lp); 3030 Constant *Step = ConstantInt::get(IdxTy, VF * UF); 3031 Induction = 3032 createInductionVariable(Lp, StartIdx, CountRoundDown, Step, 3033 getDebugLocFromInstOrOperands(OldInduction)); 3034 3035 // We are going to resume the execution of the scalar loop. 3036 // Go over all of the induction variables that we found and fix the 3037 // PHIs that are left in the scalar version of the loop. 3038 // The starting values of PHI nodes depend on the counter of the last 3039 // iteration in the vectorized loop. 3040 // If we come from a bypass edge then we need to start from the original 3041 // start value. 3042 3043 // This variable saves the new starting index for the scalar loop. It is used 3044 // to test if there are any tail iterations left once the vector loop has 3045 // completed. 3046 for (auto &InductionEntry : Legal->getInductionVars()) { 3047 PHINode *OrigPhi = InductionEntry.first; 3048 InductionDescriptor II = InductionEntry.second; 3049 3050 // Create phi nodes to merge from the backedge-taken check block. 3051 PHINode *BCResumeVal = 3052 PHINode::Create(OrigPhi->getType(), 3, "bc.resume.val", 3053 LoopScalarPreHeader->getTerminator()); 3054 // Copy original phi DL over to the new one. 3055 BCResumeVal->setDebugLoc(OrigPhi->getDebugLoc()); 3056 Value *&EndValue = IVEndValues[OrigPhi]; 3057 if (OrigPhi == OldInduction) { 3058 // We know what the end value is. 3059 EndValue = CountRoundDown; 3060 } else { 3061 IRBuilder<> B(Lp->getLoopPreheader()->getTerminator()); 3062 Type *StepType = II.getStep()->getType(); 3063 Instruction::CastOps CastOp = 3064 CastInst::getCastOpcode(CountRoundDown, true, StepType, true); 3065 Value *CRD = B.CreateCast(CastOp, CountRoundDown, StepType, "cast.crd"); 3066 const DataLayout &DL = LoopScalarBody->getModule()->getDataLayout(); 3067 EndValue = emitTransformedIndex(B, CRD, PSE.getSE(), DL, II); 3068 EndValue->setName("ind.end"); 3069 } 3070 3071 // The new PHI merges the original incoming value, in case of a bypass, 3072 // or the value at the end of the vectorized loop. 3073 BCResumeVal->addIncoming(EndValue, LoopMiddleBlock); 3074 3075 // Fix the scalar body counter (PHI node). 3076 // The old induction's phi node in the scalar body needs the truncated 3077 // value. 3078 for (BasicBlock *BB : LoopBypassBlocks) 3079 BCResumeVal->addIncoming(II.getStartValue(), BB); 3080 OrigPhi->setIncomingValueForBlock(LoopScalarPreHeader, BCResumeVal); 3081 } 3082 3083 // We need the OrigLoop (scalar loop part) latch terminator to help 3084 // produce correct debug info for the middle block BB instructions. 3085 // The legality check stage guarantees that the loop will have a single 3086 // latch. 3087 assert(isa<BranchInst>(OrigLoop->getLoopLatch()->getTerminator()) && 3088 "Scalar loop latch terminator isn't a branch"); 3089 BranchInst *ScalarLatchBr = 3090 cast<BranchInst>(OrigLoop->getLoopLatch()->getTerminator()); 3091 3092 // Add a check in the middle block to see if we have completed 3093 // all of the iterations in the first vector loop. 3094 // If (N - N%VF) == N, then we *don't* need to run the remainder. 3095 // If tail is to be folded, we know we don't need to run the remainder. 3096 Value *CmpN = Builder.getTrue(); 3097 if (!Cost->foldTailByMasking()) { 3098 CmpN = CmpInst::Create(Instruction::ICmp, CmpInst::ICMP_EQ, Count, 3099 CountRoundDown, "cmp.n", 3100 LoopMiddleBlock->getTerminator()); 3101 3102 // Here we use the same DebugLoc as the scalar loop latch branch instead 3103 // of the corresponding compare because they may have ended up with 3104 // different line numbers and we want to avoid awkward line stepping while 3105 // debugging. Eg. if the compare has got a line number inside the loop. 3106 cast<Instruction>(CmpN)->setDebugLoc(ScalarLatchBr->getDebugLoc()); 3107 } 3108 3109 BranchInst *BrInst = 3110 BranchInst::Create(LoopExitBlock, LoopScalarPreHeader, CmpN); 3111 BrInst->setDebugLoc(ScalarLatchBr->getDebugLoc()); 3112 ReplaceInstWithInst(LoopMiddleBlock->getTerminator(), BrInst); 3113 3114 // Get ready to start creating new instructions into the vectorized body. 3115 assert(LoopVectorPreHeader == Lp->getLoopPreheader() && 3116 "Inconsistent vector loop preheader"); 3117 Builder.SetInsertPoint(&*LoopVectorBody->getFirstInsertionPt()); 3118 3119 Optional<MDNode *> VectorizedLoopID = 3120 makeFollowupLoopID(OrigLoopID, {LLVMLoopVectorizeFollowupAll, 3121 LLVMLoopVectorizeFollowupVectorized}); 3122 if (VectorizedLoopID.hasValue()) { 3123 Lp->setLoopID(VectorizedLoopID.getValue()); 3124 3125 // Do not setAlreadyVectorized if loop attributes have been defined 3126 // explicitly. 3127 return LoopVectorPreHeader; 3128 } 3129 3130 // Keep all loop hints from the original loop on the vector loop (we'll 3131 // replace the vectorizer-specific hints below). 3132 if (MDNode *LID = OrigLoop->getLoopID()) 3133 Lp->setLoopID(LID); 3134 3135 LoopVectorizeHints Hints(Lp, true, *ORE); 3136 Hints.setAlreadyVectorized(); 3137 3138 #ifdef EXPENSIVE_CHECKS 3139 assert(DT->verify(DominatorTree::VerificationLevel::Fast)); 3140 LI->verify(*DT); 3141 #endif 3142 3143 return LoopVectorPreHeader; 3144 } 3145 3146 // Fix up external users of the induction variable. At this point, we are 3147 // in LCSSA form, with all external PHIs that use the IV having one input value, 3148 // coming from the remainder loop. We need those PHIs to also have a correct 3149 // value for the IV when arriving directly from the middle block. 3150 void InnerLoopVectorizer::fixupIVUsers(PHINode *OrigPhi, 3151 const InductionDescriptor &II, 3152 Value *CountRoundDown, Value *EndValue, 3153 BasicBlock *MiddleBlock) { 3154 // There are two kinds of external IV usages - those that use the value 3155 // computed in the last iteration (the PHI) and those that use the penultimate 3156 // value (the value that feeds into the phi from the loop latch). 3157 // We allow both, but they, obviously, have different values. 3158 3159 assert(OrigLoop->getExitBlock() && "Expected a single exit block"); 3160 3161 DenseMap<Value *, Value *> MissingVals; 3162 3163 // An external user of the last iteration's value should see the value that 3164 // the remainder loop uses to initialize its own IV. 3165 Value *PostInc = OrigPhi->getIncomingValueForBlock(OrigLoop->getLoopLatch()); 3166 for (User *U : PostInc->users()) { 3167 Instruction *UI = cast<Instruction>(U); 3168 if (!OrigLoop->contains(UI)) { 3169 assert(isa<PHINode>(UI) && "Expected LCSSA form"); 3170 MissingVals[UI] = EndValue; 3171 } 3172 } 3173 3174 // An external user of the penultimate value need to see EndValue - Step. 3175 // The simplest way to get this is to recompute it from the constituent SCEVs, 3176 // that is Start + (Step * (CRD - 1)). 3177 for (User *U : OrigPhi->users()) { 3178 auto *UI = cast<Instruction>(U); 3179 if (!OrigLoop->contains(UI)) { 3180 const DataLayout &DL = 3181 OrigLoop->getHeader()->getModule()->getDataLayout(); 3182 assert(isa<PHINode>(UI) && "Expected LCSSA form"); 3183 3184 IRBuilder<> B(MiddleBlock->getTerminator()); 3185 Value *CountMinusOne = B.CreateSub( 3186 CountRoundDown, ConstantInt::get(CountRoundDown->getType(), 1)); 3187 Value *CMO = 3188 !II.getStep()->getType()->isIntegerTy() 3189 ? B.CreateCast(Instruction::SIToFP, CountMinusOne, 3190 II.getStep()->getType()) 3191 : B.CreateSExtOrTrunc(CountMinusOne, II.getStep()->getType()); 3192 CMO->setName("cast.cmo"); 3193 Value *Escape = emitTransformedIndex(B, CMO, PSE.getSE(), DL, II); 3194 Escape->setName("ind.escape"); 3195 MissingVals[UI] = Escape; 3196 } 3197 } 3198 3199 for (auto &I : MissingVals) { 3200 PHINode *PHI = cast<PHINode>(I.first); 3201 // One corner case we have to handle is two IVs "chasing" each-other, 3202 // that is %IV2 = phi [...], [ %IV1, %latch ] 3203 // In this case, if IV1 has an external use, we need to avoid adding both 3204 // "last value of IV1" and "penultimate value of IV2". So, verify that we 3205 // don't already have an incoming value for the middle block. 3206 if (PHI->getBasicBlockIndex(MiddleBlock) == -1) 3207 PHI->addIncoming(I.second, MiddleBlock); 3208 } 3209 } 3210 3211 namespace { 3212 3213 struct CSEDenseMapInfo { 3214 static bool canHandle(const Instruction *I) { 3215 return isa<InsertElementInst>(I) || isa<ExtractElementInst>(I) || 3216 isa<ShuffleVectorInst>(I) || isa<GetElementPtrInst>(I); 3217 } 3218 3219 static inline Instruction *getEmptyKey() { 3220 return DenseMapInfo<Instruction *>::getEmptyKey(); 3221 } 3222 3223 static inline Instruction *getTombstoneKey() { 3224 return DenseMapInfo<Instruction *>::getTombstoneKey(); 3225 } 3226 3227 static unsigned getHashValue(const Instruction *I) { 3228 assert(canHandle(I) && "Unknown instruction!"); 3229 return hash_combine(I->getOpcode(), hash_combine_range(I->value_op_begin(), 3230 I->value_op_end())); 3231 } 3232 3233 static bool isEqual(const Instruction *LHS, const Instruction *RHS) { 3234 if (LHS == getEmptyKey() || RHS == getEmptyKey() || 3235 LHS == getTombstoneKey() || RHS == getTombstoneKey()) 3236 return LHS == RHS; 3237 return LHS->isIdenticalTo(RHS); 3238 } 3239 }; 3240 3241 } // end anonymous namespace 3242 3243 ///Perform cse of induction variable instructions. 3244 static void cse(BasicBlock *BB) { 3245 // Perform simple cse. 3246 SmallDenseMap<Instruction *, Instruction *, 4, CSEDenseMapInfo> CSEMap; 3247 for (BasicBlock::iterator I = BB->begin(), E = BB->end(); I != E;) { 3248 Instruction *In = &*I++; 3249 3250 if (!CSEDenseMapInfo::canHandle(In)) 3251 continue; 3252 3253 // Check if we can replace this instruction with any of the 3254 // visited instructions. 3255 if (Instruction *V = CSEMap.lookup(In)) { 3256 In->replaceAllUsesWith(V); 3257 In->eraseFromParent(); 3258 continue; 3259 } 3260 3261 CSEMap[In] = In; 3262 } 3263 } 3264 3265 unsigned LoopVectorizationCostModel::getVectorCallCost(CallInst *CI, 3266 unsigned VF, 3267 bool &NeedToScalarize) { 3268 Function *F = CI->getCalledFunction(); 3269 Type *ScalarRetTy = CI->getType(); 3270 SmallVector<Type *, 4> Tys, ScalarTys; 3271 for (auto &ArgOp : CI->arg_operands()) 3272 ScalarTys.push_back(ArgOp->getType()); 3273 3274 // Estimate cost of scalarized vector call. The source operands are assumed 3275 // to be vectors, so we need to extract individual elements from there, 3276 // execute VF scalar calls, and then gather the result into the vector return 3277 // value. 3278 unsigned ScalarCallCost = TTI.getCallInstrCost(F, ScalarRetTy, ScalarTys); 3279 if (VF == 1) 3280 return ScalarCallCost; 3281 3282 // Compute corresponding vector type for return value and arguments. 3283 Type *RetTy = ToVectorTy(ScalarRetTy, VF); 3284 for (Type *ScalarTy : ScalarTys) 3285 Tys.push_back(ToVectorTy(ScalarTy, VF)); 3286 3287 // Compute costs of unpacking argument values for the scalar calls and 3288 // packing the return values to a vector. 3289 unsigned ScalarizationCost = getScalarizationOverhead(CI, VF); 3290 3291 unsigned Cost = ScalarCallCost * VF + ScalarizationCost; 3292 3293 // If we can't emit a vector call for this function, then the currently found 3294 // cost is the cost we need to return. 3295 NeedToScalarize = true; 3296 VFShape Shape = VFShape::get(*CI, {VF, false}, false /*HasGlobalPred*/); 3297 Function *VecFunc = VFDatabase(*CI).getVectorizedFunction(Shape); 3298 3299 if (!TLI || CI->isNoBuiltin() || !VecFunc) 3300 return Cost; 3301 3302 // If the corresponding vector cost is cheaper, return its cost. 3303 unsigned VectorCallCost = TTI.getCallInstrCost(nullptr, RetTy, Tys); 3304 if (VectorCallCost < Cost) { 3305 NeedToScalarize = false; 3306 return VectorCallCost; 3307 } 3308 return Cost; 3309 } 3310 3311 unsigned LoopVectorizationCostModel::getVectorIntrinsicCost(CallInst *CI, 3312 unsigned VF) { 3313 Intrinsic::ID ID = getVectorIntrinsicIDForCall(CI, TLI); 3314 assert(ID && "Expected intrinsic call!"); 3315 3316 FastMathFlags FMF; 3317 if (auto *FPMO = dyn_cast<FPMathOperator>(CI)) 3318 FMF = FPMO->getFastMathFlags(); 3319 3320 SmallVector<Value *, 4> Operands(CI->arg_operands()); 3321 return TTI.getIntrinsicInstrCost(ID, CI->getType(), Operands, FMF, VF, CI); 3322 } 3323 3324 static Type *smallestIntegerVectorType(Type *T1, Type *T2) { 3325 auto *I1 = cast<IntegerType>(T1->getVectorElementType()); 3326 auto *I2 = cast<IntegerType>(T2->getVectorElementType()); 3327 return I1->getBitWidth() < I2->getBitWidth() ? T1 : T2; 3328 } 3329 static Type *largestIntegerVectorType(Type *T1, Type *T2) { 3330 auto *I1 = cast<IntegerType>(T1->getVectorElementType()); 3331 auto *I2 = cast<IntegerType>(T2->getVectorElementType()); 3332 return I1->getBitWidth() > I2->getBitWidth() ? T1 : T2; 3333 } 3334 3335 void InnerLoopVectorizer::truncateToMinimalBitwidths() { 3336 // For every instruction `I` in MinBWs, truncate the operands, create a 3337 // truncated version of `I` and reextend its result. InstCombine runs 3338 // later and will remove any ext/trunc pairs. 3339 SmallPtrSet<Value *, 4> Erased; 3340 for (const auto &KV : Cost->getMinimalBitwidths()) { 3341 // If the value wasn't vectorized, we must maintain the original scalar 3342 // type. The absence of the value from VectorLoopValueMap indicates that it 3343 // wasn't vectorized. 3344 if (!VectorLoopValueMap.hasAnyVectorValue(KV.first)) 3345 continue; 3346 for (unsigned Part = 0; Part < UF; ++Part) { 3347 Value *I = getOrCreateVectorValue(KV.first, Part); 3348 if (Erased.find(I) != Erased.end() || I->use_empty() || 3349 !isa<Instruction>(I)) 3350 continue; 3351 Type *OriginalTy = I->getType(); 3352 Type *ScalarTruncatedTy = 3353 IntegerType::get(OriginalTy->getContext(), KV.second); 3354 Type *TruncatedTy = VectorType::get(ScalarTruncatedTy, 3355 OriginalTy->getVectorNumElements()); 3356 if (TruncatedTy == OriginalTy) 3357 continue; 3358 3359 IRBuilder<> B(cast<Instruction>(I)); 3360 auto ShrinkOperand = [&](Value *V) -> Value * { 3361 if (auto *ZI = dyn_cast<ZExtInst>(V)) 3362 if (ZI->getSrcTy() == TruncatedTy) 3363 return ZI->getOperand(0); 3364 return B.CreateZExtOrTrunc(V, TruncatedTy); 3365 }; 3366 3367 // The actual instruction modification depends on the instruction type, 3368 // unfortunately. 3369 Value *NewI = nullptr; 3370 if (auto *BO = dyn_cast<BinaryOperator>(I)) { 3371 NewI = B.CreateBinOp(BO->getOpcode(), ShrinkOperand(BO->getOperand(0)), 3372 ShrinkOperand(BO->getOperand(1))); 3373 3374 // Any wrapping introduced by shrinking this operation shouldn't be 3375 // considered undefined behavior. So, we can't unconditionally copy 3376 // arithmetic wrapping flags to NewI. 3377 cast<BinaryOperator>(NewI)->copyIRFlags(I, /*IncludeWrapFlags=*/false); 3378 } else if (auto *CI = dyn_cast<ICmpInst>(I)) { 3379 NewI = 3380 B.CreateICmp(CI->getPredicate(), ShrinkOperand(CI->getOperand(0)), 3381 ShrinkOperand(CI->getOperand(1))); 3382 } else if (auto *SI = dyn_cast<SelectInst>(I)) { 3383 NewI = B.CreateSelect(SI->getCondition(), 3384 ShrinkOperand(SI->getTrueValue()), 3385 ShrinkOperand(SI->getFalseValue())); 3386 } else if (auto *CI = dyn_cast<CastInst>(I)) { 3387 switch (CI->getOpcode()) { 3388 default: 3389 llvm_unreachable("Unhandled cast!"); 3390 case Instruction::Trunc: 3391 NewI = ShrinkOperand(CI->getOperand(0)); 3392 break; 3393 case Instruction::SExt: 3394 NewI = B.CreateSExtOrTrunc( 3395 CI->getOperand(0), 3396 smallestIntegerVectorType(OriginalTy, TruncatedTy)); 3397 break; 3398 case Instruction::ZExt: 3399 NewI = B.CreateZExtOrTrunc( 3400 CI->getOperand(0), 3401 smallestIntegerVectorType(OriginalTy, TruncatedTy)); 3402 break; 3403 } 3404 } else if (auto *SI = dyn_cast<ShuffleVectorInst>(I)) { 3405 auto Elements0 = SI->getOperand(0)->getType()->getVectorNumElements(); 3406 auto *O0 = B.CreateZExtOrTrunc( 3407 SI->getOperand(0), VectorType::get(ScalarTruncatedTy, Elements0)); 3408 auto Elements1 = SI->getOperand(1)->getType()->getVectorNumElements(); 3409 auto *O1 = B.CreateZExtOrTrunc( 3410 SI->getOperand(1), VectorType::get(ScalarTruncatedTy, Elements1)); 3411 3412 NewI = B.CreateShuffleVector(O0, O1, SI->getShuffleMask()); 3413 } else if (isa<LoadInst>(I) || isa<PHINode>(I)) { 3414 // Don't do anything with the operands, just extend the result. 3415 continue; 3416 } else if (auto *IE = dyn_cast<InsertElementInst>(I)) { 3417 auto Elements = IE->getOperand(0)->getType()->getVectorNumElements(); 3418 auto *O0 = B.CreateZExtOrTrunc( 3419 IE->getOperand(0), VectorType::get(ScalarTruncatedTy, Elements)); 3420 auto *O1 = B.CreateZExtOrTrunc(IE->getOperand(1), ScalarTruncatedTy); 3421 NewI = B.CreateInsertElement(O0, O1, IE->getOperand(2)); 3422 } else if (auto *EE = dyn_cast<ExtractElementInst>(I)) { 3423 auto Elements = EE->getOperand(0)->getType()->getVectorNumElements(); 3424 auto *O0 = B.CreateZExtOrTrunc( 3425 EE->getOperand(0), VectorType::get(ScalarTruncatedTy, Elements)); 3426 NewI = B.CreateExtractElement(O0, EE->getOperand(2)); 3427 } else { 3428 // If we don't know what to do, be conservative and don't do anything. 3429 continue; 3430 } 3431 3432 // Lastly, extend the result. 3433 NewI->takeName(cast<Instruction>(I)); 3434 Value *Res = B.CreateZExtOrTrunc(NewI, OriginalTy); 3435 I->replaceAllUsesWith(Res); 3436 cast<Instruction>(I)->eraseFromParent(); 3437 Erased.insert(I); 3438 VectorLoopValueMap.resetVectorValue(KV.first, Part, Res); 3439 } 3440 } 3441 3442 // We'll have created a bunch of ZExts that are now parentless. Clean up. 3443 for (const auto &KV : Cost->getMinimalBitwidths()) { 3444 // If the value wasn't vectorized, we must maintain the original scalar 3445 // type. The absence of the value from VectorLoopValueMap indicates that it 3446 // wasn't vectorized. 3447 if (!VectorLoopValueMap.hasAnyVectorValue(KV.first)) 3448 continue; 3449 for (unsigned Part = 0; Part < UF; ++Part) { 3450 Value *I = getOrCreateVectorValue(KV.first, Part); 3451 ZExtInst *Inst = dyn_cast<ZExtInst>(I); 3452 if (Inst && Inst->use_empty()) { 3453 Value *NewI = Inst->getOperand(0); 3454 Inst->eraseFromParent(); 3455 VectorLoopValueMap.resetVectorValue(KV.first, Part, NewI); 3456 } 3457 } 3458 } 3459 } 3460 3461 void InnerLoopVectorizer::fixVectorizedLoop() { 3462 // Insert truncates and extends for any truncated instructions as hints to 3463 // InstCombine. 3464 if (VF > 1) 3465 truncateToMinimalBitwidths(); 3466 3467 // Fix widened non-induction PHIs by setting up the PHI operands. 3468 if (OrigPHIsToFix.size()) { 3469 assert(EnableVPlanNativePath && 3470 "Unexpected non-induction PHIs for fixup in non VPlan-native path"); 3471 fixNonInductionPHIs(); 3472 } 3473 3474 // At this point every instruction in the original loop is widened to a 3475 // vector form. Now we need to fix the recurrences in the loop. These PHI 3476 // nodes are currently empty because we did not want to introduce cycles. 3477 // This is the second stage of vectorizing recurrences. 3478 fixCrossIterationPHIs(); 3479 3480 // Forget the original basic block. 3481 PSE.getSE()->forgetLoop(OrigLoop); 3482 3483 // Fix-up external users of the induction variables. 3484 for (auto &Entry : Legal->getInductionVars()) 3485 fixupIVUsers(Entry.first, Entry.second, 3486 getOrCreateVectorTripCount(LI->getLoopFor(LoopVectorBody)), 3487 IVEndValues[Entry.first], LoopMiddleBlock); 3488 3489 fixLCSSAPHIs(); 3490 for (Instruction *PI : PredicatedInstructions) 3491 sinkScalarOperands(&*PI); 3492 3493 // Remove redundant induction instructions. 3494 cse(LoopVectorBody); 3495 3496 // Set/update profile weights for the vector and remainder loops as original 3497 // loop iterations are now distributed among them. Note that original loop 3498 // represented by LoopScalarBody becomes remainder loop after vectorization. 3499 // 3500 // For cases like foldTailByMasking() and requiresScalarEpiloque() we may 3501 // end up getting slightly roughened result but that should be OK since 3502 // profile is not inherently precise anyway. Note also possible bypass of 3503 // vector code caused by legality checks is ignored, assigning all the weight 3504 // to the vector loop, optimistically. 3505 setProfileInfoAfterUnrolling(LI->getLoopFor(LoopScalarBody), 3506 LI->getLoopFor(LoopVectorBody), 3507 LI->getLoopFor(LoopScalarBody), VF * UF); 3508 } 3509 3510 void InnerLoopVectorizer::fixCrossIterationPHIs() { 3511 // In order to support recurrences we need to be able to vectorize Phi nodes. 3512 // Phi nodes have cycles, so we need to vectorize them in two stages. This is 3513 // stage #2: We now need to fix the recurrences by adding incoming edges to 3514 // the currently empty PHI nodes. At this point every instruction in the 3515 // original loop is widened to a vector form so we can use them to construct 3516 // the incoming edges. 3517 for (PHINode &Phi : OrigLoop->getHeader()->phis()) { 3518 // Handle first-order recurrences and reductions that need to be fixed. 3519 if (Legal->isFirstOrderRecurrence(&Phi)) 3520 fixFirstOrderRecurrence(&Phi); 3521 else if (Legal->isReductionVariable(&Phi)) 3522 fixReduction(&Phi); 3523 } 3524 } 3525 3526 void InnerLoopVectorizer::fixFirstOrderRecurrence(PHINode *Phi) { 3527 // This is the second phase of vectorizing first-order recurrences. An 3528 // overview of the transformation is described below. Suppose we have the 3529 // following loop. 3530 // 3531 // for (int i = 0; i < n; ++i) 3532 // b[i] = a[i] - a[i - 1]; 3533 // 3534 // There is a first-order recurrence on "a". For this loop, the shorthand 3535 // scalar IR looks like: 3536 // 3537 // scalar.ph: 3538 // s_init = a[-1] 3539 // br scalar.body 3540 // 3541 // scalar.body: 3542 // i = phi [0, scalar.ph], [i+1, scalar.body] 3543 // s1 = phi [s_init, scalar.ph], [s2, scalar.body] 3544 // s2 = a[i] 3545 // b[i] = s2 - s1 3546 // br cond, scalar.body, ... 3547 // 3548 // In this example, s1 is a recurrence because it's value depends on the 3549 // previous iteration. In the first phase of vectorization, we created a 3550 // temporary value for s1. We now complete the vectorization and produce the 3551 // shorthand vector IR shown below (for VF = 4, UF = 1). 3552 // 3553 // vector.ph: 3554 // v_init = vector(..., ..., ..., a[-1]) 3555 // br vector.body 3556 // 3557 // vector.body 3558 // i = phi [0, vector.ph], [i+4, vector.body] 3559 // v1 = phi [v_init, vector.ph], [v2, vector.body] 3560 // v2 = a[i, i+1, i+2, i+3]; 3561 // v3 = vector(v1(3), v2(0, 1, 2)) 3562 // b[i, i+1, i+2, i+3] = v2 - v3 3563 // br cond, vector.body, middle.block 3564 // 3565 // middle.block: 3566 // x = v2(3) 3567 // br scalar.ph 3568 // 3569 // scalar.ph: 3570 // s_init = phi [x, middle.block], [a[-1], otherwise] 3571 // br scalar.body 3572 // 3573 // After execution completes the vector loop, we extract the next value of 3574 // the recurrence (x) to use as the initial value in the scalar loop. 3575 3576 // Get the original loop preheader and single loop latch. 3577 auto *Preheader = OrigLoop->getLoopPreheader(); 3578 auto *Latch = OrigLoop->getLoopLatch(); 3579 3580 // Get the initial and previous values of the scalar recurrence. 3581 auto *ScalarInit = Phi->getIncomingValueForBlock(Preheader); 3582 auto *Previous = Phi->getIncomingValueForBlock(Latch); 3583 3584 // Create a vector from the initial value. 3585 auto *VectorInit = ScalarInit; 3586 if (VF > 1) { 3587 Builder.SetInsertPoint(LoopVectorPreHeader->getTerminator()); 3588 VectorInit = Builder.CreateInsertElement( 3589 UndefValue::get(VectorType::get(VectorInit->getType(), VF)), VectorInit, 3590 Builder.getInt32(VF - 1), "vector.recur.init"); 3591 } 3592 3593 // We constructed a temporary phi node in the first phase of vectorization. 3594 // This phi node will eventually be deleted. 3595 Builder.SetInsertPoint( 3596 cast<Instruction>(VectorLoopValueMap.getVectorValue(Phi, 0))); 3597 3598 // Create a phi node for the new recurrence. The current value will either be 3599 // the initial value inserted into a vector or loop-varying vector value. 3600 auto *VecPhi = Builder.CreatePHI(VectorInit->getType(), 2, "vector.recur"); 3601 VecPhi->addIncoming(VectorInit, LoopVectorPreHeader); 3602 3603 // Get the vectorized previous value of the last part UF - 1. It appears last 3604 // among all unrolled iterations, due to the order of their construction. 3605 Value *PreviousLastPart = getOrCreateVectorValue(Previous, UF - 1); 3606 3607 // Find and set the insertion point after the previous value if it is an 3608 // instruction. 3609 BasicBlock::iterator InsertPt; 3610 // Note that the previous value may have been constant-folded so it is not 3611 // guaranteed to be an instruction in the vector loop. 3612 // FIXME: Loop invariant values do not form recurrences. We should deal with 3613 // them earlier. 3614 if (LI->getLoopFor(LoopVectorBody)->isLoopInvariant(PreviousLastPart)) 3615 InsertPt = LoopVectorBody->getFirstInsertionPt(); 3616 else { 3617 Instruction *PreviousInst = cast<Instruction>(PreviousLastPart); 3618 if (isa<PHINode>(PreviousLastPart)) 3619 // If the previous value is a phi node, we should insert after all the phi 3620 // nodes in the block containing the PHI to avoid breaking basic block 3621 // verification. Note that the basic block may be different to 3622 // LoopVectorBody, in case we predicate the loop. 3623 InsertPt = PreviousInst->getParent()->getFirstInsertionPt(); 3624 else 3625 InsertPt = ++PreviousInst->getIterator(); 3626 } 3627 Builder.SetInsertPoint(&*InsertPt); 3628 3629 // We will construct a vector for the recurrence by combining the values for 3630 // the current and previous iterations. This is the required shuffle mask. 3631 SmallVector<Constant *, 8> ShuffleMask(VF); 3632 ShuffleMask[0] = Builder.getInt32(VF - 1); 3633 for (unsigned I = 1; I < VF; ++I) 3634 ShuffleMask[I] = Builder.getInt32(I + VF - 1); 3635 3636 // The vector from which to take the initial value for the current iteration 3637 // (actual or unrolled). Initially, this is the vector phi node. 3638 Value *Incoming = VecPhi; 3639 3640 // Shuffle the current and previous vector and update the vector parts. 3641 for (unsigned Part = 0; Part < UF; ++Part) { 3642 Value *PreviousPart = getOrCreateVectorValue(Previous, Part); 3643 Value *PhiPart = VectorLoopValueMap.getVectorValue(Phi, Part); 3644 auto *Shuffle = 3645 VF > 1 ? Builder.CreateShuffleVector(Incoming, PreviousPart, 3646 ConstantVector::get(ShuffleMask)) 3647 : Incoming; 3648 PhiPart->replaceAllUsesWith(Shuffle); 3649 cast<Instruction>(PhiPart)->eraseFromParent(); 3650 VectorLoopValueMap.resetVectorValue(Phi, Part, Shuffle); 3651 Incoming = PreviousPart; 3652 } 3653 3654 // Fix the latch value of the new recurrence in the vector loop. 3655 VecPhi->addIncoming(Incoming, LI->getLoopFor(LoopVectorBody)->getLoopLatch()); 3656 3657 // Extract the last vector element in the middle block. This will be the 3658 // initial value for the recurrence when jumping to the scalar loop. 3659 auto *ExtractForScalar = Incoming; 3660 if (VF > 1) { 3661 Builder.SetInsertPoint(LoopMiddleBlock->getTerminator()); 3662 ExtractForScalar = Builder.CreateExtractElement( 3663 ExtractForScalar, Builder.getInt32(VF - 1), "vector.recur.extract"); 3664 } 3665 // Extract the second last element in the middle block if the 3666 // Phi is used outside the loop. We need to extract the phi itself 3667 // and not the last element (the phi update in the current iteration). This 3668 // will be the value when jumping to the exit block from the LoopMiddleBlock, 3669 // when the scalar loop is not run at all. 3670 Value *ExtractForPhiUsedOutsideLoop = nullptr; 3671 if (VF > 1) 3672 ExtractForPhiUsedOutsideLoop = Builder.CreateExtractElement( 3673 Incoming, Builder.getInt32(VF - 2), "vector.recur.extract.for.phi"); 3674 // When loop is unrolled without vectorizing, initialize 3675 // ExtractForPhiUsedOutsideLoop with the value just prior to unrolled value of 3676 // `Incoming`. This is analogous to the vectorized case above: extracting the 3677 // second last element when VF > 1. 3678 else if (UF > 1) 3679 ExtractForPhiUsedOutsideLoop = getOrCreateVectorValue(Previous, UF - 2); 3680 3681 // Fix the initial value of the original recurrence in the scalar loop. 3682 Builder.SetInsertPoint(&*LoopScalarPreHeader->begin()); 3683 auto *Start = Builder.CreatePHI(Phi->getType(), 2, "scalar.recur.init"); 3684 for (auto *BB : predecessors(LoopScalarPreHeader)) { 3685 auto *Incoming = BB == LoopMiddleBlock ? ExtractForScalar : ScalarInit; 3686 Start->addIncoming(Incoming, BB); 3687 } 3688 3689 Phi->setIncomingValueForBlock(LoopScalarPreHeader, Start); 3690 Phi->setName("scalar.recur"); 3691 3692 // Finally, fix users of the recurrence outside the loop. The users will need 3693 // either the last value of the scalar recurrence or the last value of the 3694 // vector recurrence we extracted in the middle block. Since the loop is in 3695 // LCSSA form, we just need to find all the phi nodes for the original scalar 3696 // recurrence in the exit block, and then add an edge for the middle block. 3697 for (PHINode &LCSSAPhi : LoopExitBlock->phis()) { 3698 if (LCSSAPhi.getIncomingValue(0) == Phi) { 3699 LCSSAPhi.addIncoming(ExtractForPhiUsedOutsideLoop, LoopMiddleBlock); 3700 } 3701 } 3702 } 3703 3704 void InnerLoopVectorizer::fixReduction(PHINode *Phi) { 3705 Constant *Zero = Builder.getInt32(0); 3706 3707 // Get it's reduction variable descriptor. 3708 assert(Legal->isReductionVariable(Phi) && 3709 "Unable to find the reduction variable"); 3710 RecurrenceDescriptor RdxDesc = Legal->getReductionVars()[Phi]; 3711 3712 RecurrenceDescriptor::RecurrenceKind RK = RdxDesc.getRecurrenceKind(); 3713 TrackingVH<Value> ReductionStartValue = RdxDesc.getRecurrenceStartValue(); 3714 Instruction *LoopExitInst = RdxDesc.getLoopExitInstr(); 3715 RecurrenceDescriptor::MinMaxRecurrenceKind MinMaxKind = 3716 RdxDesc.getMinMaxRecurrenceKind(); 3717 setDebugLocFromInst(Builder, ReductionStartValue); 3718 3719 // We need to generate a reduction vector from the incoming scalar. 3720 // To do so, we need to generate the 'identity' vector and override 3721 // one of the elements with the incoming scalar reduction. We need 3722 // to do it in the vector-loop preheader. 3723 Builder.SetInsertPoint(LoopVectorPreHeader->getTerminator()); 3724 3725 // This is the vector-clone of the value that leaves the loop. 3726 Type *VecTy = getOrCreateVectorValue(LoopExitInst, 0)->getType(); 3727 3728 // Find the reduction identity variable. Zero for addition, or, xor, 3729 // one for multiplication, -1 for And. 3730 Value *Identity; 3731 Value *VectorStart; 3732 if (RK == RecurrenceDescriptor::RK_IntegerMinMax || 3733 RK == RecurrenceDescriptor::RK_FloatMinMax) { 3734 // MinMax reduction have the start value as their identify. 3735 if (VF == 1) { 3736 VectorStart = Identity = ReductionStartValue; 3737 } else { 3738 VectorStart = Identity = 3739 Builder.CreateVectorSplat(VF, ReductionStartValue, "minmax.ident"); 3740 } 3741 } else { 3742 // Handle other reduction kinds: 3743 Constant *Iden = RecurrenceDescriptor::getRecurrenceIdentity( 3744 RK, VecTy->getScalarType()); 3745 if (VF == 1) { 3746 Identity = Iden; 3747 // This vector is the Identity vector where the first element is the 3748 // incoming scalar reduction. 3749 VectorStart = ReductionStartValue; 3750 } else { 3751 Identity = ConstantVector::getSplat({VF, false}, Iden); 3752 3753 // This vector is the Identity vector where the first element is the 3754 // incoming scalar reduction. 3755 VectorStart = 3756 Builder.CreateInsertElement(Identity, ReductionStartValue, Zero); 3757 } 3758 } 3759 3760 // Wrap flags are in general invalid after vectorization, clear them. 3761 clearReductionWrapFlags(RdxDesc); 3762 3763 // Fix the vector-loop phi. 3764 3765 // Reductions do not have to start at zero. They can start with 3766 // any loop invariant values. 3767 BasicBlock *Latch = OrigLoop->getLoopLatch(); 3768 Value *LoopVal = Phi->getIncomingValueForBlock(Latch); 3769 3770 for (unsigned Part = 0; Part < UF; ++Part) { 3771 Value *VecRdxPhi = getOrCreateVectorValue(Phi, Part); 3772 Value *Val = getOrCreateVectorValue(LoopVal, Part); 3773 // Make sure to add the reduction start value only to the 3774 // first unroll part. 3775 Value *StartVal = (Part == 0) ? VectorStart : Identity; 3776 cast<PHINode>(VecRdxPhi)->addIncoming(StartVal, LoopVectorPreHeader); 3777 cast<PHINode>(VecRdxPhi) 3778 ->addIncoming(Val, LI->getLoopFor(LoopVectorBody)->getLoopLatch()); 3779 } 3780 3781 // Before each round, move the insertion point right between 3782 // the PHIs and the values we are going to write. 3783 // This allows us to write both PHINodes and the extractelement 3784 // instructions. 3785 Builder.SetInsertPoint(&*LoopMiddleBlock->getFirstInsertionPt()); 3786 3787 setDebugLocFromInst(Builder, LoopExitInst); 3788 3789 // If tail is folded by masking, the vector value to leave the loop should be 3790 // a Select choosing between the vectorized LoopExitInst and vectorized Phi, 3791 // instead of the former. 3792 if (Cost->foldTailByMasking()) { 3793 for (unsigned Part = 0; Part < UF; ++Part) { 3794 Value *VecLoopExitInst = 3795 VectorLoopValueMap.getVectorValue(LoopExitInst, Part); 3796 Value *Sel = nullptr; 3797 for (User *U : VecLoopExitInst->users()) { 3798 if (isa<SelectInst>(U)) { 3799 assert(!Sel && "Reduction exit feeding two selects"); 3800 Sel = U; 3801 } else 3802 assert(isa<PHINode>(U) && "Reduction exit must feed Phi's or select"); 3803 } 3804 assert(Sel && "Reduction exit feeds no select"); 3805 VectorLoopValueMap.resetVectorValue(LoopExitInst, Part, Sel); 3806 } 3807 } 3808 3809 // If the vector reduction can be performed in a smaller type, we truncate 3810 // then extend the loop exit value to enable InstCombine to evaluate the 3811 // entire expression in the smaller type. 3812 if (VF > 1 && Phi->getType() != RdxDesc.getRecurrenceType()) { 3813 Type *RdxVecTy = VectorType::get(RdxDesc.getRecurrenceType(), VF); 3814 Builder.SetInsertPoint( 3815 LI->getLoopFor(LoopVectorBody)->getLoopLatch()->getTerminator()); 3816 VectorParts RdxParts(UF); 3817 for (unsigned Part = 0; Part < UF; ++Part) { 3818 RdxParts[Part] = VectorLoopValueMap.getVectorValue(LoopExitInst, Part); 3819 Value *Trunc = Builder.CreateTrunc(RdxParts[Part], RdxVecTy); 3820 Value *Extnd = RdxDesc.isSigned() ? Builder.CreateSExt(Trunc, VecTy) 3821 : Builder.CreateZExt(Trunc, VecTy); 3822 for (Value::user_iterator UI = RdxParts[Part]->user_begin(); 3823 UI != RdxParts[Part]->user_end();) 3824 if (*UI != Trunc) { 3825 (*UI++)->replaceUsesOfWith(RdxParts[Part], Extnd); 3826 RdxParts[Part] = Extnd; 3827 } else { 3828 ++UI; 3829 } 3830 } 3831 Builder.SetInsertPoint(&*LoopMiddleBlock->getFirstInsertionPt()); 3832 for (unsigned Part = 0; Part < UF; ++Part) { 3833 RdxParts[Part] = Builder.CreateTrunc(RdxParts[Part], RdxVecTy); 3834 VectorLoopValueMap.resetVectorValue(LoopExitInst, Part, RdxParts[Part]); 3835 } 3836 } 3837 3838 // Reduce all of the unrolled parts into a single vector. 3839 Value *ReducedPartRdx = VectorLoopValueMap.getVectorValue(LoopExitInst, 0); 3840 unsigned Op = RecurrenceDescriptor::getRecurrenceBinOp(RK); 3841 3842 // The middle block terminator has already been assigned a DebugLoc here (the 3843 // OrigLoop's single latch terminator). We want the whole middle block to 3844 // appear to execute on this line because: (a) it is all compiler generated, 3845 // (b) these instructions are always executed after evaluating the latch 3846 // conditional branch, and (c) other passes may add new predecessors which 3847 // terminate on this line. This is the easiest way to ensure we don't 3848 // accidentally cause an extra step back into the loop while debugging. 3849 setDebugLocFromInst(Builder, LoopMiddleBlock->getTerminator()); 3850 for (unsigned Part = 1; Part < UF; ++Part) { 3851 Value *RdxPart = VectorLoopValueMap.getVectorValue(LoopExitInst, Part); 3852 if (Op != Instruction::ICmp && Op != Instruction::FCmp) 3853 // Floating point operations had to be 'fast' to enable the reduction. 3854 ReducedPartRdx = addFastMathFlag( 3855 Builder.CreateBinOp((Instruction::BinaryOps)Op, RdxPart, 3856 ReducedPartRdx, "bin.rdx"), 3857 RdxDesc.getFastMathFlags()); 3858 else 3859 ReducedPartRdx = createMinMaxOp(Builder, MinMaxKind, ReducedPartRdx, 3860 RdxPart); 3861 } 3862 3863 if (VF > 1) { 3864 bool NoNaN = Legal->hasFunNoNaNAttr(); 3865 ReducedPartRdx = 3866 createTargetReduction(Builder, TTI, RdxDesc, ReducedPartRdx, NoNaN); 3867 // If the reduction can be performed in a smaller type, we need to extend 3868 // the reduction to the wider type before we branch to the original loop. 3869 if (Phi->getType() != RdxDesc.getRecurrenceType()) 3870 ReducedPartRdx = 3871 RdxDesc.isSigned() 3872 ? Builder.CreateSExt(ReducedPartRdx, Phi->getType()) 3873 : Builder.CreateZExt(ReducedPartRdx, Phi->getType()); 3874 } 3875 3876 // Create a phi node that merges control-flow from the backedge-taken check 3877 // block and the middle block. 3878 PHINode *BCBlockPhi = PHINode::Create(Phi->getType(), 2, "bc.merge.rdx", 3879 LoopScalarPreHeader->getTerminator()); 3880 for (unsigned I = 0, E = LoopBypassBlocks.size(); I != E; ++I) 3881 BCBlockPhi->addIncoming(ReductionStartValue, LoopBypassBlocks[I]); 3882 BCBlockPhi->addIncoming(ReducedPartRdx, LoopMiddleBlock); 3883 3884 // Now, we need to fix the users of the reduction variable 3885 // inside and outside of the scalar remainder loop. 3886 // We know that the loop is in LCSSA form. We need to update the 3887 // PHI nodes in the exit blocks. 3888 for (PHINode &LCSSAPhi : LoopExitBlock->phis()) { 3889 // All PHINodes need to have a single entry edge, or two if 3890 // we already fixed them. 3891 assert(LCSSAPhi.getNumIncomingValues() < 3 && "Invalid LCSSA PHI"); 3892 3893 // We found a reduction value exit-PHI. Update it with the 3894 // incoming bypass edge. 3895 if (LCSSAPhi.getIncomingValue(0) == LoopExitInst) 3896 LCSSAPhi.addIncoming(ReducedPartRdx, LoopMiddleBlock); 3897 } // end of the LCSSA phi scan. 3898 3899 // Fix the scalar loop reduction variable with the incoming reduction sum 3900 // from the vector body and from the backedge value. 3901 int IncomingEdgeBlockIdx = 3902 Phi->getBasicBlockIndex(OrigLoop->getLoopLatch()); 3903 assert(IncomingEdgeBlockIdx >= 0 && "Invalid block index"); 3904 // Pick the other block. 3905 int SelfEdgeBlockIdx = (IncomingEdgeBlockIdx ? 0 : 1); 3906 Phi->setIncomingValue(SelfEdgeBlockIdx, BCBlockPhi); 3907 Phi->setIncomingValue(IncomingEdgeBlockIdx, LoopExitInst); 3908 } 3909 3910 void InnerLoopVectorizer::clearReductionWrapFlags( 3911 RecurrenceDescriptor &RdxDesc) { 3912 RecurrenceDescriptor::RecurrenceKind RK = RdxDesc.getRecurrenceKind(); 3913 if (RK != RecurrenceDescriptor::RK_IntegerAdd && 3914 RK != RecurrenceDescriptor::RK_IntegerMult) 3915 return; 3916 3917 Instruction *LoopExitInstr = RdxDesc.getLoopExitInstr(); 3918 assert(LoopExitInstr && "null loop exit instruction"); 3919 SmallVector<Instruction *, 8> Worklist; 3920 SmallPtrSet<Instruction *, 8> Visited; 3921 Worklist.push_back(LoopExitInstr); 3922 Visited.insert(LoopExitInstr); 3923 3924 while (!Worklist.empty()) { 3925 Instruction *Cur = Worklist.pop_back_val(); 3926 if (isa<OverflowingBinaryOperator>(Cur)) 3927 for (unsigned Part = 0; Part < UF; ++Part) { 3928 Value *V = getOrCreateVectorValue(Cur, Part); 3929 cast<Instruction>(V)->dropPoisonGeneratingFlags(); 3930 } 3931 3932 for (User *U : Cur->users()) { 3933 Instruction *UI = cast<Instruction>(U); 3934 if ((Cur != LoopExitInstr || OrigLoop->contains(UI->getParent())) && 3935 Visited.insert(UI).second) 3936 Worklist.push_back(UI); 3937 } 3938 } 3939 } 3940 3941 void InnerLoopVectorizer::fixLCSSAPHIs() { 3942 for (PHINode &LCSSAPhi : LoopExitBlock->phis()) { 3943 if (LCSSAPhi.getNumIncomingValues() == 1) { 3944 auto *IncomingValue = LCSSAPhi.getIncomingValue(0); 3945 // Non-instruction incoming values will have only one value. 3946 unsigned LastLane = 0; 3947 if (isa<Instruction>(IncomingValue)) 3948 LastLane = Cost->isUniformAfterVectorization( 3949 cast<Instruction>(IncomingValue), VF) 3950 ? 0 3951 : VF - 1; 3952 // Can be a loop invariant incoming value or the last scalar value to be 3953 // extracted from the vectorized loop. 3954 Builder.SetInsertPoint(LoopMiddleBlock->getTerminator()); 3955 Value *lastIncomingValue = 3956 getOrCreateScalarValue(IncomingValue, { UF - 1, LastLane }); 3957 LCSSAPhi.addIncoming(lastIncomingValue, LoopMiddleBlock); 3958 } 3959 } 3960 } 3961 3962 void InnerLoopVectorizer::sinkScalarOperands(Instruction *PredInst) { 3963 // The basic block and loop containing the predicated instruction. 3964 auto *PredBB = PredInst->getParent(); 3965 auto *VectorLoop = LI->getLoopFor(PredBB); 3966 3967 // Initialize a worklist with the operands of the predicated instruction. 3968 SetVector<Value *> Worklist(PredInst->op_begin(), PredInst->op_end()); 3969 3970 // Holds instructions that we need to analyze again. An instruction may be 3971 // reanalyzed if we don't yet know if we can sink it or not. 3972 SmallVector<Instruction *, 8> InstsToReanalyze; 3973 3974 // Returns true if a given use occurs in the predicated block. Phi nodes use 3975 // their operands in their corresponding predecessor blocks. 3976 auto isBlockOfUsePredicated = [&](Use &U) -> bool { 3977 auto *I = cast<Instruction>(U.getUser()); 3978 BasicBlock *BB = I->getParent(); 3979 if (auto *Phi = dyn_cast<PHINode>(I)) 3980 BB = Phi->getIncomingBlock( 3981 PHINode::getIncomingValueNumForOperand(U.getOperandNo())); 3982 return BB == PredBB; 3983 }; 3984 3985 // Iteratively sink the scalarized operands of the predicated instruction 3986 // into the block we created for it. When an instruction is sunk, it's 3987 // operands are then added to the worklist. The algorithm ends after one pass 3988 // through the worklist doesn't sink a single instruction. 3989 bool Changed; 3990 do { 3991 // Add the instructions that need to be reanalyzed to the worklist, and 3992 // reset the changed indicator. 3993 Worklist.insert(InstsToReanalyze.begin(), InstsToReanalyze.end()); 3994 InstsToReanalyze.clear(); 3995 Changed = false; 3996 3997 while (!Worklist.empty()) { 3998 auto *I = dyn_cast<Instruction>(Worklist.pop_back_val()); 3999 4000 // We can't sink an instruction if it is a phi node, is already in the 4001 // predicated block, is not in the loop, or may have side effects. 4002 if (!I || isa<PHINode>(I) || I->getParent() == PredBB || 4003 !VectorLoop->contains(I) || I->mayHaveSideEffects()) 4004 continue; 4005 4006 // It's legal to sink the instruction if all its uses occur in the 4007 // predicated block. Otherwise, there's nothing to do yet, and we may 4008 // need to reanalyze the instruction. 4009 if (!llvm::all_of(I->uses(), isBlockOfUsePredicated)) { 4010 InstsToReanalyze.push_back(I); 4011 continue; 4012 } 4013 4014 // Move the instruction to the beginning of the predicated block, and add 4015 // it's operands to the worklist. 4016 I->moveBefore(&*PredBB->getFirstInsertionPt()); 4017 Worklist.insert(I->op_begin(), I->op_end()); 4018 4019 // The sinking may have enabled other instructions to be sunk, so we will 4020 // need to iterate. 4021 Changed = true; 4022 } 4023 } while (Changed); 4024 } 4025 4026 void InnerLoopVectorizer::fixNonInductionPHIs() { 4027 for (PHINode *OrigPhi : OrigPHIsToFix) { 4028 PHINode *NewPhi = 4029 cast<PHINode>(VectorLoopValueMap.getVectorValue(OrigPhi, 0)); 4030 unsigned NumIncomingValues = OrigPhi->getNumIncomingValues(); 4031 4032 SmallVector<BasicBlock *, 2> ScalarBBPredecessors( 4033 predecessors(OrigPhi->getParent())); 4034 SmallVector<BasicBlock *, 2> VectorBBPredecessors( 4035 predecessors(NewPhi->getParent())); 4036 assert(ScalarBBPredecessors.size() == VectorBBPredecessors.size() && 4037 "Scalar and Vector BB should have the same number of predecessors"); 4038 4039 // The insertion point in Builder may be invalidated by the time we get 4040 // here. Force the Builder insertion point to something valid so that we do 4041 // not run into issues during insertion point restore in 4042 // getOrCreateVectorValue calls below. 4043 Builder.SetInsertPoint(NewPhi); 4044 4045 // The predecessor order is preserved and we can rely on mapping between 4046 // scalar and vector block predecessors. 4047 for (unsigned i = 0; i < NumIncomingValues; ++i) { 4048 BasicBlock *NewPredBB = VectorBBPredecessors[i]; 4049 4050 // When looking up the new scalar/vector values to fix up, use incoming 4051 // values from original phi. 4052 Value *ScIncV = 4053 OrigPhi->getIncomingValueForBlock(ScalarBBPredecessors[i]); 4054 4055 // Scalar incoming value may need a broadcast 4056 Value *NewIncV = getOrCreateVectorValue(ScIncV, 0); 4057 NewPhi->addIncoming(NewIncV, NewPredBB); 4058 } 4059 } 4060 } 4061 4062 void InnerLoopVectorizer::widenGEP(GetElementPtrInst *GEP, unsigned UF, 4063 unsigned VF, bool IsPtrLoopInvariant, 4064 SmallBitVector &IsIndexLoopInvariant) { 4065 // Construct a vector GEP by widening the operands of the scalar GEP as 4066 // necessary. We mark the vector GEP 'inbounds' if appropriate. A GEP 4067 // results in a vector of pointers when at least one operand of the GEP 4068 // is vector-typed. Thus, to keep the representation compact, we only use 4069 // vector-typed operands for loop-varying values. 4070 4071 if (VF > 1 && IsPtrLoopInvariant && IsIndexLoopInvariant.all()) { 4072 // If we are vectorizing, but the GEP has only loop-invariant operands, 4073 // the GEP we build (by only using vector-typed operands for 4074 // loop-varying values) would be a scalar pointer. Thus, to ensure we 4075 // produce a vector of pointers, we need to either arbitrarily pick an 4076 // operand to broadcast, or broadcast a clone of the original GEP. 4077 // Here, we broadcast a clone of the original. 4078 // 4079 // TODO: If at some point we decide to scalarize instructions having 4080 // loop-invariant operands, this special case will no longer be 4081 // required. We would add the scalarization decision to 4082 // collectLoopScalars() and teach getVectorValue() to broadcast 4083 // the lane-zero scalar value. 4084 auto *Clone = Builder.Insert(GEP->clone()); 4085 for (unsigned Part = 0; Part < UF; ++Part) { 4086 Value *EntryPart = Builder.CreateVectorSplat(VF, Clone); 4087 VectorLoopValueMap.setVectorValue(GEP, Part, EntryPart); 4088 addMetadata(EntryPart, GEP); 4089 } 4090 } else { 4091 // If the GEP has at least one loop-varying operand, we are sure to 4092 // produce a vector of pointers. But if we are only unrolling, we want 4093 // to produce a scalar GEP for each unroll part. Thus, the GEP we 4094 // produce with the code below will be scalar (if VF == 1) or vector 4095 // (otherwise). Note that for the unroll-only case, we still maintain 4096 // values in the vector mapping with initVector, as we do for other 4097 // instructions. 4098 for (unsigned Part = 0; Part < UF; ++Part) { 4099 // The pointer operand of the new GEP. If it's loop-invariant, we 4100 // won't broadcast it. 4101 auto *Ptr = IsPtrLoopInvariant 4102 ? GEP->getPointerOperand() 4103 : getOrCreateVectorValue(GEP->getPointerOperand(), Part); 4104 4105 // Collect all the indices for the new GEP. If any index is 4106 // loop-invariant, we won't broadcast it. 4107 SmallVector<Value *, 4> Indices; 4108 for (auto Index : enumerate(GEP->indices())) { 4109 Value *User = Index.value().get(); 4110 if (IsIndexLoopInvariant[Index.index()]) 4111 Indices.push_back(User); 4112 else 4113 Indices.push_back(getOrCreateVectorValue(User, Part)); 4114 } 4115 4116 // Create the new GEP. Note that this GEP may be a scalar if VF == 1, 4117 // but it should be a vector, otherwise. 4118 auto *NewGEP = 4119 GEP->isInBounds() 4120 ? Builder.CreateInBoundsGEP(GEP->getSourceElementType(), Ptr, 4121 Indices) 4122 : Builder.CreateGEP(GEP->getSourceElementType(), Ptr, Indices); 4123 assert((VF == 1 || NewGEP->getType()->isVectorTy()) && 4124 "NewGEP is not a pointer vector"); 4125 VectorLoopValueMap.setVectorValue(GEP, Part, NewGEP); 4126 addMetadata(NewGEP, GEP); 4127 } 4128 } 4129 } 4130 4131 void InnerLoopVectorizer::widenPHIInstruction(Instruction *PN, unsigned UF, 4132 unsigned VF) { 4133 PHINode *P = cast<PHINode>(PN); 4134 if (EnableVPlanNativePath) { 4135 // Currently we enter here in the VPlan-native path for non-induction 4136 // PHIs where all control flow is uniform. We simply widen these PHIs. 4137 // Create a vector phi with no operands - the vector phi operands will be 4138 // set at the end of vector code generation. 4139 Type *VecTy = 4140 (VF == 1) ? PN->getType() : VectorType::get(PN->getType(), VF); 4141 Value *VecPhi = Builder.CreatePHI(VecTy, PN->getNumOperands(), "vec.phi"); 4142 VectorLoopValueMap.setVectorValue(P, 0, VecPhi); 4143 OrigPHIsToFix.push_back(P); 4144 4145 return; 4146 } 4147 4148 assert(PN->getParent() == OrigLoop->getHeader() && 4149 "Non-header phis should have been handled elsewhere"); 4150 4151 // In order to support recurrences we need to be able to vectorize Phi nodes. 4152 // Phi nodes have cycles, so we need to vectorize them in two stages. This is 4153 // stage #1: We create a new vector PHI node with no incoming edges. We'll use 4154 // this value when we vectorize all of the instructions that use the PHI. 4155 if (Legal->isReductionVariable(P) || Legal->isFirstOrderRecurrence(P)) { 4156 for (unsigned Part = 0; Part < UF; ++Part) { 4157 // This is phase one of vectorizing PHIs. 4158 Type *VecTy = 4159 (VF == 1) ? PN->getType() : VectorType::get(PN->getType(), VF); 4160 Value *EntryPart = PHINode::Create( 4161 VecTy, 2, "vec.phi", &*LoopVectorBody->getFirstInsertionPt()); 4162 VectorLoopValueMap.setVectorValue(P, Part, EntryPart); 4163 } 4164 return; 4165 } 4166 4167 setDebugLocFromInst(Builder, P); 4168 4169 // This PHINode must be an induction variable. 4170 // Make sure that we know about it. 4171 assert(Legal->getInductionVars().count(P) && "Not an induction variable"); 4172 4173 InductionDescriptor II = Legal->getInductionVars().lookup(P); 4174 const DataLayout &DL = OrigLoop->getHeader()->getModule()->getDataLayout(); 4175 4176 // FIXME: The newly created binary instructions should contain nsw/nuw flags, 4177 // which can be found from the original scalar operations. 4178 switch (II.getKind()) { 4179 case InductionDescriptor::IK_NoInduction: 4180 llvm_unreachable("Unknown induction"); 4181 case InductionDescriptor::IK_IntInduction: 4182 case InductionDescriptor::IK_FpInduction: 4183 llvm_unreachable("Integer/fp induction is handled elsewhere."); 4184 case InductionDescriptor::IK_PtrInduction: { 4185 // Handle the pointer induction variable case. 4186 assert(P->getType()->isPointerTy() && "Unexpected type."); 4187 // This is the normalized GEP that starts counting at zero. 4188 Value *PtrInd = Induction; 4189 PtrInd = Builder.CreateSExtOrTrunc(PtrInd, II.getStep()->getType()); 4190 // Determine the number of scalars we need to generate for each unroll 4191 // iteration. If the instruction is uniform, we only need to generate the 4192 // first lane. Otherwise, we generate all VF values. 4193 unsigned Lanes = Cost->isUniformAfterVectorization(P, VF) ? 1 : VF; 4194 // These are the scalar results. Notice that we don't generate vector GEPs 4195 // because scalar GEPs result in better code. 4196 for (unsigned Part = 0; Part < UF; ++Part) { 4197 for (unsigned Lane = 0; Lane < Lanes; ++Lane) { 4198 Constant *Idx = ConstantInt::get(PtrInd->getType(), Lane + Part * VF); 4199 Value *GlobalIdx = Builder.CreateAdd(PtrInd, Idx); 4200 Value *SclrGep = 4201 emitTransformedIndex(Builder, GlobalIdx, PSE.getSE(), DL, II); 4202 SclrGep->setName("next.gep"); 4203 VectorLoopValueMap.setScalarValue(P, {Part, Lane}, SclrGep); 4204 } 4205 } 4206 return; 4207 } 4208 } 4209 } 4210 4211 /// A helper function for checking whether an integer division-related 4212 /// instruction may divide by zero (in which case it must be predicated if 4213 /// executed conditionally in the scalar code). 4214 /// TODO: It may be worthwhile to generalize and check isKnownNonZero(). 4215 /// Non-zero divisors that are non compile-time constants will not be 4216 /// converted into multiplication, so we will still end up scalarizing 4217 /// the division, but can do so w/o predication. 4218 static bool mayDivideByZero(Instruction &I) { 4219 assert((I.getOpcode() == Instruction::UDiv || 4220 I.getOpcode() == Instruction::SDiv || 4221 I.getOpcode() == Instruction::URem || 4222 I.getOpcode() == Instruction::SRem) && 4223 "Unexpected instruction"); 4224 Value *Divisor = I.getOperand(1); 4225 auto *CInt = dyn_cast<ConstantInt>(Divisor); 4226 return !CInt || CInt->isZero(); 4227 } 4228 4229 void InnerLoopVectorizer::widenInstruction(Instruction &I) { 4230 switch (I.getOpcode()) { 4231 case Instruction::Call: 4232 case Instruction::Br: 4233 case Instruction::PHI: 4234 case Instruction::GetElementPtr: 4235 llvm_unreachable("This instruction is handled by a different recipe."); 4236 case Instruction::UDiv: 4237 case Instruction::SDiv: 4238 case Instruction::SRem: 4239 case Instruction::URem: 4240 case Instruction::Add: 4241 case Instruction::FAdd: 4242 case Instruction::Sub: 4243 case Instruction::FSub: 4244 case Instruction::FNeg: 4245 case Instruction::Mul: 4246 case Instruction::FMul: 4247 case Instruction::FDiv: 4248 case Instruction::FRem: 4249 case Instruction::Shl: 4250 case Instruction::LShr: 4251 case Instruction::AShr: 4252 case Instruction::And: 4253 case Instruction::Or: 4254 case Instruction::Xor: { 4255 // Just widen unops and binops. 4256 setDebugLocFromInst(Builder, &I); 4257 4258 for (unsigned Part = 0; Part < UF; ++Part) { 4259 SmallVector<Value *, 2> Ops; 4260 for (Value *Op : I.operands()) 4261 Ops.push_back(getOrCreateVectorValue(Op, Part)); 4262 4263 Value *V = Builder.CreateNAryOp(I.getOpcode(), Ops); 4264 4265 if (auto *VecOp = dyn_cast<Instruction>(V)) 4266 VecOp->copyIRFlags(&I); 4267 4268 // Use this vector value for all users of the original instruction. 4269 VectorLoopValueMap.setVectorValue(&I, Part, V); 4270 addMetadata(V, &I); 4271 } 4272 4273 break; 4274 } 4275 case Instruction::Select: { 4276 // Widen selects. 4277 // If the selector is loop invariant we can create a select 4278 // instruction with a scalar condition. Otherwise, use vector-select. 4279 auto *SE = PSE.getSE(); 4280 bool InvariantCond = 4281 SE->isLoopInvariant(PSE.getSCEV(I.getOperand(0)), OrigLoop); 4282 setDebugLocFromInst(Builder, &I); 4283 4284 // The condition can be loop invariant but still defined inside the 4285 // loop. This means that we can't just use the original 'cond' value. 4286 // We have to take the 'vectorized' value and pick the first lane. 4287 // Instcombine will make this a no-op. 4288 4289 auto *ScalarCond = getOrCreateScalarValue(I.getOperand(0), {0, 0}); 4290 4291 for (unsigned Part = 0; Part < UF; ++Part) { 4292 Value *Cond = getOrCreateVectorValue(I.getOperand(0), Part); 4293 Value *Op0 = getOrCreateVectorValue(I.getOperand(1), Part); 4294 Value *Op1 = getOrCreateVectorValue(I.getOperand(2), Part); 4295 Value *Sel = 4296 Builder.CreateSelect(InvariantCond ? ScalarCond : Cond, Op0, Op1); 4297 VectorLoopValueMap.setVectorValue(&I, Part, Sel); 4298 addMetadata(Sel, &I); 4299 } 4300 4301 break; 4302 } 4303 4304 case Instruction::ICmp: 4305 case Instruction::FCmp: { 4306 // Widen compares. Generate vector compares. 4307 bool FCmp = (I.getOpcode() == Instruction::FCmp); 4308 auto *Cmp = cast<CmpInst>(&I); 4309 setDebugLocFromInst(Builder, Cmp); 4310 for (unsigned Part = 0; Part < UF; ++Part) { 4311 Value *A = getOrCreateVectorValue(Cmp->getOperand(0), Part); 4312 Value *B = getOrCreateVectorValue(Cmp->getOperand(1), Part); 4313 Value *C = nullptr; 4314 if (FCmp) { 4315 // Propagate fast math flags. 4316 IRBuilder<>::FastMathFlagGuard FMFG(Builder); 4317 Builder.setFastMathFlags(Cmp->getFastMathFlags()); 4318 C = Builder.CreateFCmp(Cmp->getPredicate(), A, B); 4319 } else { 4320 C = Builder.CreateICmp(Cmp->getPredicate(), A, B); 4321 } 4322 VectorLoopValueMap.setVectorValue(&I, Part, C); 4323 addMetadata(C, &I); 4324 } 4325 4326 break; 4327 } 4328 4329 case Instruction::ZExt: 4330 case Instruction::SExt: 4331 case Instruction::FPToUI: 4332 case Instruction::FPToSI: 4333 case Instruction::FPExt: 4334 case Instruction::PtrToInt: 4335 case Instruction::IntToPtr: 4336 case Instruction::SIToFP: 4337 case Instruction::UIToFP: 4338 case Instruction::Trunc: 4339 case Instruction::FPTrunc: 4340 case Instruction::BitCast: { 4341 auto *CI = cast<CastInst>(&I); 4342 setDebugLocFromInst(Builder, CI); 4343 4344 /// Vectorize casts. 4345 Type *DestTy = 4346 (VF == 1) ? CI->getType() : VectorType::get(CI->getType(), VF); 4347 4348 for (unsigned Part = 0; Part < UF; ++Part) { 4349 Value *A = getOrCreateVectorValue(CI->getOperand(0), Part); 4350 Value *Cast = Builder.CreateCast(CI->getOpcode(), A, DestTy); 4351 VectorLoopValueMap.setVectorValue(&I, Part, Cast); 4352 addMetadata(Cast, &I); 4353 } 4354 break; 4355 } 4356 default: 4357 // This instruction is not vectorized by simple widening. 4358 LLVM_DEBUG(dbgs() << "LV: Found an unhandled instruction: " << I); 4359 llvm_unreachable("Unhandled instruction!"); 4360 } // end of switch. 4361 } 4362 4363 void InnerLoopVectorizer::widenCallInstruction(CallInst &I, VPUser &ArgOperands, 4364 VPTransformState &State) { 4365 assert(!isa<DbgInfoIntrinsic>(I) && 4366 "DbgInfoIntrinsic should have been dropped during VPlan construction"); 4367 setDebugLocFromInst(Builder, &I); 4368 4369 Module *M = I.getParent()->getParent()->getParent(); 4370 auto *CI = cast<CallInst>(&I); 4371 4372 SmallVector<Type *, 4> Tys; 4373 for (Value *ArgOperand : CI->arg_operands()) 4374 Tys.push_back(ToVectorTy(ArgOperand->getType(), VF)); 4375 4376 Intrinsic::ID ID = getVectorIntrinsicIDForCall(CI, TLI); 4377 4378 // The flag shows whether we use Intrinsic or a usual Call for vectorized 4379 // version of the instruction. 4380 // Is it beneficial to perform intrinsic call compared to lib call? 4381 bool NeedToScalarize = false; 4382 unsigned CallCost = Cost->getVectorCallCost(CI, VF, NeedToScalarize); 4383 bool UseVectorIntrinsic = 4384 ID && Cost->getVectorIntrinsicCost(CI, VF) <= CallCost; 4385 assert((UseVectorIntrinsic || !NeedToScalarize) && 4386 "Instruction should be scalarized elsewhere."); 4387 4388 for (unsigned Part = 0; Part < UF; ++Part) { 4389 SmallVector<Value *, 4> Args; 4390 for (auto &I : enumerate(ArgOperands.operands())) { 4391 // Some intrinsics have a scalar argument - don't replace it with a 4392 // vector. 4393 Value *Arg; 4394 if (!UseVectorIntrinsic || !hasVectorInstrinsicScalarOpd(ID, I.index())) 4395 Arg = State.get(I.value(), Part); 4396 else 4397 Arg = State.get(I.value(), {0, 0}); 4398 Args.push_back(Arg); 4399 } 4400 4401 Function *VectorF; 4402 if (UseVectorIntrinsic) { 4403 // Use vector version of the intrinsic. 4404 Type *TysForDecl[] = {CI->getType()}; 4405 if (VF > 1) 4406 TysForDecl[0] = VectorType::get(CI->getType()->getScalarType(), VF); 4407 VectorF = Intrinsic::getDeclaration(M, ID, TysForDecl); 4408 } else { 4409 // Use vector version of the function call. 4410 const VFShape Shape = 4411 VFShape::get(*CI, {VF, false} /*EC*/, false /*HasGlobalPred*/); 4412 #ifndef NDEBUG 4413 const SmallVector<VFInfo, 8> Infos = VFDatabase::getMappings(*CI); 4414 assert(std::find_if(Infos.begin(), Infos.end(), 4415 [&Shape](const VFInfo &Info) { 4416 return Info.Shape == Shape; 4417 }) != Infos.end() && 4418 "Vector function shape is missing from the database."); 4419 #endif 4420 VectorF = VFDatabase(*CI).getVectorizedFunction(Shape); 4421 } 4422 assert(VectorF && "Can't create vector function."); 4423 4424 SmallVector<OperandBundleDef, 1> OpBundles; 4425 CI->getOperandBundlesAsDefs(OpBundles); 4426 CallInst *V = Builder.CreateCall(VectorF, Args, OpBundles); 4427 4428 if (isa<FPMathOperator>(V)) 4429 V->copyFastMathFlags(CI); 4430 4431 VectorLoopValueMap.setVectorValue(&I, Part, V); 4432 addMetadata(V, &I); 4433 } 4434 } 4435 4436 void LoopVectorizationCostModel::collectLoopScalars(unsigned VF) { 4437 // We should not collect Scalars more than once per VF. Right now, this 4438 // function is called from collectUniformsAndScalars(), which already does 4439 // this check. Collecting Scalars for VF=1 does not make any sense. 4440 assert(VF >= 2 && Scalars.find(VF) == Scalars.end() && 4441 "This function should not be visited twice for the same VF"); 4442 4443 SmallSetVector<Instruction *, 8> Worklist; 4444 4445 // These sets are used to seed the analysis with pointers used by memory 4446 // accesses that will remain scalar. 4447 SmallSetVector<Instruction *, 8> ScalarPtrs; 4448 SmallPtrSet<Instruction *, 8> PossibleNonScalarPtrs; 4449 4450 // A helper that returns true if the use of Ptr by MemAccess will be scalar. 4451 // The pointer operands of loads and stores will be scalar as long as the 4452 // memory access is not a gather or scatter operation. The value operand of a 4453 // store will remain scalar if the store is scalarized. 4454 auto isScalarUse = [&](Instruction *MemAccess, Value *Ptr) { 4455 InstWidening WideningDecision = getWideningDecision(MemAccess, VF); 4456 assert(WideningDecision != CM_Unknown && 4457 "Widening decision should be ready at this moment"); 4458 if (auto *Store = dyn_cast<StoreInst>(MemAccess)) 4459 if (Ptr == Store->getValueOperand()) 4460 return WideningDecision == CM_Scalarize; 4461 assert(Ptr == getLoadStorePointerOperand(MemAccess) && 4462 "Ptr is neither a value or pointer operand"); 4463 return WideningDecision != CM_GatherScatter; 4464 }; 4465 4466 // A helper that returns true if the given value is a bitcast or 4467 // getelementptr instruction contained in the loop. 4468 auto isLoopVaryingBitCastOrGEP = [&](Value *V) { 4469 return ((isa<BitCastInst>(V) && V->getType()->isPointerTy()) || 4470 isa<GetElementPtrInst>(V)) && 4471 !TheLoop->isLoopInvariant(V); 4472 }; 4473 4474 // A helper that evaluates a memory access's use of a pointer. If the use 4475 // will be a scalar use, and the pointer is only used by memory accesses, we 4476 // place the pointer in ScalarPtrs. Otherwise, the pointer is placed in 4477 // PossibleNonScalarPtrs. 4478 auto evaluatePtrUse = [&](Instruction *MemAccess, Value *Ptr) { 4479 // We only care about bitcast and getelementptr instructions contained in 4480 // the loop. 4481 if (!isLoopVaryingBitCastOrGEP(Ptr)) 4482 return; 4483 4484 // If the pointer has already been identified as scalar (e.g., if it was 4485 // also identified as uniform), there's nothing to do. 4486 auto *I = cast<Instruction>(Ptr); 4487 if (Worklist.count(I)) 4488 return; 4489 4490 // If the use of the pointer will be a scalar use, and all users of the 4491 // pointer are memory accesses, place the pointer in ScalarPtrs. Otherwise, 4492 // place the pointer in PossibleNonScalarPtrs. 4493 if (isScalarUse(MemAccess, Ptr) && llvm::all_of(I->users(), [&](User *U) { 4494 return isa<LoadInst>(U) || isa<StoreInst>(U); 4495 })) 4496 ScalarPtrs.insert(I); 4497 else 4498 PossibleNonScalarPtrs.insert(I); 4499 }; 4500 4501 // We seed the scalars analysis with three classes of instructions: (1) 4502 // instructions marked uniform-after-vectorization, (2) bitcast and 4503 // getelementptr instructions used by memory accesses requiring a scalar use, 4504 // and (3) pointer induction variables and their update instructions (we 4505 // currently only scalarize these). 4506 // 4507 // (1) Add to the worklist all instructions that have been identified as 4508 // uniform-after-vectorization. 4509 Worklist.insert(Uniforms[VF].begin(), Uniforms[VF].end()); 4510 4511 // (2) Add to the worklist all bitcast and getelementptr instructions used by 4512 // memory accesses requiring a scalar use. The pointer operands of loads and 4513 // stores will be scalar as long as the memory accesses is not a gather or 4514 // scatter operation. The value operand of a store will remain scalar if the 4515 // store is scalarized. 4516 for (auto *BB : TheLoop->blocks()) 4517 for (auto &I : *BB) { 4518 if (auto *Load = dyn_cast<LoadInst>(&I)) { 4519 evaluatePtrUse(Load, Load->getPointerOperand()); 4520 } else if (auto *Store = dyn_cast<StoreInst>(&I)) { 4521 evaluatePtrUse(Store, Store->getPointerOperand()); 4522 evaluatePtrUse(Store, Store->getValueOperand()); 4523 } 4524 } 4525 for (auto *I : ScalarPtrs) 4526 if (PossibleNonScalarPtrs.find(I) == PossibleNonScalarPtrs.end()) { 4527 LLVM_DEBUG(dbgs() << "LV: Found scalar instruction: " << *I << "\n"); 4528 Worklist.insert(I); 4529 } 4530 4531 // (3) Add to the worklist all pointer induction variables and their update 4532 // instructions. 4533 // 4534 // TODO: Once we are able to vectorize pointer induction variables we should 4535 // no longer insert them into the worklist here. 4536 auto *Latch = TheLoop->getLoopLatch(); 4537 for (auto &Induction : Legal->getInductionVars()) { 4538 auto *Ind = Induction.first; 4539 auto *IndUpdate = cast<Instruction>(Ind->getIncomingValueForBlock(Latch)); 4540 if (Induction.second.getKind() != InductionDescriptor::IK_PtrInduction) 4541 continue; 4542 Worklist.insert(Ind); 4543 Worklist.insert(IndUpdate); 4544 LLVM_DEBUG(dbgs() << "LV: Found scalar instruction: " << *Ind << "\n"); 4545 LLVM_DEBUG(dbgs() << "LV: Found scalar instruction: " << *IndUpdate 4546 << "\n"); 4547 } 4548 4549 // Insert the forced scalars. 4550 // FIXME: Currently widenPHIInstruction() often creates a dead vector 4551 // induction variable when the PHI user is scalarized. 4552 auto ForcedScalar = ForcedScalars.find(VF); 4553 if (ForcedScalar != ForcedScalars.end()) 4554 for (auto *I : ForcedScalar->second) 4555 Worklist.insert(I); 4556 4557 // Expand the worklist by looking through any bitcasts and getelementptr 4558 // instructions we've already identified as scalar. This is similar to the 4559 // expansion step in collectLoopUniforms(); however, here we're only 4560 // expanding to include additional bitcasts and getelementptr instructions. 4561 unsigned Idx = 0; 4562 while (Idx != Worklist.size()) { 4563 Instruction *Dst = Worklist[Idx++]; 4564 if (!isLoopVaryingBitCastOrGEP(Dst->getOperand(0))) 4565 continue; 4566 auto *Src = cast<Instruction>(Dst->getOperand(0)); 4567 if (llvm::all_of(Src->users(), [&](User *U) -> bool { 4568 auto *J = cast<Instruction>(U); 4569 return !TheLoop->contains(J) || Worklist.count(J) || 4570 ((isa<LoadInst>(J) || isa<StoreInst>(J)) && 4571 isScalarUse(J, Src)); 4572 })) { 4573 Worklist.insert(Src); 4574 LLVM_DEBUG(dbgs() << "LV: Found scalar instruction: " << *Src << "\n"); 4575 } 4576 } 4577 4578 // An induction variable will remain scalar if all users of the induction 4579 // variable and induction variable update remain scalar. 4580 for (auto &Induction : Legal->getInductionVars()) { 4581 auto *Ind = Induction.first; 4582 auto *IndUpdate = cast<Instruction>(Ind->getIncomingValueForBlock(Latch)); 4583 4584 // We already considered pointer induction variables, so there's no reason 4585 // to look at their users again. 4586 // 4587 // TODO: Once we are able to vectorize pointer induction variables we 4588 // should no longer skip over them here. 4589 if (Induction.second.getKind() == InductionDescriptor::IK_PtrInduction) 4590 continue; 4591 4592 // Determine if all users of the induction variable are scalar after 4593 // vectorization. 4594 auto ScalarInd = llvm::all_of(Ind->users(), [&](User *U) -> bool { 4595 auto *I = cast<Instruction>(U); 4596 return I == IndUpdate || !TheLoop->contains(I) || Worklist.count(I); 4597 }); 4598 if (!ScalarInd) 4599 continue; 4600 4601 // Determine if all users of the induction variable update instruction are 4602 // scalar after vectorization. 4603 auto ScalarIndUpdate = 4604 llvm::all_of(IndUpdate->users(), [&](User *U) -> bool { 4605 auto *I = cast<Instruction>(U); 4606 return I == Ind || !TheLoop->contains(I) || Worklist.count(I); 4607 }); 4608 if (!ScalarIndUpdate) 4609 continue; 4610 4611 // The induction variable and its update instruction will remain scalar. 4612 Worklist.insert(Ind); 4613 Worklist.insert(IndUpdate); 4614 LLVM_DEBUG(dbgs() << "LV: Found scalar instruction: " << *Ind << "\n"); 4615 LLVM_DEBUG(dbgs() << "LV: Found scalar instruction: " << *IndUpdate 4616 << "\n"); 4617 } 4618 4619 Scalars[VF].insert(Worklist.begin(), Worklist.end()); 4620 } 4621 4622 bool LoopVectorizationCostModel::isScalarWithPredication(Instruction *I, unsigned VF) { 4623 if (!blockNeedsPredication(I->getParent())) 4624 return false; 4625 switch(I->getOpcode()) { 4626 default: 4627 break; 4628 case Instruction::Load: 4629 case Instruction::Store: { 4630 if (!Legal->isMaskRequired(I)) 4631 return false; 4632 auto *Ptr = getLoadStorePointerOperand(I); 4633 auto *Ty = getMemInstValueType(I); 4634 // We have already decided how to vectorize this instruction, get that 4635 // result. 4636 if (VF > 1) { 4637 InstWidening WideningDecision = getWideningDecision(I, VF); 4638 assert(WideningDecision != CM_Unknown && 4639 "Widening decision should be ready at this moment"); 4640 return WideningDecision == CM_Scalarize; 4641 } 4642 const MaybeAlign Alignment = getLoadStoreAlignment(I); 4643 return isa<LoadInst>(I) ? !(isLegalMaskedLoad(Ty, Ptr, Alignment) || 4644 isLegalMaskedGather(Ty, Alignment)) 4645 : !(isLegalMaskedStore(Ty, Ptr, Alignment) || 4646 isLegalMaskedScatter(Ty, Alignment)); 4647 } 4648 case Instruction::UDiv: 4649 case Instruction::SDiv: 4650 case Instruction::SRem: 4651 case Instruction::URem: 4652 return mayDivideByZero(*I); 4653 } 4654 return false; 4655 } 4656 4657 bool LoopVectorizationCostModel::interleavedAccessCanBeWidened(Instruction *I, 4658 unsigned VF) { 4659 assert(isAccessInterleaved(I) && "Expecting interleaved access."); 4660 assert(getWideningDecision(I, VF) == CM_Unknown && 4661 "Decision should not be set yet."); 4662 auto *Group = getInterleavedAccessGroup(I); 4663 assert(Group && "Must have a group."); 4664 4665 // If the instruction's allocated size doesn't equal it's type size, it 4666 // requires padding and will be scalarized. 4667 auto &DL = I->getModule()->getDataLayout(); 4668 auto *ScalarTy = getMemInstValueType(I); 4669 if (hasIrregularType(ScalarTy, DL, VF)) 4670 return false; 4671 4672 // Check if masking is required. 4673 // A Group may need masking for one of two reasons: it resides in a block that 4674 // needs predication, or it was decided to use masking to deal with gaps. 4675 bool PredicatedAccessRequiresMasking = 4676 Legal->blockNeedsPredication(I->getParent()) && Legal->isMaskRequired(I); 4677 bool AccessWithGapsRequiresMasking = 4678 Group->requiresScalarEpilogue() && !isScalarEpilogueAllowed(); 4679 if (!PredicatedAccessRequiresMasking && !AccessWithGapsRequiresMasking) 4680 return true; 4681 4682 // If masked interleaving is required, we expect that the user/target had 4683 // enabled it, because otherwise it either wouldn't have been created or 4684 // it should have been invalidated by the CostModel. 4685 assert(useMaskedInterleavedAccesses(TTI) && 4686 "Masked interleave-groups for predicated accesses are not enabled."); 4687 4688 auto *Ty = getMemInstValueType(I); 4689 const MaybeAlign Alignment = getLoadStoreAlignment(I); 4690 return isa<LoadInst>(I) ? TTI.isLegalMaskedLoad(Ty, Alignment) 4691 : TTI.isLegalMaskedStore(Ty, Alignment); 4692 } 4693 4694 bool LoopVectorizationCostModel::memoryInstructionCanBeWidened(Instruction *I, 4695 unsigned VF) { 4696 // Get and ensure we have a valid memory instruction. 4697 LoadInst *LI = dyn_cast<LoadInst>(I); 4698 StoreInst *SI = dyn_cast<StoreInst>(I); 4699 assert((LI || SI) && "Invalid memory instruction"); 4700 4701 auto *Ptr = getLoadStorePointerOperand(I); 4702 4703 // In order to be widened, the pointer should be consecutive, first of all. 4704 if (!Legal->isConsecutivePtr(Ptr)) 4705 return false; 4706 4707 // If the instruction is a store located in a predicated block, it will be 4708 // scalarized. 4709 if (isScalarWithPredication(I)) 4710 return false; 4711 4712 // If the instruction's allocated size doesn't equal it's type size, it 4713 // requires padding and will be scalarized. 4714 auto &DL = I->getModule()->getDataLayout(); 4715 auto *ScalarTy = LI ? LI->getType() : SI->getValueOperand()->getType(); 4716 if (hasIrregularType(ScalarTy, DL, VF)) 4717 return false; 4718 4719 return true; 4720 } 4721 4722 void LoopVectorizationCostModel::collectLoopUniforms(unsigned VF) { 4723 // We should not collect Uniforms more than once per VF. Right now, 4724 // this function is called from collectUniformsAndScalars(), which 4725 // already does this check. Collecting Uniforms for VF=1 does not make any 4726 // sense. 4727 4728 assert(VF >= 2 && Uniforms.find(VF) == Uniforms.end() && 4729 "This function should not be visited twice for the same VF"); 4730 4731 // Visit the list of Uniforms. If we'll not find any uniform value, we'll 4732 // not analyze again. Uniforms.count(VF) will return 1. 4733 Uniforms[VF].clear(); 4734 4735 // We now know that the loop is vectorizable! 4736 // Collect instructions inside the loop that will remain uniform after 4737 // vectorization. 4738 4739 // Global values, params and instructions outside of current loop are out of 4740 // scope. 4741 auto isOutOfScope = [&](Value *V) -> bool { 4742 Instruction *I = dyn_cast<Instruction>(V); 4743 return (!I || !TheLoop->contains(I)); 4744 }; 4745 4746 SetVector<Instruction *> Worklist; 4747 BasicBlock *Latch = TheLoop->getLoopLatch(); 4748 4749 // Instructions that are scalar with predication must not be considered 4750 // uniform after vectorization, because that would create an erroneous 4751 // replicating region where only a single instance out of VF should be formed. 4752 // TODO: optimize such seldom cases if found important, see PR40816. 4753 auto addToWorklistIfAllowed = [&](Instruction *I) -> void { 4754 if (isScalarWithPredication(I, VF)) { 4755 LLVM_DEBUG(dbgs() << "LV: Found not uniform being ScalarWithPredication: " 4756 << *I << "\n"); 4757 return; 4758 } 4759 LLVM_DEBUG(dbgs() << "LV: Found uniform instruction: " << *I << "\n"); 4760 Worklist.insert(I); 4761 }; 4762 4763 // Start with the conditional branch. If the branch condition is an 4764 // instruction contained in the loop that is only used by the branch, it is 4765 // uniform. 4766 auto *Cmp = dyn_cast<Instruction>(Latch->getTerminator()->getOperand(0)); 4767 if (Cmp && TheLoop->contains(Cmp) && Cmp->hasOneUse()) 4768 addToWorklistIfAllowed(Cmp); 4769 4770 // Holds consecutive and consecutive-like pointers. Consecutive-like pointers 4771 // are pointers that are treated like consecutive pointers during 4772 // vectorization. The pointer operands of interleaved accesses are an 4773 // example. 4774 SmallSetVector<Instruction *, 8> ConsecutiveLikePtrs; 4775 4776 // Holds pointer operands of instructions that are possibly non-uniform. 4777 SmallPtrSet<Instruction *, 8> PossibleNonUniformPtrs; 4778 4779 auto isUniformDecision = [&](Instruction *I, unsigned VF) { 4780 InstWidening WideningDecision = getWideningDecision(I, VF); 4781 assert(WideningDecision != CM_Unknown && 4782 "Widening decision should be ready at this moment"); 4783 4784 return (WideningDecision == CM_Widen || 4785 WideningDecision == CM_Widen_Reverse || 4786 WideningDecision == CM_Interleave); 4787 }; 4788 // Iterate over the instructions in the loop, and collect all 4789 // consecutive-like pointer operands in ConsecutiveLikePtrs. If it's possible 4790 // that a consecutive-like pointer operand will be scalarized, we collect it 4791 // in PossibleNonUniformPtrs instead. We use two sets here because a single 4792 // getelementptr instruction can be used by both vectorized and scalarized 4793 // memory instructions. For example, if a loop loads and stores from the same 4794 // location, but the store is conditional, the store will be scalarized, and 4795 // the getelementptr won't remain uniform. 4796 for (auto *BB : TheLoop->blocks()) 4797 for (auto &I : *BB) { 4798 // If there's no pointer operand, there's nothing to do. 4799 auto *Ptr = dyn_cast_or_null<Instruction>(getLoadStorePointerOperand(&I)); 4800 if (!Ptr) 4801 continue; 4802 4803 // True if all users of Ptr are memory accesses that have Ptr as their 4804 // pointer operand. 4805 auto UsersAreMemAccesses = 4806 llvm::all_of(Ptr->users(), [&](User *U) -> bool { 4807 return getLoadStorePointerOperand(U) == Ptr; 4808 }); 4809 4810 // Ensure the memory instruction will not be scalarized or used by 4811 // gather/scatter, making its pointer operand non-uniform. If the pointer 4812 // operand is used by any instruction other than a memory access, we 4813 // conservatively assume the pointer operand may be non-uniform. 4814 if (!UsersAreMemAccesses || !isUniformDecision(&I, VF)) 4815 PossibleNonUniformPtrs.insert(Ptr); 4816 4817 // If the memory instruction will be vectorized and its pointer operand 4818 // is consecutive-like, or interleaving - the pointer operand should 4819 // remain uniform. 4820 else 4821 ConsecutiveLikePtrs.insert(Ptr); 4822 } 4823 4824 // Add to the Worklist all consecutive and consecutive-like pointers that 4825 // aren't also identified as possibly non-uniform. 4826 for (auto *V : ConsecutiveLikePtrs) 4827 if (PossibleNonUniformPtrs.find(V) == PossibleNonUniformPtrs.end()) 4828 addToWorklistIfAllowed(V); 4829 4830 // Expand Worklist in topological order: whenever a new instruction 4831 // is added , its users should be already inside Worklist. It ensures 4832 // a uniform instruction will only be used by uniform instructions. 4833 unsigned idx = 0; 4834 while (idx != Worklist.size()) { 4835 Instruction *I = Worklist[idx++]; 4836 4837 for (auto OV : I->operand_values()) { 4838 // isOutOfScope operands cannot be uniform instructions. 4839 if (isOutOfScope(OV)) 4840 continue; 4841 // First order recurrence Phi's should typically be considered 4842 // non-uniform. 4843 auto *OP = dyn_cast<PHINode>(OV); 4844 if (OP && Legal->isFirstOrderRecurrence(OP)) 4845 continue; 4846 // If all the users of the operand are uniform, then add the 4847 // operand into the uniform worklist. 4848 auto *OI = cast<Instruction>(OV); 4849 if (llvm::all_of(OI->users(), [&](User *U) -> bool { 4850 auto *J = cast<Instruction>(U); 4851 return Worklist.count(J) || 4852 (OI == getLoadStorePointerOperand(J) && 4853 isUniformDecision(J, VF)); 4854 })) 4855 addToWorklistIfAllowed(OI); 4856 } 4857 } 4858 4859 // Returns true if Ptr is the pointer operand of a memory access instruction 4860 // I, and I is known to not require scalarization. 4861 auto isVectorizedMemAccessUse = [&](Instruction *I, Value *Ptr) -> bool { 4862 return getLoadStorePointerOperand(I) == Ptr && isUniformDecision(I, VF); 4863 }; 4864 4865 // For an instruction to be added into Worklist above, all its users inside 4866 // the loop should also be in Worklist. However, this condition cannot be 4867 // true for phi nodes that form a cyclic dependence. We must process phi 4868 // nodes separately. An induction variable will remain uniform if all users 4869 // of the induction variable and induction variable update remain uniform. 4870 // The code below handles both pointer and non-pointer induction variables. 4871 for (auto &Induction : Legal->getInductionVars()) { 4872 auto *Ind = Induction.first; 4873 auto *IndUpdate = cast<Instruction>(Ind->getIncomingValueForBlock(Latch)); 4874 4875 // Determine if all users of the induction variable are uniform after 4876 // vectorization. 4877 auto UniformInd = llvm::all_of(Ind->users(), [&](User *U) -> bool { 4878 auto *I = cast<Instruction>(U); 4879 return I == IndUpdate || !TheLoop->contains(I) || Worklist.count(I) || 4880 isVectorizedMemAccessUse(I, Ind); 4881 }); 4882 if (!UniformInd) 4883 continue; 4884 4885 // Determine if all users of the induction variable update instruction are 4886 // uniform after vectorization. 4887 auto UniformIndUpdate = 4888 llvm::all_of(IndUpdate->users(), [&](User *U) -> bool { 4889 auto *I = cast<Instruction>(U); 4890 return I == Ind || !TheLoop->contains(I) || Worklist.count(I) || 4891 isVectorizedMemAccessUse(I, IndUpdate); 4892 }); 4893 if (!UniformIndUpdate) 4894 continue; 4895 4896 // The induction variable and its update instruction will remain uniform. 4897 addToWorklistIfAllowed(Ind); 4898 addToWorklistIfAllowed(IndUpdate); 4899 } 4900 4901 Uniforms[VF].insert(Worklist.begin(), Worklist.end()); 4902 } 4903 4904 bool LoopVectorizationCostModel::runtimeChecksRequired() { 4905 LLVM_DEBUG(dbgs() << "LV: Performing code size checks.\n"); 4906 4907 if (Legal->getRuntimePointerChecking()->Need) { 4908 reportVectorizationFailure("Runtime ptr check is required with -Os/-Oz", 4909 "runtime pointer checks needed. Enable vectorization of this " 4910 "loop with '#pragma clang loop vectorize(enable)' when " 4911 "compiling with -Os/-Oz", 4912 "CantVersionLoopWithOptForSize", ORE, TheLoop); 4913 return true; 4914 } 4915 4916 if (!PSE.getUnionPredicate().getPredicates().empty()) { 4917 reportVectorizationFailure("Runtime SCEV check is required with -Os/-Oz", 4918 "runtime SCEV checks needed. Enable vectorization of this " 4919 "loop with '#pragma clang loop vectorize(enable)' when " 4920 "compiling with -Os/-Oz", 4921 "CantVersionLoopWithOptForSize", ORE, TheLoop); 4922 return true; 4923 } 4924 4925 // FIXME: Avoid specializing for stride==1 instead of bailing out. 4926 if (!Legal->getLAI()->getSymbolicStrides().empty()) { 4927 reportVectorizationFailure("Runtime stride check is required with -Os/-Oz", 4928 "runtime stride == 1 checks needed. Enable vectorization of " 4929 "this loop with '#pragma clang loop vectorize(enable)' when " 4930 "compiling with -Os/-Oz", 4931 "CantVersionLoopWithOptForSize", ORE, TheLoop); 4932 return true; 4933 } 4934 4935 return false; 4936 } 4937 4938 Optional<unsigned> LoopVectorizationCostModel::computeMaxVF() { 4939 if (Legal->getRuntimePointerChecking()->Need && TTI.hasBranchDivergence()) { 4940 // TODO: It may by useful to do since it's still likely to be dynamically 4941 // uniform if the target can skip. 4942 reportVectorizationFailure( 4943 "Not inserting runtime ptr check for divergent target", 4944 "runtime pointer checks needed. Not enabled for divergent target", 4945 "CantVersionLoopWithDivergentTarget", ORE, TheLoop); 4946 return None; 4947 } 4948 4949 unsigned TC = PSE.getSE()->getSmallConstantTripCount(TheLoop); 4950 LLVM_DEBUG(dbgs() << "LV: Found trip count: " << TC << '\n'); 4951 if (TC == 1) { 4952 reportVectorizationFailure("Single iteration (non) loop", 4953 "loop trip count is one, irrelevant for vectorization", 4954 "SingleIterationLoop", ORE, TheLoop); 4955 return None; 4956 } 4957 4958 switch (ScalarEpilogueStatus) { 4959 case CM_ScalarEpilogueAllowed: 4960 return computeFeasibleMaxVF(TC); 4961 case CM_ScalarEpilogueNotNeededUsePredicate: 4962 LLVM_DEBUG( 4963 dbgs() << "LV: vector predicate hint/switch found.\n" 4964 << "LV: Not allowing scalar epilogue, creating predicated " 4965 << "vector loop.\n"); 4966 break; 4967 case CM_ScalarEpilogueNotAllowedLowTripLoop: 4968 // fallthrough as a special case of OptForSize 4969 case CM_ScalarEpilogueNotAllowedOptSize: 4970 if (ScalarEpilogueStatus == CM_ScalarEpilogueNotAllowedOptSize) 4971 LLVM_DEBUG( 4972 dbgs() << "LV: Not allowing scalar epilogue due to -Os/-Oz.\n"); 4973 else 4974 LLVM_DEBUG(dbgs() << "LV: Not allowing scalar epilogue due to low trip " 4975 << "count.\n"); 4976 4977 // Bail if runtime checks are required, which are not good when optimising 4978 // for size. 4979 if (runtimeChecksRequired()) 4980 return None; 4981 break; 4982 } 4983 4984 // Now try the tail folding 4985 4986 // Invalidate interleave groups that require an epilogue if we can't mask 4987 // the interleave-group. 4988 if (!useMaskedInterleavedAccesses(TTI)) 4989 InterleaveInfo.invalidateGroupsRequiringScalarEpilogue(); 4990 4991 unsigned MaxVF = computeFeasibleMaxVF(TC); 4992 if (TC > 0 && TC % MaxVF == 0) { 4993 // Accept MaxVF if we do not have a tail. 4994 LLVM_DEBUG(dbgs() << "LV: No tail will remain for any chosen VF.\n"); 4995 return MaxVF; 4996 } 4997 4998 // If we don't know the precise trip count, or if the trip count that we 4999 // found modulo the vectorization factor is not zero, try to fold the tail 5000 // by masking. 5001 // FIXME: look for a smaller MaxVF that does divide TC rather than masking. 5002 if (Legal->prepareToFoldTailByMasking()) { 5003 FoldTailByMasking = true; 5004 return MaxVF; 5005 } 5006 5007 if (TC == 0) { 5008 reportVectorizationFailure( 5009 "Unable to calculate the loop count due to complex control flow", 5010 "unable to calculate the loop count due to complex control flow", 5011 "UnknownLoopCountComplexCFG", ORE, TheLoop); 5012 return None; 5013 } 5014 5015 reportVectorizationFailure( 5016 "Cannot optimize for size and vectorize at the same time.", 5017 "cannot optimize for size and vectorize at the same time. " 5018 "Enable vectorization of this loop with '#pragma clang loop " 5019 "vectorize(enable)' when compiling with -Os/-Oz", 5020 "NoTailLoopWithOptForSize", ORE, TheLoop); 5021 return None; 5022 } 5023 5024 unsigned 5025 LoopVectorizationCostModel::computeFeasibleMaxVF(unsigned ConstTripCount) { 5026 MinBWs = computeMinimumValueSizes(TheLoop->getBlocks(), *DB, &TTI); 5027 unsigned SmallestType, WidestType; 5028 std::tie(SmallestType, WidestType) = getSmallestAndWidestTypes(); 5029 unsigned WidestRegister = TTI.getRegisterBitWidth(true); 5030 5031 // Get the maximum safe dependence distance in bits computed by LAA. 5032 // It is computed by MaxVF * sizeOf(type) * 8, where type is taken from 5033 // the memory accesses that is most restrictive (involved in the smallest 5034 // dependence distance). 5035 unsigned MaxSafeRegisterWidth = Legal->getMaxSafeRegisterWidth(); 5036 5037 WidestRegister = std::min(WidestRegister, MaxSafeRegisterWidth); 5038 5039 unsigned MaxVectorSize = WidestRegister / WidestType; 5040 5041 LLVM_DEBUG(dbgs() << "LV: The Smallest and Widest types: " << SmallestType 5042 << " / " << WidestType << " bits.\n"); 5043 LLVM_DEBUG(dbgs() << "LV: The Widest register safe to use is: " 5044 << WidestRegister << " bits.\n"); 5045 5046 assert(MaxVectorSize <= 256 && "Did not expect to pack so many elements" 5047 " into one vector!"); 5048 if (MaxVectorSize == 0) { 5049 LLVM_DEBUG(dbgs() << "LV: The target has no vector registers.\n"); 5050 MaxVectorSize = 1; 5051 return MaxVectorSize; 5052 } else if (ConstTripCount && ConstTripCount < MaxVectorSize && 5053 isPowerOf2_32(ConstTripCount)) { 5054 // We need to clamp the VF to be the ConstTripCount. There is no point in 5055 // choosing a higher viable VF as done in the loop below. 5056 LLVM_DEBUG(dbgs() << "LV: Clamping the MaxVF to the constant trip count: " 5057 << ConstTripCount << "\n"); 5058 MaxVectorSize = ConstTripCount; 5059 return MaxVectorSize; 5060 } 5061 5062 unsigned MaxVF = MaxVectorSize; 5063 if (TTI.shouldMaximizeVectorBandwidth(!isScalarEpilogueAllowed()) || 5064 (MaximizeBandwidth && isScalarEpilogueAllowed())) { 5065 // Collect all viable vectorization factors larger than the default MaxVF 5066 // (i.e. MaxVectorSize). 5067 SmallVector<unsigned, 8> VFs; 5068 unsigned NewMaxVectorSize = WidestRegister / SmallestType; 5069 for (unsigned VS = MaxVectorSize * 2; VS <= NewMaxVectorSize; VS *= 2) 5070 VFs.push_back(VS); 5071 5072 // For each VF calculate its register usage. 5073 auto RUs = calculateRegisterUsage(VFs); 5074 5075 // Select the largest VF which doesn't require more registers than existing 5076 // ones. 5077 for (int i = RUs.size() - 1; i >= 0; --i) { 5078 bool Selected = true; 5079 for (auto& pair : RUs[i].MaxLocalUsers) { 5080 unsigned TargetNumRegisters = TTI.getNumberOfRegisters(pair.first); 5081 if (pair.second > TargetNumRegisters) 5082 Selected = false; 5083 } 5084 if (Selected) { 5085 MaxVF = VFs[i]; 5086 break; 5087 } 5088 } 5089 if (unsigned MinVF = TTI.getMinimumVF(SmallestType)) { 5090 if (MaxVF < MinVF) { 5091 LLVM_DEBUG(dbgs() << "LV: Overriding calculated MaxVF(" << MaxVF 5092 << ") with target's minimum: " << MinVF << '\n'); 5093 MaxVF = MinVF; 5094 } 5095 } 5096 } 5097 return MaxVF; 5098 } 5099 5100 VectorizationFactor 5101 LoopVectorizationCostModel::selectVectorizationFactor(unsigned MaxVF) { 5102 float Cost = expectedCost(1).first; 5103 const float ScalarCost = Cost; 5104 unsigned Width = 1; 5105 LLVM_DEBUG(dbgs() << "LV: Scalar loop costs: " << (int)ScalarCost << ".\n"); 5106 5107 bool ForceVectorization = Hints->getForce() == LoopVectorizeHints::FK_Enabled; 5108 if (ForceVectorization && MaxVF > 1) { 5109 // Ignore scalar width, because the user explicitly wants vectorization. 5110 // Initialize cost to max so that VF = 2 is, at least, chosen during cost 5111 // evaluation. 5112 Cost = std::numeric_limits<float>::max(); 5113 } 5114 5115 for (unsigned i = 2; i <= MaxVF; i *= 2) { 5116 // Notice that the vector loop needs to be executed less times, so 5117 // we need to divide the cost of the vector loops by the width of 5118 // the vector elements. 5119 VectorizationCostTy C = expectedCost(i); 5120 float VectorCost = C.first / (float)i; 5121 LLVM_DEBUG(dbgs() << "LV: Vector loop of width " << i 5122 << " costs: " << (int)VectorCost << ".\n"); 5123 if (!C.second && !ForceVectorization) { 5124 LLVM_DEBUG( 5125 dbgs() << "LV: Not considering vector loop of width " << i 5126 << " because it will not generate any vector instructions.\n"); 5127 continue; 5128 } 5129 if (VectorCost < Cost) { 5130 Cost = VectorCost; 5131 Width = i; 5132 } 5133 } 5134 5135 if (!EnableCondStoresVectorization && NumPredStores) { 5136 reportVectorizationFailure("There are conditional stores.", 5137 "store that is conditionally executed prevents vectorization", 5138 "ConditionalStore", ORE, TheLoop); 5139 Width = 1; 5140 Cost = ScalarCost; 5141 } 5142 5143 LLVM_DEBUG(if (ForceVectorization && Width > 1 && Cost >= ScalarCost) dbgs() 5144 << "LV: Vectorization seems to be not beneficial, " 5145 << "but was forced by a user.\n"); 5146 LLVM_DEBUG(dbgs() << "LV: Selecting VF: " << Width << ".\n"); 5147 VectorizationFactor Factor = {Width, (unsigned)(Width * Cost)}; 5148 return Factor; 5149 } 5150 5151 std::pair<unsigned, unsigned> 5152 LoopVectorizationCostModel::getSmallestAndWidestTypes() { 5153 unsigned MinWidth = -1U; 5154 unsigned MaxWidth = 8; 5155 const DataLayout &DL = TheFunction->getParent()->getDataLayout(); 5156 5157 // For each block. 5158 for (BasicBlock *BB : TheLoop->blocks()) { 5159 // For each instruction in the loop. 5160 for (Instruction &I : BB->instructionsWithoutDebug()) { 5161 Type *T = I.getType(); 5162 5163 // Skip ignored values. 5164 if (ValuesToIgnore.find(&I) != ValuesToIgnore.end()) 5165 continue; 5166 5167 // Only examine Loads, Stores and PHINodes. 5168 if (!isa<LoadInst>(I) && !isa<StoreInst>(I) && !isa<PHINode>(I)) 5169 continue; 5170 5171 // Examine PHI nodes that are reduction variables. Update the type to 5172 // account for the recurrence type. 5173 if (auto *PN = dyn_cast<PHINode>(&I)) { 5174 if (!Legal->isReductionVariable(PN)) 5175 continue; 5176 RecurrenceDescriptor RdxDesc = Legal->getReductionVars()[PN]; 5177 T = RdxDesc.getRecurrenceType(); 5178 } 5179 5180 // Examine the stored values. 5181 if (auto *ST = dyn_cast<StoreInst>(&I)) 5182 T = ST->getValueOperand()->getType(); 5183 5184 // Ignore loaded pointer types and stored pointer types that are not 5185 // vectorizable. 5186 // 5187 // FIXME: The check here attempts to predict whether a load or store will 5188 // be vectorized. We only know this for certain after a VF has 5189 // been selected. Here, we assume that if an access can be 5190 // vectorized, it will be. We should also look at extending this 5191 // optimization to non-pointer types. 5192 // 5193 if (T->isPointerTy() && !isConsecutiveLoadOrStore(&I) && 5194 !isAccessInterleaved(&I) && !isLegalGatherOrScatter(&I)) 5195 continue; 5196 5197 MinWidth = std::min(MinWidth, 5198 (unsigned)DL.getTypeSizeInBits(T->getScalarType())); 5199 MaxWidth = std::max(MaxWidth, 5200 (unsigned)DL.getTypeSizeInBits(T->getScalarType())); 5201 } 5202 } 5203 5204 return {MinWidth, MaxWidth}; 5205 } 5206 5207 unsigned LoopVectorizationCostModel::selectInterleaveCount(unsigned VF, 5208 unsigned LoopCost) { 5209 // -- The interleave heuristics -- 5210 // We interleave the loop in order to expose ILP and reduce the loop overhead. 5211 // There are many micro-architectural considerations that we can't predict 5212 // at this level. For example, frontend pressure (on decode or fetch) due to 5213 // code size, or the number and capabilities of the execution ports. 5214 // 5215 // We use the following heuristics to select the interleave count: 5216 // 1. If the code has reductions, then we interleave to break the cross 5217 // iteration dependency. 5218 // 2. If the loop is really small, then we interleave to reduce the loop 5219 // overhead. 5220 // 3. We don't interleave if we think that we will spill registers to memory 5221 // due to the increased register pressure. 5222 5223 if (!isScalarEpilogueAllowed()) 5224 return 1; 5225 5226 // We used the distance for the interleave count. 5227 if (Legal->getMaxSafeDepDistBytes() != -1U) 5228 return 1; 5229 5230 // Do not interleave loops with a relatively small known or estimated trip 5231 // count. 5232 auto BestKnownTC = getSmallBestKnownTC(*PSE.getSE(), TheLoop); 5233 if (BestKnownTC && *BestKnownTC < TinyTripCountInterleaveThreshold) 5234 return 1; 5235 5236 RegisterUsage R = calculateRegisterUsage({VF})[0]; 5237 // We divide by these constants so assume that we have at least one 5238 // instruction that uses at least one register. 5239 for (auto& pair : R.MaxLocalUsers) { 5240 pair.second = std::max(pair.second, 1U); 5241 } 5242 5243 // We calculate the interleave count using the following formula. 5244 // Subtract the number of loop invariants from the number of available 5245 // registers. These registers are used by all of the interleaved instances. 5246 // Next, divide the remaining registers by the number of registers that is 5247 // required by the loop, in order to estimate how many parallel instances 5248 // fit without causing spills. All of this is rounded down if necessary to be 5249 // a power of two. We want power of two interleave count to simplify any 5250 // addressing operations or alignment considerations. 5251 // We also want power of two interleave counts to ensure that the induction 5252 // variable of the vector loop wraps to zero, when tail is folded by masking; 5253 // this currently happens when OptForSize, in which case IC is set to 1 above. 5254 unsigned IC = UINT_MAX; 5255 5256 for (auto& pair : R.MaxLocalUsers) { 5257 unsigned TargetNumRegisters = TTI.getNumberOfRegisters(pair.first); 5258 LLVM_DEBUG(dbgs() << "LV: The target has " << TargetNumRegisters 5259 << " registers of " 5260 << TTI.getRegisterClassName(pair.first) << " register class\n"); 5261 if (VF == 1) { 5262 if (ForceTargetNumScalarRegs.getNumOccurrences() > 0) 5263 TargetNumRegisters = ForceTargetNumScalarRegs; 5264 } else { 5265 if (ForceTargetNumVectorRegs.getNumOccurrences() > 0) 5266 TargetNumRegisters = ForceTargetNumVectorRegs; 5267 } 5268 unsigned MaxLocalUsers = pair.second; 5269 unsigned LoopInvariantRegs = 0; 5270 if (R.LoopInvariantRegs.find(pair.first) != R.LoopInvariantRegs.end()) 5271 LoopInvariantRegs = R.LoopInvariantRegs[pair.first]; 5272 5273 unsigned TmpIC = PowerOf2Floor((TargetNumRegisters - LoopInvariantRegs) / MaxLocalUsers); 5274 // Don't count the induction variable as interleaved. 5275 if (EnableIndVarRegisterHeur) { 5276 TmpIC = 5277 PowerOf2Floor((TargetNumRegisters - LoopInvariantRegs - 1) / 5278 std::max(1U, (MaxLocalUsers - 1))); 5279 } 5280 5281 IC = std::min(IC, TmpIC); 5282 } 5283 5284 // Clamp the interleave ranges to reasonable counts. 5285 unsigned MaxInterleaveCount = TTI.getMaxInterleaveFactor(VF); 5286 5287 // Check if the user has overridden the max. 5288 if (VF == 1) { 5289 if (ForceTargetMaxScalarInterleaveFactor.getNumOccurrences() > 0) 5290 MaxInterleaveCount = ForceTargetMaxScalarInterleaveFactor; 5291 } else { 5292 if (ForceTargetMaxVectorInterleaveFactor.getNumOccurrences() > 0) 5293 MaxInterleaveCount = ForceTargetMaxVectorInterleaveFactor; 5294 } 5295 5296 // If trip count is known or estimated compile time constant, limit the 5297 // interleave count to be less than the trip count divided by VF. 5298 if (BestKnownTC) { 5299 MaxInterleaveCount = std::min(*BestKnownTC / VF, MaxInterleaveCount); 5300 } 5301 5302 // If we did not calculate the cost for VF (because the user selected the VF) 5303 // then we calculate the cost of VF here. 5304 if (LoopCost == 0) 5305 LoopCost = expectedCost(VF).first; 5306 5307 assert(LoopCost && "Non-zero loop cost expected"); 5308 5309 // Clamp the calculated IC to be between the 1 and the max interleave count 5310 // that the target and trip count allows. 5311 if (IC > MaxInterleaveCount) 5312 IC = MaxInterleaveCount; 5313 else if (IC < 1) 5314 IC = 1; 5315 5316 // Interleave if we vectorized this loop and there is a reduction that could 5317 // benefit from interleaving. 5318 if (VF > 1 && !Legal->getReductionVars().empty()) { 5319 LLVM_DEBUG(dbgs() << "LV: Interleaving because of reductions.\n"); 5320 return IC; 5321 } 5322 5323 // Note that if we've already vectorized the loop we will have done the 5324 // runtime check and so interleaving won't require further checks. 5325 bool InterleavingRequiresRuntimePointerCheck = 5326 (VF == 1 && Legal->getRuntimePointerChecking()->Need); 5327 5328 // We want to interleave small loops in order to reduce the loop overhead and 5329 // potentially expose ILP opportunities. 5330 LLVM_DEBUG(dbgs() << "LV: Loop cost is " << LoopCost << '\n'); 5331 if (!InterleavingRequiresRuntimePointerCheck && LoopCost < SmallLoopCost) { 5332 // We assume that the cost overhead is 1 and we use the cost model 5333 // to estimate the cost of the loop and interleave until the cost of the 5334 // loop overhead is about 5% of the cost of the loop. 5335 unsigned SmallIC = 5336 std::min(IC, (unsigned)PowerOf2Floor(SmallLoopCost / LoopCost)); 5337 5338 // Interleave until store/load ports (estimated by max interleave count) are 5339 // saturated. 5340 unsigned NumStores = Legal->getNumStores(); 5341 unsigned NumLoads = Legal->getNumLoads(); 5342 unsigned StoresIC = IC / (NumStores ? NumStores : 1); 5343 unsigned LoadsIC = IC / (NumLoads ? NumLoads : 1); 5344 5345 // If we have a scalar reduction (vector reductions are already dealt with 5346 // by this point), we can increase the critical path length if the loop 5347 // we're interleaving is inside another loop. Limit, by default to 2, so the 5348 // critical path only gets increased by one reduction operation. 5349 if (!Legal->getReductionVars().empty() && TheLoop->getLoopDepth() > 1) { 5350 unsigned F = static_cast<unsigned>(MaxNestedScalarReductionIC); 5351 SmallIC = std::min(SmallIC, F); 5352 StoresIC = std::min(StoresIC, F); 5353 LoadsIC = std::min(LoadsIC, F); 5354 } 5355 5356 if (EnableLoadStoreRuntimeInterleave && 5357 std::max(StoresIC, LoadsIC) > SmallIC) { 5358 LLVM_DEBUG( 5359 dbgs() << "LV: Interleaving to saturate store or load ports.\n"); 5360 return std::max(StoresIC, LoadsIC); 5361 } 5362 5363 LLVM_DEBUG(dbgs() << "LV: Interleaving to reduce branch cost.\n"); 5364 return SmallIC; 5365 } 5366 5367 // Interleave if this is a large loop (small loops are already dealt with by 5368 // this point) that could benefit from interleaving. 5369 bool HasReductions = !Legal->getReductionVars().empty(); 5370 if (TTI.enableAggressiveInterleaving(HasReductions)) { 5371 LLVM_DEBUG(dbgs() << "LV: Interleaving to expose ILP.\n"); 5372 return IC; 5373 } 5374 5375 LLVM_DEBUG(dbgs() << "LV: Not Interleaving.\n"); 5376 return 1; 5377 } 5378 5379 SmallVector<LoopVectorizationCostModel::RegisterUsage, 8> 5380 LoopVectorizationCostModel::calculateRegisterUsage(ArrayRef<unsigned> VFs) { 5381 // This function calculates the register usage by measuring the highest number 5382 // of values that are alive at a single location. Obviously, this is a very 5383 // rough estimation. We scan the loop in a topological order in order and 5384 // assign a number to each instruction. We use RPO to ensure that defs are 5385 // met before their users. We assume that each instruction that has in-loop 5386 // users starts an interval. We record every time that an in-loop value is 5387 // used, so we have a list of the first and last occurrences of each 5388 // instruction. Next, we transpose this data structure into a multi map that 5389 // holds the list of intervals that *end* at a specific location. This multi 5390 // map allows us to perform a linear search. We scan the instructions linearly 5391 // and record each time that a new interval starts, by placing it in a set. 5392 // If we find this value in the multi-map then we remove it from the set. 5393 // The max register usage is the maximum size of the set. 5394 // We also search for instructions that are defined outside the loop, but are 5395 // used inside the loop. We need this number separately from the max-interval 5396 // usage number because when we unroll, loop-invariant values do not take 5397 // more register. 5398 LoopBlocksDFS DFS(TheLoop); 5399 DFS.perform(LI); 5400 5401 RegisterUsage RU; 5402 5403 // Each 'key' in the map opens a new interval. The values 5404 // of the map are the index of the 'last seen' usage of the 5405 // instruction that is the key. 5406 using IntervalMap = DenseMap<Instruction *, unsigned>; 5407 5408 // Maps instruction to its index. 5409 SmallVector<Instruction *, 64> IdxToInstr; 5410 // Marks the end of each interval. 5411 IntervalMap EndPoint; 5412 // Saves the list of instruction indices that are used in the loop. 5413 SmallPtrSet<Instruction *, 8> Ends; 5414 // Saves the list of values that are used in the loop but are 5415 // defined outside the loop, such as arguments and constants. 5416 SmallPtrSet<Value *, 8> LoopInvariants; 5417 5418 for (BasicBlock *BB : make_range(DFS.beginRPO(), DFS.endRPO())) { 5419 for (Instruction &I : BB->instructionsWithoutDebug()) { 5420 IdxToInstr.push_back(&I); 5421 5422 // Save the end location of each USE. 5423 for (Value *U : I.operands()) { 5424 auto *Instr = dyn_cast<Instruction>(U); 5425 5426 // Ignore non-instruction values such as arguments, constants, etc. 5427 if (!Instr) 5428 continue; 5429 5430 // If this instruction is outside the loop then record it and continue. 5431 if (!TheLoop->contains(Instr)) { 5432 LoopInvariants.insert(Instr); 5433 continue; 5434 } 5435 5436 // Overwrite previous end points. 5437 EndPoint[Instr] = IdxToInstr.size(); 5438 Ends.insert(Instr); 5439 } 5440 } 5441 } 5442 5443 // Saves the list of intervals that end with the index in 'key'. 5444 using InstrList = SmallVector<Instruction *, 2>; 5445 DenseMap<unsigned, InstrList> TransposeEnds; 5446 5447 // Transpose the EndPoints to a list of values that end at each index. 5448 for (auto &Interval : EndPoint) 5449 TransposeEnds[Interval.second].push_back(Interval.first); 5450 5451 SmallPtrSet<Instruction *, 8> OpenIntervals; 5452 5453 // Get the size of the widest register. 5454 unsigned MaxSafeDepDist = -1U; 5455 if (Legal->getMaxSafeDepDistBytes() != -1U) 5456 MaxSafeDepDist = Legal->getMaxSafeDepDistBytes() * 8; 5457 unsigned WidestRegister = 5458 std::min(TTI.getRegisterBitWidth(true), MaxSafeDepDist); 5459 const DataLayout &DL = TheFunction->getParent()->getDataLayout(); 5460 5461 SmallVector<RegisterUsage, 8> RUs(VFs.size()); 5462 SmallVector<SmallMapVector<unsigned, unsigned, 4>, 8> MaxUsages(VFs.size()); 5463 5464 LLVM_DEBUG(dbgs() << "LV(REG): Calculating max register usage:\n"); 5465 5466 // A lambda that gets the register usage for the given type and VF. 5467 auto GetRegUsage = [&DL, WidestRegister](Type *Ty, unsigned VF) { 5468 if (Ty->isTokenTy()) 5469 return 0U; 5470 unsigned TypeSize = DL.getTypeSizeInBits(Ty->getScalarType()); 5471 return std::max<unsigned>(1, VF * TypeSize / WidestRegister); 5472 }; 5473 5474 for (unsigned int i = 0, s = IdxToInstr.size(); i < s; ++i) { 5475 Instruction *I = IdxToInstr[i]; 5476 5477 // Remove all of the instructions that end at this location. 5478 InstrList &List = TransposeEnds[i]; 5479 for (Instruction *ToRemove : List) 5480 OpenIntervals.erase(ToRemove); 5481 5482 // Ignore instructions that are never used within the loop. 5483 if (Ends.find(I) == Ends.end()) 5484 continue; 5485 5486 // Skip ignored values. 5487 if (ValuesToIgnore.find(I) != ValuesToIgnore.end()) 5488 continue; 5489 5490 // For each VF find the maximum usage of registers. 5491 for (unsigned j = 0, e = VFs.size(); j < e; ++j) { 5492 // Count the number of live intervals. 5493 SmallMapVector<unsigned, unsigned, 4> RegUsage; 5494 5495 if (VFs[j] == 1) { 5496 for (auto Inst : OpenIntervals) { 5497 unsigned ClassID = TTI.getRegisterClassForType(false, Inst->getType()); 5498 if (RegUsage.find(ClassID) == RegUsage.end()) 5499 RegUsage[ClassID] = 1; 5500 else 5501 RegUsage[ClassID] += 1; 5502 } 5503 } else { 5504 collectUniformsAndScalars(VFs[j]); 5505 for (auto Inst : OpenIntervals) { 5506 // Skip ignored values for VF > 1. 5507 if (VecValuesToIgnore.find(Inst) != VecValuesToIgnore.end()) 5508 continue; 5509 if (isScalarAfterVectorization(Inst, VFs[j])) { 5510 unsigned ClassID = TTI.getRegisterClassForType(false, Inst->getType()); 5511 if (RegUsage.find(ClassID) == RegUsage.end()) 5512 RegUsage[ClassID] = 1; 5513 else 5514 RegUsage[ClassID] += 1; 5515 } else { 5516 unsigned ClassID = TTI.getRegisterClassForType(true, Inst->getType()); 5517 if (RegUsage.find(ClassID) == RegUsage.end()) 5518 RegUsage[ClassID] = GetRegUsage(Inst->getType(), VFs[j]); 5519 else 5520 RegUsage[ClassID] += GetRegUsage(Inst->getType(), VFs[j]); 5521 } 5522 } 5523 } 5524 5525 for (auto& pair : RegUsage) { 5526 if (MaxUsages[j].find(pair.first) != MaxUsages[j].end()) 5527 MaxUsages[j][pair.first] = std::max(MaxUsages[j][pair.first], pair.second); 5528 else 5529 MaxUsages[j][pair.first] = pair.second; 5530 } 5531 } 5532 5533 LLVM_DEBUG(dbgs() << "LV(REG): At #" << i << " Interval # " 5534 << OpenIntervals.size() << '\n'); 5535 5536 // Add the current instruction to the list of open intervals. 5537 OpenIntervals.insert(I); 5538 } 5539 5540 for (unsigned i = 0, e = VFs.size(); i < e; ++i) { 5541 SmallMapVector<unsigned, unsigned, 4> Invariant; 5542 5543 for (auto Inst : LoopInvariants) { 5544 unsigned Usage = VFs[i] == 1 ? 1 : GetRegUsage(Inst->getType(), VFs[i]); 5545 unsigned ClassID = TTI.getRegisterClassForType(VFs[i] > 1, Inst->getType()); 5546 if (Invariant.find(ClassID) == Invariant.end()) 5547 Invariant[ClassID] = Usage; 5548 else 5549 Invariant[ClassID] += Usage; 5550 } 5551 5552 LLVM_DEBUG({ 5553 dbgs() << "LV(REG): VF = " << VFs[i] << '\n'; 5554 dbgs() << "LV(REG): Found max usage: " << MaxUsages[i].size() 5555 << " item\n"; 5556 for (const auto &pair : MaxUsages[i]) { 5557 dbgs() << "LV(REG): RegisterClass: " 5558 << TTI.getRegisterClassName(pair.first) << ", " << pair.second 5559 << " registers\n"; 5560 } 5561 dbgs() << "LV(REG): Found invariant usage: " << Invariant.size() 5562 << " item\n"; 5563 for (const auto &pair : Invariant) { 5564 dbgs() << "LV(REG): RegisterClass: " 5565 << TTI.getRegisterClassName(pair.first) << ", " << pair.second 5566 << " registers\n"; 5567 } 5568 }); 5569 5570 RU.LoopInvariantRegs = Invariant; 5571 RU.MaxLocalUsers = MaxUsages[i]; 5572 RUs[i] = RU; 5573 } 5574 5575 return RUs; 5576 } 5577 5578 bool LoopVectorizationCostModel::useEmulatedMaskMemRefHack(Instruction *I){ 5579 // TODO: Cost model for emulated masked load/store is completely 5580 // broken. This hack guides the cost model to use an artificially 5581 // high enough value to practically disable vectorization with such 5582 // operations, except where previously deployed legality hack allowed 5583 // using very low cost values. This is to avoid regressions coming simply 5584 // from moving "masked load/store" check from legality to cost model. 5585 // Masked Load/Gather emulation was previously never allowed. 5586 // Limited number of Masked Store/Scatter emulation was allowed. 5587 assert(isPredicatedInst(I) && "Expecting a scalar emulated instruction"); 5588 return isa<LoadInst>(I) || 5589 (isa<StoreInst>(I) && 5590 NumPredStores > NumberOfStoresToPredicate); 5591 } 5592 5593 void LoopVectorizationCostModel::collectInstsToScalarize(unsigned VF) { 5594 // If we aren't vectorizing the loop, or if we've already collected the 5595 // instructions to scalarize, there's nothing to do. Collection may already 5596 // have occurred if we have a user-selected VF and are now computing the 5597 // expected cost for interleaving. 5598 if (VF < 2 || InstsToScalarize.find(VF) != InstsToScalarize.end()) 5599 return; 5600 5601 // Initialize a mapping for VF in InstsToScalalarize. If we find that it's 5602 // not profitable to scalarize any instructions, the presence of VF in the 5603 // map will indicate that we've analyzed it already. 5604 ScalarCostsTy &ScalarCostsVF = InstsToScalarize[VF]; 5605 5606 // Find all the instructions that are scalar with predication in the loop and 5607 // determine if it would be better to not if-convert the blocks they are in. 5608 // If so, we also record the instructions to scalarize. 5609 for (BasicBlock *BB : TheLoop->blocks()) { 5610 if (!blockNeedsPredication(BB)) 5611 continue; 5612 for (Instruction &I : *BB) 5613 if (isScalarWithPredication(&I)) { 5614 ScalarCostsTy ScalarCosts; 5615 // Do not apply discount logic if hacked cost is needed 5616 // for emulated masked memrefs. 5617 if (!useEmulatedMaskMemRefHack(&I) && 5618 computePredInstDiscount(&I, ScalarCosts, VF) >= 0) 5619 ScalarCostsVF.insert(ScalarCosts.begin(), ScalarCosts.end()); 5620 // Remember that BB will remain after vectorization. 5621 PredicatedBBsAfterVectorization.insert(BB); 5622 } 5623 } 5624 } 5625 5626 int LoopVectorizationCostModel::computePredInstDiscount( 5627 Instruction *PredInst, DenseMap<Instruction *, unsigned> &ScalarCosts, 5628 unsigned VF) { 5629 assert(!isUniformAfterVectorization(PredInst, VF) && 5630 "Instruction marked uniform-after-vectorization will be predicated"); 5631 5632 // Initialize the discount to zero, meaning that the scalar version and the 5633 // vector version cost the same. 5634 int Discount = 0; 5635 5636 // Holds instructions to analyze. The instructions we visit are mapped in 5637 // ScalarCosts. Those instructions are the ones that would be scalarized if 5638 // we find that the scalar version costs less. 5639 SmallVector<Instruction *, 8> Worklist; 5640 5641 // Returns true if the given instruction can be scalarized. 5642 auto canBeScalarized = [&](Instruction *I) -> bool { 5643 // We only attempt to scalarize instructions forming a single-use chain 5644 // from the original predicated block that would otherwise be vectorized. 5645 // Although not strictly necessary, we give up on instructions we know will 5646 // already be scalar to avoid traversing chains that are unlikely to be 5647 // beneficial. 5648 if (!I->hasOneUse() || PredInst->getParent() != I->getParent() || 5649 isScalarAfterVectorization(I, VF)) 5650 return false; 5651 5652 // If the instruction is scalar with predication, it will be analyzed 5653 // separately. We ignore it within the context of PredInst. 5654 if (isScalarWithPredication(I)) 5655 return false; 5656 5657 // If any of the instruction's operands are uniform after vectorization, 5658 // the instruction cannot be scalarized. This prevents, for example, a 5659 // masked load from being scalarized. 5660 // 5661 // We assume we will only emit a value for lane zero of an instruction 5662 // marked uniform after vectorization, rather than VF identical values. 5663 // Thus, if we scalarize an instruction that uses a uniform, we would 5664 // create uses of values corresponding to the lanes we aren't emitting code 5665 // for. This behavior can be changed by allowing getScalarValue to clone 5666 // the lane zero values for uniforms rather than asserting. 5667 for (Use &U : I->operands()) 5668 if (auto *J = dyn_cast<Instruction>(U.get())) 5669 if (isUniformAfterVectorization(J, VF)) 5670 return false; 5671 5672 // Otherwise, we can scalarize the instruction. 5673 return true; 5674 }; 5675 5676 // Compute the expected cost discount from scalarizing the entire expression 5677 // feeding the predicated instruction. We currently only consider expressions 5678 // that are single-use instruction chains. 5679 Worklist.push_back(PredInst); 5680 while (!Worklist.empty()) { 5681 Instruction *I = Worklist.pop_back_val(); 5682 5683 // If we've already analyzed the instruction, there's nothing to do. 5684 if (ScalarCosts.find(I) != ScalarCosts.end()) 5685 continue; 5686 5687 // Compute the cost of the vector instruction. Note that this cost already 5688 // includes the scalarization overhead of the predicated instruction. 5689 unsigned VectorCost = getInstructionCost(I, VF).first; 5690 5691 // Compute the cost of the scalarized instruction. This cost is the cost of 5692 // the instruction as if it wasn't if-converted and instead remained in the 5693 // predicated block. We will scale this cost by block probability after 5694 // computing the scalarization overhead. 5695 unsigned ScalarCost = VF * getInstructionCost(I, 1).first; 5696 5697 // Compute the scalarization overhead of needed insertelement instructions 5698 // and phi nodes. 5699 if (isScalarWithPredication(I) && !I->getType()->isVoidTy()) { 5700 ScalarCost += TTI.getScalarizationOverhead(ToVectorTy(I->getType(), VF), 5701 true, false); 5702 ScalarCost += VF * TTI.getCFInstrCost(Instruction::PHI); 5703 } 5704 5705 // Compute the scalarization overhead of needed extractelement 5706 // instructions. For each of the instruction's operands, if the operand can 5707 // be scalarized, add it to the worklist; otherwise, account for the 5708 // overhead. 5709 for (Use &U : I->operands()) 5710 if (auto *J = dyn_cast<Instruction>(U.get())) { 5711 assert(VectorType::isValidElementType(J->getType()) && 5712 "Instruction has non-scalar type"); 5713 if (canBeScalarized(J)) 5714 Worklist.push_back(J); 5715 else if (needsExtract(J, VF)) 5716 ScalarCost += TTI.getScalarizationOverhead( 5717 ToVectorTy(J->getType(),VF), false, true); 5718 } 5719 5720 // Scale the total scalar cost by block probability. 5721 ScalarCost /= getReciprocalPredBlockProb(); 5722 5723 // Compute the discount. A non-negative discount means the vector version 5724 // of the instruction costs more, and scalarizing would be beneficial. 5725 Discount += VectorCost - ScalarCost; 5726 ScalarCosts[I] = ScalarCost; 5727 } 5728 5729 return Discount; 5730 } 5731 5732 LoopVectorizationCostModel::VectorizationCostTy 5733 LoopVectorizationCostModel::expectedCost(unsigned VF) { 5734 VectorizationCostTy Cost; 5735 5736 // For each block. 5737 for (BasicBlock *BB : TheLoop->blocks()) { 5738 VectorizationCostTy BlockCost; 5739 5740 // For each instruction in the old loop. 5741 for (Instruction &I : BB->instructionsWithoutDebug()) { 5742 // Skip ignored values. 5743 if (ValuesToIgnore.find(&I) != ValuesToIgnore.end() || 5744 (VF > 1 && VecValuesToIgnore.find(&I) != VecValuesToIgnore.end())) 5745 continue; 5746 5747 VectorizationCostTy C = getInstructionCost(&I, VF); 5748 5749 // Check if we should override the cost. 5750 if (ForceTargetInstructionCost.getNumOccurrences() > 0) 5751 C.first = ForceTargetInstructionCost; 5752 5753 BlockCost.first += C.first; 5754 BlockCost.second |= C.second; 5755 LLVM_DEBUG(dbgs() << "LV: Found an estimated cost of " << C.first 5756 << " for VF " << VF << " For instruction: " << I 5757 << '\n'); 5758 } 5759 5760 // If we are vectorizing a predicated block, it will have been 5761 // if-converted. This means that the block's instructions (aside from 5762 // stores and instructions that may divide by zero) will now be 5763 // unconditionally executed. For the scalar case, we may not always execute 5764 // the predicated block. Thus, scale the block's cost by the probability of 5765 // executing it. 5766 if (VF == 1 && blockNeedsPredication(BB)) 5767 BlockCost.first /= getReciprocalPredBlockProb(); 5768 5769 Cost.first += BlockCost.first; 5770 Cost.second |= BlockCost.second; 5771 } 5772 5773 return Cost; 5774 } 5775 5776 /// Gets Address Access SCEV after verifying that the access pattern 5777 /// is loop invariant except the induction variable dependence. 5778 /// 5779 /// This SCEV can be sent to the Target in order to estimate the address 5780 /// calculation cost. 5781 static const SCEV *getAddressAccessSCEV( 5782 Value *Ptr, 5783 LoopVectorizationLegality *Legal, 5784 PredicatedScalarEvolution &PSE, 5785 const Loop *TheLoop) { 5786 5787 auto *Gep = dyn_cast<GetElementPtrInst>(Ptr); 5788 if (!Gep) 5789 return nullptr; 5790 5791 // We are looking for a gep with all loop invariant indices except for one 5792 // which should be an induction variable. 5793 auto SE = PSE.getSE(); 5794 unsigned NumOperands = Gep->getNumOperands(); 5795 for (unsigned i = 1; i < NumOperands; ++i) { 5796 Value *Opd = Gep->getOperand(i); 5797 if (!SE->isLoopInvariant(SE->getSCEV(Opd), TheLoop) && 5798 !Legal->isInductionVariable(Opd)) 5799 return nullptr; 5800 } 5801 5802 // Now we know we have a GEP ptr, %inv, %ind, %inv. return the Ptr SCEV. 5803 return PSE.getSCEV(Ptr); 5804 } 5805 5806 static bool isStrideMul(Instruction *I, LoopVectorizationLegality *Legal) { 5807 return Legal->hasStride(I->getOperand(0)) || 5808 Legal->hasStride(I->getOperand(1)); 5809 } 5810 5811 unsigned LoopVectorizationCostModel::getMemInstScalarizationCost(Instruction *I, 5812 unsigned VF) { 5813 assert(VF > 1 && "Scalarization cost of instruction implies vectorization."); 5814 Type *ValTy = getMemInstValueType(I); 5815 auto SE = PSE.getSE(); 5816 5817 unsigned AS = getLoadStoreAddressSpace(I); 5818 Value *Ptr = getLoadStorePointerOperand(I); 5819 Type *PtrTy = ToVectorTy(Ptr->getType(), VF); 5820 5821 // Figure out whether the access is strided and get the stride value 5822 // if it's known in compile time 5823 const SCEV *PtrSCEV = getAddressAccessSCEV(Ptr, Legal, PSE, TheLoop); 5824 5825 // Get the cost of the scalar memory instruction and address computation. 5826 unsigned Cost = VF * TTI.getAddressComputationCost(PtrTy, SE, PtrSCEV); 5827 5828 // Don't pass *I here, since it is scalar but will actually be part of a 5829 // vectorized loop where the user of it is a vectorized instruction. 5830 const MaybeAlign Alignment = getLoadStoreAlignment(I); 5831 Cost += VF * TTI.getMemoryOpCost(I->getOpcode(), ValTy->getScalarType(), 5832 Alignment, AS); 5833 5834 // Get the overhead of the extractelement and insertelement instructions 5835 // we might create due to scalarization. 5836 Cost += getScalarizationOverhead(I, VF); 5837 5838 // If we have a predicated store, it may not be executed for each vector 5839 // lane. Scale the cost by the probability of executing the predicated 5840 // block. 5841 if (isPredicatedInst(I)) { 5842 Cost /= getReciprocalPredBlockProb(); 5843 5844 if (useEmulatedMaskMemRefHack(I)) 5845 // Artificially setting to a high enough value to practically disable 5846 // vectorization with such operations. 5847 Cost = 3000000; 5848 } 5849 5850 return Cost; 5851 } 5852 5853 unsigned LoopVectorizationCostModel::getConsecutiveMemOpCost(Instruction *I, 5854 unsigned VF) { 5855 Type *ValTy = getMemInstValueType(I); 5856 Type *VectorTy = ToVectorTy(ValTy, VF); 5857 Value *Ptr = getLoadStorePointerOperand(I); 5858 unsigned AS = getLoadStoreAddressSpace(I); 5859 int ConsecutiveStride = Legal->isConsecutivePtr(Ptr); 5860 5861 assert((ConsecutiveStride == 1 || ConsecutiveStride == -1) && 5862 "Stride should be 1 or -1 for consecutive memory access"); 5863 const MaybeAlign Alignment = getLoadStoreAlignment(I); 5864 unsigned Cost = 0; 5865 if (Legal->isMaskRequired(I)) 5866 Cost += TTI.getMaskedMemoryOpCost(I->getOpcode(), VectorTy, 5867 Alignment ? Alignment->value() : 0, AS); 5868 else 5869 Cost += TTI.getMemoryOpCost(I->getOpcode(), VectorTy, Alignment, AS, I); 5870 5871 bool Reverse = ConsecutiveStride < 0; 5872 if (Reverse) 5873 Cost += TTI.getShuffleCost(TargetTransformInfo::SK_Reverse, VectorTy, 0); 5874 return Cost; 5875 } 5876 5877 unsigned LoopVectorizationCostModel::getUniformMemOpCost(Instruction *I, 5878 unsigned VF) { 5879 Type *ValTy = getMemInstValueType(I); 5880 Type *VectorTy = ToVectorTy(ValTy, VF); 5881 const MaybeAlign Alignment = getLoadStoreAlignment(I); 5882 unsigned AS = getLoadStoreAddressSpace(I); 5883 if (isa<LoadInst>(I)) { 5884 return TTI.getAddressComputationCost(ValTy) + 5885 TTI.getMemoryOpCost(Instruction::Load, ValTy, Alignment, AS) + 5886 TTI.getShuffleCost(TargetTransformInfo::SK_Broadcast, VectorTy); 5887 } 5888 StoreInst *SI = cast<StoreInst>(I); 5889 5890 bool isLoopInvariantStoreValue = Legal->isUniform(SI->getValueOperand()); 5891 return TTI.getAddressComputationCost(ValTy) + 5892 TTI.getMemoryOpCost(Instruction::Store, ValTy, Alignment, AS) + 5893 (isLoopInvariantStoreValue 5894 ? 0 5895 : TTI.getVectorInstrCost(Instruction::ExtractElement, VectorTy, 5896 VF - 1)); 5897 } 5898 5899 unsigned LoopVectorizationCostModel::getGatherScatterCost(Instruction *I, 5900 unsigned VF) { 5901 Type *ValTy = getMemInstValueType(I); 5902 Type *VectorTy = ToVectorTy(ValTy, VF); 5903 const MaybeAlign Alignment = getLoadStoreAlignment(I); 5904 Value *Ptr = getLoadStorePointerOperand(I); 5905 5906 return TTI.getAddressComputationCost(VectorTy) + 5907 TTI.getGatherScatterOpCost(I->getOpcode(), VectorTy, Ptr, 5908 Legal->isMaskRequired(I), 5909 Alignment ? Alignment->value() : 0, I); 5910 } 5911 5912 unsigned LoopVectorizationCostModel::getInterleaveGroupCost(Instruction *I, 5913 unsigned VF) { 5914 Type *ValTy = getMemInstValueType(I); 5915 Type *VectorTy = ToVectorTy(ValTy, VF); 5916 unsigned AS = getLoadStoreAddressSpace(I); 5917 5918 auto Group = getInterleavedAccessGroup(I); 5919 assert(Group && "Fail to get an interleaved access group."); 5920 5921 unsigned InterleaveFactor = Group->getFactor(); 5922 Type *WideVecTy = VectorType::get(ValTy, VF * InterleaveFactor); 5923 5924 // Holds the indices of existing members in an interleaved load group. 5925 // An interleaved store group doesn't need this as it doesn't allow gaps. 5926 SmallVector<unsigned, 4> Indices; 5927 if (isa<LoadInst>(I)) { 5928 for (unsigned i = 0; i < InterleaveFactor; i++) 5929 if (Group->getMember(i)) 5930 Indices.push_back(i); 5931 } 5932 5933 // Calculate the cost of the whole interleaved group. 5934 bool UseMaskForGaps = 5935 Group->requiresScalarEpilogue() && !isScalarEpilogueAllowed(); 5936 unsigned Cost = TTI.getInterleavedMemoryOpCost( 5937 I->getOpcode(), WideVecTy, Group->getFactor(), Indices, 5938 Group->getAlign().value(), AS, Legal->isMaskRequired(I), UseMaskForGaps); 5939 5940 if (Group->isReverse()) { 5941 // TODO: Add support for reversed masked interleaved access. 5942 assert(!Legal->isMaskRequired(I) && 5943 "Reverse masked interleaved access not supported."); 5944 Cost += Group->getNumMembers() * 5945 TTI.getShuffleCost(TargetTransformInfo::SK_Reverse, VectorTy, 0); 5946 } 5947 return Cost; 5948 } 5949 5950 unsigned LoopVectorizationCostModel::getMemoryInstructionCost(Instruction *I, 5951 unsigned VF) { 5952 // Calculate scalar cost only. Vectorization cost should be ready at this 5953 // moment. 5954 if (VF == 1) { 5955 Type *ValTy = getMemInstValueType(I); 5956 const MaybeAlign Alignment = getLoadStoreAlignment(I); 5957 unsigned AS = getLoadStoreAddressSpace(I); 5958 5959 return TTI.getAddressComputationCost(ValTy) + 5960 TTI.getMemoryOpCost(I->getOpcode(), ValTy, Alignment, AS, I); 5961 } 5962 return getWideningCost(I, VF); 5963 } 5964 5965 LoopVectorizationCostModel::VectorizationCostTy 5966 LoopVectorizationCostModel::getInstructionCost(Instruction *I, unsigned VF) { 5967 // If we know that this instruction will remain uniform, check the cost of 5968 // the scalar version. 5969 if (isUniformAfterVectorization(I, VF)) 5970 VF = 1; 5971 5972 if (VF > 1 && isProfitableToScalarize(I, VF)) 5973 return VectorizationCostTy(InstsToScalarize[VF][I], false); 5974 5975 // Forced scalars do not have any scalarization overhead. 5976 auto ForcedScalar = ForcedScalars.find(VF); 5977 if (VF > 1 && ForcedScalar != ForcedScalars.end()) { 5978 auto InstSet = ForcedScalar->second; 5979 if (InstSet.find(I) != InstSet.end()) 5980 return VectorizationCostTy((getInstructionCost(I, 1).first * VF), false); 5981 } 5982 5983 Type *VectorTy; 5984 unsigned C = getInstructionCost(I, VF, VectorTy); 5985 5986 bool TypeNotScalarized = 5987 VF > 1 && VectorTy->isVectorTy() && TTI.getNumberOfParts(VectorTy) < VF; 5988 return VectorizationCostTy(C, TypeNotScalarized); 5989 } 5990 5991 unsigned LoopVectorizationCostModel::getScalarizationOverhead(Instruction *I, 5992 unsigned VF) { 5993 5994 if (VF == 1) 5995 return 0; 5996 5997 unsigned Cost = 0; 5998 Type *RetTy = ToVectorTy(I->getType(), VF); 5999 if (!RetTy->isVoidTy() && 6000 (!isa<LoadInst>(I) || !TTI.supportsEfficientVectorElementLoadStore())) 6001 Cost += TTI.getScalarizationOverhead(RetTy, true, false); 6002 6003 // Some targets keep addresses scalar. 6004 if (isa<LoadInst>(I) && !TTI.prefersVectorizedAddressing()) 6005 return Cost; 6006 6007 // Some targets support efficient element stores. 6008 if (isa<StoreInst>(I) && TTI.supportsEfficientVectorElementLoadStore()) 6009 return Cost; 6010 6011 // Collect operands to consider. 6012 CallInst *CI = dyn_cast<CallInst>(I); 6013 Instruction::op_range Ops = CI ? CI->arg_operands() : I->operands(); 6014 6015 // Skip operands that do not require extraction/scalarization and do not incur 6016 // any overhead. 6017 return Cost + TTI.getOperandsScalarizationOverhead( 6018 filterExtractingOperands(Ops, VF), VF); 6019 } 6020 6021 void LoopVectorizationCostModel::setCostBasedWideningDecision(unsigned VF) { 6022 if (VF == 1) 6023 return; 6024 NumPredStores = 0; 6025 for (BasicBlock *BB : TheLoop->blocks()) { 6026 // For each instruction in the old loop. 6027 for (Instruction &I : *BB) { 6028 Value *Ptr = getLoadStorePointerOperand(&I); 6029 if (!Ptr) 6030 continue; 6031 6032 // TODO: We should generate better code and update the cost model for 6033 // predicated uniform stores. Today they are treated as any other 6034 // predicated store (see added test cases in 6035 // invariant-store-vectorization.ll). 6036 if (isa<StoreInst>(&I) && isScalarWithPredication(&I)) 6037 NumPredStores++; 6038 6039 if (Legal->isUniform(Ptr) && 6040 // Conditional loads and stores should be scalarized and predicated. 6041 // isScalarWithPredication cannot be used here since masked 6042 // gather/scatters are not considered scalar with predication. 6043 !Legal->blockNeedsPredication(I.getParent())) { 6044 // TODO: Avoid replicating loads and stores instead of 6045 // relying on instcombine to remove them. 6046 // Load: Scalar load + broadcast 6047 // Store: Scalar store + isLoopInvariantStoreValue ? 0 : extract 6048 unsigned Cost = getUniformMemOpCost(&I, VF); 6049 setWideningDecision(&I, VF, CM_Scalarize, Cost); 6050 continue; 6051 } 6052 6053 // We assume that widening is the best solution when possible. 6054 if (memoryInstructionCanBeWidened(&I, VF)) { 6055 unsigned Cost = getConsecutiveMemOpCost(&I, VF); 6056 int ConsecutiveStride = 6057 Legal->isConsecutivePtr(getLoadStorePointerOperand(&I)); 6058 assert((ConsecutiveStride == 1 || ConsecutiveStride == -1) && 6059 "Expected consecutive stride."); 6060 InstWidening Decision = 6061 ConsecutiveStride == 1 ? CM_Widen : CM_Widen_Reverse; 6062 setWideningDecision(&I, VF, Decision, Cost); 6063 continue; 6064 } 6065 6066 // Choose between Interleaving, Gather/Scatter or Scalarization. 6067 unsigned InterleaveCost = std::numeric_limits<unsigned>::max(); 6068 unsigned NumAccesses = 1; 6069 if (isAccessInterleaved(&I)) { 6070 auto Group = getInterleavedAccessGroup(&I); 6071 assert(Group && "Fail to get an interleaved access group."); 6072 6073 // Make one decision for the whole group. 6074 if (getWideningDecision(&I, VF) != CM_Unknown) 6075 continue; 6076 6077 NumAccesses = Group->getNumMembers(); 6078 if (interleavedAccessCanBeWidened(&I, VF)) 6079 InterleaveCost = getInterleaveGroupCost(&I, VF); 6080 } 6081 6082 unsigned GatherScatterCost = 6083 isLegalGatherOrScatter(&I) 6084 ? getGatherScatterCost(&I, VF) * NumAccesses 6085 : std::numeric_limits<unsigned>::max(); 6086 6087 unsigned ScalarizationCost = 6088 getMemInstScalarizationCost(&I, VF) * NumAccesses; 6089 6090 // Choose better solution for the current VF, 6091 // write down this decision and use it during vectorization. 6092 unsigned Cost; 6093 InstWidening Decision; 6094 if (InterleaveCost <= GatherScatterCost && 6095 InterleaveCost < ScalarizationCost) { 6096 Decision = CM_Interleave; 6097 Cost = InterleaveCost; 6098 } else if (GatherScatterCost < ScalarizationCost) { 6099 Decision = CM_GatherScatter; 6100 Cost = GatherScatterCost; 6101 } else { 6102 Decision = CM_Scalarize; 6103 Cost = ScalarizationCost; 6104 } 6105 // If the instructions belongs to an interleave group, the whole group 6106 // receives the same decision. The whole group receives the cost, but 6107 // the cost will actually be assigned to one instruction. 6108 if (auto Group = getInterleavedAccessGroup(&I)) 6109 setWideningDecision(Group, VF, Decision, Cost); 6110 else 6111 setWideningDecision(&I, VF, Decision, Cost); 6112 } 6113 } 6114 6115 // Make sure that any load of address and any other address computation 6116 // remains scalar unless there is gather/scatter support. This avoids 6117 // inevitable extracts into address registers, and also has the benefit of 6118 // activating LSR more, since that pass can't optimize vectorized 6119 // addresses. 6120 if (TTI.prefersVectorizedAddressing()) 6121 return; 6122 6123 // Start with all scalar pointer uses. 6124 SmallPtrSet<Instruction *, 8> AddrDefs; 6125 for (BasicBlock *BB : TheLoop->blocks()) 6126 for (Instruction &I : *BB) { 6127 Instruction *PtrDef = 6128 dyn_cast_or_null<Instruction>(getLoadStorePointerOperand(&I)); 6129 if (PtrDef && TheLoop->contains(PtrDef) && 6130 getWideningDecision(&I, VF) != CM_GatherScatter) 6131 AddrDefs.insert(PtrDef); 6132 } 6133 6134 // Add all instructions used to generate the addresses. 6135 SmallVector<Instruction *, 4> Worklist; 6136 for (auto *I : AddrDefs) 6137 Worklist.push_back(I); 6138 while (!Worklist.empty()) { 6139 Instruction *I = Worklist.pop_back_val(); 6140 for (auto &Op : I->operands()) 6141 if (auto *InstOp = dyn_cast<Instruction>(Op)) 6142 if ((InstOp->getParent() == I->getParent()) && !isa<PHINode>(InstOp) && 6143 AddrDefs.insert(InstOp).second) 6144 Worklist.push_back(InstOp); 6145 } 6146 6147 for (auto *I : AddrDefs) { 6148 if (isa<LoadInst>(I)) { 6149 // Setting the desired widening decision should ideally be handled in 6150 // by cost functions, but since this involves the task of finding out 6151 // if the loaded register is involved in an address computation, it is 6152 // instead changed here when we know this is the case. 6153 InstWidening Decision = getWideningDecision(I, VF); 6154 if (Decision == CM_Widen || Decision == CM_Widen_Reverse) 6155 // Scalarize a widened load of address. 6156 setWideningDecision(I, VF, CM_Scalarize, 6157 (VF * getMemoryInstructionCost(I, 1))); 6158 else if (auto Group = getInterleavedAccessGroup(I)) { 6159 // Scalarize an interleave group of address loads. 6160 for (unsigned I = 0; I < Group->getFactor(); ++I) { 6161 if (Instruction *Member = Group->getMember(I)) 6162 setWideningDecision(Member, VF, CM_Scalarize, 6163 (VF * getMemoryInstructionCost(Member, 1))); 6164 } 6165 } 6166 } else 6167 // Make sure I gets scalarized and a cost estimate without 6168 // scalarization overhead. 6169 ForcedScalars[VF].insert(I); 6170 } 6171 } 6172 6173 unsigned LoopVectorizationCostModel::getInstructionCost(Instruction *I, 6174 unsigned VF, 6175 Type *&VectorTy) { 6176 Type *RetTy = I->getType(); 6177 if (canTruncateToMinimalBitwidth(I, VF)) 6178 RetTy = IntegerType::get(RetTy->getContext(), MinBWs[I]); 6179 VectorTy = isScalarAfterVectorization(I, VF) ? RetTy : ToVectorTy(RetTy, VF); 6180 auto SE = PSE.getSE(); 6181 6182 // TODO: We need to estimate the cost of intrinsic calls. 6183 switch (I->getOpcode()) { 6184 case Instruction::GetElementPtr: 6185 // We mark this instruction as zero-cost because the cost of GEPs in 6186 // vectorized code depends on whether the corresponding memory instruction 6187 // is scalarized or not. Therefore, we handle GEPs with the memory 6188 // instruction cost. 6189 return 0; 6190 case Instruction::Br: { 6191 // In cases of scalarized and predicated instructions, there will be VF 6192 // predicated blocks in the vectorized loop. Each branch around these 6193 // blocks requires also an extract of its vector compare i1 element. 6194 bool ScalarPredicatedBB = false; 6195 BranchInst *BI = cast<BranchInst>(I); 6196 if (VF > 1 && BI->isConditional() && 6197 (PredicatedBBsAfterVectorization.find(BI->getSuccessor(0)) != 6198 PredicatedBBsAfterVectorization.end() || 6199 PredicatedBBsAfterVectorization.find(BI->getSuccessor(1)) != 6200 PredicatedBBsAfterVectorization.end())) 6201 ScalarPredicatedBB = true; 6202 6203 if (ScalarPredicatedBB) { 6204 // Return cost for branches around scalarized and predicated blocks. 6205 Type *Vec_i1Ty = 6206 VectorType::get(IntegerType::getInt1Ty(RetTy->getContext()), VF); 6207 return (TTI.getScalarizationOverhead(Vec_i1Ty, false, true) + 6208 (TTI.getCFInstrCost(Instruction::Br) * VF)); 6209 } else if (I->getParent() == TheLoop->getLoopLatch() || VF == 1) 6210 // The back-edge branch will remain, as will all scalar branches. 6211 return TTI.getCFInstrCost(Instruction::Br); 6212 else 6213 // This branch will be eliminated by if-conversion. 6214 return 0; 6215 // Note: We currently assume zero cost for an unconditional branch inside 6216 // a predicated block since it will become a fall-through, although we 6217 // may decide in the future to call TTI for all branches. 6218 } 6219 case Instruction::PHI: { 6220 auto *Phi = cast<PHINode>(I); 6221 6222 // First-order recurrences are replaced by vector shuffles inside the loop. 6223 // NOTE: Don't use ToVectorTy as SK_ExtractSubvector expects a vector type. 6224 if (VF > 1 && Legal->isFirstOrderRecurrence(Phi)) 6225 return TTI.getShuffleCost(TargetTransformInfo::SK_ExtractSubvector, 6226 VectorTy, VF - 1, VectorType::get(RetTy, 1)); 6227 6228 // Phi nodes in non-header blocks (not inductions, reductions, etc.) are 6229 // converted into select instructions. We require N - 1 selects per phi 6230 // node, where N is the number of incoming values. 6231 if (VF > 1 && Phi->getParent() != TheLoop->getHeader()) 6232 return (Phi->getNumIncomingValues() - 1) * 6233 TTI.getCmpSelInstrCost( 6234 Instruction::Select, ToVectorTy(Phi->getType(), VF), 6235 ToVectorTy(Type::getInt1Ty(Phi->getContext()), VF)); 6236 6237 return TTI.getCFInstrCost(Instruction::PHI); 6238 } 6239 case Instruction::UDiv: 6240 case Instruction::SDiv: 6241 case Instruction::URem: 6242 case Instruction::SRem: 6243 // If we have a predicated instruction, it may not be executed for each 6244 // vector lane. Get the scalarization cost and scale this amount by the 6245 // probability of executing the predicated block. If the instruction is not 6246 // predicated, we fall through to the next case. 6247 if (VF > 1 && isScalarWithPredication(I)) { 6248 unsigned Cost = 0; 6249 6250 // These instructions have a non-void type, so account for the phi nodes 6251 // that we will create. This cost is likely to be zero. The phi node 6252 // cost, if any, should be scaled by the block probability because it 6253 // models a copy at the end of each predicated block. 6254 Cost += VF * TTI.getCFInstrCost(Instruction::PHI); 6255 6256 // The cost of the non-predicated instruction. 6257 Cost += VF * TTI.getArithmeticInstrCost(I->getOpcode(), RetTy); 6258 6259 // The cost of insertelement and extractelement instructions needed for 6260 // scalarization. 6261 Cost += getScalarizationOverhead(I, VF); 6262 6263 // Scale the cost by the probability of executing the predicated blocks. 6264 // This assumes the predicated block for each vector lane is equally 6265 // likely. 6266 return Cost / getReciprocalPredBlockProb(); 6267 } 6268 LLVM_FALLTHROUGH; 6269 case Instruction::Add: 6270 case Instruction::FAdd: 6271 case Instruction::Sub: 6272 case Instruction::FSub: 6273 case Instruction::Mul: 6274 case Instruction::FMul: 6275 case Instruction::FDiv: 6276 case Instruction::FRem: 6277 case Instruction::Shl: 6278 case Instruction::LShr: 6279 case Instruction::AShr: 6280 case Instruction::And: 6281 case Instruction::Or: 6282 case Instruction::Xor: { 6283 // Since we will replace the stride by 1 the multiplication should go away. 6284 if (I->getOpcode() == Instruction::Mul && isStrideMul(I, Legal)) 6285 return 0; 6286 // Certain instructions can be cheaper to vectorize if they have a constant 6287 // second vector operand. One example of this are shifts on x86. 6288 Value *Op2 = I->getOperand(1); 6289 TargetTransformInfo::OperandValueProperties Op2VP; 6290 TargetTransformInfo::OperandValueKind Op2VK = 6291 TTI.getOperandInfo(Op2, Op2VP); 6292 if (Op2VK == TargetTransformInfo::OK_AnyValue && Legal->isUniform(Op2)) 6293 Op2VK = TargetTransformInfo::OK_UniformValue; 6294 6295 SmallVector<const Value *, 4> Operands(I->operand_values()); 6296 unsigned N = isScalarAfterVectorization(I, VF) ? VF : 1; 6297 return N * TTI.getArithmeticInstrCost( 6298 I->getOpcode(), VectorTy, TargetTransformInfo::OK_AnyValue, 6299 Op2VK, TargetTransformInfo::OP_None, Op2VP, Operands, I); 6300 } 6301 case Instruction::FNeg: { 6302 unsigned N = isScalarAfterVectorization(I, VF) ? VF : 1; 6303 return N * TTI.getArithmeticInstrCost( 6304 I->getOpcode(), VectorTy, TargetTransformInfo::OK_AnyValue, 6305 TargetTransformInfo::OK_AnyValue, 6306 TargetTransformInfo::OP_None, TargetTransformInfo::OP_None, 6307 I->getOperand(0), I); 6308 } 6309 case Instruction::Select: { 6310 SelectInst *SI = cast<SelectInst>(I); 6311 const SCEV *CondSCEV = SE->getSCEV(SI->getCondition()); 6312 bool ScalarCond = (SE->isLoopInvariant(CondSCEV, TheLoop)); 6313 Type *CondTy = SI->getCondition()->getType(); 6314 if (!ScalarCond) 6315 CondTy = VectorType::get(CondTy, VF); 6316 6317 return TTI.getCmpSelInstrCost(I->getOpcode(), VectorTy, CondTy, I); 6318 } 6319 case Instruction::ICmp: 6320 case Instruction::FCmp: { 6321 Type *ValTy = I->getOperand(0)->getType(); 6322 Instruction *Op0AsInstruction = dyn_cast<Instruction>(I->getOperand(0)); 6323 if (canTruncateToMinimalBitwidth(Op0AsInstruction, VF)) 6324 ValTy = IntegerType::get(ValTy->getContext(), MinBWs[Op0AsInstruction]); 6325 VectorTy = ToVectorTy(ValTy, VF); 6326 return TTI.getCmpSelInstrCost(I->getOpcode(), VectorTy, nullptr, I); 6327 } 6328 case Instruction::Store: 6329 case Instruction::Load: { 6330 unsigned Width = VF; 6331 if (Width > 1) { 6332 InstWidening Decision = getWideningDecision(I, Width); 6333 assert(Decision != CM_Unknown && 6334 "CM decision should be taken at this point"); 6335 if (Decision == CM_Scalarize) 6336 Width = 1; 6337 } 6338 VectorTy = ToVectorTy(getMemInstValueType(I), Width); 6339 return getMemoryInstructionCost(I, VF); 6340 } 6341 case Instruction::ZExt: 6342 case Instruction::SExt: 6343 case Instruction::FPToUI: 6344 case Instruction::FPToSI: 6345 case Instruction::FPExt: 6346 case Instruction::PtrToInt: 6347 case Instruction::IntToPtr: 6348 case Instruction::SIToFP: 6349 case Instruction::UIToFP: 6350 case Instruction::Trunc: 6351 case Instruction::FPTrunc: 6352 case Instruction::BitCast: { 6353 // We optimize the truncation of induction variables having constant 6354 // integer steps. The cost of these truncations is the same as the scalar 6355 // operation. 6356 if (isOptimizableIVTruncate(I, VF)) { 6357 auto *Trunc = cast<TruncInst>(I); 6358 return TTI.getCastInstrCost(Instruction::Trunc, Trunc->getDestTy(), 6359 Trunc->getSrcTy(), Trunc); 6360 } 6361 6362 Type *SrcScalarTy = I->getOperand(0)->getType(); 6363 Type *SrcVecTy = 6364 VectorTy->isVectorTy() ? ToVectorTy(SrcScalarTy, VF) : SrcScalarTy; 6365 if (canTruncateToMinimalBitwidth(I, VF)) { 6366 // This cast is going to be shrunk. This may remove the cast or it might 6367 // turn it into slightly different cast. For example, if MinBW == 16, 6368 // "zext i8 %1 to i32" becomes "zext i8 %1 to i16". 6369 // 6370 // Calculate the modified src and dest types. 6371 Type *MinVecTy = VectorTy; 6372 if (I->getOpcode() == Instruction::Trunc) { 6373 SrcVecTy = smallestIntegerVectorType(SrcVecTy, MinVecTy); 6374 VectorTy = 6375 largestIntegerVectorType(ToVectorTy(I->getType(), VF), MinVecTy); 6376 } else if (I->getOpcode() == Instruction::ZExt || 6377 I->getOpcode() == Instruction::SExt) { 6378 SrcVecTy = largestIntegerVectorType(SrcVecTy, MinVecTy); 6379 VectorTy = 6380 smallestIntegerVectorType(ToVectorTy(I->getType(), VF), MinVecTy); 6381 } 6382 } 6383 6384 unsigned N = isScalarAfterVectorization(I, VF) ? VF : 1; 6385 return N * TTI.getCastInstrCost(I->getOpcode(), VectorTy, SrcVecTy, I); 6386 } 6387 case Instruction::Call: { 6388 bool NeedToScalarize; 6389 CallInst *CI = cast<CallInst>(I); 6390 unsigned CallCost = getVectorCallCost(CI, VF, NeedToScalarize); 6391 if (getVectorIntrinsicIDForCall(CI, TLI)) 6392 return std::min(CallCost, getVectorIntrinsicCost(CI, VF)); 6393 return CallCost; 6394 } 6395 default: 6396 // The cost of executing VF copies of the scalar instruction. This opcode 6397 // is unknown. Assume that it is the same as 'mul'. 6398 return VF * TTI.getArithmeticInstrCost(Instruction::Mul, VectorTy) + 6399 getScalarizationOverhead(I, VF); 6400 } // end of switch. 6401 } 6402 6403 char LoopVectorize::ID = 0; 6404 6405 static const char lv_name[] = "Loop Vectorization"; 6406 6407 INITIALIZE_PASS_BEGIN(LoopVectorize, LV_NAME, lv_name, false, false) 6408 INITIALIZE_PASS_DEPENDENCY(TargetTransformInfoWrapperPass) 6409 INITIALIZE_PASS_DEPENDENCY(BasicAAWrapperPass) 6410 INITIALIZE_PASS_DEPENDENCY(AAResultsWrapperPass) 6411 INITIALIZE_PASS_DEPENDENCY(GlobalsAAWrapperPass) 6412 INITIALIZE_PASS_DEPENDENCY(AssumptionCacheTracker) 6413 INITIALIZE_PASS_DEPENDENCY(BlockFrequencyInfoWrapperPass) 6414 INITIALIZE_PASS_DEPENDENCY(DominatorTreeWrapperPass) 6415 INITIALIZE_PASS_DEPENDENCY(ScalarEvolutionWrapperPass) 6416 INITIALIZE_PASS_DEPENDENCY(LoopInfoWrapperPass) 6417 INITIALIZE_PASS_DEPENDENCY(LoopAccessLegacyAnalysis) 6418 INITIALIZE_PASS_DEPENDENCY(DemandedBitsWrapperPass) 6419 INITIALIZE_PASS_DEPENDENCY(OptimizationRemarkEmitterWrapperPass) 6420 INITIALIZE_PASS_DEPENDENCY(ProfileSummaryInfoWrapperPass) 6421 INITIALIZE_PASS_DEPENDENCY(InjectTLIMappingsLegacy) 6422 INITIALIZE_PASS_END(LoopVectorize, LV_NAME, lv_name, false, false) 6423 6424 namespace llvm { 6425 6426 Pass *createLoopVectorizePass() { return new LoopVectorize(); } 6427 6428 Pass *createLoopVectorizePass(bool InterleaveOnlyWhenForced, 6429 bool VectorizeOnlyWhenForced) { 6430 return new LoopVectorize(InterleaveOnlyWhenForced, VectorizeOnlyWhenForced); 6431 } 6432 6433 } // end namespace llvm 6434 6435 bool LoopVectorizationCostModel::isConsecutiveLoadOrStore(Instruction *Inst) { 6436 // Check if the pointer operand of a load or store instruction is 6437 // consecutive. 6438 if (auto *Ptr = getLoadStorePointerOperand(Inst)) 6439 return Legal->isConsecutivePtr(Ptr); 6440 return false; 6441 } 6442 6443 void LoopVectorizationCostModel::collectValuesToIgnore() { 6444 // Ignore ephemeral values. 6445 CodeMetrics::collectEphemeralValues(TheLoop, AC, ValuesToIgnore); 6446 6447 // Ignore type-promoting instructions we identified during reduction 6448 // detection. 6449 for (auto &Reduction : Legal->getReductionVars()) { 6450 RecurrenceDescriptor &RedDes = Reduction.second; 6451 SmallPtrSetImpl<Instruction *> &Casts = RedDes.getCastInsts(); 6452 VecValuesToIgnore.insert(Casts.begin(), Casts.end()); 6453 } 6454 // Ignore type-casting instructions we identified during induction 6455 // detection. 6456 for (auto &Induction : Legal->getInductionVars()) { 6457 InductionDescriptor &IndDes = Induction.second; 6458 const SmallVectorImpl<Instruction *> &Casts = IndDes.getCastInsts(); 6459 VecValuesToIgnore.insert(Casts.begin(), Casts.end()); 6460 } 6461 } 6462 6463 // TODO: we could return a pair of values that specify the max VF and 6464 // min VF, to be used in `buildVPlans(MinVF, MaxVF)` instead of 6465 // `buildVPlans(VF, VF)`. We cannot do it because VPLAN at the moment 6466 // doesn't have a cost model that can choose which plan to execute if 6467 // more than one is generated. 6468 static unsigned determineVPlanVF(const unsigned WidestVectorRegBits, 6469 LoopVectorizationCostModel &CM) { 6470 unsigned WidestType; 6471 std::tie(std::ignore, WidestType) = CM.getSmallestAndWidestTypes(); 6472 return WidestVectorRegBits / WidestType; 6473 } 6474 6475 VectorizationFactor 6476 LoopVectorizationPlanner::planInVPlanNativePath(unsigned UserVF) { 6477 unsigned VF = UserVF; 6478 // Outer loop handling: They may require CFG and instruction level 6479 // transformations before even evaluating whether vectorization is profitable. 6480 // Since we cannot modify the incoming IR, we need to build VPlan upfront in 6481 // the vectorization pipeline. 6482 if (!OrigLoop->empty()) { 6483 // If the user doesn't provide a vectorization factor, determine a 6484 // reasonable one. 6485 if (!UserVF) { 6486 VF = determineVPlanVF(TTI->getRegisterBitWidth(true /* Vector*/), CM); 6487 LLVM_DEBUG(dbgs() << "LV: VPlan computed VF " << VF << ".\n"); 6488 6489 // Make sure we have a VF > 1 for stress testing. 6490 if (VPlanBuildStressTest && VF < 2) { 6491 LLVM_DEBUG(dbgs() << "LV: VPlan stress testing: " 6492 << "overriding computed VF.\n"); 6493 VF = 4; 6494 } 6495 } 6496 assert(EnableVPlanNativePath && "VPlan-native path is not enabled."); 6497 assert(isPowerOf2_32(VF) && "VF needs to be a power of two"); 6498 LLVM_DEBUG(dbgs() << "LV: Using " << (UserVF ? "user " : "") << "VF " << VF 6499 << " to build VPlans.\n"); 6500 buildVPlans(VF, VF); 6501 6502 // For VPlan build stress testing, we bail out after VPlan construction. 6503 if (VPlanBuildStressTest) 6504 return VectorizationFactor::Disabled(); 6505 6506 return {VF, 0}; 6507 } 6508 6509 LLVM_DEBUG( 6510 dbgs() << "LV: Not vectorizing. Inner loops aren't supported in the " 6511 "VPlan-native path.\n"); 6512 return VectorizationFactor::Disabled(); 6513 } 6514 6515 Optional<VectorizationFactor> LoopVectorizationPlanner::plan(unsigned UserVF) { 6516 assert(OrigLoop->empty() && "Inner loop expected."); 6517 Optional<unsigned> MaybeMaxVF = CM.computeMaxVF(); 6518 if (!MaybeMaxVF) // Cases that should not to be vectorized nor interleaved. 6519 return None; 6520 6521 // Invalidate interleave groups if all blocks of loop will be predicated. 6522 if (CM.blockNeedsPredication(OrigLoop->getHeader()) && 6523 !useMaskedInterleavedAccesses(*TTI)) { 6524 LLVM_DEBUG( 6525 dbgs() 6526 << "LV: Invalidate all interleaved groups due to fold-tail by masking " 6527 "which requires masked-interleaved support.\n"); 6528 CM.InterleaveInfo.reset(); 6529 } 6530 6531 if (UserVF) { 6532 LLVM_DEBUG(dbgs() << "LV: Using user VF " << UserVF << ".\n"); 6533 assert(isPowerOf2_32(UserVF) && "VF needs to be a power of two"); 6534 // Collect the instructions (and their associated costs) that will be more 6535 // profitable to scalarize. 6536 CM.selectUserVectorizationFactor(UserVF); 6537 buildVPlansWithVPRecipes(UserVF, UserVF); 6538 LLVM_DEBUG(printPlans(dbgs())); 6539 return {{UserVF, 0}}; 6540 } 6541 6542 unsigned MaxVF = MaybeMaxVF.getValue(); 6543 assert(MaxVF != 0 && "MaxVF is zero."); 6544 6545 for (unsigned VF = 1; VF <= MaxVF; VF *= 2) { 6546 // Collect Uniform and Scalar instructions after vectorization with VF. 6547 CM.collectUniformsAndScalars(VF); 6548 6549 // Collect the instructions (and their associated costs) that will be more 6550 // profitable to scalarize. 6551 if (VF > 1) 6552 CM.collectInstsToScalarize(VF); 6553 } 6554 6555 buildVPlansWithVPRecipes(1, MaxVF); 6556 LLVM_DEBUG(printPlans(dbgs())); 6557 if (MaxVF == 1) 6558 return VectorizationFactor::Disabled(); 6559 6560 // Select the optimal vectorization factor. 6561 return CM.selectVectorizationFactor(MaxVF); 6562 } 6563 6564 void LoopVectorizationPlanner::setBestPlan(unsigned VF, unsigned UF) { 6565 LLVM_DEBUG(dbgs() << "Setting best plan to VF=" << VF << ", UF=" << UF 6566 << '\n'); 6567 BestVF = VF; 6568 BestUF = UF; 6569 6570 erase_if(VPlans, [VF](const VPlanPtr &Plan) { 6571 return !Plan->hasVF(VF); 6572 }); 6573 assert(VPlans.size() == 1 && "Best VF has not a single VPlan."); 6574 } 6575 6576 void LoopVectorizationPlanner::executePlan(InnerLoopVectorizer &ILV, 6577 DominatorTree *DT) { 6578 // Perform the actual loop transformation. 6579 6580 // 1. Create a new empty loop. Unlink the old loop and connect the new one. 6581 VPCallbackILV CallbackILV(ILV); 6582 6583 VPTransformState State{BestVF, BestUF, LI, 6584 DT, ILV.Builder, ILV.VectorLoopValueMap, 6585 &ILV, CallbackILV}; 6586 State.CFG.PrevBB = ILV.createVectorizedLoopSkeleton(); 6587 State.TripCount = ILV.getOrCreateTripCount(nullptr); 6588 State.CanonicalIV = ILV.Induction; 6589 6590 //===------------------------------------------------===// 6591 // 6592 // Notice: any optimization or new instruction that go 6593 // into the code below should also be implemented in 6594 // the cost-model. 6595 // 6596 //===------------------------------------------------===// 6597 6598 // 2. Copy and widen instructions from the old loop into the new loop. 6599 assert(VPlans.size() == 1 && "Not a single VPlan to execute."); 6600 VPlans.front()->execute(&State); 6601 6602 // 3. Fix the vectorized code: take care of header phi's, live-outs, 6603 // predication, updating analyses. 6604 ILV.fixVectorizedLoop(); 6605 } 6606 6607 void LoopVectorizationPlanner::collectTriviallyDeadInstructions( 6608 SmallPtrSetImpl<Instruction *> &DeadInstructions) { 6609 BasicBlock *Latch = OrigLoop->getLoopLatch(); 6610 6611 // We create new control-flow for the vectorized loop, so the original 6612 // condition will be dead after vectorization if it's only used by the 6613 // branch. 6614 auto *Cmp = dyn_cast<Instruction>(Latch->getTerminator()->getOperand(0)); 6615 if (Cmp && Cmp->hasOneUse()) 6616 DeadInstructions.insert(Cmp); 6617 6618 // We create new "steps" for induction variable updates to which the original 6619 // induction variables map. An original update instruction will be dead if 6620 // all its users except the induction variable are dead. 6621 for (auto &Induction : Legal->getInductionVars()) { 6622 PHINode *Ind = Induction.first; 6623 auto *IndUpdate = cast<Instruction>(Ind->getIncomingValueForBlock(Latch)); 6624 if (llvm::all_of(IndUpdate->users(), [&](User *U) -> bool { 6625 return U == Ind || DeadInstructions.find(cast<Instruction>(U)) != 6626 DeadInstructions.end(); 6627 })) 6628 DeadInstructions.insert(IndUpdate); 6629 6630 // We record as "Dead" also the type-casting instructions we had identified 6631 // during induction analysis. We don't need any handling for them in the 6632 // vectorized loop because we have proven that, under a proper runtime 6633 // test guarding the vectorized loop, the value of the phi, and the casted 6634 // value of the phi, are the same. The last instruction in this casting chain 6635 // will get its scalar/vector/widened def from the scalar/vector/widened def 6636 // of the respective phi node. Any other casts in the induction def-use chain 6637 // have no other uses outside the phi update chain, and will be ignored. 6638 InductionDescriptor &IndDes = Induction.second; 6639 const SmallVectorImpl<Instruction *> &Casts = IndDes.getCastInsts(); 6640 DeadInstructions.insert(Casts.begin(), Casts.end()); 6641 } 6642 } 6643 6644 Value *InnerLoopUnroller::reverseVector(Value *Vec) { return Vec; } 6645 6646 Value *InnerLoopUnroller::getBroadcastInstrs(Value *V) { return V; } 6647 6648 Value *InnerLoopUnroller::getStepVector(Value *Val, int StartIdx, Value *Step, 6649 Instruction::BinaryOps BinOp) { 6650 // When unrolling and the VF is 1, we only need to add a simple scalar. 6651 Type *Ty = Val->getType(); 6652 assert(!Ty->isVectorTy() && "Val must be a scalar"); 6653 6654 if (Ty->isFloatingPointTy()) { 6655 Constant *C = ConstantFP::get(Ty, (double)StartIdx); 6656 6657 // Floating point operations had to be 'fast' to enable the unrolling. 6658 Value *MulOp = addFastMathFlag(Builder.CreateFMul(C, Step)); 6659 return addFastMathFlag(Builder.CreateBinOp(BinOp, Val, MulOp)); 6660 } 6661 Constant *C = ConstantInt::get(Ty, StartIdx); 6662 return Builder.CreateAdd(Val, Builder.CreateMul(C, Step), "induction"); 6663 } 6664 6665 static void AddRuntimeUnrollDisableMetaData(Loop *L) { 6666 SmallVector<Metadata *, 4> MDs; 6667 // Reserve first location for self reference to the LoopID metadata node. 6668 MDs.push_back(nullptr); 6669 bool IsUnrollMetadata = false; 6670 MDNode *LoopID = L->getLoopID(); 6671 if (LoopID) { 6672 // First find existing loop unrolling disable metadata. 6673 for (unsigned i = 1, ie = LoopID->getNumOperands(); i < ie; ++i) { 6674 auto *MD = dyn_cast<MDNode>(LoopID->getOperand(i)); 6675 if (MD) { 6676 const auto *S = dyn_cast<MDString>(MD->getOperand(0)); 6677 IsUnrollMetadata = 6678 S && S->getString().startswith("llvm.loop.unroll.disable"); 6679 } 6680 MDs.push_back(LoopID->getOperand(i)); 6681 } 6682 } 6683 6684 if (!IsUnrollMetadata) { 6685 // Add runtime unroll disable metadata. 6686 LLVMContext &Context = L->getHeader()->getContext(); 6687 SmallVector<Metadata *, 1> DisableOperands; 6688 DisableOperands.push_back( 6689 MDString::get(Context, "llvm.loop.unroll.runtime.disable")); 6690 MDNode *DisableNode = MDNode::get(Context, DisableOperands); 6691 MDs.push_back(DisableNode); 6692 MDNode *NewLoopID = MDNode::get(Context, MDs); 6693 // Set operand 0 to refer to the loop id itself. 6694 NewLoopID->replaceOperandWith(0, NewLoopID); 6695 L->setLoopID(NewLoopID); 6696 } 6697 } 6698 6699 bool LoopVectorizationPlanner::getDecisionAndClampRange( 6700 const std::function<bool(unsigned)> &Predicate, VFRange &Range) { 6701 assert(Range.End > Range.Start && "Trying to test an empty VF range."); 6702 bool PredicateAtRangeStart = Predicate(Range.Start); 6703 6704 for (unsigned TmpVF = Range.Start * 2; TmpVF < Range.End; TmpVF *= 2) 6705 if (Predicate(TmpVF) != PredicateAtRangeStart) { 6706 Range.End = TmpVF; 6707 break; 6708 } 6709 6710 return PredicateAtRangeStart; 6711 } 6712 6713 /// Build VPlans for the full range of feasible VF's = {\p MinVF, 2 * \p MinVF, 6714 /// 4 * \p MinVF, ..., \p MaxVF} by repeatedly building a VPlan for a sub-range 6715 /// of VF's starting at a given VF and extending it as much as possible. Each 6716 /// vectorization decision can potentially shorten this sub-range during 6717 /// buildVPlan(). 6718 void LoopVectorizationPlanner::buildVPlans(unsigned MinVF, unsigned MaxVF) { 6719 for (unsigned VF = MinVF; VF < MaxVF + 1;) { 6720 VFRange SubRange = {VF, MaxVF + 1}; 6721 VPlans.push_back(buildVPlan(SubRange)); 6722 VF = SubRange.End; 6723 } 6724 } 6725 6726 VPValue *VPRecipeBuilder::createEdgeMask(BasicBlock *Src, BasicBlock *Dst, 6727 VPlanPtr &Plan) { 6728 assert(is_contained(predecessors(Dst), Src) && "Invalid edge"); 6729 6730 // Look for cached value. 6731 std::pair<BasicBlock *, BasicBlock *> Edge(Src, Dst); 6732 EdgeMaskCacheTy::iterator ECEntryIt = EdgeMaskCache.find(Edge); 6733 if (ECEntryIt != EdgeMaskCache.end()) 6734 return ECEntryIt->second; 6735 6736 VPValue *SrcMask = createBlockInMask(Src, Plan); 6737 6738 // The terminator has to be a branch inst! 6739 BranchInst *BI = dyn_cast<BranchInst>(Src->getTerminator()); 6740 assert(BI && "Unexpected terminator found"); 6741 6742 if (!BI->isConditional() || BI->getSuccessor(0) == BI->getSuccessor(1)) 6743 return EdgeMaskCache[Edge] = SrcMask; 6744 6745 VPValue *EdgeMask = Plan->getVPValue(BI->getCondition()); 6746 assert(EdgeMask && "No Edge Mask found for condition"); 6747 6748 if (BI->getSuccessor(0) != Dst) 6749 EdgeMask = Builder.createNot(EdgeMask); 6750 6751 if (SrcMask) // Otherwise block in-mask is all-one, no need to AND. 6752 EdgeMask = Builder.createAnd(EdgeMask, SrcMask); 6753 6754 return EdgeMaskCache[Edge] = EdgeMask; 6755 } 6756 6757 VPValue *VPRecipeBuilder::createBlockInMask(BasicBlock *BB, VPlanPtr &Plan) { 6758 assert(OrigLoop->contains(BB) && "Block is not a part of a loop"); 6759 6760 // Look for cached value. 6761 BlockMaskCacheTy::iterator BCEntryIt = BlockMaskCache.find(BB); 6762 if (BCEntryIt != BlockMaskCache.end()) 6763 return BCEntryIt->second; 6764 6765 // All-one mask is modelled as no-mask following the convention for masked 6766 // load/store/gather/scatter. Initialize BlockMask to no-mask. 6767 VPValue *BlockMask = nullptr; 6768 6769 if (OrigLoop->getHeader() == BB) { 6770 if (!CM.blockNeedsPredication(BB)) 6771 return BlockMaskCache[BB] = BlockMask; // Loop incoming mask is all-one. 6772 6773 // Introduce the early-exit compare IV <= BTC to form header block mask. 6774 // This is used instead of IV < TC because TC may wrap, unlike BTC. 6775 // Start by constructing the desired canonical IV. 6776 VPValue *IV = nullptr; 6777 if (Legal->getPrimaryInduction()) 6778 IV = Plan->getVPValue(Legal->getPrimaryInduction()); 6779 else { 6780 auto IVRecipe = new VPWidenCanonicalIVRecipe(); 6781 Builder.getInsertBlock()->appendRecipe(IVRecipe); 6782 IV = IVRecipe->getVPValue(); 6783 } 6784 VPValue *BTC = Plan->getOrCreateBackedgeTakenCount(); 6785 BlockMask = Builder.createNaryOp(VPInstruction::ICmpULE, {IV, BTC}); 6786 return BlockMaskCache[BB] = BlockMask; 6787 } 6788 6789 // This is the block mask. We OR all incoming edges. 6790 for (auto *Predecessor : predecessors(BB)) { 6791 VPValue *EdgeMask = createEdgeMask(Predecessor, BB, Plan); 6792 if (!EdgeMask) // Mask of predecessor is all-one so mask of block is too. 6793 return BlockMaskCache[BB] = EdgeMask; 6794 6795 if (!BlockMask) { // BlockMask has its initialized nullptr value. 6796 BlockMask = EdgeMask; 6797 continue; 6798 } 6799 6800 BlockMask = Builder.createOr(BlockMask, EdgeMask); 6801 } 6802 6803 return BlockMaskCache[BB] = BlockMask; 6804 } 6805 6806 VPWidenMemoryInstructionRecipe * 6807 VPRecipeBuilder::tryToWidenMemory(Instruction *I, VFRange &Range, 6808 VPlanPtr &Plan) { 6809 if (!isa<LoadInst>(I) && !isa<StoreInst>(I)) 6810 return nullptr; 6811 6812 auto willWiden = [&](unsigned VF) -> bool { 6813 if (VF == 1) 6814 return false; 6815 LoopVectorizationCostModel::InstWidening Decision = 6816 CM.getWideningDecision(I, VF); 6817 assert(Decision != LoopVectorizationCostModel::CM_Unknown && 6818 "CM decision should be taken at this point."); 6819 if (Decision == LoopVectorizationCostModel::CM_Interleave) 6820 return true; 6821 if (CM.isScalarAfterVectorization(I, VF) || 6822 CM.isProfitableToScalarize(I, VF)) 6823 return false; 6824 return Decision != LoopVectorizationCostModel::CM_Scalarize; 6825 }; 6826 6827 if (!LoopVectorizationPlanner::getDecisionAndClampRange(willWiden, Range)) 6828 return nullptr; 6829 6830 VPValue *Mask = nullptr; 6831 if (Legal->isMaskRequired(I)) 6832 Mask = createBlockInMask(I->getParent(), Plan); 6833 6834 VPValue *Addr = Plan->getOrAddVPValue(getLoadStorePointerOperand(I)); 6835 if (LoadInst *Load = dyn_cast<LoadInst>(I)) 6836 return new VPWidenMemoryInstructionRecipe(*Load, Addr, Mask); 6837 6838 StoreInst *Store = cast<StoreInst>(I); 6839 VPValue *StoredValue = Plan->getOrAddVPValue(Store->getValueOperand()); 6840 return new VPWidenMemoryInstructionRecipe(*Store, Addr, StoredValue, Mask); 6841 } 6842 6843 VPWidenIntOrFpInductionRecipe * 6844 VPRecipeBuilder::tryToOptimizeInduction(Instruction *I, VFRange &Range) { 6845 if (PHINode *Phi = dyn_cast<PHINode>(I)) { 6846 // Check if this is an integer or fp induction. If so, build the recipe that 6847 // produces its scalar and vector values. 6848 InductionDescriptor II = Legal->getInductionVars().lookup(Phi); 6849 if (II.getKind() == InductionDescriptor::IK_IntInduction || 6850 II.getKind() == InductionDescriptor::IK_FpInduction) 6851 return new VPWidenIntOrFpInductionRecipe(Phi); 6852 6853 return nullptr; 6854 } 6855 6856 // Optimize the special case where the source is a constant integer 6857 // induction variable. Notice that we can only optimize the 'trunc' case 6858 // because (a) FP conversions lose precision, (b) sext/zext may wrap, and 6859 // (c) other casts depend on pointer size. 6860 6861 // Determine whether \p K is a truncation based on an induction variable that 6862 // can be optimized. 6863 auto isOptimizableIVTruncate = 6864 [&](Instruction *K) -> std::function<bool(unsigned)> { 6865 return 6866 [=](unsigned VF) -> bool { return CM.isOptimizableIVTruncate(K, VF); }; 6867 }; 6868 6869 if (isa<TruncInst>(I) && LoopVectorizationPlanner::getDecisionAndClampRange( 6870 isOptimizableIVTruncate(I), Range)) 6871 return new VPWidenIntOrFpInductionRecipe(cast<PHINode>(I->getOperand(0)), 6872 cast<TruncInst>(I)); 6873 return nullptr; 6874 } 6875 6876 VPBlendRecipe *VPRecipeBuilder::tryToBlend(Instruction *I, VPlanPtr &Plan) { 6877 PHINode *Phi = dyn_cast<PHINode>(I); 6878 if (!Phi || Phi->getParent() == OrigLoop->getHeader()) 6879 return nullptr; 6880 6881 // We know that all PHIs in non-header blocks are converted into selects, so 6882 // we don't have to worry about the insertion order and we can just use the 6883 // builder. At this point we generate the predication tree. There may be 6884 // duplications since this is a simple recursive scan, but future 6885 // optimizations will clean it up. 6886 6887 SmallVector<VPValue *, 2> Operands; 6888 unsigned NumIncoming = Phi->getNumIncomingValues(); 6889 for (unsigned In = 0; In < NumIncoming; In++) { 6890 VPValue *EdgeMask = 6891 createEdgeMask(Phi->getIncomingBlock(In), Phi->getParent(), Plan); 6892 assert((EdgeMask || NumIncoming == 1) && 6893 "Multiple predecessors with one having a full mask"); 6894 Operands.push_back(Plan->getOrAddVPValue(Phi->getIncomingValue(In))); 6895 if (EdgeMask) 6896 Operands.push_back(EdgeMask); 6897 } 6898 return new VPBlendRecipe(Phi, Operands); 6899 } 6900 6901 VPWidenCallRecipe * 6902 VPRecipeBuilder::tryToWidenCall(Instruction *I, VFRange &Range, VPlan &Plan) { 6903 6904 bool IsPredicated = LoopVectorizationPlanner::getDecisionAndClampRange( 6905 [&](unsigned VF) { return CM.isScalarWithPredication(I, VF); }, Range); 6906 6907 CallInst *CI = dyn_cast<CallInst>(I); 6908 if (IsPredicated || !CI) 6909 return nullptr; 6910 6911 Intrinsic::ID ID = getVectorIntrinsicIDForCall(CI, TLI); 6912 if (ID && (ID == Intrinsic::assume || ID == Intrinsic::lifetime_end || 6913 ID == Intrinsic::lifetime_start || ID == Intrinsic::sideeffect)) 6914 return nullptr; 6915 6916 auto willWiden = [&](unsigned VF) -> bool { 6917 Intrinsic::ID ID = getVectorIntrinsicIDForCall(CI, TLI); 6918 // The following case may be scalarized depending on the VF. 6919 // The flag shows whether we use Intrinsic or a usual Call for vectorized 6920 // version of the instruction. 6921 // Is it beneficial to perform intrinsic call compared to lib call? 6922 bool NeedToScalarize = false; 6923 unsigned CallCost = CM.getVectorCallCost(CI, VF, NeedToScalarize); 6924 bool UseVectorIntrinsic = 6925 ID && CM.getVectorIntrinsicCost(CI, VF) <= CallCost; 6926 return UseVectorIntrinsic || !NeedToScalarize; 6927 }; 6928 6929 if (!LoopVectorizationPlanner::getDecisionAndClampRange(willWiden, Range)) 6930 return nullptr; 6931 6932 // Success: widen this call. 6933 auto VPValues = map_range(CI->arg_operands(), [&Plan](Value *Op) { 6934 return Plan.getOrAddVPValue(Op); 6935 }); 6936 6937 return new VPWidenCallRecipe(*CI, VPValues); 6938 } 6939 6940 VPWidenRecipe *VPRecipeBuilder::tryToWiden(Instruction *I, VFRange &Range) { 6941 bool IsPredicated = LoopVectorizationPlanner::getDecisionAndClampRange( 6942 [&](unsigned VF) { return CM.isScalarWithPredication(I, VF); }, Range); 6943 6944 if (IsPredicated) 6945 return nullptr; 6946 6947 auto IsVectorizableOpcode = [](unsigned Opcode) { 6948 switch (Opcode) { 6949 case Instruction::Add: 6950 case Instruction::And: 6951 case Instruction::AShr: 6952 case Instruction::BitCast: 6953 case Instruction::Br: 6954 case Instruction::FAdd: 6955 case Instruction::FCmp: 6956 case Instruction::FDiv: 6957 case Instruction::FMul: 6958 case Instruction::FNeg: 6959 case Instruction::FPExt: 6960 case Instruction::FPToSI: 6961 case Instruction::FPToUI: 6962 case Instruction::FPTrunc: 6963 case Instruction::FRem: 6964 case Instruction::FSub: 6965 case Instruction::ICmp: 6966 case Instruction::IntToPtr: 6967 case Instruction::Load: 6968 case Instruction::LShr: 6969 case Instruction::Mul: 6970 case Instruction::Or: 6971 case Instruction::PHI: 6972 case Instruction::PtrToInt: 6973 case Instruction::SDiv: 6974 case Instruction::Select: 6975 case Instruction::SExt: 6976 case Instruction::Shl: 6977 case Instruction::SIToFP: 6978 case Instruction::SRem: 6979 case Instruction::Store: 6980 case Instruction::Sub: 6981 case Instruction::Trunc: 6982 case Instruction::UDiv: 6983 case Instruction::UIToFP: 6984 case Instruction::URem: 6985 case Instruction::Xor: 6986 case Instruction::ZExt: 6987 return true; 6988 } 6989 return false; 6990 }; 6991 6992 if (!IsVectorizableOpcode(I->getOpcode())) 6993 return nullptr; 6994 6995 auto willWiden = [&](unsigned VF) -> bool { 6996 if (!isa<PHINode>(I) && (CM.isScalarAfterVectorization(I, VF) || 6997 CM.isProfitableToScalarize(I, VF))) 6998 return false; 6999 if (isa<LoadInst>(I) || isa<StoreInst>(I)) { 7000 assert(CM.getWideningDecision(I, VF) == 7001 LoopVectorizationCostModel::CM_Scalarize && 7002 "Memory widening decisions should have been taken care by now"); 7003 return false; 7004 } 7005 return true; 7006 }; 7007 7008 if (!LoopVectorizationPlanner::getDecisionAndClampRange(willWiden, Range)) 7009 return nullptr; 7010 7011 // Success: widen this instruction. 7012 return new VPWidenRecipe(*I); 7013 } 7014 7015 VPBasicBlock *VPRecipeBuilder::handleReplication( 7016 Instruction *I, VFRange &Range, VPBasicBlock *VPBB, 7017 DenseMap<Instruction *, VPReplicateRecipe *> &PredInst2Recipe, 7018 VPlanPtr &Plan) { 7019 bool IsUniform = LoopVectorizationPlanner::getDecisionAndClampRange( 7020 [&](unsigned VF) { return CM.isUniformAfterVectorization(I, VF); }, 7021 Range); 7022 7023 bool IsPredicated = LoopVectorizationPlanner::getDecisionAndClampRange( 7024 [&](unsigned VF) { return CM.isScalarWithPredication(I, VF); }, Range); 7025 7026 auto *Recipe = new VPReplicateRecipe(I, IsUniform, IsPredicated); 7027 setRecipe(I, Recipe); 7028 7029 // Find if I uses a predicated instruction. If so, it will use its scalar 7030 // value. Avoid hoisting the insert-element which packs the scalar value into 7031 // a vector value, as that happens iff all users use the vector value. 7032 for (auto &Op : I->operands()) 7033 if (auto *PredInst = dyn_cast<Instruction>(Op)) 7034 if (PredInst2Recipe.find(PredInst) != PredInst2Recipe.end()) 7035 PredInst2Recipe[PredInst]->setAlsoPack(false); 7036 7037 // Finalize the recipe for Instr, first if it is not predicated. 7038 if (!IsPredicated) { 7039 LLVM_DEBUG(dbgs() << "LV: Scalarizing:" << *I << "\n"); 7040 VPBB->appendRecipe(Recipe); 7041 return VPBB; 7042 } 7043 LLVM_DEBUG(dbgs() << "LV: Scalarizing and predicating:" << *I << "\n"); 7044 assert(VPBB->getSuccessors().empty() && 7045 "VPBB has successors when handling predicated replication."); 7046 // Record predicated instructions for above packing optimizations. 7047 PredInst2Recipe[I] = Recipe; 7048 VPBlockBase *Region = createReplicateRegion(I, Recipe, Plan); 7049 VPBlockUtils::insertBlockAfter(Region, VPBB); 7050 auto *RegSucc = new VPBasicBlock(); 7051 VPBlockUtils::insertBlockAfter(RegSucc, Region); 7052 return RegSucc; 7053 } 7054 7055 VPRegionBlock *VPRecipeBuilder::createReplicateRegion(Instruction *Instr, 7056 VPRecipeBase *PredRecipe, 7057 VPlanPtr &Plan) { 7058 // Instructions marked for predication are replicated and placed under an 7059 // if-then construct to prevent side-effects. 7060 7061 // Generate recipes to compute the block mask for this region. 7062 VPValue *BlockInMask = createBlockInMask(Instr->getParent(), Plan); 7063 7064 // Build the triangular if-then region. 7065 std::string RegionName = (Twine("pred.") + Instr->getOpcodeName()).str(); 7066 assert(Instr->getParent() && "Predicated instruction not in any basic block"); 7067 auto *BOMRecipe = new VPBranchOnMaskRecipe(BlockInMask); 7068 auto *Entry = new VPBasicBlock(Twine(RegionName) + ".entry", BOMRecipe); 7069 auto *PHIRecipe = 7070 Instr->getType()->isVoidTy() ? nullptr : new VPPredInstPHIRecipe(Instr); 7071 auto *Exit = new VPBasicBlock(Twine(RegionName) + ".continue", PHIRecipe); 7072 auto *Pred = new VPBasicBlock(Twine(RegionName) + ".if", PredRecipe); 7073 VPRegionBlock *Region = new VPRegionBlock(Entry, Exit, RegionName, true); 7074 7075 // Note: first set Entry as region entry and then connect successors starting 7076 // from it in order, to propagate the "parent" of each VPBasicBlock. 7077 VPBlockUtils::insertTwoBlocksAfter(Pred, Exit, BlockInMask, Entry); 7078 VPBlockUtils::connectBlocks(Pred, Exit); 7079 7080 return Region; 7081 } 7082 7083 bool VPRecipeBuilder::tryToCreateRecipe(Instruction *Instr, VFRange &Range, 7084 VPlanPtr &Plan, VPBasicBlock *VPBB) { 7085 VPRecipeBase *Recipe = nullptr; 7086 7087 // First, check for specific widening recipes that deal with calls, memory 7088 // operations, inductions and Phi nodes. 7089 if ((Recipe = tryToWidenCall(Instr, Range, *Plan)) || 7090 (Recipe = tryToWidenMemory(Instr, Range, Plan)) || 7091 (Recipe = tryToOptimizeInduction(Instr, Range)) || 7092 (Recipe = tryToBlend(Instr, Plan)) || 7093 (isa<PHINode>(Instr) && 7094 (Recipe = new VPWidenPHIRecipe(cast<PHINode>(Instr))))) { 7095 setRecipe(Instr, Recipe); 7096 VPBB->appendRecipe(Recipe); 7097 return true; 7098 } 7099 7100 // Handle GEP widening. 7101 if (GetElementPtrInst *GEP = dyn_cast<GetElementPtrInst>(Instr)) { 7102 auto Scalarize = [&](unsigned VF) { 7103 return CM.isScalarWithPredication(Instr, VF) || 7104 CM.isScalarAfterVectorization(Instr, VF) || 7105 CM.isProfitableToScalarize(Instr, VF); 7106 }; 7107 if (LoopVectorizationPlanner::getDecisionAndClampRange(Scalarize, Range)) 7108 return false; 7109 VPWidenGEPRecipe *Recipe = new VPWidenGEPRecipe(GEP, OrigLoop); 7110 setRecipe(Instr, Recipe); 7111 VPBB->appendRecipe(Recipe); 7112 return true; 7113 } 7114 7115 // Check if Instr is to be widened by a general VPWidenRecipe, after 7116 // having first checked for specific widening recipes. 7117 if ((Recipe = tryToWiden(Instr, Range))) { 7118 setRecipe(Instr, Recipe); 7119 VPBB->appendRecipe(Recipe); 7120 return true; 7121 } 7122 7123 return false; 7124 } 7125 7126 void LoopVectorizationPlanner::buildVPlansWithVPRecipes(unsigned MinVF, 7127 unsigned MaxVF) { 7128 assert(OrigLoop->empty() && "Inner loop expected."); 7129 7130 // Collect conditions feeding internal conditional branches; they need to be 7131 // represented in VPlan for it to model masking. 7132 SmallPtrSet<Value *, 1> NeedDef; 7133 7134 auto *Latch = OrigLoop->getLoopLatch(); 7135 for (BasicBlock *BB : OrigLoop->blocks()) { 7136 if (BB == Latch) 7137 continue; 7138 BranchInst *Branch = dyn_cast<BranchInst>(BB->getTerminator()); 7139 if (Branch && Branch->isConditional()) 7140 NeedDef.insert(Branch->getCondition()); 7141 } 7142 7143 // If the tail is to be folded by masking, the primary induction variable, if 7144 // exists needs to be represented in VPlan for it to model early-exit masking. 7145 // Also, both the Phi and the live-out instruction of each reduction are 7146 // required in order to introduce a select between them in VPlan. 7147 if (CM.foldTailByMasking()) { 7148 if (Legal->getPrimaryInduction()) 7149 NeedDef.insert(Legal->getPrimaryInduction()); 7150 for (auto &Reduction : Legal->getReductionVars()) { 7151 NeedDef.insert(Reduction.first); 7152 NeedDef.insert(Reduction.second.getLoopExitInstr()); 7153 } 7154 } 7155 7156 // Collect instructions from the original loop that will become trivially dead 7157 // in the vectorized loop. We don't need to vectorize these instructions. For 7158 // example, original induction update instructions can become dead because we 7159 // separately emit induction "steps" when generating code for the new loop. 7160 // Similarly, we create a new latch condition when setting up the structure 7161 // of the new loop, so the old one can become dead. 7162 SmallPtrSet<Instruction *, 4> DeadInstructions; 7163 collectTriviallyDeadInstructions(DeadInstructions); 7164 7165 // Add assume instructions we need to drop to DeadInstructions, to prevent 7166 // them from being added to the VPlan. 7167 // TODO: We only need to drop assumes in blocks that get flattend. If the 7168 // control flow is preserved, we should keep them. 7169 auto &ConditionalAssumes = Legal->getConditionalAssumes(); 7170 DeadInstructions.insert(ConditionalAssumes.begin(), ConditionalAssumes.end()); 7171 7172 DenseMap<Instruction *, Instruction *> &SinkAfter = Legal->getSinkAfter(); 7173 // Dead instructions do not need sinking. Remove them from SinkAfter. 7174 for (Instruction *I : DeadInstructions) 7175 SinkAfter.erase(I); 7176 7177 for (unsigned VF = MinVF; VF < MaxVF + 1;) { 7178 VFRange SubRange = {VF, MaxVF + 1}; 7179 VPlans.push_back(buildVPlanWithVPRecipes(SubRange, NeedDef, 7180 DeadInstructions, SinkAfter)); 7181 VF = SubRange.End; 7182 } 7183 } 7184 7185 VPlanPtr LoopVectorizationPlanner::buildVPlanWithVPRecipes( 7186 VFRange &Range, SmallPtrSetImpl<Value *> &NeedDef, 7187 SmallPtrSetImpl<Instruction *> &DeadInstructions, 7188 const DenseMap<Instruction *, Instruction *> &SinkAfter) { 7189 7190 // Hold a mapping from predicated instructions to their recipes, in order to 7191 // fix their AlsoPack behavior if a user is determined to replicate and use a 7192 // scalar instead of vector value. 7193 DenseMap<Instruction *, VPReplicateRecipe *> PredInst2Recipe; 7194 7195 SmallPtrSet<const InterleaveGroup<Instruction> *, 1> InterleaveGroups; 7196 7197 VPRecipeBuilder RecipeBuilder(OrigLoop, TLI, Legal, CM, Builder); 7198 7199 // --------------------------------------------------------------------------- 7200 // Pre-construction: record ingredients whose recipes we'll need to further 7201 // process after constructing the initial VPlan. 7202 // --------------------------------------------------------------------------- 7203 7204 // Mark instructions we'll need to sink later and their targets as 7205 // ingredients whose recipe we'll need to record. 7206 for (auto &Entry : SinkAfter) { 7207 RecipeBuilder.recordRecipeOf(Entry.first); 7208 RecipeBuilder.recordRecipeOf(Entry.second); 7209 } 7210 7211 // For each interleave group which is relevant for this (possibly trimmed) 7212 // Range, add it to the set of groups to be later applied to the VPlan and add 7213 // placeholders for its members' Recipes which we'll be replacing with a 7214 // single VPInterleaveRecipe. 7215 for (InterleaveGroup<Instruction> *IG : IAI.getInterleaveGroups()) { 7216 auto applyIG = [IG, this](unsigned VF) -> bool { 7217 return (VF >= 2 && // Query is illegal for VF == 1 7218 CM.getWideningDecision(IG->getInsertPos(), VF) == 7219 LoopVectorizationCostModel::CM_Interleave); 7220 }; 7221 if (!getDecisionAndClampRange(applyIG, Range)) 7222 continue; 7223 InterleaveGroups.insert(IG); 7224 for (unsigned i = 0; i < IG->getFactor(); i++) 7225 if (Instruction *Member = IG->getMember(i)) 7226 RecipeBuilder.recordRecipeOf(Member); 7227 }; 7228 7229 // --------------------------------------------------------------------------- 7230 // Build initial VPlan: Scan the body of the loop in a topological order to 7231 // visit each basic block after having visited its predecessor basic blocks. 7232 // --------------------------------------------------------------------------- 7233 7234 // Create a dummy pre-entry VPBasicBlock to start building the VPlan. 7235 auto Plan = std::make_unique<VPlan>(); 7236 VPBasicBlock *VPBB = new VPBasicBlock("Pre-Entry"); 7237 Plan->setEntry(VPBB); 7238 7239 // Represent values that will have defs inside VPlan. 7240 for (Value *V : NeedDef) 7241 Plan->addVPValue(V); 7242 7243 // Scan the body of the loop in a topological order to visit each basic block 7244 // after having visited its predecessor basic blocks. 7245 LoopBlocksDFS DFS(OrigLoop); 7246 DFS.perform(LI); 7247 7248 for (BasicBlock *BB : make_range(DFS.beginRPO(), DFS.endRPO())) { 7249 // Relevant instructions from basic block BB will be grouped into VPRecipe 7250 // ingredients and fill a new VPBasicBlock. 7251 unsigned VPBBsForBB = 0; 7252 auto *FirstVPBBForBB = new VPBasicBlock(BB->getName()); 7253 VPBlockUtils::insertBlockAfter(FirstVPBBForBB, VPBB); 7254 VPBB = FirstVPBBForBB; 7255 Builder.setInsertPoint(VPBB); 7256 7257 // Introduce each ingredient into VPlan. 7258 // TODO: Model and preserve debug instrinsics in VPlan. 7259 for (Instruction &I : BB->instructionsWithoutDebug()) { 7260 Instruction *Instr = &I; 7261 7262 // First filter out irrelevant instructions, to ensure no recipes are 7263 // built for them. 7264 if (isa<BranchInst>(Instr) || 7265 DeadInstructions.find(Instr) != DeadInstructions.end()) 7266 continue; 7267 7268 if (RecipeBuilder.tryToCreateRecipe(Instr, Range, Plan, VPBB)) 7269 continue; 7270 7271 // Otherwise, if all widening options failed, Instruction is to be 7272 // replicated. This may create a successor for VPBB. 7273 VPBasicBlock *NextVPBB = RecipeBuilder.handleReplication( 7274 Instr, Range, VPBB, PredInst2Recipe, Plan); 7275 if (NextVPBB != VPBB) { 7276 VPBB = NextVPBB; 7277 VPBB->setName(BB->hasName() ? BB->getName() + "." + Twine(VPBBsForBB++) 7278 : ""); 7279 } 7280 } 7281 } 7282 7283 // Discard empty dummy pre-entry VPBasicBlock. Note that other VPBasicBlocks 7284 // may also be empty, such as the last one VPBB, reflecting original 7285 // basic-blocks with no recipes. 7286 VPBasicBlock *PreEntry = cast<VPBasicBlock>(Plan->getEntry()); 7287 assert(PreEntry->empty() && "Expecting empty pre-entry block."); 7288 VPBlockBase *Entry = Plan->setEntry(PreEntry->getSingleSuccessor()); 7289 VPBlockUtils::disconnectBlocks(PreEntry, Entry); 7290 delete PreEntry; 7291 7292 // --------------------------------------------------------------------------- 7293 // Transform initial VPlan: Apply previously taken decisions, in order, to 7294 // bring the VPlan to its final state. 7295 // --------------------------------------------------------------------------- 7296 7297 // Apply Sink-After legal constraints. 7298 for (auto &Entry : SinkAfter) { 7299 VPRecipeBase *Sink = RecipeBuilder.getRecipe(Entry.first); 7300 VPRecipeBase *Target = RecipeBuilder.getRecipe(Entry.second); 7301 Sink->moveAfter(Target); 7302 } 7303 7304 // Interleave memory: for each Interleave Group we marked earlier as relevant 7305 // for this VPlan, replace the Recipes widening its memory instructions with a 7306 // single VPInterleaveRecipe at its insertion point. 7307 for (auto IG : InterleaveGroups) { 7308 auto *Recipe = cast<VPWidenMemoryInstructionRecipe>( 7309 RecipeBuilder.getRecipe(IG->getInsertPos())); 7310 (new VPInterleaveRecipe(IG, Recipe->getAddr(), Recipe->getMask())) 7311 ->insertBefore(Recipe); 7312 7313 for (unsigned i = 0; i < IG->getFactor(); ++i) 7314 if (Instruction *Member = IG->getMember(i)) { 7315 RecipeBuilder.getRecipe(Member)->eraseFromParent(); 7316 } 7317 } 7318 7319 // Finally, if tail is folded by masking, introduce selects between the phi 7320 // and the live-out instruction of each reduction, at the end of the latch. 7321 if (CM.foldTailByMasking()) { 7322 Builder.setInsertPoint(VPBB); 7323 auto *Cond = RecipeBuilder.createBlockInMask(OrigLoop->getHeader(), Plan); 7324 for (auto &Reduction : Legal->getReductionVars()) { 7325 VPValue *Phi = Plan->getVPValue(Reduction.first); 7326 VPValue *Red = Plan->getVPValue(Reduction.second.getLoopExitInstr()); 7327 Builder.createNaryOp(Instruction::Select, {Cond, Red, Phi}); 7328 } 7329 } 7330 7331 std::string PlanName; 7332 raw_string_ostream RSO(PlanName); 7333 unsigned VF = Range.Start; 7334 Plan->addVF(VF); 7335 RSO << "Initial VPlan for VF={" << VF; 7336 for (VF *= 2; VF < Range.End; VF *= 2) { 7337 Plan->addVF(VF); 7338 RSO << "," << VF; 7339 } 7340 RSO << "},UF>=1"; 7341 RSO.flush(); 7342 Plan->setName(PlanName); 7343 7344 return Plan; 7345 } 7346 7347 VPlanPtr LoopVectorizationPlanner::buildVPlan(VFRange &Range) { 7348 // Outer loop handling: They may require CFG and instruction level 7349 // transformations before even evaluating whether vectorization is profitable. 7350 // Since we cannot modify the incoming IR, we need to build VPlan upfront in 7351 // the vectorization pipeline. 7352 assert(!OrigLoop->empty()); 7353 assert(EnableVPlanNativePath && "VPlan-native path is not enabled."); 7354 7355 // Create new empty VPlan 7356 auto Plan = std::make_unique<VPlan>(); 7357 7358 // Build hierarchical CFG 7359 VPlanHCFGBuilder HCFGBuilder(OrigLoop, LI, *Plan); 7360 HCFGBuilder.buildHierarchicalCFG(); 7361 7362 for (unsigned VF = Range.Start; VF < Range.End; VF *= 2) 7363 Plan->addVF(VF); 7364 7365 if (EnableVPlanPredication) { 7366 VPlanPredicator VPP(*Plan); 7367 VPP.predicate(); 7368 7369 // Avoid running transformation to recipes until masked code generation in 7370 // VPlan-native path is in place. 7371 return Plan; 7372 } 7373 7374 SmallPtrSet<Instruction *, 1> DeadInstructions; 7375 VPlanTransforms::VPInstructionsToVPRecipes( 7376 OrigLoop, Plan, Legal->getInductionVars(), DeadInstructions); 7377 return Plan; 7378 } 7379 7380 Value* LoopVectorizationPlanner::VPCallbackILV:: 7381 getOrCreateVectorValues(Value *V, unsigned Part) { 7382 return ILV.getOrCreateVectorValue(V, Part); 7383 } 7384 7385 Value *LoopVectorizationPlanner::VPCallbackILV::getOrCreateScalarValue( 7386 Value *V, const VPIteration &Instance) { 7387 return ILV.getOrCreateScalarValue(V, Instance); 7388 } 7389 7390 void VPInterleaveRecipe::print(raw_ostream &O, const Twine &Indent, 7391 VPSlotTracker &SlotTracker) const { 7392 O << " +\n" 7393 << Indent << "\"INTERLEAVE-GROUP with factor " << IG->getFactor() << " at "; 7394 IG->getInsertPos()->printAsOperand(O, false); 7395 O << ", "; 7396 getAddr()->printAsOperand(O, SlotTracker); 7397 VPValue *Mask = getMask(); 7398 if (Mask) { 7399 O << ", "; 7400 Mask->printAsOperand(O, SlotTracker); 7401 } 7402 O << "\\l\""; 7403 for (unsigned i = 0; i < IG->getFactor(); ++i) 7404 if (Instruction *I = IG->getMember(i)) 7405 O << " +\n" 7406 << Indent << "\" " << VPlanIngredient(I) << " " << i << "\\l\""; 7407 } 7408 7409 void VPWidenCallRecipe::execute(VPTransformState &State) { 7410 State.ILV->widenCallInstruction(Ingredient, User, State); 7411 } 7412 7413 void VPWidenRecipe::execute(VPTransformState &State) { 7414 State.ILV->widenInstruction(Ingredient); 7415 } 7416 7417 void VPWidenGEPRecipe::execute(VPTransformState &State) { 7418 State.ILV->widenGEP(GEP, State.UF, State.VF, IsPtrLoopInvariant, 7419 IsIndexLoopInvariant); 7420 } 7421 7422 void VPWidenIntOrFpInductionRecipe::execute(VPTransformState &State) { 7423 assert(!State.Instance && "Int or FP induction being replicated."); 7424 State.ILV->widenIntOrFpInduction(IV, Trunc); 7425 } 7426 7427 void VPWidenPHIRecipe::execute(VPTransformState &State) { 7428 State.ILV->widenPHIInstruction(Phi, State.UF, State.VF); 7429 } 7430 7431 void VPBlendRecipe::execute(VPTransformState &State) { 7432 State.ILV->setDebugLocFromInst(State.Builder, Phi); 7433 // We know that all PHIs in non-header blocks are converted into 7434 // selects, so we don't have to worry about the insertion order and we 7435 // can just use the builder. 7436 // At this point we generate the predication tree. There may be 7437 // duplications since this is a simple recursive scan, but future 7438 // optimizations will clean it up. 7439 7440 unsigned NumIncoming = getNumIncomingValues(); 7441 7442 // Generate a sequence of selects of the form: 7443 // SELECT(Mask3, In3, 7444 // SELECT(Mask2, In2, 7445 // ( ...))) 7446 InnerLoopVectorizer::VectorParts Entry(State.UF); 7447 for (unsigned In = 0; In < NumIncoming; ++In) { 7448 for (unsigned Part = 0; Part < State.UF; ++Part) { 7449 // We might have single edge PHIs (blocks) - use an identity 7450 // 'select' for the first PHI operand. 7451 Value *In0 = State.get(getIncomingValue(In), Part); 7452 if (In == 0) 7453 Entry[Part] = In0; // Initialize with the first incoming value. 7454 else { 7455 // Select between the current value and the previous incoming edge 7456 // based on the incoming mask. 7457 Value *Cond = State.get(getMask(In), Part); 7458 Entry[Part] = 7459 State.Builder.CreateSelect(Cond, In0, Entry[Part], "predphi"); 7460 } 7461 } 7462 } 7463 for (unsigned Part = 0; Part < State.UF; ++Part) 7464 State.ValueMap.setVectorValue(Phi, Part, Entry[Part]); 7465 } 7466 7467 void VPInterleaveRecipe::execute(VPTransformState &State) { 7468 assert(!State.Instance && "Interleave group being replicated."); 7469 State.ILV->vectorizeInterleaveGroup(IG->getInsertPos(), State, getAddr(), 7470 getMask()); 7471 } 7472 7473 void VPReplicateRecipe::execute(VPTransformState &State) { 7474 if (State.Instance) { // Generate a single instance. 7475 State.ILV->scalarizeInstruction(Ingredient, *State.Instance, IsPredicated); 7476 // Insert scalar instance packing it into a vector. 7477 if (AlsoPack && State.VF > 1) { 7478 // If we're constructing lane 0, initialize to start from undef. 7479 if (State.Instance->Lane == 0) { 7480 Value *Undef = 7481 UndefValue::get(VectorType::get(Ingredient->getType(), State.VF)); 7482 State.ValueMap.setVectorValue(Ingredient, State.Instance->Part, Undef); 7483 } 7484 State.ILV->packScalarIntoVectorValue(Ingredient, *State.Instance); 7485 } 7486 return; 7487 } 7488 7489 // Generate scalar instances for all VF lanes of all UF parts, unless the 7490 // instruction is uniform inwhich case generate only the first lane for each 7491 // of the UF parts. 7492 unsigned EndLane = IsUniform ? 1 : State.VF; 7493 for (unsigned Part = 0; Part < State.UF; ++Part) 7494 for (unsigned Lane = 0; Lane < EndLane; ++Lane) 7495 State.ILV->scalarizeInstruction(Ingredient, {Part, Lane}, IsPredicated); 7496 } 7497 7498 void VPBranchOnMaskRecipe::execute(VPTransformState &State) { 7499 assert(State.Instance && "Branch on Mask works only on single instance."); 7500 7501 unsigned Part = State.Instance->Part; 7502 unsigned Lane = State.Instance->Lane; 7503 7504 Value *ConditionBit = nullptr; 7505 if (!User) // Block in mask is all-one. 7506 ConditionBit = State.Builder.getTrue(); 7507 else { 7508 VPValue *BlockInMask = User->getOperand(0); 7509 ConditionBit = State.get(BlockInMask, Part); 7510 if (ConditionBit->getType()->isVectorTy()) 7511 ConditionBit = State.Builder.CreateExtractElement( 7512 ConditionBit, State.Builder.getInt32(Lane)); 7513 } 7514 7515 // Replace the temporary unreachable terminator with a new conditional branch, 7516 // whose two destinations will be set later when they are created. 7517 auto *CurrentTerminator = State.CFG.PrevBB->getTerminator(); 7518 assert(isa<UnreachableInst>(CurrentTerminator) && 7519 "Expected to replace unreachable terminator with conditional branch."); 7520 auto *CondBr = BranchInst::Create(State.CFG.PrevBB, nullptr, ConditionBit); 7521 CondBr->setSuccessor(0, nullptr); 7522 ReplaceInstWithInst(CurrentTerminator, CondBr); 7523 } 7524 7525 void VPPredInstPHIRecipe::execute(VPTransformState &State) { 7526 assert(State.Instance && "Predicated instruction PHI works per instance."); 7527 Instruction *ScalarPredInst = cast<Instruction>( 7528 State.ValueMap.getScalarValue(PredInst, *State.Instance)); 7529 BasicBlock *PredicatedBB = ScalarPredInst->getParent(); 7530 BasicBlock *PredicatingBB = PredicatedBB->getSinglePredecessor(); 7531 assert(PredicatingBB && "Predicated block has no single predecessor."); 7532 7533 // By current pack/unpack logic we need to generate only a single phi node: if 7534 // a vector value for the predicated instruction exists at this point it means 7535 // the instruction has vector users only, and a phi for the vector value is 7536 // needed. In this case the recipe of the predicated instruction is marked to 7537 // also do that packing, thereby "hoisting" the insert-element sequence. 7538 // Otherwise, a phi node for the scalar value is needed. 7539 unsigned Part = State.Instance->Part; 7540 if (State.ValueMap.hasVectorValue(PredInst, Part)) { 7541 Value *VectorValue = State.ValueMap.getVectorValue(PredInst, Part); 7542 InsertElementInst *IEI = cast<InsertElementInst>(VectorValue); 7543 PHINode *VPhi = State.Builder.CreatePHI(IEI->getType(), 2); 7544 VPhi->addIncoming(IEI->getOperand(0), PredicatingBB); // Unmodified vector. 7545 VPhi->addIncoming(IEI, PredicatedBB); // New vector with inserted element. 7546 State.ValueMap.resetVectorValue(PredInst, Part, VPhi); // Update cache. 7547 } else { 7548 Type *PredInstType = PredInst->getType(); 7549 PHINode *Phi = State.Builder.CreatePHI(PredInstType, 2); 7550 Phi->addIncoming(UndefValue::get(ScalarPredInst->getType()), PredicatingBB); 7551 Phi->addIncoming(ScalarPredInst, PredicatedBB); 7552 State.ValueMap.resetScalarValue(PredInst, *State.Instance, Phi); 7553 } 7554 } 7555 7556 void VPWidenMemoryInstructionRecipe::execute(VPTransformState &State) { 7557 VPValue *StoredValue = isa<StoreInst>(Instr) ? getStoredValue() : nullptr; 7558 State.ILV->vectorizeMemoryInstruction(&Instr, State, getAddr(), StoredValue, 7559 getMask()); 7560 } 7561 7562 // Determine how to lower the scalar epilogue, which depends on 1) optimising 7563 // for minimum code-size, 2) predicate compiler options, 3) loop hints forcing 7564 // predication, and 4) a TTI hook that analyses whether the loop is suitable 7565 // for predication. 7566 static ScalarEpilogueLowering getScalarEpilogueLowering( 7567 Function *F, Loop *L, LoopVectorizeHints &Hints, ProfileSummaryInfo *PSI, 7568 BlockFrequencyInfo *BFI, TargetTransformInfo *TTI, TargetLibraryInfo *TLI, 7569 AssumptionCache *AC, LoopInfo *LI, ScalarEvolution *SE, DominatorTree *DT, 7570 LoopVectorizationLegality &LVL) { 7571 bool OptSize = 7572 F->hasOptSize() || llvm::shouldOptimizeForSize(L->getHeader(), PSI, BFI, 7573 PGSOQueryType::IRPass); 7574 // 1) OptSize takes precedence over all other options, i.e. if this is set, 7575 // don't look at hints or options, and don't request a scalar epilogue. 7576 if (OptSize && Hints.getForce() != LoopVectorizeHints::FK_Enabled) 7577 return CM_ScalarEpilogueNotAllowedOptSize; 7578 7579 bool PredicateOptDisabled = PreferPredicateOverEpilog.getNumOccurrences() && 7580 !PreferPredicateOverEpilog; 7581 7582 // 2) Next, if disabling predication is requested on the command line, honour 7583 // this and request a scalar epilogue. 7584 if (PredicateOptDisabled) 7585 return CM_ScalarEpilogueAllowed; 7586 7587 // 3) and 4) look if enabling predication is requested on the command line, 7588 // with a loop hint, or if the TTI hook indicates this is profitable, request 7589 // predication . 7590 if (PreferPredicateOverEpilog || 7591 Hints.getPredicate() == LoopVectorizeHints::FK_Enabled || 7592 (TTI->preferPredicateOverEpilogue(L, LI, *SE, *AC, TLI, DT, 7593 LVL.getLAI()) && 7594 Hints.getPredicate() != LoopVectorizeHints::FK_Disabled)) 7595 return CM_ScalarEpilogueNotNeededUsePredicate; 7596 7597 return CM_ScalarEpilogueAllowed; 7598 } 7599 7600 // Process the loop in the VPlan-native vectorization path. This path builds 7601 // VPlan upfront in the vectorization pipeline, which allows to apply 7602 // VPlan-to-VPlan transformations from the very beginning without modifying the 7603 // input LLVM IR. 7604 static bool processLoopInVPlanNativePath( 7605 Loop *L, PredicatedScalarEvolution &PSE, LoopInfo *LI, DominatorTree *DT, 7606 LoopVectorizationLegality *LVL, TargetTransformInfo *TTI, 7607 TargetLibraryInfo *TLI, DemandedBits *DB, AssumptionCache *AC, 7608 OptimizationRemarkEmitter *ORE, BlockFrequencyInfo *BFI, 7609 ProfileSummaryInfo *PSI, LoopVectorizeHints &Hints) { 7610 7611 assert(EnableVPlanNativePath && "VPlan-native path is disabled."); 7612 Function *F = L->getHeader()->getParent(); 7613 InterleavedAccessInfo IAI(PSE, L, DT, LI, LVL->getLAI()); 7614 7615 ScalarEpilogueLowering SEL = getScalarEpilogueLowering( 7616 F, L, Hints, PSI, BFI, TTI, TLI, AC, LI, PSE.getSE(), DT, *LVL); 7617 7618 LoopVectorizationCostModel CM(SEL, L, PSE, LI, LVL, *TTI, TLI, DB, AC, ORE, F, 7619 &Hints, IAI); 7620 // Use the planner for outer loop vectorization. 7621 // TODO: CM is not used at this point inside the planner. Turn CM into an 7622 // optional argument if we don't need it in the future. 7623 LoopVectorizationPlanner LVP(L, LI, TLI, TTI, LVL, CM, IAI); 7624 7625 // Get user vectorization factor. 7626 const unsigned UserVF = Hints.getWidth(); 7627 7628 // Plan how to best vectorize, return the best VF and its cost. 7629 const VectorizationFactor VF = LVP.planInVPlanNativePath(UserVF); 7630 7631 // If we are stress testing VPlan builds, do not attempt to generate vector 7632 // code. Masked vector code generation support will follow soon. 7633 // Also, do not attempt to vectorize if no vector code will be produced. 7634 if (VPlanBuildStressTest || EnableVPlanPredication || 7635 VectorizationFactor::Disabled() == VF) 7636 return false; 7637 7638 LVP.setBestPlan(VF.Width, 1); 7639 7640 InnerLoopVectorizer LB(L, PSE, LI, DT, TLI, TTI, AC, ORE, VF.Width, 1, LVL, 7641 &CM); 7642 LLVM_DEBUG(dbgs() << "Vectorizing outer loop in \"" 7643 << L->getHeader()->getParent()->getName() << "\"\n"); 7644 LVP.executePlan(LB, DT); 7645 7646 // Mark the loop as already vectorized to avoid vectorizing again. 7647 Hints.setAlreadyVectorized(); 7648 7649 LLVM_DEBUG(verifyFunction(*L->getHeader()->getParent())); 7650 return true; 7651 } 7652 7653 bool LoopVectorizePass::processLoop(Loop *L) { 7654 assert((EnableVPlanNativePath || L->empty()) && 7655 "VPlan-native path is not enabled. Only process inner loops."); 7656 7657 #ifndef NDEBUG 7658 const std::string DebugLocStr = getDebugLocString(L); 7659 #endif /* NDEBUG */ 7660 7661 LLVM_DEBUG(dbgs() << "\nLV: Checking a loop in \"" 7662 << L->getHeader()->getParent()->getName() << "\" from " 7663 << DebugLocStr << "\n"); 7664 7665 LoopVectorizeHints Hints(L, InterleaveOnlyWhenForced, *ORE); 7666 7667 LLVM_DEBUG( 7668 dbgs() << "LV: Loop hints:" 7669 << " force=" 7670 << (Hints.getForce() == LoopVectorizeHints::FK_Disabled 7671 ? "disabled" 7672 : (Hints.getForce() == LoopVectorizeHints::FK_Enabled 7673 ? "enabled" 7674 : "?")) 7675 << " width=" << Hints.getWidth() 7676 << " unroll=" << Hints.getInterleave() << "\n"); 7677 7678 // Function containing loop 7679 Function *F = L->getHeader()->getParent(); 7680 7681 // Looking at the diagnostic output is the only way to determine if a loop 7682 // was vectorized (other than looking at the IR or machine code), so it 7683 // is important to generate an optimization remark for each loop. Most of 7684 // these messages are generated as OptimizationRemarkAnalysis. Remarks 7685 // generated as OptimizationRemark and OptimizationRemarkMissed are 7686 // less verbose reporting vectorized loops and unvectorized loops that may 7687 // benefit from vectorization, respectively. 7688 7689 if (!Hints.allowVectorization(F, L, VectorizeOnlyWhenForced)) { 7690 LLVM_DEBUG(dbgs() << "LV: Loop hints prevent vectorization.\n"); 7691 return false; 7692 } 7693 7694 PredicatedScalarEvolution PSE(*SE, *L); 7695 7696 // Check if it is legal to vectorize the loop. 7697 LoopVectorizationRequirements Requirements(*ORE); 7698 LoopVectorizationLegality LVL(L, PSE, DT, TTI, TLI, AA, F, GetLAA, LI, ORE, 7699 &Requirements, &Hints, DB, AC); 7700 if (!LVL.canVectorize(EnableVPlanNativePath)) { 7701 LLVM_DEBUG(dbgs() << "LV: Not vectorizing: Cannot prove legality.\n"); 7702 Hints.emitRemarkWithHints(); 7703 return false; 7704 } 7705 7706 // Check the function attributes and profiles to find out if this function 7707 // should be optimized for size. 7708 ScalarEpilogueLowering SEL = getScalarEpilogueLowering( 7709 F, L, Hints, PSI, BFI, TTI, TLI, AC, LI, PSE.getSE(), DT, LVL); 7710 7711 // Entrance to the VPlan-native vectorization path. Outer loops are processed 7712 // here. They may require CFG and instruction level transformations before 7713 // even evaluating whether vectorization is profitable. Since we cannot modify 7714 // the incoming IR, we need to build VPlan upfront in the vectorization 7715 // pipeline. 7716 if (!L->empty()) 7717 return processLoopInVPlanNativePath(L, PSE, LI, DT, &LVL, TTI, TLI, DB, AC, 7718 ORE, BFI, PSI, Hints); 7719 7720 assert(L->empty() && "Inner loop expected."); 7721 7722 // Check the loop for a trip count threshold: vectorize loops with a tiny trip 7723 // count by optimizing for size, to minimize overheads. 7724 auto ExpectedTC = getSmallBestKnownTC(*SE, L); 7725 if (ExpectedTC && *ExpectedTC < TinyTripCountVectorThreshold) { 7726 LLVM_DEBUG(dbgs() << "LV: Found a loop with a very small trip count. " 7727 << "This loop is worth vectorizing only if no scalar " 7728 << "iteration overheads are incurred."); 7729 if (Hints.getForce() == LoopVectorizeHints::FK_Enabled) 7730 LLVM_DEBUG(dbgs() << " But vectorizing was explicitly forced.\n"); 7731 else { 7732 LLVM_DEBUG(dbgs() << "\n"); 7733 SEL = CM_ScalarEpilogueNotAllowedLowTripLoop; 7734 } 7735 } 7736 7737 // Check the function attributes to see if implicit floats are allowed. 7738 // FIXME: This check doesn't seem possibly correct -- what if the loop is 7739 // an integer loop and the vector instructions selected are purely integer 7740 // vector instructions? 7741 if (F->hasFnAttribute(Attribute::NoImplicitFloat)) { 7742 reportVectorizationFailure( 7743 "Can't vectorize when the NoImplicitFloat attribute is used", 7744 "loop not vectorized due to NoImplicitFloat attribute", 7745 "NoImplicitFloat", ORE, L); 7746 Hints.emitRemarkWithHints(); 7747 return false; 7748 } 7749 7750 // Check if the target supports potentially unsafe FP vectorization. 7751 // FIXME: Add a check for the type of safety issue (denormal, signaling) 7752 // for the target we're vectorizing for, to make sure none of the 7753 // additional fp-math flags can help. 7754 if (Hints.isPotentiallyUnsafe() && 7755 TTI->isFPVectorizationPotentiallyUnsafe()) { 7756 reportVectorizationFailure( 7757 "Potentially unsafe FP op prevents vectorization", 7758 "loop not vectorized due to unsafe FP support.", 7759 "UnsafeFP", ORE, L); 7760 Hints.emitRemarkWithHints(); 7761 return false; 7762 } 7763 7764 bool UseInterleaved = TTI->enableInterleavedAccessVectorization(); 7765 InterleavedAccessInfo IAI(PSE, L, DT, LI, LVL.getLAI()); 7766 7767 // If an override option has been passed in for interleaved accesses, use it. 7768 if (EnableInterleavedMemAccesses.getNumOccurrences() > 0) 7769 UseInterleaved = EnableInterleavedMemAccesses; 7770 7771 // Analyze interleaved memory accesses. 7772 if (UseInterleaved) { 7773 IAI.analyzeInterleaving(useMaskedInterleavedAccesses(*TTI)); 7774 } 7775 7776 // Use the cost model. 7777 LoopVectorizationCostModel CM(SEL, L, PSE, LI, &LVL, *TTI, TLI, DB, AC, ORE, 7778 F, &Hints, IAI); 7779 CM.collectValuesToIgnore(); 7780 7781 // Use the planner for vectorization. 7782 LoopVectorizationPlanner LVP(L, LI, TLI, TTI, &LVL, CM, IAI); 7783 7784 // Get user vectorization factor. 7785 unsigned UserVF = Hints.getWidth(); 7786 7787 // Plan how to best vectorize, return the best VF and its cost. 7788 Optional<VectorizationFactor> MaybeVF = LVP.plan(UserVF); 7789 7790 VectorizationFactor VF = VectorizationFactor::Disabled(); 7791 unsigned IC = 1; 7792 unsigned UserIC = Hints.getInterleave(); 7793 7794 if (MaybeVF) { 7795 VF = *MaybeVF; 7796 // Select the interleave count. 7797 IC = CM.selectInterleaveCount(VF.Width, VF.Cost); 7798 } 7799 7800 // Identify the diagnostic messages that should be produced. 7801 std::pair<StringRef, std::string> VecDiagMsg, IntDiagMsg; 7802 bool VectorizeLoop = true, InterleaveLoop = true; 7803 if (Requirements.doesNotMeet(F, L, Hints)) { 7804 LLVM_DEBUG(dbgs() << "LV: Not vectorizing: loop did not meet vectorization " 7805 "requirements.\n"); 7806 Hints.emitRemarkWithHints(); 7807 return false; 7808 } 7809 7810 if (VF.Width == 1) { 7811 LLVM_DEBUG(dbgs() << "LV: Vectorization is possible but not beneficial.\n"); 7812 VecDiagMsg = std::make_pair( 7813 "VectorizationNotBeneficial", 7814 "the cost-model indicates that vectorization is not beneficial"); 7815 VectorizeLoop = false; 7816 } 7817 7818 if (!MaybeVF && UserIC > 1) { 7819 // Tell the user interleaving was avoided up-front, despite being explicitly 7820 // requested. 7821 LLVM_DEBUG(dbgs() << "LV: Ignoring UserIC, because vectorization and " 7822 "interleaving should be avoided up front\n"); 7823 IntDiagMsg = std::make_pair( 7824 "InterleavingAvoided", 7825 "Ignoring UserIC, because interleaving was avoided up front"); 7826 InterleaveLoop = false; 7827 } else if (IC == 1 && UserIC <= 1) { 7828 // Tell the user interleaving is not beneficial. 7829 LLVM_DEBUG(dbgs() << "LV: Interleaving is not beneficial.\n"); 7830 IntDiagMsg = std::make_pair( 7831 "InterleavingNotBeneficial", 7832 "the cost-model indicates that interleaving is not beneficial"); 7833 InterleaveLoop = false; 7834 if (UserIC == 1) { 7835 IntDiagMsg.first = "InterleavingNotBeneficialAndDisabled"; 7836 IntDiagMsg.second += 7837 " and is explicitly disabled or interleave count is set to 1"; 7838 } 7839 } else if (IC > 1 && UserIC == 1) { 7840 // Tell the user interleaving is beneficial, but it explicitly disabled. 7841 LLVM_DEBUG( 7842 dbgs() << "LV: Interleaving is beneficial but is explicitly disabled."); 7843 IntDiagMsg = std::make_pair( 7844 "InterleavingBeneficialButDisabled", 7845 "the cost-model indicates that interleaving is beneficial " 7846 "but is explicitly disabled or interleave count is set to 1"); 7847 InterleaveLoop = false; 7848 } 7849 7850 // Override IC if user provided an interleave count. 7851 IC = UserIC > 0 ? UserIC : IC; 7852 7853 // Emit diagnostic messages, if any. 7854 const char *VAPassName = Hints.vectorizeAnalysisPassName(); 7855 if (!VectorizeLoop && !InterleaveLoop) { 7856 // Do not vectorize or interleaving the loop. 7857 ORE->emit([&]() { 7858 return OptimizationRemarkMissed(VAPassName, VecDiagMsg.first, 7859 L->getStartLoc(), L->getHeader()) 7860 << VecDiagMsg.second; 7861 }); 7862 ORE->emit([&]() { 7863 return OptimizationRemarkMissed(LV_NAME, IntDiagMsg.first, 7864 L->getStartLoc(), L->getHeader()) 7865 << IntDiagMsg.second; 7866 }); 7867 return false; 7868 } else if (!VectorizeLoop && InterleaveLoop) { 7869 LLVM_DEBUG(dbgs() << "LV: Interleave Count is " << IC << '\n'); 7870 ORE->emit([&]() { 7871 return OptimizationRemarkAnalysis(VAPassName, VecDiagMsg.first, 7872 L->getStartLoc(), L->getHeader()) 7873 << VecDiagMsg.second; 7874 }); 7875 } else if (VectorizeLoop && !InterleaveLoop) { 7876 LLVM_DEBUG(dbgs() << "LV: Found a vectorizable loop (" << VF.Width 7877 << ") in " << DebugLocStr << '\n'); 7878 ORE->emit([&]() { 7879 return OptimizationRemarkAnalysis(LV_NAME, IntDiagMsg.first, 7880 L->getStartLoc(), L->getHeader()) 7881 << IntDiagMsg.second; 7882 }); 7883 } else if (VectorizeLoop && InterleaveLoop) { 7884 LLVM_DEBUG(dbgs() << "LV: Found a vectorizable loop (" << VF.Width 7885 << ") in " << DebugLocStr << '\n'); 7886 LLVM_DEBUG(dbgs() << "LV: Interleave Count is " << IC << '\n'); 7887 } 7888 7889 LVP.setBestPlan(VF.Width, IC); 7890 7891 using namespace ore; 7892 bool DisableRuntimeUnroll = false; 7893 MDNode *OrigLoopID = L->getLoopID(); 7894 7895 if (!VectorizeLoop) { 7896 assert(IC > 1 && "interleave count should not be 1 or 0"); 7897 // If we decided that it is not legal to vectorize the loop, then 7898 // interleave it. 7899 InnerLoopUnroller Unroller(L, PSE, LI, DT, TLI, TTI, AC, ORE, IC, &LVL, 7900 &CM); 7901 LVP.executePlan(Unroller, DT); 7902 7903 ORE->emit([&]() { 7904 return OptimizationRemark(LV_NAME, "Interleaved", L->getStartLoc(), 7905 L->getHeader()) 7906 << "interleaved loop (interleaved count: " 7907 << NV("InterleaveCount", IC) << ")"; 7908 }); 7909 } else { 7910 // If we decided that it is *legal* to vectorize the loop, then do it. 7911 InnerLoopVectorizer LB(L, PSE, LI, DT, TLI, TTI, AC, ORE, VF.Width, IC, 7912 &LVL, &CM); 7913 LVP.executePlan(LB, DT); 7914 ++LoopsVectorized; 7915 7916 // Add metadata to disable runtime unrolling a scalar loop when there are 7917 // no runtime checks about strides and memory. A scalar loop that is 7918 // rarely used is not worth unrolling. 7919 if (!LB.areSafetyChecksAdded()) 7920 DisableRuntimeUnroll = true; 7921 7922 // Report the vectorization decision. 7923 ORE->emit([&]() { 7924 return OptimizationRemark(LV_NAME, "Vectorized", L->getStartLoc(), 7925 L->getHeader()) 7926 << "vectorized loop (vectorization width: " 7927 << NV("VectorizationFactor", VF.Width) 7928 << ", interleaved count: " << NV("InterleaveCount", IC) << ")"; 7929 }); 7930 } 7931 7932 Optional<MDNode *> RemainderLoopID = 7933 makeFollowupLoopID(OrigLoopID, {LLVMLoopVectorizeFollowupAll, 7934 LLVMLoopVectorizeFollowupEpilogue}); 7935 if (RemainderLoopID.hasValue()) { 7936 L->setLoopID(RemainderLoopID.getValue()); 7937 } else { 7938 if (DisableRuntimeUnroll) 7939 AddRuntimeUnrollDisableMetaData(L); 7940 7941 // Mark the loop as already vectorized to avoid vectorizing again. 7942 Hints.setAlreadyVectorized(); 7943 } 7944 7945 LLVM_DEBUG(verifyFunction(*L->getHeader()->getParent())); 7946 return true; 7947 } 7948 7949 bool LoopVectorizePass::runImpl( 7950 Function &F, ScalarEvolution &SE_, LoopInfo &LI_, TargetTransformInfo &TTI_, 7951 DominatorTree &DT_, BlockFrequencyInfo &BFI_, TargetLibraryInfo *TLI_, 7952 DemandedBits &DB_, AliasAnalysis &AA_, AssumptionCache &AC_, 7953 std::function<const LoopAccessInfo &(Loop &)> &GetLAA_, 7954 OptimizationRemarkEmitter &ORE_, ProfileSummaryInfo *PSI_) { 7955 SE = &SE_; 7956 LI = &LI_; 7957 TTI = &TTI_; 7958 DT = &DT_; 7959 BFI = &BFI_; 7960 TLI = TLI_; 7961 AA = &AA_; 7962 AC = &AC_; 7963 GetLAA = &GetLAA_; 7964 DB = &DB_; 7965 ORE = &ORE_; 7966 PSI = PSI_; 7967 7968 // Don't attempt if 7969 // 1. the target claims to have no vector registers, and 7970 // 2. interleaving won't help ILP. 7971 // 7972 // The second condition is necessary because, even if the target has no 7973 // vector registers, loop vectorization may still enable scalar 7974 // interleaving. 7975 if (!TTI->getNumberOfRegisters(TTI->getRegisterClassForType(true)) && 7976 TTI->getMaxInterleaveFactor(1) < 2) 7977 return false; 7978 7979 bool Changed = false; 7980 7981 // The vectorizer requires loops to be in simplified form. 7982 // Since simplification may add new inner loops, it has to run before the 7983 // legality and profitability checks. This means running the loop vectorizer 7984 // will simplify all loops, regardless of whether anything end up being 7985 // vectorized. 7986 for (auto &L : *LI) 7987 Changed |= 7988 simplifyLoop(L, DT, LI, SE, AC, nullptr, false /* PreserveLCSSA */); 7989 7990 // Build up a worklist of inner-loops to vectorize. This is necessary as 7991 // the act of vectorizing or partially unrolling a loop creates new loops 7992 // and can invalidate iterators across the loops. 7993 SmallVector<Loop *, 8> Worklist; 7994 7995 for (Loop *L : *LI) 7996 collectSupportedLoops(*L, LI, ORE, Worklist); 7997 7998 LoopsAnalyzed += Worklist.size(); 7999 8000 // Now walk the identified inner loops. 8001 while (!Worklist.empty()) { 8002 Loop *L = Worklist.pop_back_val(); 8003 8004 // For the inner loops we actually process, form LCSSA to simplify the 8005 // transform. 8006 Changed |= formLCSSARecursively(*L, *DT, LI, SE); 8007 8008 Changed |= processLoop(L); 8009 } 8010 8011 // Process each loop nest in the function. 8012 return Changed; 8013 } 8014 8015 PreservedAnalyses LoopVectorizePass::run(Function &F, 8016 FunctionAnalysisManager &AM) { 8017 auto &SE = AM.getResult<ScalarEvolutionAnalysis>(F); 8018 auto &LI = AM.getResult<LoopAnalysis>(F); 8019 auto &TTI = AM.getResult<TargetIRAnalysis>(F); 8020 auto &DT = AM.getResult<DominatorTreeAnalysis>(F); 8021 auto &BFI = AM.getResult<BlockFrequencyAnalysis>(F); 8022 auto &TLI = AM.getResult<TargetLibraryAnalysis>(F); 8023 auto &AA = AM.getResult<AAManager>(F); 8024 auto &AC = AM.getResult<AssumptionAnalysis>(F); 8025 auto &DB = AM.getResult<DemandedBitsAnalysis>(F); 8026 auto &ORE = AM.getResult<OptimizationRemarkEmitterAnalysis>(F); 8027 MemorySSA *MSSA = EnableMSSALoopDependency 8028 ? &AM.getResult<MemorySSAAnalysis>(F).getMSSA() 8029 : nullptr; 8030 8031 auto &LAM = AM.getResult<LoopAnalysisManagerFunctionProxy>(F).getManager(); 8032 std::function<const LoopAccessInfo &(Loop &)> GetLAA = 8033 [&](Loop &L) -> const LoopAccessInfo & { 8034 LoopStandardAnalysisResults AR = {AA, AC, DT, LI, SE, TLI, TTI, MSSA}; 8035 return LAM.getResult<LoopAccessAnalysis>(L, AR); 8036 }; 8037 const ModuleAnalysisManager &MAM = 8038 AM.getResult<ModuleAnalysisManagerFunctionProxy>(F).getManager(); 8039 ProfileSummaryInfo *PSI = 8040 MAM.getCachedResult<ProfileSummaryAnalysis>(*F.getParent()); 8041 bool Changed = 8042 runImpl(F, SE, LI, TTI, DT, BFI, &TLI, DB, AA, AC, GetLAA, ORE, PSI); 8043 if (!Changed) 8044 return PreservedAnalyses::all(); 8045 PreservedAnalyses PA; 8046 8047 // We currently do not preserve loopinfo/dominator analyses with outer loop 8048 // vectorization. Until this is addressed, mark these analyses as preserved 8049 // only for non-VPlan-native path. 8050 // TODO: Preserve Loop and Dominator analyses for VPlan-native path. 8051 if (!EnableVPlanNativePath) { 8052 PA.preserve<LoopAnalysis>(); 8053 PA.preserve<DominatorTreeAnalysis>(); 8054 } 8055 PA.preserve<BasicAA>(); 8056 PA.preserve<GlobalsAA>(); 8057 return PA; 8058 } 8059