1 //===- LoopVectorize.cpp - A Loop Vectorizer ------------------------------===// 2 // 3 // Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. 4 // See https://llvm.org/LICENSE.txt for license information. 5 // SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception 6 // 7 //===----------------------------------------------------------------------===// 8 // 9 // This is the LLVM loop vectorizer. This pass modifies 'vectorizable' loops 10 // and generates target-independent LLVM-IR. 11 // The vectorizer uses the TargetTransformInfo analysis to estimate the costs 12 // of instructions in order to estimate the profitability of vectorization. 13 // 14 // The loop vectorizer combines consecutive loop iterations into a single 15 // 'wide' iteration. After this transformation the index is incremented 16 // by the SIMD vector width, and not by one. 17 // 18 // This pass has three parts: 19 // 1. The main loop pass that drives the different parts. 20 // 2. LoopVectorizationLegality - A unit that checks for the legality 21 // of the vectorization. 22 // 3. InnerLoopVectorizer - A unit that performs the actual 23 // widening of instructions. 24 // 4. LoopVectorizationCostModel - A unit that checks for the profitability 25 // of vectorization. It decides on the optimal vector width, which 26 // can be one, if vectorization is not profitable. 27 // 28 // There is a development effort going on to migrate loop vectorizer to the 29 // VPlan infrastructure and to introduce outer loop vectorization support (see 30 // docs/Proposal/VectorizationPlan.rst and 31 // http://lists.llvm.org/pipermail/llvm-dev/2017-December/119523.html). For this 32 // purpose, we temporarily introduced the VPlan-native vectorization path: an 33 // alternative vectorization path that is natively implemented on top of the 34 // VPlan infrastructure. See EnableVPlanNativePath for enabling. 35 // 36 //===----------------------------------------------------------------------===// 37 // 38 // The reduction-variable vectorization is based on the paper: 39 // D. Nuzman and R. Henderson. Multi-platform Auto-vectorization. 40 // 41 // Variable uniformity checks are inspired by: 42 // Karrenberg, R. and Hack, S. Whole Function Vectorization. 43 // 44 // The interleaved access vectorization is based on the paper: 45 // Dorit Nuzman, Ira Rosen and Ayal Zaks. Auto-Vectorization of Interleaved 46 // Data for SIMD 47 // 48 // Other ideas/concepts are from: 49 // A. Zaks and D. Nuzman. Autovectorization in GCC-two years later. 50 // 51 // S. Maleki, Y. Gao, M. Garzaran, T. Wong and D. Padua. An Evaluation of 52 // Vectorizing Compilers. 53 // 54 //===----------------------------------------------------------------------===// 55 56 #include "llvm/Transforms/Vectorize/LoopVectorize.h" 57 #include "LoopVectorizationPlanner.h" 58 #include "VPRecipeBuilder.h" 59 #include "VPlan.h" 60 #include "VPlanHCFGBuilder.h" 61 #include "VPlanPredicator.h" 62 #include "VPlanTransforms.h" 63 #include "llvm/ADT/APInt.h" 64 #include "llvm/ADT/ArrayRef.h" 65 #include "llvm/ADT/DenseMap.h" 66 #include "llvm/ADT/DenseMapInfo.h" 67 #include "llvm/ADT/Hashing.h" 68 #include "llvm/ADT/MapVector.h" 69 #include "llvm/ADT/None.h" 70 #include "llvm/ADT/Optional.h" 71 #include "llvm/ADT/STLExtras.h" 72 #include "llvm/ADT/SetVector.h" 73 #include "llvm/ADT/SmallPtrSet.h" 74 #include "llvm/ADT/SmallVector.h" 75 #include "llvm/ADT/Statistic.h" 76 #include "llvm/ADT/StringRef.h" 77 #include "llvm/ADT/Twine.h" 78 #include "llvm/ADT/iterator_range.h" 79 #include "llvm/Analysis/AssumptionCache.h" 80 #include "llvm/Analysis/BasicAliasAnalysis.h" 81 #include "llvm/Analysis/BlockFrequencyInfo.h" 82 #include "llvm/Analysis/CFG.h" 83 #include "llvm/Analysis/CodeMetrics.h" 84 #include "llvm/Analysis/DemandedBits.h" 85 #include "llvm/Analysis/GlobalsModRef.h" 86 #include "llvm/Analysis/LoopAccessAnalysis.h" 87 #include "llvm/Analysis/LoopAnalysisManager.h" 88 #include "llvm/Analysis/LoopInfo.h" 89 #include "llvm/Analysis/LoopIterator.h" 90 #include "llvm/Analysis/MemorySSA.h" 91 #include "llvm/Analysis/OptimizationRemarkEmitter.h" 92 #include "llvm/Analysis/ProfileSummaryInfo.h" 93 #include "llvm/Analysis/ScalarEvolution.h" 94 #include "llvm/Analysis/ScalarEvolutionExpander.h" 95 #include "llvm/Analysis/ScalarEvolutionExpressions.h" 96 #include "llvm/Analysis/TargetLibraryInfo.h" 97 #include "llvm/Analysis/TargetTransformInfo.h" 98 #include "llvm/Analysis/VectorUtils.h" 99 #include "llvm/IR/Attributes.h" 100 #include "llvm/IR/BasicBlock.h" 101 #include "llvm/IR/CFG.h" 102 #include "llvm/IR/Constant.h" 103 #include "llvm/IR/Constants.h" 104 #include "llvm/IR/DataLayout.h" 105 #include "llvm/IR/DebugInfoMetadata.h" 106 #include "llvm/IR/DebugLoc.h" 107 #include "llvm/IR/DerivedTypes.h" 108 #include "llvm/IR/DiagnosticInfo.h" 109 #include "llvm/IR/Dominators.h" 110 #include "llvm/IR/Function.h" 111 #include "llvm/IR/IRBuilder.h" 112 #include "llvm/IR/InstrTypes.h" 113 #include "llvm/IR/Instruction.h" 114 #include "llvm/IR/Instructions.h" 115 #include "llvm/IR/IntrinsicInst.h" 116 #include "llvm/IR/Intrinsics.h" 117 #include "llvm/IR/LLVMContext.h" 118 #include "llvm/IR/Metadata.h" 119 #include "llvm/IR/Module.h" 120 #include "llvm/IR/Operator.h" 121 #include "llvm/IR/Type.h" 122 #include "llvm/IR/Use.h" 123 #include "llvm/IR/User.h" 124 #include "llvm/IR/Value.h" 125 #include "llvm/IR/ValueHandle.h" 126 #include "llvm/IR/Verifier.h" 127 #include "llvm/InitializePasses.h" 128 #include "llvm/Pass.h" 129 #include "llvm/Support/Casting.h" 130 #include "llvm/Support/CommandLine.h" 131 #include "llvm/Support/Compiler.h" 132 #include "llvm/Support/Debug.h" 133 #include "llvm/Support/ErrorHandling.h" 134 #include "llvm/Support/MathExtras.h" 135 #include "llvm/Support/raw_ostream.h" 136 #include "llvm/Transforms/Utils/BasicBlockUtils.h" 137 #include "llvm/Transforms/Utils/InjectTLIMappings.h" 138 #include "llvm/Transforms/Utils/LoopSimplify.h" 139 #include "llvm/Transforms/Utils/LoopUtils.h" 140 #include "llvm/Transforms/Utils/LoopVersioning.h" 141 #include "llvm/Transforms/Utils/SizeOpts.h" 142 #include "llvm/Transforms/Vectorize/LoopVectorizationLegality.h" 143 #include <algorithm> 144 #include <cassert> 145 #include <cstdint> 146 #include <cstdlib> 147 #include <functional> 148 #include <iterator> 149 #include <limits> 150 #include <memory> 151 #include <string> 152 #include <tuple> 153 #include <utility> 154 155 using namespace llvm; 156 157 #define LV_NAME "loop-vectorize" 158 #define DEBUG_TYPE LV_NAME 159 160 /// @{ 161 /// Metadata attribute names 162 static const char *const LLVMLoopVectorizeFollowupAll = 163 "llvm.loop.vectorize.followup_all"; 164 static const char *const LLVMLoopVectorizeFollowupVectorized = 165 "llvm.loop.vectorize.followup_vectorized"; 166 static const char *const LLVMLoopVectorizeFollowupEpilogue = 167 "llvm.loop.vectorize.followup_epilogue"; 168 /// @} 169 170 STATISTIC(LoopsVectorized, "Number of loops vectorized"); 171 STATISTIC(LoopsAnalyzed, "Number of loops analyzed for vectorization"); 172 173 /// Loops with a known constant trip count below this number are vectorized only 174 /// if no scalar iteration overheads are incurred. 175 static cl::opt<unsigned> TinyTripCountVectorThreshold( 176 "vectorizer-min-trip-count", cl::init(16), cl::Hidden, 177 cl::desc("Loops with a constant trip count that is smaller than this " 178 "value are vectorized only if no scalar iteration overheads " 179 "are incurred.")); 180 181 // Indicates that an epilogue is undesired, predication is preferred. 182 // This means that the vectorizer will try to fold the loop-tail (epilogue) 183 // into the loop and predicate the loop body accordingly. 184 static cl::opt<bool> PreferPredicateOverEpilog( 185 "prefer-predicate-over-epilog", cl::init(false), cl::Hidden, 186 cl::desc("Indicate that an epilogue is undesired, predication should be " 187 "used instead.")); 188 189 static cl::opt<bool> MaximizeBandwidth( 190 "vectorizer-maximize-bandwidth", cl::init(false), cl::Hidden, 191 cl::desc("Maximize bandwidth when selecting vectorization factor which " 192 "will be determined by the smallest type in loop.")); 193 194 static cl::opt<bool> EnableInterleavedMemAccesses( 195 "enable-interleaved-mem-accesses", cl::init(false), cl::Hidden, 196 cl::desc("Enable vectorization on interleaved memory accesses in a loop")); 197 198 /// An interleave-group may need masking if it resides in a block that needs 199 /// predication, or in order to mask away gaps. 200 static cl::opt<bool> EnableMaskedInterleavedMemAccesses( 201 "enable-masked-interleaved-mem-accesses", cl::init(false), cl::Hidden, 202 cl::desc("Enable vectorization on masked interleaved memory accesses in a loop")); 203 204 static cl::opt<unsigned> TinyTripCountInterleaveThreshold( 205 "tiny-trip-count-interleave-threshold", cl::init(128), cl::Hidden, 206 cl::desc("We don't interleave loops with a estimated constant trip count " 207 "below this number")); 208 209 static cl::opt<unsigned> ForceTargetNumScalarRegs( 210 "force-target-num-scalar-regs", cl::init(0), cl::Hidden, 211 cl::desc("A flag that overrides the target's number of scalar registers.")); 212 213 static cl::opt<unsigned> ForceTargetNumVectorRegs( 214 "force-target-num-vector-regs", cl::init(0), cl::Hidden, 215 cl::desc("A flag that overrides the target's number of vector registers.")); 216 217 static cl::opt<unsigned> ForceTargetMaxScalarInterleaveFactor( 218 "force-target-max-scalar-interleave", cl::init(0), cl::Hidden, 219 cl::desc("A flag that overrides the target's max interleave factor for " 220 "scalar loops.")); 221 222 static cl::opt<unsigned> ForceTargetMaxVectorInterleaveFactor( 223 "force-target-max-vector-interleave", cl::init(0), cl::Hidden, 224 cl::desc("A flag that overrides the target's max interleave factor for " 225 "vectorized loops.")); 226 227 static cl::opt<unsigned> ForceTargetInstructionCost( 228 "force-target-instruction-cost", cl::init(0), cl::Hidden, 229 cl::desc("A flag that overrides the target's expected cost for " 230 "an instruction to a single constant value. Mostly " 231 "useful for getting consistent testing.")); 232 233 static cl::opt<unsigned> SmallLoopCost( 234 "small-loop-cost", cl::init(20), cl::Hidden, 235 cl::desc( 236 "The cost of a loop that is considered 'small' by the interleaver.")); 237 238 static cl::opt<bool> LoopVectorizeWithBlockFrequency( 239 "loop-vectorize-with-block-frequency", cl::init(true), cl::Hidden, 240 cl::desc("Enable the use of the block frequency analysis to access PGO " 241 "heuristics minimizing code growth in cold regions and being more " 242 "aggressive in hot regions.")); 243 244 // Runtime interleave loops for load/store throughput. 245 static cl::opt<bool> EnableLoadStoreRuntimeInterleave( 246 "enable-loadstore-runtime-interleave", cl::init(true), cl::Hidden, 247 cl::desc( 248 "Enable runtime interleaving until load/store ports are saturated")); 249 250 /// The number of stores in a loop that are allowed to need predication. 251 static cl::opt<unsigned> NumberOfStoresToPredicate( 252 "vectorize-num-stores-pred", cl::init(1), cl::Hidden, 253 cl::desc("Max number of stores to be predicated behind an if.")); 254 255 static cl::opt<bool> EnableIndVarRegisterHeur( 256 "enable-ind-var-reg-heur", cl::init(true), cl::Hidden, 257 cl::desc("Count the induction variable only once when interleaving")); 258 259 static cl::opt<bool> EnableCondStoresVectorization( 260 "enable-cond-stores-vec", cl::init(true), cl::Hidden, 261 cl::desc("Enable if predication of stores during vectorization.")); 262 263 static cl::opt<unsigned> MaxNestedScalarReductionIC( 264 "max-nested-scalar-reduction-interleave", cl::init(2), cl::Hidden, 265 cl::desc("The maximum interleave count to use when interleaving a scalar " 266 "reduction in a nested loop.")); 267 268 cl::opt<bool> EnableVPlanNativePath( 269 "enable-vplan-native-path", cl::init(false), cl::Hidden, 270 cl::desc("Enable VPlan-native vectorization path with " 271 "support for outer loop vectorization.")); 272 273 // FIXME: Remove this switch once we have divergence analysis. Currently we 274 // assume divergent non-backedge branches when this switch is true. 275 cl::opt<bool> EnableVPlanPredication( 276 "enable-vplan-predication", cl::init(false), cl::Hidden, 277 cl::desc("Enable VPlan-native vectorization path predicator with " 278 "support for outer loop vectorization.")); 279 280 // This flag enables the stress testing of the VPlan H-CFG construction in the 281 // VPlan-native vectorization path. It must be used in conjuction with 282 // -enable-vplan-native-path. -vplan-verify-hcfg can also be used to enable the 283 // verification of the H-CFGs built. 284 static cl::opt<bool> VPlanBuildStressTest( 285 "vplan-build-stress-test", cl::init(false), cl::Hidden, 286 cl::desc( 287 "Build VPlan for every supported loop nest in the function and bail " 288 "out right after the build (stress test the VPlan H-CFG construction " 289 "in the VPlan-native vectorization path).")); 290 291 cl::opt<bool> llvm::EnableLoopInterleaving( 292 "interleave-loops", cl::init(true), cl::Hidden, 293 cl::desc("Enable loop interleaving in Loop vectorization passes")); 294 cl::opt<bool> llvm::EnableLoopVectorization( 295 "vectorize-loops", cl::init(true), cl::Hidden, 296 cl::desc("Run the Loop vectorization passes")); 297 298 /// A helper function for converting Scalar types to vector types. 299 /// If the incoming type is void, we return void. If the VF is 1, we return 300 /// the scalar type. 301 static Type *ToVectorTy(Type *Scalar, unsigned VF) { 302 if (Scalar->isVoidTy() || VF == 1) 303 return Scalar; 304 return VectorType::get(Scalar, VF); 305 } 306 307 /// A helper function that returns the type of loaded or stored value. 308 static Type *getMemInstValueType(Value *I) { 309 assert((isa<LoadInst>(I) || isa<StoreInst>(I)) && 310 "Expected Load or Store instruction"); 311 if (auto *LI = dyn_cast<LoadInst>(I)) 312 return LI->getType(); 313 return cast<StoreInst>(I)->getValueOperand()->getType(); 314 } 315 316 /// A helper function that returns true if the given type is irregular. The 317 /// type is irregular if its allocated size doesn't equal the store size of an 318 /// element of the corresponding vector type at the given vectorization factor. 319 static bool hasIrregularType(Type *Ty, const DataLayout &DL, unsigned VF) { 320 // Determine if an array of VF elements of type Ty is "bitcast compatible" 321 // with a <VF x Ty> vector. 322 if (VF > 1) { 323 auto *VectorTy = VectorType::get(Ty, VF); 324 return VF * DL.getTypeAllocSize(Ty) != DL.getTypeStoreSize(VectorTy); 325 } 326 327 // If the vectorization factor is one, we just check if an array of type Ty 328 // requires padding between elements. 329 return DL.getTypeAllocSizeInBits(Ty) != DL.getTypeSizeInBits(Ty); 330 } 331 332 /// A helper function that returns the reciprocal of the block probability of 333 /// predicated blocks. If we return X, we are assuming the predicated block 334 /// will execute once for every X iterations of the loop header. 335 /// 336 /// TODO: We should use actual block probability here, if available. Currently, 337 /// we always assume predicated blocks have a 50% chance of executing. 338 static unsigned getReciprocalPredBlockProb() { return 2; } 339 340 /// A helper function that adds a 'fast' flag to floating-point operations. 341 static Value *addFastMathFlag(Value *V) { 342 if (isa<FPMathOperator>(V)) 343 cast<Instruction>(V)->setFastMathFlags(FastMathFlags::getFast()); 344 return V; 345 } 346 347 static Value *addFastMathFlag(Value *V, FastMathFlags FMF) { 348 if (isa<FPMathOperator>(V)) 349 cast<Instruction>(V)->setFastMathFlags(FMF); 350 return V; 351 } 352 353 /// A helper function that returns an integer or floating-point constant with 354 /// value C. 355 static Constant *getSignedIntOrFpConstant(Type *Ty, int64_t C) { 356 return Ty->isIntegerTy() ? ConstantInt::getSigned(Ty, C) 357 : ConstantFP::get(Ty, C); 358 } 359 360 /// Returns "best known" trip count for the specified loop \p L as defined by 361 /// the following procedure: 362 /// 1) Returns exact trip count if it is known. 363 /// 2) Returns expected trip count according to profile data if any. 364 /// 3) Returns upper bound estimate if it is known. 365 /// 4) Returns None if all of the above failed. 366 static Optional<unsigned> getSmallBestKnownTC(ScalarEvolution &SE, Loop *L) { 367 // Check if exact trip count is known. 368 if (unsigned ExpectedTC = SE.getSmallConstantTripCount(L)) 369 return ExpectedTC; 370 371 // Check if there is an expected trip count available from profile data. 372 if (LoopVectorizeWithBlockFrequency) 373 if (auto EstimatedTC = getLoopEstimatedTripCount(L)) 374 return EstimatedTC; 375 376 // Check if upper bound estimate is known. 377 if (unsigned ExpectedTC = SE.getSmallConstantMaxTripCount(L)) 378 return ExpectedTC; 379 380 return None; 381 } 382 383 namespace llvm { 384 385 /// InnerLoopVectorizer vectorizes loops which contain only one basic 386 /// block to a specified vectorization factor (VF). 387 /// This class performs the widening of scalars into vectors, or multiple 388 /// scalars. This class also implements the following features: 389 /// * It inserts an epilogue loop for handling loops that don't have iteration 390 /// counts that are known to be a multiple of the vectorization factor. 391 /// * It handles the code generation for reduction variables. 392 /// * Scalarization (implementation using scalars) of un-vectorizable 393 /// instructions. 394 /// InnerLoopVectorizer does not perform any vectorization-legality 395 /// checks, and relies on the caller to check for the different legality 396 /// aspects. The InnerLoopVectorizer relies on the 397 /// LoopVectorizationLegality class to provide information about the induction 398 /// and reduction variables that were found to a given vectorization factor. 399 class InnerLoopVectorizer { 400 public: 401 InnerLoopVectorizer(Loop *OrigLoop, PredicatedScalarEvolution &PSE, 402 LoopInfo *LI, DominatorTree *DT, 403 const TargetLibraryInfo *TLI, 404 const TargetTransformInfo *TTI, AssumptionCache *AC, 405 OptimizationRemarkEmitter *ORE, unsigned VecWidth, 406 unsigned UnrollFactor, LoopVectorizationLegality *LVL, 407 LoopVectorizationCostModel *CM) 408 : OrigLoop(OrigLoop), PSE(PSE), LI(LI), DT(DT), TLI(TLI), TTI(TTI), 409 AC(AC), ORE(ORE), VF(VecWidth), UF(UnrollFactor), 410 Builder(PSE.getSE()->getContext()), 411 VectorLoopValueMap(UnrollFactor, VecWidth), Legal(LVL), Cost(CM) {} 412 virtual ~InnerLoopVectorizer() = default; 413 414 /// Create a new empty loop. Unlink the old loop and connect the new one. 415 /// Return the pre-header block of the new loop. 416 BasicBlock *createVectorizedLoopSkeleton(); 417 418 /// Widen a single instruction within the innermost loop. 419 void widenInstruction(Instruction &I); 420 421 /// Fix the vectorized code, taking care of header phi's, live-outs, and more. 422 void fixVectorizedLoop(); 423 424 // Return true if any runtime check is added. 425 bool areSafetyChecksAdded() { return AddedSafetyChecks; } 426 427 /// A type for vectorized values in the new loop. Each value from the 428 /// original loop, when vectorized, is represented by UF vector values in the 429 /// new unrolled loop, where UF is the unroll factor. 430 using VectorParts = SmallVector<Value *, 2>; 431 432 /// Vectorize a single GetElementPtrInst based on information gathered and 433 /// decisions taken during planning. 434 void widenGEP(GetElementPtrInst *GEP, unsigned UF, unsigned VF, 435 bool IsPtrLoopInvariant, SmallBitVector &IsIndexLoopInvariant); 436 437 /// Vectorize a single PHINode in a block. This method handles the induction 438 /// variable canonicalization. It supports both VF = 1 for unrolled loops and 439 /// arbitrary length vectors. 440 void widenPHIInstruction(Instruction *PN, unsigned UF, unsigned VF); 441 442 /// A helper function to scalarize a single Instruction in the innermost loop. 443 /// Generates a sequence of scalar instances for each lane between \p MinLane 444 /// and \p MaxLane, times each part between \p MinPart and \p MaxPart, 445 /// inclusive.. 446 void scalarizeInstruction(Instruction *Instr, const VPIteration &Instance, 447 bool IfPredicateInstr); 448 449 /// Widen an integer or floating-point induction variable \p IV. If \p Trunc 450 /// is provided, the integer induction variable will first be truncated to 451 /// the corresponding type. 452 void widenIntOrFpInduction(PHINode *IV, TruncInst *Trunc = nullptr); 453 454 /// getOrCreateVectorValue and getOrCreateScalarValue coordinate to generate a 455 /// vector or scalar value on-demand if one is not yet available. When 456 /// vectorizing a loop, we visit the definition of an instruction before its 457 /// uses. When visiting the definition, we either vectorize or scalarize the 458 /// instruction, creating an entry for it in the corresponding map. (In some 459 /// cases, such as induction variables, we will create both vector and scalar 460 /// entries.) Then, as we encounter uses of the definition, we derive values 461 /// for each scalar or vector use unless such a value is already available. 462 /// For example, if we scalarize a definition and one of its uses is vector, 463 /// we build the required vector on-demand with an insertelement sequence 464 /// when visiting the use. Otherwise, if the use is scalar, we can use the 465 /// existing scalar definition. 466 /// 467 /// Return a value in the new loop corresponding to \p V from the original 468 /// loop at unroll index \p Part. If the value has already been vectorized, 469 /// the corresponding vector entry in VectorLoopValueMap is returned. If, 470 /// however, the value has a scalar entry in VectorLoopValueMap, we construct 471 /// a new vector value on-demand by inserting the scalar values into a vector 472 /// with an insertelement sequence. If the value has been neither vectorized 473 /// nor scalarized, it must be loop invariant, so we simply broadcast the 474 /// value into a vector. 475 Value *getOrCreateVectorValue(Value *V, unsigned Part); 476 477 /// Return a value in the new loop corresponding to \p V from the original 478 /// loop at unroll and vector indices \p Instance. If the value has been 479 /// vectorized but not scalarized, the necessary extractelement instruction 480 /// will be generated. 481 Value *getOrCreateScalarValue(Value *V, const VPIteration &Instance); 482 483 /// Construct the vector value of a scalarized value \p V one lane at a time. 484 void packScalarIntoVectorValue(Value *V, const VPIteration &Instance); 485 486 /// Try to vectorize the interleaved access group that \p Instr belongs to 487 /// with the base address given in \p Addr, optionally masking the vector 488 /// operations if \p BlockInMask is non-null. Use \p State to translate given 489 /// VPValues to IR values in the vectorized loop. 490 void vectorizeInterleaveGroup(Instruction *Instr, VPTransformState &State, 491 VPValue *Addr, VPValue *BlockInMask = nullptr); 492 493 /// Vectorize Load and Store instructions with the base address given in \p 494 /// Addr, optionally masking the vector operations if \p BlockInMask is 495 /// non-null. Use \p State to translate given VPValues to IR values in the 496 /// vectorized loop. 497 void vectorizeMemoryInstruction(Instruction *Instr, VPTransformState &State, 498 VPValue *Addr, 499 VPValue *BlockInMask = nullptr); 500 501 /// Set the debug location in the builder using the debug location in 502 /// the instruction. 503 void setDebugLocFromInst(IRBuilder<> &B, const Value *Ptr); 504 505 /// Fix the non-induction PHIs in the OrigPHIsToFix vector. 506 void fixNonInductionPHIs(void); 507 508 protected: 509 friend class LoopVectorizationPlanner; 510 511 /// A small list of PHINodes. 512 using PhiVector = SmallVector<PHINode *, 4>; 513 514 /// A type for scalarized values in the new loop. Each value from the 515 /// original loop, when scalarized, is represented by UF x VF scalar values 516 /// in the new unrolled loop, where UF is the unroll factor and VF is the 517 /// vectorization factor. 518 using ScalarParts = SmallVector<SmallVector<Value *, 4>, 2>; 519 520 /// Set up the values of the IVs correctly when exiting the vector loop. 521 void fixupIVUsers(PHINode *OrigPhi, const InductionDescriptor &II, 522 Value *CountRoundDown, Value *EndValue, 523 BasicBlock *MiddleBlock); 524 525 /// Create a new induction variable inside L. 526 PHINode *createInductionVariable(Loop *L, Value *Start, Value *End, 527 Value *Step, Instruction *DL); 528 529 /// Handle all cross-iteration phis in the header. 530 void fixCrossIterationPHIs(); 531 532 /// Fix a first-order recurrence. This is the second phase of vectorizing 533 /// this phi node. 534 void fixFirstOrderRecurrence(PHINode *Phi); 535 536 /// Fix a reduction cross-iteration phi. This is the second phase of 537 /// vectorizing this phi node. 538 void fixReduction(PHINode *Phi); 539 540 /// Clear NSW/NUW flags from reduction instructions if necessary. 541 void clearReductionWrapFlags(RecurrenceDescriptor &RdxDesc); 542 543 /// The Loop exit block may have single value PHI nodes with some 544 /// incoming value. While vectorizing we only handled real values 545 /// that were defined inside the loop and we should have one value for 546 /// each predecessor of its parent basic block. See PR14725. 547 void fixLCSSAPHIs(); 548 549 /// Iteratively sink the scalarized operands of a predicated instruction into 550 /// the block that was created for it. 551 void sinkScalarOperands(Instruction *PredInst); 552 553 /// Shrinks vector element sizes to the smallest bitwidth they can be legally 554 /// represented as. 555 void truncateToMinimalBitwidths(); 556 557 /// Create a broadcast instruction. This method generates a broadcast 558 /// instruction (shuffle) for loop invariant values and for the induction 559 /// value. If this is the induction variable then we extend it to N, N+1, ... 560 /// this is needed because each iteration in the loop corresponds to a SIMD 561 /// element. 562 virtual Value *getBroadcastInstrs(Value *V); 563 564 /// This function adds (StartIdx, StartIdx + Step, StartIdx + 2*Step, ...) 565 /// to each vector element of Val. The sequence starts at StartIndex. 566 /// \p Opcode is relevant for FP induction variable. 567 virtual Value *getStepVector(Value *Val, int StartIdx, Value *Step, 568 Instruction::BinaryOps Opcode = 569 Instruction::BinaryOpsEnd); 570 571 /// Compute scalar induction steps. \p ScalarIV is the scalar induction 572 /// variable on which to base the steps, \p Step is the size of the step, and 573 /// \p EntryVal is the value from the original loop that maps to the steps. 574 /// Note that \p EntryVal doesn't have to be an induction variable - it 575 /// can also be a truncate instruction. 576 void buildScalarSteps(Value *ScalarIV, Value *Step, Instruction *EntryVal, 577 const InductionDescriptor &ID); 578 579 /// Create a vector induction phi node based on an existing scalar one. \p 580 /// EntryVal is the value from the original loop that maps to the vector phi 581 /// node, and \p Step is the loop-invariant step. If \p EntryVal is a 582 /// truncate instruction, instead of widening the original IV, we widen a 583 /// version of the IV truncated to \p EntryVal's type. 584 void createVectorIntOrFpInductionPHI(const InductionDescriptor &II, 585 Value *Step, Instruction *EntryVal); 586 587 /// Returns true if an instruction \p I should be scalarized instead of 588 /// vectorized for the chosen vectorization factor. 589 bool shouldScalarizeInstruction(Instruction *I) const; 590 591 /// Returns true if we should generate a scalar version of \p IV. 592 bool needsScalarInduction(Instruction *IV) const; 593 594 /// If there is a cast involved in the induction variable \p ID, which should 595 /// be ignored in the vectorized loop body, this function records the 596 /// VectorLoopValue of the respective Phi also as the VectorLoopValue of the 597 /// cast. We had already proved that the casted Phi is equal to the uncasted 598 /// Phi in the vectorized loop (under a runtime guard), and therefore 599 /// there is no need to vectorize the cast - the same value can be used in the 600 /// vector loop for both the Phi and the cast. 601 /// If \p VectorLoopValue is a scalarized value, \p Lane is also specified, 602 /// Otherwise, \p VectorLoopValue is a widened/vectorized value. 603 /// 604 /// \p EntryVal is the value from the original loop that maps to the vector 605 /// phi node and is used to distinguish what is the IV currently being 606 /// processed - original one (if \p EntryVal is a phi corresponding to the 607 /// original IV) or the "newly-created" one based on the proof mentioned above 608 /// (see also buildScalarSteps() and createVectorIntOrFPInductionPHI()). In the 609 /// latter case \p EntryVal is a TruncInst and we must not record anything for 610 /// that IV, but it's error-prone to expect callers of this routine to care 611 /// about that, hence this explicit parameter. 612 void recordVectorLoopValueForInductionCast(const InductionDescriptor &ID, 613 const Instruction *EntryVal, 614 Value *VectorLoopValue, 615 unsigned Part, 616 unsigned Lane = UINT_MAX); 617 618 /// Generate a shuffle sequence that will reverse the vector Vec. 619 virtual Value *reverseVector(Value *Vec); 620 621 /// Returns (and creates if needed) the original loop trip count. 622 Value *getOrCreateTripCount(Loop *NewLoop); 623 624 /// Returns (and creates if needed) the trip count of the widened loop. 625 Value *getOrCreateVectorTripCount(Loop *NewLoop); 626 627 /// Returns a bitcasted value to the requested vector type. 628 /// Also handles bitcasts of vector<float> <-> vector<pointer> types. 629 Value *createBitOrPointerCast(Value *V, VectorType *DstVTy, 630 const DataLayout &DL); 631 632 /// Emit a bypass check to see if the vector trip count is zero, including if 633 /// it overflows. 634 void emitMinimumIterationCountCheck(Loop *L, BasicBlock *Bypass); 635 636 /// Emit a bypass check to see if all of the SCEV assumptions we've 637 /// had to make are correct. 638 void emitSCEVChecks(Loop *L, BasicBlock *Bypass); 639 640 /// Emit bypass checks to check any memory assumptions we may have made. 641 void emitMemRuntimeChecks(Loop *L, BasicBlock *Bypass); 642 643 /// Compute the transformed value of Index at offset StartValue using step 644 /// StepValue. 645 /// For integer induction, returns StartValue + Index * StepValue. 646 /// For pointer induction, returns StartValue[Index * StepValue]. 647 /// FIXME: The newly created binary instructions should contain nsw/nuw 648 /// flags, which can be found from the original scalar operations. 649 Value *emitTransformedIndex(IRBuilder<> &B, Value *Index, ScalarEvolution *SE, 650 const DataLayout &DL, 651 const InductionDescriptor &ID) const; 652 653 /// Add additional metadata to \p To that was not present on \p Orig. 654 /// 655 /// Currently this is used to add the noalias annotations based on the 656 /// inserted memchecks. Use this for instructions that are *cloned* into the 657 /// vector loop. 658 void addNewMetadata(Instruction *To, const Instruction *Orig); 659 660 /// Add metadata from one instruction to another. 661 /// 662 /// This includes both the original MDs from \p From and additional ones (\see 663 /// addNewMetadata). Use this for *newly created* instructions in the vector 664 /// loop. 665 void addMetadata(Instruction *To, Instruction *From); 666 667 /// Similar to the previous function but it adds the metadata to a 668 /// vector of instructions. 669 void addMetadata(ArrayRef<Value *> To, Instruction *From); 670 671 /// The original loop. 672 Loop *OrigLoop; 673 674 /// A wrapper around ScalarEvolution used to add runtime SCEV checks. Applies 675 /// dynamic knowledge to simplify SCEV expressions and converts them to a 676 /// more usable form. 677 PredicatedScalarEvolution &PSE; 678 679 /// Loop Info. 680 LoopInfo *LI; 681 682 /// Dominator Tree. 683 DominatorTree *DT; 684 685 /// Alias Analysis. 686 AliasAnalysis *AA; 687 688 /// Target Library Info. 689 const TargetLibraryInfo *TLI; 690 691 /// Target Transform Info. 692 const TargetTransformInfo *TTI; 693 694 /// Assumption Cache. 695 AssumptionCache *AC; 696 697 /// Interface to emit optimization remarks. 698 OptimizationRemarkEmitter *ORE; 699 700 /// LoopVersioning. It's only set up (non-null) if memchecks were 701 /// used. 702 /// 703 /// This is currently only used to add no-alias metadata based on the 704 /// memchecks. The actually versioning is performed manually. 705 std::unique_ptr<LoopVersioning> LVer; 706 707 /// The vectorization SIMD factor to use. Each vector will have this many 708 /// vector elements. 709 unsigned VF; 710 711 /// The vectorization unroll factor to use. Each scalar is vectorized to this 712 /// many different vector instructions. 713 unsigned UF; 714 715 /// The builder that we use 716 IRBuilder<> Builder; 717 718 // --- Vectorization state --- 719 720 /// The vector-loop preheader. 721 BasicBlock *LoopVectorPreHeader; 722 723 /// The scalar-loop preheader. 724 BasicBlock *LoopScalarPreHeader; 725 726 /// Middle Block between the vector and the scalar. 727 BasicBlock *LoopMiddleBlock; 728 729 /// The ExitBlock of the scalar loop. 730 BasicBlock *LoopExitBlock; 731 732 /// The vector loop body. 733 BasicBlock *LoopVectorBody; 734 735 /// The scalar loop body. 736 BasicBlock *LoopScalarBody; 737 738 /// A list of all bypass blocks. The first block is the entry of the loop. 739 SmallVector<BasicBlock *, 4> LoopBypassBlocks; 740 741 /// The new Induction variable which was added to the new block. 742 PHINode *Induction = nullptr; 743 744 /// The induction variable of the old basic block. 745 PHINode *OldInduction = nullptr; 746 747 /// Maps values from the original loop to their corresponding values in the 748 /// vectorized loop. A key value can map to either vector values, scalar 749 /// values or both kinds of values, depending on whether the key was 750 /// vectorized and scalarized. 751 VectorizerValueMap VectorLoopValueMap; 752 753 /// Store instructions that were predicated. 754 SmallVector<Instruction *, 4> PredicatedInstructions; 755 756 /// Trip count of the original loop. 757 Value *TripCount = nullptr; 758 759 /// Trip count of the widened loop (TripCount - TripCount % (VF*UF)) 760 Value *VectorTripCount = nullptr; 761 762 /// The legality analysis. 763 LoopVectorizationLegality *Legal; 764 765 /// The profitablity analysis. 766 LoopVectorizationCostModel *Cost; 767 768 // Record whether runtime checks are added. 769 bool AddedSafetyChecks = false; 770 771 // Holds the end values for each induction variable. We save the end values 772 // so we can later fix-up the external users of the induction variables. 773 DenseMap<PHINode *, Value *> IVEndValues; 774 775 // Vector of original scalar PHIs whose corresponding widened PHIs need to be 776 // fixed up at the end of vector code generation. 777 SmallVector<PHINode *, 8> OrigPHIsToFix; 778 }; 779 780 class InnerLoopUnroller : public InnerLoopVectorizer { 781 public: 782 InnerLoopUnroller(Loop *OrigLoop, PredicatedScalarEvolution &PSE, 783 LoopInfo *LI, DominatorTree *DT, 784 const TargetLibraryInfo *TLI, 785 const TargetTransformInfo *TTI, AssumptionCache *AC, 786 OptimizationRemarkEmitter *ORE, unsigned UnrollFactor, 787 LoopVectorizationLegality *LVL, 788 LoopVectorizationCostModel *CM) 789 : InnerLoopVectorizer(OrigLoop, PSE, LI, DT, TLI, TTI, AC, ORE, 1, 790 UnrollFactor, LVL, CM) {} 791 792 private: 793 Value *getBroadcastInstrs(Value *V) override; 794 Value *getStepVector(Value *Val, int StartIdx, Value *Step, 795 Instruction::BinaryOps Opcode = 796 Instruction::BinaryOpsEnd) override; 797 Value *reverseVector(Value *Vec) override; 798 }; 799 800 } // end namespace llvm 801 802 /// Look for a meaningful debug location on the instruction or it's 803 /// operands. 804 static Instruction *getDebugLocFromInstOrOperands(Instruction *I) { 805 if (!I) 806 return I; 807 808 DebugLoc Empty; 809 if (I->getDebugLoc() != Empty) 810 return I; 811 812 for (User::op_iterator OI = I->op_begin(), OE = I->op_end(); OI != OE; ++OI) { 813 if (Instruction *OpInst = dyn_cast<Instruction>(*OI)) 814 if (OpInst->getDebugLoc() != Empty) 815 return OpInst; 816 } 817 818 return I; 819 } 820 821 void InnerLoopVectorizer::setDebugLocFromInst(IRBuilder<> &B, const Value *Ptr) { 822 if (const Instruction *Inst = dyn_cast_or_null<Instruction>(Ptr)) { 823 const DILocation *DIL = Inst->getDebugLoc(); 824 if (DIL && Inst->getFunction()->isDebugInfoForProfiling() && 825 !isa<DbgInfoIntrinsic>(Inst)) { 826 auto NewDIL = DIL->cloneByMultiplyingDuplicationFactor(UF * VF); 827 if (NewDIL) 828 B.SetCurrentDebugLocation(NewDIL.getValue()); 829 else 830 LLVM_DEBUG(dbgs() 831 << "Failed to create new discriminator: " 832 << DIL->getFilename() << " Line: " << DIL->getLine()); 833 } 834 else 835 B.SetCurrentDebugLocation(DIL); 836 } else 837 B.SetCurrentDebugLocation(DebugLoc()); 838 } 839 840 /// Write a record \p DebugMsg about vectorization failure to the debug 841 /// output stream. If \p I is passed, it is an instruction that prevents 842 /// vectorization. 843 #ifndef NDEBUG 844 static void debugVectorizationFailure(const StringRef DebugMsg, 845 Instruction *I) { 846 dbgs() << "LV: Not vectorizing: " << DebugMsg; 847 if (I != nullptr) 848 dbgs() << " " << *I; 849 else 850 dbgs() << '.'; 851 dbgs() << '\n'; 852 } 853 #endif 854 855 /// Create an analysis remark that explains why vectorization failed 856 /// 857 /// \p PassName is the name of the pass (e.g. can be AlwaysPrint). \p 858 /// RemarkName is the identifier for the remark. If \p I is passed it is an 859 /// instruction that prevents vectorization. Otherwise \p TheLoop is used for 860 /// the location of the remark. \return the remark object that can be 861 /// streamed to. 862 static OptimizationRemarkAnalysis createLVAnalysis(const char *PassName, 863 StringRef RemarkName, Loop *TheLoop, Instruction *I) { 864 Value *CodeRegion = TheLoop->getHeader(); 865 DebugLoc DL = TheLoop->getStartLoc(); 866 867 if (I) { 868 CodeRegion = I->getParent(); 869 // If there is no debug location attached to the instruction, revert back to 870 // using the loop's. 871 if (I->getDebugLoc()) 872 DL = I->getDebugLoc(); 873 } 874 875 OptimizationRemarkAnalysis R(PassName, RemarkName, DL, CodeRegion); 876 R << "loop not vectorized: "; 877 return R; 878 } 879 880 namespace llvm { 881 882 void reportVectorizationFailure(const StringRef DebugMsg, 883 const StringRef OREMsg, const StringRef ORETag, 884 OptimizationRemarkEmitter *ORE, Loop *TheLoop, Instruction *I) { 885 LLVM_DEBUG(debugVectorizationFailure(DebugMsg, I)); 886 LoopVectorizeHints Hints(TheLoop, true /* doesn't matter */, *ORE); 887 ORE->emit(createLVAnalysis(Hints.vectorizeAnalysisPassName(), 888 ORETag, TheLoop, I) << OREMsg); 889 } 890 891 } // end namespace llvm 892 893 #ifndef NDEBUG 894 /// \return string containing a file name and a line # for the given loop. 895 static std::string getDebugLocString(const Loop *L) { 896 std::string Result; 897 if (L) { 898 raw_string_ostream OS(Result); 899 if (const DebugLoc LoopDbgLoc = L->getStartLoc()) 900 LoopDbgLoc.print(OS); 901 else 902 // Just print the module name. 903 OS << L->getHeader()->getParent()->getParent()->getModuleIdentifier(); 904 OS.flush(); 905 } 906 return Result; 907 } 908 #endif 909 910 void InnerLoopVectorizer::addNewMetadata(Instruction *To, 911 const Instruction *Orig) { 912 // If the loop was versioned with memchecks, add the corresponding no-alias 913 // metadata. 914 if (LVer && (isa<LoadInst>(Orig) || isa<StoreInst>(Orig))) 915 LVer->annotateInstWithNoAlias(To, Orig); 916 } 917 918 void InnerLoopVectorizer::addMetadata(Instruction *To, 919 Instruction *From) { 920 propagateMetadata(To, From); 921 addNewMetadata(To, From); 922 } 923 924 void InnerLoopVectorizer::addMetadata(ArrayRef<Value *> To, 925 Instruction *From) { 926 for (Value *V : To) { 927 if (Instruction *I = dyn_cast<Instruction>(V)) 928 addMetadata(I, From); 929 } 930 } 931 932 namespace llvm { 933 934 // Loop vectorization cost-model hints how the scalar epilogue loop should be 935 // lowered. 936 enum ScalarEpilogueLowering { 937 938 // The default: allowing scalar epilogues. 939 CM_ScalarEpilogueAllowed, 940 941 // Vectorization with OptForSize: don't allow epilogues. 942 CM_ScalarEpilogueNotAllowedOptSize, 943 944 // A special case of vectorisation with OptForSize: loops with a very small 945 // trip count are considered for vectorization under OptForSize, thereby 946 // making sure the cost of their loop body is dominant, free of runtime 947 // guards and scalar iteration overheads. 948 CM_ScalarEpilogueNotAllowedLowTripLoop, 949 950 // Loop hint predicate indicating an epilogue is undesired. 951 CM_ScalarEpilogueNotNeededUsePredicate 952 }; 953 954 /// LoopVectorizationCostModel - estimates the expected speedups due to 955 /// vectorization. 956 /// In many cases vectorization is not profitable. This can happen because of 957 /// a number of reasons. In this class we mainly attempt to predict the 958 /// expected speedup/slowdowns due to the supported instruction set. We use the 959 /// TargetTransformInfo to query the different backends for the cost of 960 /// different operations. 961 class LoopVectorizationCostModel { 962 public: 963 LoopVectorizationCostModel(ScalarEpilogueLowering SEL, Loop *L, 964 PredicatedScalarEvolution &PSE, LoopInfo *LI, 965 LoopVectorizationLegality *Legal, 966 const TargetTransformInfo &TTI, 967 const TargetLibraryInfo *TLI, DemandedBits *DB, 968 AssumptionCache *AC, 969 OptimizationRemarkEmitter *ORE, const Function *F, 970 const LoopVectorizeHints *Hints, 971 InterleavedAccessInfo &IAI) 972 : ScalarEpilogueStatus(SEL), TheLoop(L), PSE(PSE), LI(LI), Legal(Legal), 973 TTI(TTI), TLI(TLI), DB(DB), AC(AC), ORE(ORE), TheFunction(F), 974 Hints(Hints), InterleaveInfo(IAI) {} 975 976 /// \return An upper bound for the vectorization factor, or None if 977 /// vectorization and interleaving should be avoided up front. 978 Optional<unsigned> computeMaxVF(); 979 980 /// \return True if runtime checks are required for vectorization, and false 981 /// otherwise. 982 bool runtimeChecksRequired(); 983 984 /// \return The most profitable vectorization factor and the cost of that VF. 985 /// This method checks every power of two up to MaxVF. If UserVF is not ZERO 986 /// then this vectorization factor will be selected if vectorization is 987 /// possible. 988 VectorizationFactor selectVectorizationFactor(unsigned MaxVF); 989 990 /// Setup cost-based decisions for user vectorization factor. 991 void selectUserVectorizationFactor(unsigned UserVF) { 992 collectUniformsAndScalars(UserVF); 993 collectInstsToScalarize(UserVF); 994 } 995 996 /// \return The size (in bits) of the smallest and widest types in the code 997 /// that needs to be vectorized. We ignore values that remain scalar such as 998 /// 64 bit loop indices. 999 std::pair<unsigned, unsigned> getSmallestAndWidestTypes(); 1000 1001 /// \return The desired interleave count. 1002 /// If interleave count has been specified by metadata it will be returned. 1003 /// Otherwise, the interleave count is computed and returned. VF and LoopCost 1004 /// are the selected vectorization factor and the cost of the selected VF. 1005 unsigned selectInterleaveCount(unsigned VF, unsigned LoopCost); 1006 1007 /// Memory access instruction may be vectorized in more than one way. 1008 /// Form of instruction after vectorization depends on cost. 1009 /// This function takes cost-based decisions for Load/Store instructions 1010 /// and collects them in a map. This decisions map is used for building 1011 /// the lists of loop-uniform and loop-scalar instructions. 1012 /// The calculated cost is saved with widening decision in order to 1013 /// avoid redundant calculations. 1014 void setCostBasedWideningDecision(unsigned VF); 1015 1016 /// A struct that represents some properties of the register usage 1017 /// of a loop. 1018 struct RegisterUsage { 1019 /// Holds the number of loop invariant values that are used in the loop. 1020 /// The key is ClassID of target-provided register class. 1021 SmallMapVector<unsigned, unsigned, 4> LoopInvariantRegs; 1022 /// Holds the maximum number of concurrent live intervals in the loop. 1023 /// The key is ClassID of target-provided register class. 1024 SmallMapVector<unsigned, unsigned, 4> MaxLocalUsers; 1025 }; 1026 1027 /// \return Returns information about the register usages of the loop for the 1028 /// given vectorization factors. 1029 SmallVector<RegisterUsage, 8> calculateRegisterUsage(ArrayRef<unsigned> VFs); 1030 1031 /// Collect values we want to ignore in the cost model. 1032 void collectValuesToIgnore(); 1033 1034 /// \returns The smallest bitwidth each instruction can be represented with. 1035 /// The vector equivalents of these instructions should be truncated to this 1036 /// type. 1037 const MapVector<Instruction *, uint64_t> &getMinimalBitwidths() const { 1038 return MinBWs; 1039 } 1040 1041 /// \returns True if it is more profitable to scalarize instruction \p I for 1042 /// vectorization factor \p VF. 1043 bool isProfitableToScalarize(Instruction *I, unsigned VF) const { 1044 assert(VF > 1 && "Profitable to scalarize relevant only for VF > 1."); 1045 1046 // Cost model is not run in the VPlan-native path - return conservative 1047 // result until this changes. 1048 if (EnableVPlanNativePath) 1049 return false; 1050 1051 auto Scalars = InstsToScalarize.find(VF); 1052 assert(Scalars != InstsToScalarize.end() && 1053 "VF not yet analyzed for scalarization profitability"); 1054 return Scalars->second.find(I) != Scalars->second.end(); 1055 } 1056 1057 /// Returns true if \p I is known to be uniform after vectorization. 1058 bool isUniformAfterVectorization(Instruction *I, unsigned VF) const { 1059 if (VF == 1) 1060 return true; 1061 1062 // Cost model is not run in the VPlan-native path - return conservative 1063 // result until this changes. 1064 if (EnableVPlanNativePath) 1065 return false; 1066 1067 auto UniformsPerVF = Uniforms.find(VF); 1068 assert(UniformsPerVF != Uniforms.end() && 1069 "VF not yet analyzed for uniformity"); 1070 return UniformsPerVF->second.find(I) != UniformsPerVF->second.end(); 1071 } 1072 1073 /// Returns true if \p I is known to be scalar after vectorization. 1074 bool isScalarAfterVectorization(Instruction *I, unsigned VF) const { 1075 if (VF == 1) 1076 return true; 1077 1078 // Cost model is not run in the VPlan-native path - return conservative 1079 // result until this changes. 1080 if (EnableVPlanNativePath) 1081 return false; 1082 1083 auto ScalarsPerVF = Scalars.find(VF); 1084 assert(ScalarsPerVF != Scalars.end() && 1085 "Scalar values are not calculated for VF"); 1086 return ScalarsPerVF->second.find(I) != ScalarsPerVF->second.end(); 1087 } 1088 1089 /// \returns True if instruction \p I can be truncated to a smaller bitwidth 1090 /// for vectorization factor \p VF. 1091 bool canTruncateToMinimalBitwidth(Instruction *I, unsigned VF) const { 1092 return VF > 1 && MinBWs.find(I) != MinBWs.end() && 1093 !isProfitableToScalarize(I, VF) && 1094 !isScalarAfterVectorization(I, VF); 1095 } 1096 1097 /// Decision that was taken during cost calculation for memory instruction. 1098 enum InstWidening { 1099 CM_Unknown, 1100 CM_Widen, // For consecutive accesses with stride +1. 1101 CM_Widen_Reverse, // For consecutive accesses with stride -1. 1102 CM_Interleave, 1103 CM_GatherScatter, 1104 CM_Scalarize 1105 }; 1106 1107 /// Save vectorization decision \p W and \p Cost taken by the cost model for 1108 /// instruction \p I and vector width \p VF. 1109 void setWideningDecision(Instruction *I, unsigned VF, InstWidening W, 1110 unsigned Cost) { 1111 assert(VF >= 2 && "Expected VF >=2"); 1112 WideningDecisions[std::make_pair(I, VF)] = std::make_pair(W, Cost); 1113 } 1114 1115 /// Save vectorization decision \p W and \p Cost taken by the cost model for 1116 /// interleaving group \p Grp and vector width \p VF. 1117 void setWideningDecision(const InterleaveGroup<Instruction> *Grp, unsigned VF, 1118 InstWidening W, unsigned Cost) { 1119 assert(VF >= 2 && "Expected VF >=2"); 1120 /// Broadcast this decicion to all instructions inside the group. 1121 /// But the cost will be assigned to one instruction only. 1122 for (unsigned i = 0; i < Grp->getFactor(); ++i) { 1123 if (auto *I = Grp->getMember(i)) { 1124 if (Grp->getInsertPos() == I) 1125 WideningDecisions[std::make_pair(I, VF)] = std::make_pair(W, Cost); 1126 else 1127 WideningDecisions[std::make_pair(I, VF)] = std::make_pair(W, 0); 1128 } 1129 } 1130 } 1131 1132 /// Return the cost model decision for the given instruction \p I and vector 1133 /// width \p VF. Return CM_Unknown if this instruction did not pass 1134 /// through the cost modeling. 1135 InstWidening getWideningDecision(Instruction *I, unsigned VF) { 1136 assert(VF >= 2 && "Expected VF >=2"); 1137 1138 // Cost model is not run in the VPlan-native path - return conservative 1139 // result until this changes. 1140 if (EnableVPlanNativePath) 1141 return CM_GatherScatter; 1142 1143 std::pair<Instruction *, unsigned> InstOnVF = std::make_pair(I, VF); 1144 auto Itr = WideningDecisions.find(InstOnVF); 1145 if (Itr == WideningDecisions.end()) 1146 return CM_Unknown; 1147 return Itr->second.first; 1148 } 1149 1150 /// Return the vectorization cost for the given instruction \p I and vector 1151 /// width \p VF. 1152 unsigned getWideningCost(Instruction *I, unsigned VF) { 1153 assert(VF >= 2 && "Expected VF >=2"); 1154 std::pair<Instruction *, unsigned> InstOnVF = std::make_pair(I, VF); 1155 assert(WideningDecisions.find(InstOnVF) != WideningDecisions.end() && 1156 "The cost is not calculated"); 1157 return WideningDecisions[InstOnVF].second; 1158 } 1159 1160 /// Return True if instruction \p I is an optimizable truncate whose operand 1161 /// is an induction variable. Such a truncate will be removed by adding a new 1162 /// induction variable with the destination type. 1163 bool isOptimizableIVTruncate(Instruction *I, unsigned VF) { 1164 // If the instruction is not a truncate, return false. 1165 auto *Trunc = dyn_cast<TruncInst>(I); 1166 if (!Trunc) 1167 return false; 1168 1169 // Get the source and destination types of the truncate. 1170 Type *SrcTy = ToVectorTy(cast<CastInst>(I)->getSrcTy(), VF); 1171 Type *DestTy = ToVectorTy(cast<CastInst>(I)->getDestTy(), VF); 1172 1173 // If the truncate is free for the given types, return false. Replacing a 1174 // free truncate with an induction variable would add an induction variable 1175 // update instruction to each iteration of the loop. We exclude from this 1176 // check the primary induction variable since it will need an update 1177 // instruction regardless. 1178 Value *Op = Trunc->getOperand(0); 1179 if (Op != Legal->getPrimaryInduction() && TTI.isTruncateFree(SrcTy, DestTy)) 1180 return false; 1181 1182 // If the truncated value is not an induction variable, return false. 1183 return Legal->isInductionPhi(Op); 1184 } 1185 1186 /// Collects the instructions to scalarize for each predicated instruction in 1187 /// the loop. 1188 void collectInstsToScalarize(unsigned VF); 1189 1190 /// Collect Uniform and Scalar values for the given \p VF. 1191 /// The sets depend on CM decision for Load/Store instructions 1192 /// that may be vectorized as interleave, gather-scatter or scalarized. 1193 void collectUniformsAndScalars(unsigned VF) { 1194 // Do the analysis once. 1195 if (VF == 1 || Uniforms.find(VF) != Uniforms.end()) 1196 return; 1197 setCostBasedWideningDecision(VF); 1198 collectLoopUniforms(VF); 1199 collectLoopScalars(VF); 1200 } 1201 1202 /// Returns true if the target machine supports masked store operation 1203 /// for the given \p DataType and kind of access to \p Ptr. 1204 bool isLegalMaskedStore(Type *DataType, Value *Ptr, MaybeAlign Alignment) { 1205 return Legal->isConsecutivePtr(Ptr) && 1206 TTI.isLegalMaskedStore(DataType, Alignment); 1207 } 1208 1209 /// Returns true if the target machine supports masked load operation 1210 /// for the given \p DataType and kind of access to \p Ptr. 1211 bool isLegalMaskedLoad(Type *DataType, Value *Ptr, MaybeAlign Alignment) { 1212 return Legal->isConsecutivePtr(Ptr) && 1213 TTI.isLegalMaskedLoad(DataType, Alignment); 1214 } 1215 1216 /// Returns true if the target machine supports masked scatter operation 1217 /// for the given \p DataType. 1218 bool isLegalMaskedScatter(Type *DataType, MaybeAlign Alignment) { 1219 return TTI.isLegalMaskedScatter(DataType, Alignment); 1220 } 1221 1222 /// Returns true if the target machine supports masked gather operation 1223 /// for the given \p DataType. 1224 bool isLegalMaskedGather(Type *DataType, MaybeAlign Alignment) { 1225 return TTI.isLegalMaskedGather(DataType, Alignment); 1226 } 1227 1228 /// Returns true if the target machine can represent \p V as a masked gather 1229 /// or scatter operation. 1230 bool isLegalGatherOrScatter(Value *V) { 1231 bool LI = isa<LoadInst>(V); 1232 bool SI = isa<StoreInst>(V); 1233 if (!LI && !SI) 1234 return false; 1235 auto *Ty = getMemInstValueType(V); 1236 MaybeAlign Align = getLoadStoreAlignment(V); 1237 return (LI && isLegalMaskedGather(Ty, Align)) || 1238 (SI && isLegalMaskedScatter(Ty, Align)); 1239 } 1240 1241 /// Returns true if \p I is an instruction that will be scalarized with 1242 /// predication. Such instructions include conditional stores and 1243 /// instructions that may divide by zero. 1244 /// If a non-zero VF has been calculated, we check if I will be scalarized 1245 /// predication for that VF. 1246 bool isScalarWithPredication(Instruction *I, unsigned VF = 1); 1247 1248 // Returns true if \p I is an instruction that will be predicated either 1249 // through scalar predication or masked load/store or masked gather/scatter. 1250 // Superset of instructions that return true for isScalarWithPredication. 1251 bool isPredicatedInst(Instruction *I) { 1252 if (!blockNeedsPredication(I->getParent())) 1253 return false; 1254 // Loads and stores that need some form of masked operation are predicated 1255 // instructions. 1256 if (isa<LoadInst>(I) || isa<StoreInst>(I)) 1257 return Legal->isMaskRequired(I); 1258 return isScalarWithPredication(I); 1259 } 1260 1261 /// Returns true if \p I is a memory instruction with consecutive memory 1262 /// access that can be widened. 1263 bool memoryInstructionCanBeWidened(Instruction *I, unsigned VF = 1); 1264 1265 /// Returns true if \p I is a memory instruction in an interleaved-group 1266 /// of memory accesses that can be vectorized with wide vector loads/stores 1267 /// and shuffles. 1268 bool interleavedAccessCanBeWidened(Instruction *I, unsigned VF = 1); 1269 1270 /// Check if \p Instr belongs to any interleaved access group. 1271 bool isAccessInterleaved(Instruction *Instr) { 1272 return InterleaveInfo.isInterleaved(Instr); 1273 } 1274 1275 /// Get the interleaved access group that \p Instr belongs to. 1276 const InterleaveGroup<Instruction> * 1277 getInterleavedAccessGroup(Instruction *Instr) { 1278 return InterleaveInfo.getInterleaveGroup(Instr); 1279 } 1280 1281 /// Returns true if an interleaved group requires a scalar iteration 1282 /// to handle accesses with gaps, and there is nothing preventing us from 1283 /// creating a scalar epilogue. 1284 bool requiresScalarEpilogue() const { 1285 return isScalarEpilogueAllowed() && InterleaveInfo.requiresScalarEpilogue(); 1286 } 1287 1288 /// Returns true if a scalar epilogue is not allowed due to optsize or a 1289 /// loop hint annotation. 1290 bool isScalarEpilogueAllowed() const { 1291 return ScalarEpilogueStatus == CM_ScalarEpilogueAllowed; 1292 } 1293 1294 /// Returns true if all loop blocks should be masked to fold tail loop. 1295 bool foldTailByMasking() const { return FoldTailByMasking; } 1296 1297 bool blockNeedsPredication(BasicBlock *BB) { 1298 return foldTailByMasking() || Legal->blockNeedsPredication(BB); 1299 } 1300 1301 /// Estimate cost of an intrinsic call instruction CI if it were vectorized 1302 /// with factor VF. Return the cost of the instruction, including 1303 /// scalarization overhead if it's needed. 1304 unsigned getVectorIntrinsicCost(CallInst *CI, unsigned VF); 1305 1306 /// Estimate cost of a call instruction CI if it were vectorized with factor 1307 /// VF. Return the cost of the instruction, including scalarization overhead 1308 /// if it's needed. The flag NeedToScalarize shows if the call needs to be 1309 /// scalarized - 1310 /// i.e. either vector version isn't available, or is too expensive. 1311 unsigned getVectorCallCost(CallInst *CI, unsigned VF, bool &NeedToScalarize); 1312 1313 private: 1314 unsigned NumPredStores = 0; 1315 1316 /// \return An upper bound for the vectorization factor, larger than zero. 1317 /// One is returned if vectorization should best be avoided due to cost. 1318 unsigned computeFeasibleMaxVF(unsigned ConstTripCount); 1319 1320 /// The vectorization cost is a combination of the cost itself and a boolean 1321 /// indicating whether any of the contributing operations will actually 1322 /// operate on 1323 /// vector values after type legalization in the backend. If this latter value 1324 /// is 1325 /// false, then all operations will be scalarized (i.e. no vectorization has 1326 /// actually taken place). 1327 using VectorizationCostTy = std::pair<unsigned, bool>; 1328 1329 /// Returns the expected execution cost. The unit of the cost does 1330 /// not matter because we use the 'cost' units to compare different 1331 /// vector widths. The cost that is returned is *not* normalized by 1332 /// the factor width. 1333 VectorizationCostTy expectedCost(unsigned VF); 1334 1335 /// Returns the execution time cost of an instruction for a given vector 1336 /// width. Vector width of one means scalar. 1337 VectorizationCostTy getInstructionCost(Instruction *I, unsigned VF); 1338 1339 /// The cost-computation logic from getInstructionCost which provides 1340 /// the vector type as an output parameter. 1341 unsigned getInstructionCost(Instruction *I, unsigned VF, Type *&VectorTy); 1342 1343 /// Calculate vectorization cost of memory instruction \p I. 1344 unsigned getMemoryInstructionCost(Instruction *I, unsigned VF); 1345 1346 /// The cost computation for scalarized memory instruction. 1347 unsigned getMemInstScalarizationCost(Instruction *I, unsigned VF); 1348 1349 /// The cost computation for interleaving group of memory instructions. 1350 unsigned getInterleaveGroupCost(Instruction *I, unsigned VF); 1351 1352 /// The cost computation for Gather/Scatter instruction. 1353 unsigned getGatherScatterCost(Instruction *I, unsigned VF); 1354 1355 /// The cost computation for widening instruction \p I with consecutive 1356 /// memory access. 1357 unsigned getConsecutiveMemOpCost(Instruction *I, unsigned VF); 1358 1359 /// The cost calculation for Load/Store instruction \p I with uniform pointer - 1360 /// Load: scalar load + broadcast. 1361 /// Store: scalar store + (loop invariant value stored? 0 : extract of last 1362 /// element) 1363 unsigned getUniformMemOpCost(Instruction *I, unsigned VF); 1364 1365 /// Estimate the overhead of scalarizing an instruction. This is a 1366 /// convenience wrapper for the type-based getScalarizationOverhead API. 1367 unsigned getScalarizationOverhead(Instruction *I, unsigned VF); 1368 1369 /// Returns whether the instruction is a load or store and will be a emitted 1370 /// as a vector operation. 1371 bool isConsecutiveLoadOrStore(Instruction *I); 1372 1373 /// Returns true if an artificially high cost for emulated masked memrefs 1374 /// should be used. 1375 bool useEmulatedMaskMemRefHack(Instruction *I); 1376 1377 /// Map of scalar integer values to the smallest bitwidth they can be legally 1378 /// represented as. The vector equivalents of these values should be truncated 1379 /// to this type. 1380 MapVector<Instruction *, uint64_t> MinBWs; 1381 1382 /// A type representing the costs for instructions if they were to be 1383 /// scalarized rather than vectorized. The entries are Instruction-Cost 1384 /// pairs. 1385 using ScalarCostsTy = DenseMap<Instruction *, unsigned>; 1386 1387 /// A set containing all BasicBlocks that are known to present after 1388 /// vectorization as a predicated block. 1389 SmallPtrSet<BasicBlock *, 4> PredicatedBBsAfterVectorization; 1390 1391 /// Records whether it is allowed to have the original scalar loop execute at 1392 /// least once. This may be needed as a fallback loop in case runtime 1393 /// aliasing/dependence checks fail, or to handle the tail/remainder 1394 /// iterations when the trip count is unknown or doesn't divide by the VF, 1395 /// or as a peel-loop to handle gaps in interleave-groups. 1396 /// Under optsize and when the trip count is very small we don't allow any 1397 /// iterations to execute in the scalar loop. 1398 ScalarEpilogueLowering ScalarEpilogueStatus = CM_ScalarEpilogueAllowed; 1399 1400 /// All blocks of loop are to be masked to fold tail of scalar iterations. 1401 bool FoldTailByMasking = false; 1402 1403 /// A map holding scalar costs for different vectorization factors. The 1404 /// presence of a cost for an instruction in the mapping indicates that the 1405 /// instruction will be scalarized when vectorizing with the associated 1406 /// vectorization factor. The entries are VF-ScalarCostTy pairs. 1407 DenseMap<unsigned, ScalarCostsTy> InstsToScalarize; 1408 1409 /// Holds the instructions known to be uniform after vectorization. 1410 /// The data is collected per VF. 1411 DenseMap<unsigned, SmallPtrSet<Instruction *, 4>> Uniforms; 1412 1413 /// Holds the instructions known to be scalar after vectorization. 1414 /// The data is collected per VF. 1415 DenseMap<unsigned, SmallPtrSet<Instruction *, 4>> Scalars; 1416 1417 /// Holds the instructions (address computations) that are forced to be 1418 /// scalarized. 1419 DenseMap<unsigned, SmallPtrSet<Instruction *, 4>> ForcedScalars; 1420 1421 /// Returns the expected difference in cost from scalarizing the expression 1422 /// feeding a predicated instruction \p PredInst. The instructions to 1423 /// scalarize and their scalar costs are collected in \p ScalarCosts. A 1424 /// non-negative return value implies the expression will be scalarized. 1425 /// Currently, only single-use chains are considered for scalarization. 1426 int computePredInstDiscount(Instruction *PredInst, ScalarCostsTy &ScalarCosts, 1427 unsigned VF); 1428 1429 /// Collect the instructions that are uniform after vectorization. An 1430 /// instruction is uniform if we represent it with a single scalar value in 1431 /// the vectorized loop corresponding to each vector iteration. Examples of 1432 /// uniform instructions include pointer operands of consecutive or 1433 /// interleaved memory accesses. Note that although uniformity implies an 1434 /// instruction will be scalar, the reverse is not true. In general, a 1435 /// scalarized instruction will be represented by VF scalar values in the 1436 /// vectorized loop, each corresponding to an iteration of the original 1437 /// scalar loop. 1438 void collectLoopUniforms(unsigned VF); 1439 1440 /// Collect the instructions that are scalar after vectorization. An 1441 /// instruction is scalar if it is known to be uniform or will be scalarized 1442 /// during vectorization. Non-uniform scalarized instructions will be 1443 /// represented by VF values in the vectorized loop, each corresponding to an 1444 /// iteration of the original scalar loop. 1445 void collectLoopScalars(unsigned VF); 1446 1447 /// Keeps cost model vectorization decision and cost for instructions. 1448 /// Right now it is used for memory instructions only. 1449 using DecisionList = DenseMap<std::pair<Instruction *, unsigned>, 1450 std::pair<InstWidening, unsigned>>; 1451 1452 DecisionList WideningDecisions; 1453 1454 /// Returns true if \p V is expected to be vectorized and it needs to be 1455 /// extracted. 1456 bool needsExtract(Value *V, unsigned VF) const { 1457 Instruction *I = dyn_cast<Instruction>(V); 1458 if (VF == 1 || !I || !TheLoop->contains(I) || TheLoop->isLoopInvariant(I)) 1459 return false; 1460 1461 // Assume we can vectorize V (and hence we need extraction) if the 1462 // scalars are not computed yet. This can happen, because it is called 1463 // via getScalarizationOverhead from setCostBasedWideningDecision, before 1464 // the scalars are collected. That should be a safe assumption in most 1465 // cases, because we check if the operands have vectorizable types 1466 // beforehand in LoopVectorizationLegality. 1467 return Scalars.find(VF) == Scalars.end() || 1468 !isScalarAfterVectorization(I, VF); 1469 }; 1470 1471 /// Returns a range containing only operands needing to be extracted. 1472 SmallVector<Value *, 4> filterExtractingOperands(Instruction::op_range Ops, 1473 unsigned VF) { 1474 return SmallVector<Value *, 4>(make_filter_range( 1475 Ops, [this, VF](Value *V) { return this->needsExtract(V, VF); })); 1476 } 1477 1478 public: 1479 /// The loop that we evaluate. 1480 Loop *TheLoop; 1481 1482 /// Predicated scalar evolution analysis. 1483 PredicatedScalarEvolution &PSE; 1484 1485 /// Loop Info analysis. 1486 LoopInfo *LI; 1487 1488 /// Vectorization legality. 1489 LoopVectorizationLegality *Legal; 1490 1491 /// Vector target information. 1492 const TargetTransformInfo &TTI; 1493 1494 /// Target Library Info. 1495 const TargetLibraryInfo *TLI; 1496 1497 /// Demanded bits analysis. 1498 DemandedBits *DB; 1499 1500 /// Assumption cache. 1501 AssumptionCache *AC; 1502 1503 /// Interface to emit optimization remarks. 1504 OptimizationRemarkEmitter *ORE; 1505 1506 const Function *TheFunction; 1507 1508 /// Loop Vectorize Hint. 1509 const LoopVectorizeHints *Hints; 1510 1511 /// The interleave access information contains groups of interleaved accesses 1512 /// with the same stride and close to each other. 1513 InterleavedAccessInfo &InterleaveInfo; 1514 1515 /// Values to ignore in the cost model. 1516 SmallPtrSet<const Value *, 16> ValuesToIgnore; 1517 1518 /// Values to ignore in the cost model when VF > 1. 1519 SmallPtrSet<const Value *, 16> VecValuesToIgnore; 1520 }; 1521 1522 } // end namespace llvm 1523 1524 // Return true if \p OuterLp is an outer loop annotated with hints for explicit 1525 // vectorization. The loop needs to be annotated with #pragma omp simd 1526 // simdlen(#) or #pragma clang vectorize(enable) vectorize_width(#). If the 1527 // vector length information is not provided, vectorization is not considered 1528 // explicit. Interleave hints are not allowed either. These limitations will be 1529 // relaxed in the future. 1530 // Please, note that we are currently forced to abuse the pragma 'clang 1531 // vectorize' semantics. This pragma provides *auto-vectorization hints* 1532 // (i.e., LV must check that vectorization is legal) whereas pragma 'omp simd' 1533 // provides *explicit vectorization hints* (LV can bypass legal checks and 1534 // assume that vectorization is legal). However, both hints are implemented 1535 // using the same metadata (llvm.loop.vectorize, processed by 1536 // LoopVectorizeHints). This will be fixed in the future when the native IR 1537 // representation for pragma 'omp simd' is introduced. 1538 static bool isExplicitVecOuterLoop(Loop *OuterLp, 1539 OptimizationRemarkEmitter *ORE) { 1540 assert(!OuterLp->empty() && "This is not an outer loop"); 1541 LoopVectorizeHints Hints(OuterLp, true /*DisableInterleaving*/, *ORE); 1542 1543 // Only outer loops with an explicit vectorization hint are supported. 1544 // Unannotated outer loops are ignored. 1545 if (Hints.getForce() == LoopVectorizeHints::FK_Undefined) 1546 return false; 1547 1548 Function *Fn = OuterLp->getHeader()->getParent(); 1549 if (!Hints.allowVectorization(Fn, OuterLp, 1550 true /*VectorizeOnlyWhenForced*/)) { 1551 LLVM_DEBUG(dbgs() << "LV: Loop hints prevent outer loop vectorization.\n"); 1552 return false; 1553 } 1554 1555 if (Hints.getInterleave() > 1) { 1556 // TODO: Interleave support is future work. 1557 LLVM_DEBUG(dbgs() << "LV: Not vectorizing: Interleave is not supported for " 1558 "outer loops.\n"); 1559 Hints.emitRemarkWithHints(); 1560 return false; 1561 } 1562 1563 return true; 1564 } 1565 1566 static void collectSupportedLoops(Loop &L, LoopInfo *LI, 1567 OptimizationRemarkEmitter *ORE, 1568 SmallVectorImpl<Loop *> &V) { 1569 // Collect inner loops and outer loops without irreducible control flow. For 1570 // now, only collect outer loops that have explicit vectorization hints. If we 1571 // are stress testing the VPlan H-CFG construction, we collect the outermost 1572 // loop of every loop nest. 1573 if (L.empty() || VPlanBuildStressTest || 1574 (EnableVPlanNativePath && isExplicitVecOuterLoop(&L, ORE))) { 1575 LoopBlocksRPO RPOT(&L); 1576 RPOT.perform(LI); 1577 if (!containsIrreducibleCFG<const BasicBlock *>(RPOT, *LI)) { 1578 V.push_back(&L); 1579 // TODO: Collect inner loops inside marked outer loops in case 1580 // vectorization fails for the outer loop. Do not invoke 1581 // 'containsIrreducibleCFG' again for inner loops when the outer loop is 1582 // already known to be reducible. We can use an inherited attribute for 1583 // that. 1584 return; 1585 } 1586 } 1587 for (Loop *InnerL : L) 1588 collectSupportedLoops(*InnerL, LI, ORE, V); 1589 } 1590 1591 namespace { 1592 1593 /// The LoopVectorize Pass. 1594 struct LoopVectorize : public FunctionPass { 1595 /// Pass identification, replacement for typeid 1596 static char ID; 1597 1598 LoopVectorizePass Impl; 1599 1600 explicit LoopVectorize(bool InterleaveOnlyWhenForced = false, 1601 bool VectorizeOnlyWhenForced = false) 1602 : FunctionPass(ID) { 1603 Impl.InterleaveOnlyWhenForced = InterleaveOnlyWhenForced; 1604 Impl.VectorizeOnlyWhenForced = VectorizeOnlyWhenForced; 1605 initializeLoopVectorizePass(*PassRegistry::getPassRegistry()); 1606 } 1607 1608 bool runOnFunction(Function &F) override { 1609 if (skipFunction(F)) 1610 return false; 1611 1612 auto *SE = &getAnalysis<ScalarEvolutionWrapperPass>().getSE(); 1613 auto *LI = &getAnalysis<LoopInfoWrapperPass>().getLoopInfo(); 1614 auto *TTI = &getAnalysis<TargetTransformInfoWrapperPass>().getTTI(F); 1615 auto *DT = &getAnalysis<DominatorTreeWrapperPass>().getDomTree(); 1616 auto *BFI = &getAnalysis<BlockFrequencyInfoWrapperPass>().getBFI(); 1617 auto *TLIP = getAnalysisIfAvailable<TargetLibraryInfoWrapperPass>(); 1618 auto *TLI = TLIP ? &TLIP->getTLI(F) : nullptr; 1619 auto *AA = &getAnalysis<AAResultsWrapperPass>().getAAResults(); 1620 auto *AC = &getAnalysis<AssumptionCacheTracker>().getAssumptionCache(F); 1621 auto *LAA = &getAnalysis<LoopAccessLegacyAnalysis>(); 1622 auto *DB = &getAnalysis<DemandedBitsWrapperPass>().getDemandedBits(); 1623 auto *ORE = &getAnalysis<OptimizationRemarkEmitterWrapperPass>().getORE(); 1624 auto *PSI = &getAnalysis<ProfileSummaryInfoWrapperPass>().getPSI(); 1625 1626 std::function<const LoopAccessInfo &(Loop &)> GetLAA = 1627 [&](Loop &L) -> const LoopAccessInfo & { return LAA->getInfo(&L); }; 1628 1629 return Impl.runImpl(F, *SE, *LI, *TTI, *DT, *BFI, TLI, *DB, *AA, *AC, 1630 GetLAA, *ORE, PSI); 1631 } 1632 1633 void getAnalysisUsage(AnalysisUsage &AU) const override { 1634 AU.addRequired<AssumptionCacheTracker>(); 1635 AU.addRequired<BlockFrequencyInfoWrapperPass>(); 1636 AU.addRequired<DominatorTreeWrapperPass>(); 1637 AU.addRequired<LoopInfoWrapperPass>(); 1638 AU.addRequired<ScalarEvolutionWrapperPass>(); 1639 AU.addRequired<TargetTransformInfoWrapperPass>(); 1640 AU.addRequired<AAResultsWrapperPass>(); 1641 AU.addRequired<LoopAccessLegacyAnalysis>(); 1642 AU.addRequired<DemandedBitsWrapperPass>(); 1643 AU.addRequired<OptimizationRemarkEmitterWrapperPass>(); 1644 AU.addRequired<InjectTLIMappingsLegacy>(); 1645 1646 // We currently do not preserve loopinfo/dominator analyses with outer loop 1647 // vectorization. Until this is addressed, mark these analyses as preserved 1648 // only for non-VPlan-native path. 1649 // TODO: Preserve Loop and Dominator analyses for VPlan-native path. 1650 if (!EnableVPlanNativePath) { 1651 AU.addPreserved<LoopInfoWrapperPass>(); 1652 AU.addPreserved<DominatorTreeWrapperPass>(); 1653 } 1654 1655 AU.addPreserved<BasicAAWrapperPass>(); 1656 AU.addPreserved<GlobalsAAWrapperPass>(); 1657 AU.addRequired<ProfileSummaryInfoWrapperPass>(); 1658 } 1659 }; 1660 1661 } // end anonymous namespace 1662 1663 //===----------------------------------------------------------------------===// 1664 // Implementation of LoopVectorizationLegality, InnerLoopVectorizer and 1665 // LoopVectorizationCostModel and LoopVectorizationPlanner. 1666 //===----------------------------------------------------------------------===// 1667 1668 Value *InnerLoopVectorizer::getBroadcastInstrs(Value *V) { 1669 // We need to place the broadcast of invariant variables outside the loop, 1670 // but only if it's proven safe to do so. Else, broadcast will be inside 1671 // vector loop body. 1672 Instruction *Instr = dyn_cast<Instruction>(V); 1673 bool SafeToHoist = OrigLoop->isLoopInvariant(V) && 1674 (!Instr || 1675 DT->dominates(Instr->getParent(), LoopVectorPreHeader)); 1676 // Place the code for broadcasting invariant variables in the new preheader. 1677 IRBuilder<>::InsertPointGuard Guard(Builder); 1678 if (SafeToHoist) 1679 Builder.SetInsertPoint(LoopVectorPreHeader->getTerminator()); 1680 1681 // Broadcast the scalar into all locations in the vector. 1682 Value *Shuf = Builder.CreateVectorSplat(VF, V, "broadcast"); 1683 1684 return Shuf; 1685 } 1686 1687 void InnerLoopVectorizer::createVectorIntOrFpInductionPHI( 1688 const InductionDescriptor &II, Value *Step, Instruction *EntryVal) { 1689 assert((isa<PHINode>(EntryVal) || isa<TruncInst>(EntryVal)) && 1690 "Expected either an induction phi-node or a truncate of it!"); 1691 Value *Start = II.getStartValue(); 1692 1693 // Construct the initial value of the vector IV in the vector loop preheader 1694 auto CurrIP = Builder.saveIP(); 1695 Builder.SetInsertPoint(LoopVectorPreHeader->getTerminator()); 1696 if (isa<TruncInst>(EntryVal)) { 1697 assert(Start->getType()->isIntegerTy() && 1698 "Truncation requires an integer type"); 1699 auto *TruncType = cast<IntegerType>(EntryVal->getType()); 1700 Step = Builder.CreateTrunc(Step, TruncType); 1701 Start = Builder.CreateCast(Instruction::Trunc, Start, TruncType); 1702 } 1703 Value *SplatStart = Builder.CreateVectorSplat(VF, Start); 1704 Value *SteppedStart = 1705 getStepVector(SplatStart, 0, Step, II.getInductionOpcode()); 1706 1707 // We create vector phi nodes for both integer and floating-point induction 1708 // variables. Here, we determine the kind of arithmetic we will perform. 1709 Instruction::BinaryOps AddOp; 1710 Instruction::BinaryOps MulOp; 1711 if (Step->getType()->isIntegerTy()) { 1712 AddOp = Instruction::Add; 1713 MulOp = Instruction::Mul; 1714 } else { 1715 AddOp = II.getInductionOpcode(); 1716 MulOp = Instruction::FMul; 1717 } 1718 1719 // Multiply the vectorization factor by the step using integer or 1720 // floating-point arithmetic as appropriate. 1721 Value *ConstVF = getSignedIntOrFpConstant(Step->getType(), VF); 1722 Value *Mul = addFastMathFlag(Builder.CreateBinOp(MulOp, Step, ConstVF)); 1723 1724 // Create a vector splat to use in the induction update. 1725 // 1726 // FIXME: If the step is non-constant, we create the vector splat with 1727 // IRBuilder. IRBuilder can constant-fold the multiply, but it doesn't 1728 // handle a constant vector splat. 1729 Value *SplatVF = isa<Constant>(Mul) 1730 ? ConstantVector::getSplat(VF, cast<Constant>(Mul)) 1731 : Builder.CreateVectorSplat(VF, Mul); 1732 Builder.restoreIP(CurrIP); 1733 1734 // We may need to add the step a number of times, depending on the unroll 1735 // factor. The last of those goes into the PHI. 1736 PHINode *VecInd = PHINode::Create(SteppedStart->getType(), 2, "vec.ind", 1737 &*LoopVectorBody->getFirstInsertionPt()); 1738 VecInd->setDebugLoc(EntryVal->getDebugLoc()); 1739 Instruction *LastInduction = VecInd; 1740 for (unsigned Part = 0; Part < UF; ++Part) { 1741 VectorLoopValueMap.setVectorValue(EntryVal, Part, LastInduction); 1742 1743 if (isa<TruncInst>(EntryVal)) 1744 addMetadata(LastInduction, EntryVal); 1745 recordVectorLoopValueForInductionCast(II, EntryVal, LastInduction, Part); 1746 1747 LastInduction = cast<Instruction>(addFastMathFlag( 1748 Builder.CreateBinOp(AddOp, LastInduction, SplatVF, "step.add"))); 1749 LastInduction->setDebugLoc(EntryVal->getDebugLoc()); 1750 } 1751 1752 // Move the last step to the end of the latch block. This ensures consistent 1753 // placement of all induction updates. 1754 auto *LoopVectorLatch = LI->getLoopFor(LoopVectorBody)->getLoopLatch(); 1755 auto *Br = cast<BranchInst>(LoopVectorLatch->getTerminator()); 1756 auto *ICmp = cast<Instruction>(Br->getCondition()); 1757 LastInduction->moveBefore(ICmp); 1758 LastInduction->setName("vec.ind.next"); 1759 1760 VecInd->addIncoming(SteppedStart, LoopVectorPreHeader); 1761 VecInd->addIncoming(LastInduction, LoopVectorLatch); 1762 } 1763 1764 bool InnerLoopVectorizer::shouldScalarizeInstruction(Instruction *I) const { 1765 return Cost->isScalarAfterVectorization(I, VF) || 1766 Cost->isProfitableToScalarize(I, VF); 1767 } 1768 1769 bool InnerLoopVectorizer::needsScalarInduction(Instruction *IV) const { 1770 if (shouldScalarizeInstruction(IV)) 1771 return true; 1772 auto isScalarInst = [&](User *U) -> bool { 1773 auto *I = cast<Instruction>(U); 1774 return (OrigLoop->contains(I) && shouldScalarizeInstruction(I)); 1775 }; 1776 return llvm::any_of(IV->users(), isScalarInst); 1777 } 1778 1779 void InnerLoopVectorizer::recordVectorLoopValueForInductionCast( 1780 const InductionDescriptor &ID, const Instruction *EntryVal, 1781 Value *VectorLoopVal, unsigned Part, unsigned Lane) { 1782 assert((isa<PHINode>(EntryVal) || isa<TruncInst>(EntryVal)) && 1783 "Expected either an induction phi-node or a truncate of it!"); 1784 1785 // This induction variable is not the phi from the original loop but the 1786 // newly-created IV based on the proof that casted Phi is equal to the 1787 // uncasted Phi in the vectorized loop (under a runtime guard possibly). It 1788 // re-uses the same InductionDescriptor that original IV uses but we don't 1789 // have to do any recording in this case - that is done when original IV is 1790 // processed. 1791 if (isa<TruncInst>(EntryVal)) 1792 return; 1793 1794 const SmallVectorImpl<Instruction *> &Casts = ID.getCastInsts(); 1795 if (Casts.empty()) 1796 return; 1797 // Only the first Cast instruction in the Casts vector is of interest. 1798 // The rest of the Casts (if exist) have no uses outside the 1799 // induction update chain itself. 1800 Instruction *CastInst = *Casts.begin(); 1801 if (Lane < UINT_MAX) 1802 VectorLoopValueMap.setScalarValue(CastInst, {Part, Lane}, VectorLoopVal); 1803 else 1804 VectorLoopValueMap.setVectorValue(CastInst, Part, VectorLoopVal); 1805 } 1806 1807 void InnerLoopVectorizer::widenIntOrFpInduction(PHINode *IV, TruncInst *Trunc) { 1808 assert((IV->getType()->isIntegerTy() || IV != OldInduction) && 1809 "Primary induction variable must have an integer type"); 1810 1811 auto II = Legal->getInductionVars()->find(IV); 1812 assert(II != Legal->getInductionVars()->end() && "IV is not an induction"); 1813 1814 auto ID = II->second; 1815 assert(IV->getType() == ID.getStartValue()->getType() && "Types must match"); 1816 1817 // The scalar value to broadcast. This will be derived from the canonical 1818 // induction variable. 1819 Value *ScalarIV = nullptr; 1820 1821 // The value from the original loop to which we are mapping the new induction 1822 // variable. 1823 Instruction *EntryVal = Trunc ? cast<Instruction>(Trunc) : IV; 1824 1825 // True if we have vectorized the induction variable. 1826 auto VectorizedIV = false; 1827 1828 // Determine if we want a scalar version of the induction variable. This is 1829 // true if the induction variable itself is not widened, or if it has at 1830 // least one user in the loop that is not widened. 1831 auto NeedsScalarIV = VF > 1 && needsScalarInduction(EntryVal); 1832 1833 // Generate code for the induction step. Note that induction steps are 1834 // required to be loop-invariant 1835 assert(PSE.getSE()->isLoopInvariant(ID.getStep(), OrigLoop) && 1836 "Induction step should be loop invariant"); 1837 auto &DL = OrigLoop->getHeader()->getModule()->getDataLayout(); 1838 Value *Step = nullptr; 1839 if (PSE.getSE()->isSCEVable(IV->getType())) { 1840 SCEVExpander Exp(*PSE.getSE(), DL, "induction"); 1841 Step = Exp.expandCodeFor(ID.getStep(), ID.getStep()->getType(), 1842 LoopVectorPreHeader->getTerminator()); 1843 } else { 1844 Step = cast<SCEVUnknown>(ID.getStep())->getValue(); 1845 } 1846 1847 // Try to create a new independent vector induction variable. If we can't 1848 // create the phi node, we will splat the scalar induction variable in each 1849 // loop iteration. 1850 if (VF > 1 && !shouldScalarizeInstruction(EntryVal)) { 1851 createVectorIntOrFpInductionPHI(ID, Step, EntryVal); 1852 VectorizedIV = true; 1853 } 1854 1855 // If we haven't yet vectorized the induction variable, or if we will create 1856 // a scalar one, we need to define the scalar induction variable and step 1857 // values. If we were given a truncation type, truncate the canonical 1858 // induction variable and step. Otherwise, derive these values from the 1859 // induction descriptor. 1860 if (!VectorizedIV || NeedsScalarIV) { 1861 ScalarIV = Induction; 1862 if (IV != OldInduction) { 1863 ScalarIV = IV->getType()->isIntegerTy() 1864 ? Builder.CreateSExtOrTrunc(Induction, IV->getType()) 1865 : Builder.CreateCast(Instruction::SIToFP, Induction, 1866 IV->getType()); 1867 ScalarIV = emitTransformedIndex(Builder, ScalarIV, PSE.getSE(), DL, ID); 1868 ScalarIV->setName("offset.idx"); 1869 } 1870 if (Trunc) { 1871 auto *TruncType = cast<IntegerType>(Trunc->getType()); 1872 assert(Step->getType()->isIntegerTy() && 1873 "Truncation requires an integer step"); 1874 ScalarIV = Builder.CreateTrunc(ScalarIV, TruncType); 1875 Step = Builder.CreateTrunc(Step, TruncType); 1876 } 1877 } 1878 1879 // If we haven't yet vectorized the induction variable, splat the scalar 1880 // induction variable, and build the necessary step vectors. 1881 // TODO: Don't do it unless the vectorized IV is really required. 1882 if (!VectorizedIV) { 1883 Value *Broadcasted = getBroadcastInstrs(ScalarIV); 1884 for (unsigned Part = 0; Part < UF; ++Part) { 1885 Value *EntryPart = 1886 getStepVector(Broadcasted, VF * Part, Step, ID.getInductionOpcode()); 1887 VectorLoopValueMap.setVectorValue(EntryVal, Part, EntryPart); 1888 if (Trunc) 1889 addMetadata(EntryPart, Trunc); 1890 recordVectorLoopValueForInductionCast(ID, EntryVal, EntryPart, Part); 1891 } 1892 } 1893 1894 // If an induction variable is only used for counting loop iterations or 1895 // calculating addresses, it doesn't need to be widened. Create scalar steps 1896 // that can be used by instructions we will later scalarize. Note that the 1897 // addition of the scalar steps will not increase the number of instructions 1898 // in the loop in the common case prior to InstCombine. We will be trading 1899 // one vector extract for each scalar step. 1900 if (NeedsScalarIV) 1901 buildScalarSteps(ScalarIV, Step, EntryVal, ID); 1902 } 1903 1904 Value *InnerLoopVectorizer::getStepVector(Value *Val, int StartIdx, Value *Step, 1905 Instruction::BinaryOps BinOp) { 1906 // Create and check the types. 1907 assert(Val->getType()->isVectorTy() && "Must be a vector"); 1908 int VLen = Val->getType()->getVectorNumElements(); 1909 1910 Type *STy = Val->getType()->getScalarType(); 1911 assert((STy->isIntegerTy() || STy->isFloatingPointTy()) && 1912 "Induction Step must be an integer or FP"); 1913 assert(Step->getType() == STy && "Step has wrong type"); 1914 1915 SmallVector<Constant *, 8> Indices; 1916 1917 if (STy->isIntegerTy()) { 1918 // Create a vector of consecutive numbers from zero to VF. 1919 for (int i = 0; i < VLen; ++i) 1920 Indices.push_back(ConstantInt::get(STy, StartIdx + i)); 1921 1922 // Add the consecutive indices to the vector value. 1923 Constant *Cv = ConstantVector::get(Indices); 1924 assert(Cv->getType() == Val->getType() && "Invalid consecutive vec"); 1925 Step = Builder.CreateVectorSplat(VLen, Step); 1926 assert(Step->getType() == Val->getType() && "Invalid step vec"); 1927 // FIXME: The newly created binary instructions should contain nsw/nuw flags, 1928 // which can be found from the original scalar operations. 1929 Step = Builder.CreateMul(Cv, Step); 1930 return Builder.CreateAdd(Val, Step, "induction"); 1931 } 1932 1933 // Floating point induction. 1934 assert((BinOp == Instruction::FAdd || BinOp == Instruction::FSub) && 1935 "Binary Opcode should be specified for FP induction"); 1936 // Create a vector of consecutive numbers from zero to VF. 1937 for (int i = 0; i < VLen; ++i) 1938 Indices.push_back(ConstantFP::get(STy, (double)(StartIdx + i))); 1939 1940 // Add the consecutive indices to the vector value. 1941 Constant *Cv = ConstantVector::get(Indices); 1942 1943 Step = Builder.CreateVectorSplat(VLen, Step); 1944 1945 // Floating point operations had to be 'fast' to enable the induction. 1946 FastMathFlags Flags; 1947 Flags.setFast(); 1948 1949 Value *MulOp = Builder.CreateFMul(Cv, Step); 1950 if (isa<Instruction>(MulOp)) 1951 // Have to check, MulOp may be a constant 1952 cast<Instruction>(MulOp)->setFastMathFlags(Flags); 1953 1954 Value *BOp = Builder.CreateBinOp(BinOp, Val, MulOp, "induction"); 1955 if (isa<Instruction>(BOp)) 1956 cast<Instruction>(BOp)->setFastMathFlags(Flags); 1957 return BOp; 1958 } 1959 1960 void InnerLoopVectorizer::buildScalarSteps(Value *ScalarIV, Value *Step, 1961 Instruction *EntryVal, 1962 const InductionDescriptor &ID) { 1963 // We shouldn't have to build scalar steps if we aren't vectorizing. 1964 assert(VF > 1 && "VF should be greater than one"); 1965 1966 // Get the value type and ensure it and the step have the same integer type. 1967 Type *ScalarIVTy = ScalarIV->getType()->getScalarType(); 1968 assert(ScalarIVTy == Step->getType() && 1969 "Val and Step should have the same type"); 1970 1971 // We build scalar steps for both integer and floating-point induction 1972 // variables. Here, we determine the kind of arithmetic we will perform. 1973 Instruction::BinaryOps AddOp; 1974 Instruction::BinaryOps MulOp; 1975 if (ScalarIVTy->isIntegerTy()) { 1976 AddOp = Instruction::Add; 1977 MulOp = Instruction::Mul; 1978 } else { 1979 AddOp = ID.getInductionOpcode(); 1980 MulOp = Instruction::FMul; 1981 } 1982 1983 // Determine the number of scalars we need to generate for each unroll 1984 // iteration. If EntryVal is uniform, we only need to generate the first 1985 // lane. Otherwise, we generate all VF values. 1986 unsigned Lanes = 1987 Cost->isUniformAfterVectorization(cast<Instruction>(EntryVal), VF) ? 1 1988 : VF; 1989 // Compute the scalar steps and save the results in VectorLoopValueMap. 1990 for (unsigned Part = 0; Part < UF; ++Part) { 1991 for (unsigned Lane = 0; Lane < Lanes; ++Lane) { 1992 auto *StartIdx = getSignedIntOrFpConstant(ScalarIVTy, VF * Part + Lane); 1993 auto *Mul = addFastMathFlag(Builder.CreateBinOp(MulOp, StartIdx, Step)); 1994 auto *Add = addFastMathFlag(Builder.CreateBinOp(AddOp, ScalarIV, Mul)); 1995 VectorLoopValueMap.setScalarValue(EntryVal, {Part, Lane}, Add); 1996 recordVectorLoopValueForInductionCast(ID, EntryVal, Add, Part, Lane); 1997 } 1998 } 1999 } 2000 2001 Value *InnerLoopVectorizer::getOrCreateVectorValue(Value *V, unsigned Part) { 2002 assert(V != Induction && "The new induction variable should not be used."); 2003 assert(!V->getType()->isVectorTy() && "Can't widen a vector"); 2004 assert(!V->getType()->isVoidTy() && "Type does not produce a value"); 2005 2006 // If we have a stride that is replaced by one, do it here. Defer this for 2007 // the VPlan-native path until we start running Legal checks in that path. 2008 if (!EnableVPlanNativePath && Legal->hasStride(V)) 2009 V = ConstantInt::get(V->getType(), 1); 2010 2011 // If we have a vector mapped to this value, return it. 2012 if (VectorLoopValueMap.hasVectorValue(V, Part)) 2013 return VectorLoopValueMap.getVectorValue(V, Part); 2014 2015 // If the value has not been vectorized, check if it has been scalarized 2016 // instead. If it has been scalarized, and we actually need the value in 2017 // vector form, we will construct the vector values on demand. 2018 if (VectorLoopValueMap.hasAnyScalarValue(V)) { 2019 Value *ScalarValue = VectorLoopValueMap.getScalarValue(V, {Part, 0}); 2020 2021 // If we've scalarized a value, that value should be an instruction. 2022 auto *I = cast<Instruction>(V); 2023 2024 // If we aren't vectorizing, we can just copy the scalar map values over to 2025 // the vector map. 2026 if (VF == 1) { 2027 VectorLoopValueMap.setVectorValue(V, Part, ScalarValue); 2028 return ScalarValue; 2029 } 2030 2031 // Get the last scalar instruction we generated for V and Part. If the value 2032 // is known to be uniform after vectorization, this corresponds to lane zero 2033 // of the Part unroll iteration. Otherwise, the last instruction is the one 2034 // we created for the last vector lane of the Part unroll iteration. 2035 unsigned LastLane = Cost->isUniformAfterVectorization(I, VF) ? 0 : VF - 1; 2036 auto *LastInst = cast<Instruction>( 2037 VectorLoopValueMap.getScalarValue(V, {Part, LastLane})); 2038 2039 // Set the insert point after the last scalarized instruction. This ensures 2040 // the insertelement sequence will directly follow the scalar definitions. 2041 auto OldIP = Builder.saveIP(); 2042 auto NewIP = std::next(BasicBlock::iterator(LastInst)); 2043 Builder.SetInsertPoint(&*NewIP); 2044 2045 // However, if we are vectorizing, we need to construct the vector values. 2046 // If the value is known to be uniform after vectorization, we can just 2047 // broadcast the scalar value corresponding to lane zero for each unroll 2048 // iteration. Otherwise, we construct the vector values using insertelement 2049 // instructions. Since the resulting vectors are stored in 2050 // VectorLoopValueMap, we will only generate the insertelements once. 2051 Value *VectorValue = nullptr; 2052 if (Cost->isUniformAfterVectorization(I, VF)) { 2053 VectorValue = getBroadcastInstrs(ScalarValue); 2054 VectorLoopValueMap.setVectorValue(V, Part, VectorValue); 2055 } else { 2056 // Initialize packing with insertelements to start from undef. 2057 Value *Undef = UndefValue::get(VectorType::get(V->getType(), VF)); 2058 VectorLoopValueMap.setVectorValue(V, Part, Undef); 2059 for (unsigned Lane = 0; Lane < VF; ++Lane) 2060 packScalarIntoVectorValue(V, {Part, Lane}); 2061 VectorValue = VectorLoopValueMap.getVectorValue(V, Part); 2062 } 2063 Builder.restoreIP(OldIP); 2064 return VectorValue; 2065 } 2066 2067 // If this scalar is unknown, assume that it is a constant or that it is 2068 // loop invariant. Broadcast V and save the value for future uses. 2069 Value *B = getBroadcastInstrs(V); 2070 VectorLoopValueMap.setVectorValue(V, Part, B); 2071 return B; 2072 } 2073 2074 Value * 2075 InnerLoopVectorizer::getOrCreateScalarValue(Value *V, 2076 const VPIteration &Instance) { 2077 // If the value is not an instruction contained in the loop, it should 2078 // already be scalar. 2079 if (OrigLoop->isLoopInvariant(V)) 2080 return V; 2081 2082 assert(Instance.Lane > 0 2083 ? !Cost->isUniformAfterVectorization(cast<Instruction>(V), VF) 2084 : true && "Uniform values only have lane zero"); 2085 2086 // If the value from the original loop has not been vectorized, it is 2087 // represented by UF x VF scalar values in the new loop. Return the requested 2088 // scalar value. 2089 if (VectorLoopValueMap.hasScalarValue(V, Instance)) 2090 return VectorLoopValueMap.getScalarValue(V, Instance); 2091 2092 // If the value has not been scalarized, get its entry in VectorLoopValueMap 2093 // for the given unroll part. If this entry is not a vector type (i.e., the 2094 // vectorization factor is one), there is no need to generate an 2095 // extractelement instruction. 2096 auto *U = getOrCreateVectorValue(V, Instance.Part); 2097 if (!U->getType()->isVectorTy()) { 2098 assert(VF == 1 && "Value not scalarized has non-vector type"); 2099 return U; 2100 } 2101 2102 // Otherwise, the value from the original loop has been vectorized and is 2103 // represented by UF vector values. Extract and return the requested scalar 2104 // value from the appropriate vector lane. 2105 return Builder.CreateExtractElement(U, Builder.getInt32(Instance.Lane)); 2106 } 2107 2108 void InnerLoopVectorizer::packScalarIntoVectorValue( 2109 Value *V, const VPIteration &Instance) { 2110 assert(V != Induction && "The new induction variable should not be used."); 2111 assert(!V->getType()->isVectorTy() && "Can't pack a vector"); 2112 assert(!V->getType()->isVoidTy() && "Type does not produce a value"); 2113 2114 Value *ScalarInst = VectorLoopValueMap.getScalarValue(V, Instance); 2115 Value *VectorValue = VectorLoopValueMap.getVectorValue(V, Instance.Part); 2116 VectorValue = Builder.CreateInsertElement(VectorValue, ScalarInst, 2117 Builder.getInt32(Instance.Lane)); 2118 VectorLoopValueMap.resetVectorValue(V, Instance.Part, VectorValue); 2119 } 2120 2121 Value *InnerLoopVectorizer::reverseVector(Value *Vec) { 2122 assert(Vec->getType()->isVectorTy() && "Invalid type"); 2123 SmallVector<Constant *, 8> ShuffleMask; 2124 for (unsigned i = 0; i < VF; ++i) 2125 ShuffleMask.push_back(Builder.getInt32(VF - i - 1)); 2126 2127 return Builder.CreateShuffleVector(Vec, UndefValue::get(Vec->getType()), 2128 ConstantVector::get(ShuffleMask), 2129 "reverse"); 2130 } 2131 2132 // Return whether we allow using masked interleave-groups (for dealing with 2133 // strided loads/stores that reside in predicated blocks, or for dealing 2134 // with gaps). 2135 static bool useMaskedInterleavedAccesses(const TargetTransformInfo &TTI) { 2136 // If an override option has been passed in for interleaved accesses, use it. 2137 if (EnableMaskedInterleavedMemAccesses.getNumOccurrences() > 0) 2138 return EnableMaskedInterleavedMemAccesses; 2139 2140 return TTI.enableMaskedInterleavedAccessVectorization(); 2141 } 2142 2143 // Try to vectorize the interleave group that \p Instr belongs to. 2144 // 2145 // E.g. Translate following interleaved load group (factor = 3): 2146 // for (i = 0; i < N; i+=3) { 2147 // R = Pic[i]; // Member of index 0 2148 // G = Pic[i+1]; // Member of index 1 2149 // B = Pic[i+2]; // Member of index 2 2150 // ... // do something to R, G, B 2151 // } 2152 // To: 2153 // %wide.vec = load <12 x i32> ; Read 4 tuples of R,G,B 2154 // %R.vec = shuffle %wide.vec, undef, <0, 3, 6, 9> ; R elements 2155 // %G.vec = shuffle %wide.vec, undef, <1, 4, 7, 10> ; G elements 2156 // %B.vec = shuffle %wide.vec, undef, <2, 5, 8, 11> ; B elements 2157 // 2158 // Or translate following interleaved store group (factor = 3): 2159 // for (i = 0; i < N; i+=3) { 2160 // ... do something to R, G, B 2161 // Pic[i] = R; // Member of index 0 2162 // Pic[i+1] = G; // Member of index 1 2163 // Pic[i+2] = B; // Member of index 2 2164 // } 2165 // To: 2166 // %R_G.vec = shuffle %R.vec, %G.vec, <0, 1, 2, ..., 7> 2167 // %B_U.vec = shuffle %B.vec, undef, <0, 1, 2, 3, u, u, u, u> 2168 // %interleaved.vec = shuffle %R_G.vec, %B_U.vec, 2169 // <0, 4, 8, 1, 5, 9, 2, 6, 10, 3, 7, 11> ; Interleave R,G,B elements 2170 // store <12 x i32> %interleaved.vec ; Write 4 tuples of R,G,B 2171 void InnerLoopVectorizer::vectorizeInterleaveGroup(Instruction *Instr, 2172 VPTransformState &State, 2173 VPValue *Addr, 2174 VPValue *BlockInMask) { 2175 const InterleaveGroup<Instruction> *Group = 2176 Cost->getInterleavedAccessGroup(Instr); 2177 assert(Group && "Fail to get an interleaved access group."); 2178 2179 // Skip if current instruction is not the insert position. 2180 if (Instr != Group->getInsertPos()) 2181 return; 2182 2183 const DataLayout &DL = Instr->getModule()->getDataLayout(); 2184 2185 // Prepare for the vector type of the interleaved load/store. 2186 Type *ScalarTy = getMemInstValueType(Instr); 2187 unsigned InterleaveFactor = Group->getFactor(); 2188 Type *VecTy = VectorType::get(ScalarTy, InterleaveFactor * VF); 2189 2190 // Prepare for the new pointers. 2191 SmallVector<Value *, 2> AddrParts; 2192 unsigned Index = Group->getIndex(Instr); 2193 2194 // TODO: extend the masked interleaved-group support to reversed access. 2195 assert((!BlockInMask || !Group->isReverse()) && 2196 "Reversed masked interleave-group not supported."); 2197 2198 // If the group is reverse, adjust the index to refer to the last vector lane 2199 // instead of the first. We adjust the index from the first vector lane, 2200 // rather than directly getting the pointer for lane VF - 1, because the 2201 // pointer operand of the interleaved access is supposed to be uniform. For 2202 // uniform instructions, we're only required to generate a value for the 2203 // first vector lane in each unroll iteration. 2204 if (Group->isReverse()) 2205 Index += (VF - 1) * Group->getFactor(); 2206 2207 for (unsigned Part = 0; Part < UF; Part++) { 2208 Value *AddrPart = State.get(Addr, {Part, 0}); 2209 setDebugLocFromInst(Builder, AddrPart); 2210 2211 // Notice current instruction could be any index. Need to adjust the address 2212 // to the member of index 0. 2213 // 2214 // E.g. a = A[i+1]; // Member of index 1 (Current instruction) 2215 // b = A[i]; // Member of index 0 2216 // Current pointer is pointed to A[i+1], adjust it to A[i]. 2217 // 2218 // E.g. A[i+1] = a; // Member of index 1 2219 // A[i] = b; // Member of index 0 2220 // A[i+2] = c; // Member of index 2 (Current instruction) 2221 // Current pointer is pointed to A[i+2], adjust it to A[i]. 2222 2223 bool InBounds = false; 2224 if (auto *gep = dyn_cast<GetElementPtrInst>(AddrPart->stripPointerCasts())) 2225 InBounds = gep->isInBounds(); 2226 AddrPart = Builder.CreateGEP(ScalarTy, AddrPart, Builder.getInt32(-Index)); 2227 cast<GetElementPtrInst>(AddrPart)->setIsInBounds(InBounds); 2228 2229 // Cast to the vector pointer type. 2230 unsigned AddressSpace = AddrPart->getType()->getPointerAddressSpace(); 2231 Type *PtrTy = VecTy->getPointerTo(AddressSpace); 2232 AddrParts.push_back(Builder.CreateBitCast(AddrPart, PtrTy)); 2233 } 2234 2235 setDebugLocFromInst(Builder, Instr); 2236 Value *UndefVec = UndefValue::get(VecTy); 2237 2238 Value *MaskForGaps = nullptr; 2239 if (Group->requiresScalarEpilogue() && !Cost->isScalarEpilogueAllowed()) { 2240 MaskForGaps = createBitMaskForGaps(Builder, VF, *Group); 2241 assert(MaskForGaps && "Mask for Gaps is required but it is null"); 2242 } 2243 2244 // Vectorize the interleaved load group. 2245 if (isa<LoadInst>(Instr)) { 2246 // For each unroll part, create a wide load for the group. 2247 SmallVector<Value *, 2> NewLoads; 2248 for (unsigned Part = 0; Part < UF; Part++) { 2249 Instruction *NewLoad; 2250 if (BlockInMask || MaskForGaps) { 2251 assert(useMaskedInterleavedAccesses(*TTI) && 2252 "masked interleaved groups are not allowed."); 2253 Value *GroupMask = MaskForGaps; 2254 if (BlockInMask) { 2255 Value *BlockInMaskPart = State.get(BlockInMask, Part); 2256 auto *Undefs = UndefValue::get(BlockInMaskPart->getType()); 2257 auto *RepMask = createReplicatedMask(Builder, InterleaveFactor, VF); 2258 Value *ShuffledMask = Builder.CreateShuffleVector( 2259 BlockInMaskPart, Undefs, RepMask, "interleaved.mask"); 2260 GroupMask = MaskForGaps 2261 ? Builder.CreateBinOp(Instruction::And, ShuffledMask, 2262 MaskForGaps) 2263 : ShuffledMask; 2264 } 2265 NewLoad = 2266 Builder.CreateMaskedLoad(AddrParts[Part], Group->getAlignment(), 2267 GroupMask, UndefVec, "wide.masked.vec"); 2268 } 2269 else 2270 NewLoad = Builder.CreateAlignedLoad(VecTy, AddrParts[Part], 2271 Group->getAlignment(), "wide.vec"); 2272 Group->addMetadata(NewLoad); 2273 NewLoads.push_back(NewLoad); 2274 } 2275 2276 // For each member in the group, shuffle out the appropriate data from the 2277 // wide loads. 2278 for (unsigned I = 0; I < InterleaveFactor; ++I) { 2279 Instruction *Member = Group->getMember(I); 2280 2281 // Skip the gaps in the group. 2282 if (!Member) 2283 continue; 2284 2285 Constant *StrideMask = createStrideMask(Builder, I, InterleaveFactor, VF); 2286 for (unsigned Part = 0; Part < UF; Part++) { 2287 Value *StridedVec = Builder.CreateShuffleVector( 2288 NewLoads[Part], UndefVec, StrideMask, "strided.vec"); 2289 2290 // If this member has different type, cast the result type. 2291 if (Member->getType() != ScalarTy) { 2292 VectorType *OtherVTy = VectorType::get(Member->getType(), VF); 2293 StridedVec = createBitOrPointerCast(StridedVec, OtherVTy, DL); 2294 } 2295 2296 if (Group->isReverse()) 2297 StridedVec = reverseVector(StridedVec); 2298 2299 VectorLoopValueMap.setVectorValue(Member, Part, StridedVec); 2300 } 2301 } 2302 return; 2303 } 2304 2305 // The sub vector type for current instruction. 2306 VectorType *SubVT = VectorType::get(ScalarTy, VF); 2307 2308 // Vectorize the interleaved store group. 2309 for (unsigned Part = 0; Part < UF; Part++) { 2310 // Collect the stored vector from each member. 2311 SmallVector<Value *, 4> StoredVecs; 2312 for (unsigned i = 0; i < InterleaveFactor; i++) { 2313 // Interleaved store group doesn't allow a gap, so each index has a member 2314 Instruction *Member = Group->getMember(i); 2315 assert(Member && "Fail to get a member from an interleaved store group"); 2316 2317 Value *StoredVec = getOrCreateVectorValue( 2318 cast<StoreInst>(Member)->getValueOperand(), Part); 2319 if (Group->isReverse()) 2320 StoredVec = reverseVector(StoredVec); 2321 2322 // If this member has different type, cast it to a unified type. 2323 2324 if (StoredVec->getType() != SubVT) 2325 StoredVec = createBitOrPointerCast(StoredVec, SubVT, DL); 2326 2327 StoredVecs.push_back(StoredVec); 2328 } 2329 2330 // Concatenate all vectors into a wide vector. 2331 Value *WideVec = concatenateVectors(Builder, StoredVecs); 2332 2333 // Interleave the elements in the wide vector. 2334 Constant *IMask = createInterleaveMask(Builder, VF, InterleaveFactor); 2335 Value *IVec = Builder.CreateShuffleVector(WideVec, UndefVec, IMask, 2336 "interleaved.vec"); 2337 2338 Instruction *NewStoreInstr; 2339 if (BlockInMask) { 2340 Value *BlockInMaskPart = State.get(BlockInMask, Part); 2341 auto *Undefs = UndefValue::get(BlockInMaskPart->getType()); 2342 auto *RepMask = createReplicatedMask(Builder, InterleaveFactor, VF); 2343 Value *ShuffledMask = Builder.CreateShuffleVector( 2344 BlockInMaskPart, Undefs, RepMask, "interleaved.mask"); 2345 NewStoreInstr = Builder.CreateMaskedStore( 2346 IVec, AddrParts[Part], Group->getAlignment(), ShuffledMask); 2347 } 2348 else 2349 NewStoreInstr = Builder.CreateAlignedStore(IVec, AddrParts[Part], 2350 Group->getAlignment()); 2351 2352 Group->addMetadata(NewStoreInstr); 2353 } 2354 } 2355 2356 void InnerLoopVectorizer::vectorizeMemoryInstruction(Instruction *Instr, 2357 VPTransformState &State, 2358 VPValue *Addr, 2359 VPValue *BlockInMask) { 2360 // Attempt to issue a wide load. 2361 LoadInst *LI = dyn_cast<LoadInst>(Instr); 2362 StoreInst *SI = dyn_cast<StoreInst>(Instr); 2363 2364 assert((LI || SI) && "Invalid Load/Store instruction"); 2365 2366 LoopVectorizationCostModel::InstWidening Decision = 2367 Cost->getWideningDecision(Instr, VF); 2368 assert(Decision != LoopVectorizationCostModel::CM_Unknown && 2369 "CM decision should be taken at this point"); 2370 if (Decision == LoopVectorizationCostModel::CM_Interleave) 2371 return vectorizeInterleaveGroup(Instr, State, Addr, BlockInMask); 2372 2373 Type *ScalarDataTy = getMemInstValueType(Instr); 2374 Type *DataTy = VectorType::get(ScalarDataTy, VF); 2375 // An alignment of 0 means target abi alignment. We need to use the scalar's 2376 // target abi alignment in such a case. 2377 const DataLayout &DL = Instr->getModule()->getDataLayout(); 2378 const Align Alignment = 2379 DL.getValueOrABITypeAlignment(getLoadStoreAlignment(Instr), ScalarDataTy); 2380 2381 // Determine if the pointer operand of the access is either consecutive or 2382 // reverse consecutive. 2383 bool Reverse = (Decision == LoopVectorizationCostModel::CM_Widen_Reverse); 2384 bool ConsecutiveStride = 2385 Reverse || (Decision == LoopVectorizationCostModel::CM_Widen); 2386 bool CreateGatherScatter = 2387 (Decision == LoopVectorizationCostModel::CM_GatherScatter); 2388 2389 // Either Ptr feeds a vector load/store, or a vector GEP should feed a vector 2390 // gather/scatter. Otherwise Decision should have been to Scalarize. 2391 assert((ConsecutiveStride || CreateGatherScatter) && 2392 "The instruction should be scalarized"); 2393 (void)ConsecutiveStride; 2394 2395 VectorParts BlockInMaskParts(UF); 2396 bool isMaskRequired = BlockInMask; 2397 if (isMaskRequired) 2398 for (unsigned Part = 0; Part < UF; ++Part) 2399 BlockInMaskParts[Part] = State.get(BlockInMask, Part); 2400 2401 const auto CreateVecPtr = [&](unsigned Part, Value *Ptr) -> Value * { 2402 // Calculate the pointer for the specific unroll-part. 2403 GetElementPtrInst *PartPtr = nullptr; 2404 2405 bool InBounds = false; 2406 if (auto *gep = dyn_cast<GetElementPtrInst>(Ptr->stripPointerCasts())) 2407 InBounds = gep->isInBounds(); 2408 2409 if (Reverse) { 2410 // If the address is consecutive but reversed, then the 2411 // wide store needs to start at the last vector element. 2412 PartPtr = cast<GetElementPtrInst>( 2413 Builder.CreateGEP(ScalarDataTy, Ptr, Builder.getInt32(-Part * VF))); 2414 PartPtr->setIsInBounds(InBounds); 2415 PartPtr = cast<GetElementPtrInst>( 2416 Builder.CreateGEP(ScalarDataTy, PartPtr, Builder.getInt32(1 - VF))); 2417 PartPtr->setIsInBounds(InBounds); 2418 if (isMaskRequired) // Reverse of a null all-one mask is a null mask. 2419 BlockInMaskParts[Part] = reverseVector(BlockInMaskParts[Part]); 2420 } else { 2421 PartPtr = cast<GetElementPtrInst>( 2422 Builder.CreateGEP(ScalarDataTy, Ptr, Builder.getInt32(Part * VF))); 2423 PartPtr->setIsInBounds(InBounds); 2424 } 2425 2426 unsigned AddressSpace = Ptr->getType()->getPointerAddressSpace(); 2427 return Builder.CreateBitCast(PartPtr, DataTy->getPointerTo(AddressSpace)); 2428 }; 2429 2430 // Handle Stores: 2431 if (SI) { 2432 setDebugLocFromInst(Builder, SI); 2433 2434 for (unsigned Part = 0; Part < UF; ++Part) { 2435 Instruction *NewSI = nullptr; 2436 Value *StoredVal = getOrCreateVectorValue(SI->getValueOperand(), Part); 2437 if (CreateGatherScatter) { 2438 Value *MaskPart = isMaskRequired ? BlockInMaskParts[Part] : nullptr; 2439 Value *VectorGep = State.get(Addr, Part); 2440 NewSI = Builder.CreateMaskedScatter(StoredVal, VectorGep, 2441 Alignment.value(), MaskPart); 2442 } else { 2443 if (Reverse) { 2444 // If we store to reverse consecutive memory locations, then we need 2445 // to reverse the order of elements in the stored value. 2446 StoredVal = reverseVector(StoredVal); 2447 // We don't want to update the value in the map as it might be used in 2448 // another expression. So don't call resetVectorValue(StoredVal). 2449 } 2450 auto *VecPtr = CreateVecPtr(Part, State.get(Addr, {0, 0})); 2451 if (isMaskRequired) 2452 NewSI = Builder.CreateMaskedStore( 2453 StoredVal, VecPtr, Alignment.value(), BlockInMaskParts[Part]); 2454 else 2455 NewSI = 2456 Builder.CreateAlignedStore(StoredVal, VecPtr, Alignment.value()); 2457 } 2458 addMetadata(NewSI, SI); 2459 } 2460 return; 2461 } 2462 2463 // Handle loads. 2464 assert(LI && "Must have a load instruction"); 2465 setDebugLocFromInst(Builder, LI); 2466 for (unsigned Part = 0; Part < UF; ++Part) { 2467 Value *NewLI; 2468 if (CreateGatherScatter) { 2469 Value *MaskPart = isMaskRequired ? BlockInMaskParts[Part] : nullptr; 2470 Value *VectorGep = State.get(Addr, Part); 2471 NewLI = Builder.CreateMaskedGather(VectorGep, Alignment.value(), MaskPart, 2472 nullptr, "wide.masked.gather"); 2473 addMetadata(NewLI, LI); 2474 } else { 2475 auto *VecPtr = CreateVecPtr(Part, State.get(Addr, {0, 0})); 2476 if (isMaskRequired) 2477 NewLI = Builder.CreateMaskedLoad( 2478 VecPtr, Alignment.value(), BlockInMaskParts[Part], 2479 UndefValue::get(DataTy), "wide.masked.load"); 2480 else 2481 NewLI = Builder.CreateAlignedLoad(DataTy, VecPtr, Alignment.value(), 2482 "wide.load"); 2483 2484 // Add metadata to the load, but setVectorValue to the reverse shuffle. 2485 addMetadata(NewLI, LI); 2486 if (Reverse) 2487 NewLI = reverseVector(NewLI); 2488 } 2489 VectorLoopValueMap.setVectorValue(Instr, Part, NewLI); 2490 } 2491 } 2492 2493 void InnerLoopVectorizer::scalarizeInstruction(Instruction *Instr, 2494 const VPIteration &Instance, 2495 bool IfPredicateInstr) { 2496 assert(!Instr->getType()->isAggregateType() && "Can't handle vectors"); 2497 2498 setDebugLocFromInst(Builder, Instr); 2499 2500 // Does this instruction return a value ? 2501 bool IsVoidRetTy = Instr->getType()->isVoidTy(); 2502 2503 Instruction *Cloned = Instr->clone(); 2504 if (!IsVoidRetTy) 2505 Cloned->setName(Instr->getName() + ".cloned"); 2506 2507 // Replace the operands of the cloned instructions with their scalar 2508 // equivalents in the new loop. 2509 for (unsigned op = 0, e = Instr->getNumOperands(); op != e; ++op) { 2510 auto *NewOp = getOrCreateScalarValue(Instr->getOperand(op), Instance); 2511 Cloned->setOperand(op, NewOp); 2512 } 2513 addNewMetadata(Cloned, Instr); 2514 2515 // Place the cloned scalar in the new loop. 2516 Builder.Insert(Cloned); 2517 2518 // Add the cloned scalar to the scalar map entry. 2519 VectorLoopValueMap.setScalarValue(Instr, Instance, Cloned); 2520 2521 // If we just cloned a new assumption, add it the assumption cache. 2522 if (auto *II = dyn_cast<IntrinsicInst>(Cloned)) 2523 if (II->getIntrinsicID() == Intrinsic::assume) 2524 AC->registerAssumption(II); 2525 2526 // End if-block. 2527 if (IfPredicateInstr) 2528 PredicatedInstructions.push_back(Cloned); 2529 } 2530 2531 PHINode *InnerLoopVectorizer::createInductionVariable(Loop *L, Value *Start, 2532 Value *End, Value *Step, 2533 Instruction *DL) { 2534 BasicBlock *Header = L->getHeader(); 2535 BasicBlock *Latch = L->getLoopLatch(); 2536 // As we're just creating this loop, it's possible no latch exists 2537 // yet. If so, use the header as this will be a single block loop. 2538 if (!Latch) 2539 Latch = Header; 2540 2541 IRBuilder<> Builder(&*Header->getFirstInsertionPt()); 2542 Instruction *OldInst = getDebugLocFromInstOrOperands(OldInduction); 2543 setDebugLocFromInst(Builder, OldInst); 2544 auto *Induction = Builder.CreatePHI(Start->getType(), 2, "index"); 2545 2546 Builder.SetInsertPoint(Latch->getTerminator()); 2547 setDebugLocFromInst(Builder, OldInst); 2548 2549 // Create i+1 and fill the PHINode. 2550 Value *Next = Builder.CreateAdd(Induction, Step, "index.next"); 2551 Induction->addIncoming(Start, L->getLoopPreheader()); 2552 Induction->addIncoming(Next, Latch); 2553 // Create the compare. 2554 Value *ICmp = Builder.CreateICmpEQ(Next, End); 2555 Builder.CreateCondBr(ICmp, L->getExitBlock(), Header); 2556 2557 // Now we have two terminators. Remove the old one from the block. 2558 Latch->getTerminator()->eraseFromParent(); 2559 2560 return Induction; 2561 } 2562 2563 Value *InnerLoopVectorizer::getOrCreateTripCount(Loop *L) { 2564 if (TripCount) 2565 return TripCount; 2566 2567 assert(L && "Create Trip Count for null loop."); 2568 IRBuilder<> Builder(L->getLoopPreheader()->getTerminator()); 2569 // Find the loop boundaries. 2570 ScalarEvolution *SE = PSE.getSE(); 2571 const SCEV *BackedgeTakenCount = PSE.getBackedgeTakenCount(); 2572 assert(BackedgeTakenCount != SE->getCouldNotCompute() && 2573 "Invalid loop count"); 2574 2575 Type *IdxTy = Legal->getWidestInductionType(); 2576 assert(IdxTy && "No type for induction"); 2577 2578 // The exit count might have the type of i64 while the phi is i32. This can 2579 // happen if we have an induction variable that is sign extended before the 2580 // compare. The only way that we get a backedge taken count is that the 2581 // induction variable was signed and as such will not overflow. In such a case 2582 // truncation is legal. 2583 if (BackedgeTakenCount->getType()->getPrimitiveSizeInBits() > 2584 IdxTy->getPrimitiveSizeInBits()) 2585 BackedgeTakenCount = SE->getTruncateOrNoop(BackedgeTakenCount, IdxTy); 2586 BackedgeTakenCount = SE->getNoopOrZeroExtend(BackedgeTakenCount, IdxTy); 2587 2588 // Get the total trip count from the count by adding 1. 2589 const SCEV *ExitCount = SE->getAddExpr( 2590 BackedgeTakenCount, SE->getOne(BackedgeTakenCount->getType())); 2591 2592 const DataLayout &DL = L->getHeader()->getModule()->getDataLayout(); 2593 2594 // Expand the trip count and place the new instructions in the preheader. 2595 // Notice that the pre-header does not change, only the loop body. 2596 SCEVExpander Exp(*SE, DL, "induction"); 2597 2598 // Count holds the overall loop count (N). 2599 TripCount = Exp.expandCodeFor(ExitCount, ExitCount->getType(), 2600 L->getLoopPreheader()->getTerminator()); 2601 2602 if (TripCount->getType()->isPointerTy()) 2603 TripCount = 2604 CastInst::CreatePointerCast(TripCount, IdxTy, "exitcount.ptrcnt.to.int", 2605 L->getLoopPreheader()->getTerminator()); 2606 2607 return TripCount; 2608 } 2609 2610 Value *InnerLoopVectorizer::getOrCreateVectorTripCount(Loop *L) { 2611 if (VectorTripCount) 2612 return VectorTripCount; 2613 2614 Value *TC = getOrCreateTripCount(L); 2615 IRBuilder<> Builder(L->getLoopPreheader()->getTerminator()); 2616 2617 Type *Ty = TC->getType(); 2618 Constant *Step = ConstantInt::get(Ty, VF * UF); 2619 2620 // If the tail is to be folded by masking, round the number of iterations N 2621 // up to a multiple of Step instead of rounding down. This is done by first 2622 // adding Step-1 and then rounding down. Note that it's ok if this addition 2623 // overflows: the vector induction variable will eventually wrap to zero given 2624 // that it starts at zero and its Step is a power of two; the loop will then 2625 // exit, with the last early-exit vector comparison also producing all-true. 2626 if (Cost->foldTailByMasking()) { 2627 assert(isPowerOf2_32(VF * UF) && 2628 "VF*UF must be a power of 2 when folding tail by masking"); 2629 TC = Builder.CreateAdd(TC, ConstantInt::get(Ty, VF * UF - 1), "n.rnd.up"); 2630 } 2631 2632 // Now we need to generate the expression for the part of the loop that the 2633 // vectorized body will execute. This is equal to N - (N % Step) if scalar 2634 // iterations are not required for correctness, or N - Step, otherwise. Step 2635 // is equal to the vectorization factor (number of SIMD elements) times the 2636 // unroll factor (number of SIMD instructions). 2637 Value *R = Builder.CreateURem(TC, Step, "n.mod.vf"); 2638 2639 // If there is a non-reversed interleaved group that may speculatively access 2640 // memory out-of-bounds, we need to ensure that there will be at least one 2641 // iteration of the scalar epilogue loop. Thus, if the step evenly divides 2642 // the trip count, we set the remainder to be equal to the step. If the step 2643 // does not evenly divide the trip count, no adjustment is necessary since 2644 // there will already be scalar iterations. Note that the minimum iterations 2645 // check ensures that N >= Step. 2646 if (VF > 1 && Cost->requiresScalarEpilogue()) { 2647 auto *IsZero = Builder.CreateICmpEQ(R, ConstantInt::get(R->getType(), 0)); 2648 R = Builder.CreateSelect(IsZero, Step, R); 2649 } 2650 2651 VectorTripCount = Builder.CreateSub(TC, R, "n.vec"); 2652 2653 return VectorTripCount; 2654 } 2655 2656 Value *InnerLoopVectorizer::createBitOrPointerCast(Value *V, VectorType *DstVTy, 2657 const DataLayout &DL) { 2658 // Verify that V is a vector type with same number of elements as DstVTy. 2659 unsigned VF = DstVTy->getNumElements(); 2660 VectorType *SrcVecTy = cast<VectorType>(V->getType()); 2661 assert((VF == SrcVecTy->getNumElements()) && "Vector dimensions do not match"); 2662 Type *SrcElemTy = SrcVecTy->getElementType(); 2663 Type *DstElemTy = DstVTy->getElementType(); 2664 assert((DL.getTypeSizeInBits(SrcElemTy) == DL.getTypeSizeInBits(DstElemTy)) && 2665 "Vector elements must have same size"); 2666 2667 // Do a direct cast if element types are castable. 2668 if (CastInst::isBitOrNoopPointerCastable(SrcElemTy, DstElemTy, DL)) { 2669 return Builder.CreateBitOrPointerCast(V, DstVTy); 2670 } 2671 // V cannot be directly casted to desired vector type. 2672 // May happen when V is a floating point vector but DstVTy is a vector of 2673 // pointers or vice-versa. Handle this using a two-step bitcast using an 2674 // intermediate Integer type for the bitcast i.e. Ptr <-> Int <-> Float. 2675 assert((DstElemTy->isPointerTy() != SrcElemTy->isPointerTy()) && 2676 "Only one type should be a pointer type"); 2677 assert((DstElemTy->isFloatingPointTy() != SrcElemTy->isFloatingPointTy()) && 2678 "Only one type should be a floating point type"); 2679 Type *IntTy = 2680 IntegerType::getIntNTy(V->getContext(), DL.getTypeSizeInBits(SrcElemTy)); 2681 VectorType *VecIntTy = VectorType::get(IntTy, VF); 2682 Value *CastVal = Builder.CreateBitOrPointerCast(V, VecIntTy); 2683 return Builder.CreateBitOrPointerCast(CastVal, DstVTy); 2684 } 2685 2686 void InnerLoopVectorizer::emitMinimumIterationCountCheck(Loop *L, 2687 BasicBlock *Bypass) { 2688 Value *Count = getOrCreateTripCount(L); 2689 // Reuse existing vector loop preheader for TC checks. 2690 // Note that new preheader block is generated for vector loop. 2691 BasicBlock *const TCCheckBlock = LoopVectorPreHeader; 2692 IRBuilder<> Builder(TCCheckBlock->getTerminator()); 2693 2694 // Generate code to check if the loop's trip count is less than VF * UF, or 2695 // equal to it in case a scalar epilogue is required; this implies that the 2696 // vector trip count is zero. This check also covers the case where adding one 2697 // to the backedge-taken count overflowed leading to an incorrect trip count 2698 // of zero. In this case we will also jump to the scalar loop. 2699 auto P = Cost->requiresScalarEpilogue() ? ICmpInst::ICMP_ULE 2700 : ICmpInst::ICMP_ULT; 2701 2702 // If tail is to be folded, vector loop takes care of all iterations. 2703 Value *CheckMinIters = Builder.getFalse(); 2704 if (!Cost->foldTailByMasking()) 2705 CheckMinIters = Builder.CreateICmp( 2706 P, Count, ConstantInt::get(Count->getType(), VF * UF), 2707 "min.iters.check"); 2708 2709 // Create new preheader for vector loop. 2710 LoopVectorPreHeader = 2711 SplitBlock(TCCheckBlock, TCCheckBlock->getTerminator(), DT, LI, nullptr, 2712 "vector.ph"); 2713 2714 assert(DT->properlyDominates(DT->getNode(TCCheckBlock), 2715 DT->getNode(Bypass)->getIDom()) && 2716 "TC check is expected to dominate Bypass"); 2717 2718 // Update dominator for Bypass & LoopExit. 2719 DT->changeImmediateDominator(Bypass, TCCheckBlock); 2720 DT->changeImmediateDominator(LoopExitBlock, TCCheckBlock); 2721 2722 ReplaceInstWithInst( 2723 TCCheckBlock->getTerminator(), 2724 BranchInst::Create(Bypass, LoopVectorPreHeader, CheckMinIters)); 2725 LoopBypassBlocks.push_back(TCCheckBlock); 2726 } 2727 2728 void InnerLoopVectorizer::emitSCEVChecks(Loop *L, BasicBlock *Bypass) { 2729 // Reuse existing vector loop preheader for SCEV checks. 2730 // Note that new preheader block is generated for vector loop. 2731 BasicBlock *const SCEVCheckBlock = LoopVectorPreHeader; 2732 2733 // Generate the code to check that the SCEV assumptions that we made. 2734 // We want the new basic block to start at the first instruction in a 2735 // sequence of instructions that form a check. 2736 SCEVExpander Exp(*PSE.getSE(), Bypass->getModule()->getDataLayout(), 2737 "scev.check"); 2738 Value *SCEVCheck = Exp.expandCodeForPredicate( 2739 &PSE.getUnionPredicate(), SCEVCheckBlock->getTerminator()); 2740 2741 if (auto *C = dyn_cast<ConstantInt>(SCEVCheck)) 2742 if (C->isZero()) 2743 return; 2744 2745 assert(!SCEVCheckBlock->getParent()->hasOptSize() && 2746 "Cannot SCEV check stride or overflow when optimizing for size"); 2747 2748 SCEVCheckBlock->setName("vector.scevcheck"); 2749 // Create new preheader for vector loop. 2750 LoopVectorPreHeader = 2751 SplitBlock(SCEVCheckBlock, SCEVCheckBlock->getTerminator(), DT, LI, 2752 nullptr, "vector.ph"); 2753 2754 // Update dominator only if this is first RT check. 2755 if (LoopBypassBlocks.empty()) { 2756 DT->changeImmediateDominator(Bypass, SCEVCheckBlock); 2757 DT->changeImmediateDominator(LoopExitBlock, SCEVCheckBlock); 2758 } 2759 2760 ReplaceInstWithInst( 2761 SCEVCheckBlock->getTerminator(), 2762 BranchInst::Create(Bypass, LoopVectorPreHeader, SCEVCheck)); 2763 LoopBypassBlocks.push_back(SCEVCheckBlock); 2764 AddedSafetyChecks = true; 2765 } 2766 2767 void InnerLoopVectorizer::emitMemRuntimeChecks(Loop *L, BasicBlock *Bypass) { 2768 // VPlan-native path does not do any analysis for runtime checks currently. 2769 if (EnableVPlanNativePath) 2770 return; 2771 2772 // Reuse existing vector loop preheader for runtime memory checks. 2773 // Note that new preheader block is generated for vector loop. 2774 BasicBlock *const MemCheckBlock = L->getLoopPreheader(); 2775 2776 // Generate the code that checks in runtime if arrays overlap. We put the 2777 // checks into a separate block to make the more common case of few elements 2778 // faster. 2779 Instruction *FirstCheckInst; 2780 Instruction *MemRuntimeCheck; 2781 std::tie(FirstCheckInst, MemRuntimeCheck) = 2782 Legal->getLAI()->addRuntimeChecks(MemCheckBlock->getTerminator()); 2783 if (!MemRuntimeCheck) 2784 return; 2785 2786 if (MemCheckBlock->getParent()->hasOptSize()) { 2787 assert(Cost->Hints->getForce() == LoopVectorizeHints::FK_Enabled && 2788 "Cannot emit memory checks when optimizing for size, unless forced " 2789 "to vectorize."); 2790 ORE->emit([&]() { 2791 return OptimizationRemarkAnalysis(DEBUG_TYPE, "VectorizationCodeSize", 2792 L->getStartLoc(), L->getHeader()) 2793 << "Code-size may be reduced by not forcing " 2794 "vectorization, or by source-code modifications " 2795 "eliminating the need for runtime checks " 2796 "(e.g., adding 'restrict')."; 2797 }); 2798 } 2799 2800 MemCheckBlock->setName("vector.memcheck"); 2801 // Create new preheader for vector loop. 2802 LoopVectorPreHeader = 2803 SplitBlock(MemCheckBlock, MemCheckBlock->getTerminator(), DT, LI, nullptr, 2804 "vector.ph"); 2805 2806 // Update dominator only if this is first RT check. 2807 if (LoopBypassBlocks.empty()) { 2808 DT->changeImmediateDominator(Bypass, MemCheckBlock); 2809 DT->changeImmediateDominator(LoopExitBlock, MemCheckBlock); 2810 } 2811 2812 ReplaceInstWithInst( 2813 MemCheckBlock->getTerminator(), 2814 BranchInst::Create(Bypass, LoopVectorPreHeader, MemRuntimeCheck)); 2815 LoopBypassBlocks.push_back(MemCheckBlock); 2816 AddedSafetyChecks = true; 2817 2818 // We currently don't use LoopVersioning for the actual loop cloning but we 2819 // still use it to add the noalias metadata. 2820 LVer = std::make_unique<LoopVersioning>(*Legal->getLAI(), OrigLoop, LI, DT, 2821 PSE.getSE()); 2822 LVer->prepareNoAliasMetadata(); 2823 } 2824 2825 Value *InnerLoopVectorizer::emitTransformedIndex( 2826 IRBuilder<> &B, Value *Index, ScalarEvolution *SE, const DataLayout &DL, 2827 const InductionDescriptor &ID) const { 2828 2829 SCEVExpander Exp(*SE, DL, "induction"); 2830 auto Step = ID.getStep(); 2831 auto StartValue = ID.getStartValue(); 2832 assert(Index->getType() == Step->getType() && 2833 "Index type does not match StepValue type"); 2834 2835 // Note: the IR at this point is broken. We cannot use SE to create any new 2836 // SCEV and then expand it, hoping that SCEV's simplification will give us 2837 // a more optimal code. Unfortunately, attempt of doing so on invalid IR may 2838 // lead to various SCEV crashes. So all we can do is to use builder and rely 2839 // on InstCombine for future simplifications. Here we handle some trivial 2840 // cases only. 2841 auto CreateAdd = [&B](Value *X, Value *Y) { 2842 assert(X->getType() == Y->getType() && "Types don't match!"); 2843 if (auto *CX = dyn_cast<ConstantInt>(X)) 2844 if (CX->isZero()) 2845 return Y; 2846 if (auto *CY = dyn_cast<ConstantInt>(Y)) 2847 if (CY->isZero()) 2848 return X; 2849 return B.CreateAdd(X, Y); 2850 }; 2851 2852 auto CreateMul = [&B](Value *X, Value *Y) { 2853 assert(X->getType() == Y->getType() && "Types don't match!"); 2854 if (auto *CX = dyn_cast<ConstantInt>(X)) 2855 if (CX->isOne()) 2856 return Y; 2857 if (auto *CY = dyn_cast<ConstantInt>(Y)) 2858 if (CY->isOne()) 2859 return X; 2860 return B.CreateMul(X, Y); 2861 }; 2862 2863 switch (ID.getKind()) { 2864 case InductionDescriptor::IK_IntInduction: { 2865 assert(Index->getType() == StartValue->getType() && 2866 "Index type does not match StartValue type"); 2867 if (ID.getConstIntStepValue() && ID.getConstIntStepValue()->isMinusOne()) 2868 return B.CreateSub(StartValue, Index); 2869 auto *Offset = CreateMul( 2870 Index, Exp.expandCodeFor(Step, Index->getType(), &*B.GetInsertPoint())); 2871 return CreateAdd(StartValue, Offset); 2872 } 2873 case InductionDescriptor::IK_PtrInduction: { 2874 assert(isa<SCEVConstant>(Step) && 2875 "Expected constant step for pointer induction"); 2876 return B.CreateGEP( 2877 StartValue->getType()->getPointerElementType(), StartValue, 2878 CreateMul(Index, Exp.expandCodeFor(Step, Index->getType(), 2879 &*B.GetInsertPoint()))); 2880 } 2881 case InductionDescriptor::IK_FpInduction: { 2882 assert(Step->getType()->isFloatingPointTy() && "Expected FP Step value"); 2883 auto InductionBinOp = ID.getInductionBinOp(); 2884 assert(InductionBinOp && 2885 (InductionBinOp->getOpcode() == Instruction::FAdd || 2886 InductionBinOp->getOpcode() == Instruction::FSub) && 2887 "Original bin op should be defined for FP induction"); 2888 2889 Value *StepValue = cast<SCEVUnknown>(Step)->getValue(); 2890 2891 // Floating point operations had to be 'fast' to enable the induction. 2892 FastMathFlags Flags; 2893 Flags.setFast(); 2894 2895 Value *MulExp = B.CreateFMul(StepValue, Index); 2896 if (isa<Instruction>(MulExp)) 2897 // We have to check, the MulExp may be a constant. 2898 cast<Instruction>(MulExp)->setFastMathFlags(Flags); 2899 2900 Value *BOp = B.CreateBinOp(InductionBinOp->getOpcode(), StartValue, MulExp, 2901 "induction"); 2902 if (isa<Instruction>(BOp)) 2903 cast<Instruction>(BOp)->setFastMathFlags(Flags); 2904 2905 return BOp; 2906 } 2907 case InductionDescriptor::IK_NoInduction: 2908 return nullptr; 2909 } 2910 llvm_unreachable("invalid enum"); 2911 } 2912 2913 BasicBlock *InnerLoopVectorizer::createVectorizedLoopSkeleton() { 2914 /* 2915 In this function we generate a new loop. The new loop will contain 2916 the vectorized instructions while the old loop will continue to run the 2917 scalar remainder. 2918 2919 [ ] <-- loop iteration number check. 2920 / | 2921 / v 2922 | [ ] <-- vector loop bypass (may consist of multiple blocks). 2923 | / | 2924 | / v 2925 || [ ] <-- vector pre header. 2926 |/ | 2927 | v 2928 | [ ] \ 2929 | [ ]_| <-- vector loop. 2930 | | 2931 | v 2932 | -[ ] <--- middle-block. 2933 | / | 2934 | / v 2935 -|- >[ ] <--- new preheader. 2936 | | 2937 | v 2938 | [ ] \ 2939 | [ ]_| <-- old scalar loop to handle remainder. 2940 \ | 2941 \ v 2942 >[ ] <-- exit block. 2943 ... 2944 */ 2945 2946 MDNode *OrigLoopID = OrigLoop->getLoopID(); 2947 2948 // Some loops have a single integer induction variable, while other loops 2949 // don't. One example is c++ iterators that often have multiple pointer 2950 // induction variables. In the code below we also support a case where we 2951 // don't have a single induction variable. 2952 // 2953 // We try to obtain an induction variable from the original loop as hard 2954 // as possible. However if we don't find one that: 2955 // - is an integer 2956 // - counts from zero, stepping by one 2957 // - is the size of the widest induction variable type 2958 // then we create a new one. 2959 OldInduction = Legal->getPrimaryInduction(); 2960 Type *IdxTy = Legal->getWidestInductionType(); 2961 2962 // Split the single block loop into the two loop structure described above. 2963 LoopScalarBody = OrigLoop->getHeader(); 2964 LoopVectorPreHeader = OrigLoop->getLoopPreheader(); 2965 LoopExitBlock = OrigLoop->getExitBlock(); 2966 assert(LoopExitBlock && "Must have an exit block"); 2967 assert(LoopVectorPreHeader && "Invalid loop structure"); 2968 2969 LoopMiddleBlock = 2970 SplitBlock(LoopVectorPreHeader, LoopVectorPreHeader->getTerminator(), DT, 2971 LI, nullptr, "middle.block"); 2972 LoopScalarPreHeader = 2973 SplitBlock(LoopMiddleBlock, LoopMiddleBlock->getTerminator(), DT, LI, 2974 nullptr, "scalar.ph"); 2975 // We intentionally don't let SplitBlock to update LoopInfo since 2976 // LoopVectorBody should belong to another loop than LoopVectorPreHeader. 2977 // LoopVectorBody is explicitly added to the correct place few lines later. 2978 LoopVectorBody = 2979 SplitBlock(LoopVectorPreHeader, LoopVectorPreHeader->getTerminator(), DT, 2980 nullptr, nullptr, "vector.body"); 2981 2982 // Update dominator for loop exit. 2983 DT->changeImmediateDominator(LoopExitBlock, LoopMiddleBlock); 2984 2985 // Create and register the new vector loop. 2986 Loop *Lp = LI->AllocateLoop(); 2987 Loop *ParentLoop = OrigLoop->getParentLoop(); 2988 2989 // Insert the new loop into the loop nest and register the new basic blocks 2990 // before calling any utilities such as SCEV that require valid LoopInfo. 2991 if (ParentLoop) { 2992 ParentLoop->addChildLoop(Lp); 2993 } else { 2994 LI->addTopLevelLoop(Lp); 2995 } 2996 Lp->addBasicBlockToLoop(LoopVectorBody, *LI); 2997 2998 // Find the loop boundaries. 2999 Value *Count = getOrCreateTripCount(Lp); 3000 3001 Value *StartIdx = ConstantInt::get(IdxTy, 0); 3002 3003 // Now, compare the new count to zero. If it is zero skip the vector loop and 3004 // jump to the scalar loop. This check also covers the case where the 3005 // backedge-taken count is uint##_max: adding one to it will overflow leading 3006 // to an incorrect trip count of zero. In this (rare) case we will also jump 3007 // to the scalar loop. 3008 emitMinimumIterationCountCheck(Lp, LoopScalarPreHeader); 3009 3010 // Generate the code to check any assumptions that we've made for SCEV 3011 // expressions. 3012 emitSCEVChecks(Lp, LoopScalarPreHeader); 3013 3014 // Generate the code that checks in runtime if arrays overlap. We put the 3015 // checks into a separate block to make the more common case of few elements 3016 // faster. 3017 emitMemRuntimeChecks(Lp, LoopScalarPreHeader); 3018 3019 // Generate the induction variable. 3020 // The loop step is equal to the vectorization factor (num of SIMD elements) 3021 // times the unroll factor (num of SIMD instructions). 3022 Value *CountRoundDown = getOrCreateVectorTripCount(Lp); 3023 Constant *Step = ConstantInt::get(IdxTy, VF * UF); 3024 Induction = 3025 createInductionVariable(Lp, StartIdx, CountRoundDown, Step, 3026 getDebugLocFromInstOrOperands(OldInduction)); 3027 3028 // We are going to resume the execution of the scalar loop. 3029 // Go over all of the induction variables that we found and fix the 3030 // PHIs that are left in the scalar version of the loop. 3031 // The starting values of PHI nodes depend on the counter of the last 3032 // iteration in the vectorized loop. 3033 // If we come from a bypass edge then we need to start from the original 3034 // start value. 3035 3036 // This variable saves the new starting index for the scalar loop. It is used 3037 // to test if there are any tail iterations left once the vector loop has 3038 // completed. 3039 LoopVectorizationLegality::InductionList *List = Legal->getInductionVars(); 3040 for (auto &InductionEntry : *List) { 3041 PHINode *OrigPhi = InductionEntry.first; 3042 InductionDescriptor II = InductionEntry.second; 3043 3044 // Create phi nodes to merge from the backedge-taken check block. 3045 PHINode *BCResumeVal = 3046 PHINode::Create(OrigPhi->getType(), 3, "bc.resume.val", 3047 LoopScalarPreHeader->getTerminator()); 3048 // Copy original phi DL over to the new one. 3049 BCResumeVal->setDebugLoc(OrigPhi->getDebugLoc()); 3050 Value *&EndValue = IVEndValues[OrigPhi]; 3051 if (OrigPhi == OldInduction) { 3052 // We know what the end value is. 3053 EndValue = CountRoundDown; 3054 } else { 3055 IRBuilder<> B(Lp->getLoopPreheader()->getTerminator()); 3056 Type *StepType = II.getStep()->getType(); 3057 Instruction::CastOps CastOp = 3058 CastInst::getCastOpcode(CountRoundDown, true, StepType, true); 3059 Value *CRD = B.CreateCast(CastOp, CountRoundDown, StepType, "cast.crd"); 3060 const DataLayout &DL = LoopScalarBody->getModule()->getDataLayout(); 3061 EndValue = emitTransformedIndex(B, CRD, PSE.getSE(), DL, II); 3062 EndValue->setName("ind.end"); 3063 } 3064 3065 // The new PHI merges the original incoming value, in case of a bypass, 3066 // or the value at the end of the vectorized loop. 3067 BCResumeVal->addIncoming(EndValue, LoopMiddleBlock); 3068 3069 // Fix the scalar body counter (PHI node). 3070 // The old induction's phi node in the scalar body needs the truncated 3071 // value. 3072 for (BasicBlock *BB : LoopBypassBlocks) 3073 BCResumeVal->addIncoming(II.getStartValue(), BB); 3074 OrigPhi->setIncomingValueForBlock(LoopScalarPreHeader, BCResumeVal); 3075 } 3076 3077 // We need the OrigLoop (scalar loop part) latch terminator to help 3078 // produce correct debug info for the middle block BB instructions. 3079 // The legality check stage guarantees that the loop will have a single 3080 // latch. 3081 assert(isa<BranchInst>(OrigLoop->getLoopLatch()->getTerminator()) && 3082 "Scalar loop latch terminator isn't a branch"); 3083 BranchInst *ScalarLatchBr = 3084 cast<BranchInst>(OrigLoop->getLoopLatch()->getTerminator()); 3085 3086 // Add a check in the middle block to see if we have completed 3087 // all of the iterations in the first vector loop. 3088 // If (N - N%VF) == N, then we *don't* need to run the remainder. 3089 // If tail is to be folded, we know we don't need to run the remainder. 3090 Value *CmpN = Builder.getTrue(); 3091 if (!Cost->foldTailByMasking()) { 3092 CmpN = CmpInst::Create(Instruction::ICmp, CmpInst::ICMP_EQ, Count, 3093 CountRoundDown, "cmp.n", 3094 LoopMiddleBlock->getTerminator()); 3095 3096 // Here we use the same DebugLoc as the scalar loop latch branch instead 3097 // of the corresponding compare because they may have ended up with 3098 // different line numbers and we want to avoid awkward line stepping while 3099 // debugging. Eg. if the compare has got a line number inside the loop. 3100 cast<Instruction>(CmpN)->setDebugLoc(ScalarLatchBr->getDebugLoc()); 3101 } 3102 3103 BranchInst *BrInst = 3104 BranchInst::Create(LoopExitBlock, LoopScalarPreHeader, CmpN); 3105 BrInst->setDebugLoc(ScalarLatchBr->getDebugLoc()); 3106 ReplaceInstWithInst(LoopMiddleBlock->getTerminator(), BrInst); 3107 3108 // Get ready to start creating new instructions into the vectorized body. 3109 assert(LoopVectorPreHeader == Lp->getLoopPreheader() && 3110 "Inconsistent vector loop preheader"); 3111 Builder.SetInsertPoint(&*LoopVectorBody->getFirstInsertionPt()); 3112 3113 Optional<MDNode *> VectorizedLoopID = 3114 makeFollowupLoopID(OrigLoopID, {LLVMLoopVectorizeFollowupAll, 3115 LLVMLoopVectorizeFollowupVectorized}); 3116 if (VectorizedLoopID.hasValue()) { 3117 Lp->setLoopID(VectorizedLoopID.getValue()); 3118 3119 // Do not setAlreadyVectorized if loop attributes have been defined 3120 // explicitly. 3121 return LoopVectorPreHeader; 3122 } 3123 3124 // Keep all loop hints from the original loop on the vector loop (we'll 3125 // replace the vectorizer-specific hints below). 3126 if (MDNode *LID = OrigLoop->getLoopID()) 3127 Lp->setLoopID(LID); 3128 3129 LoopVectorizeHints Hints(Lp, true, *ORE); 3130 Hints.setAlreadyVectorized(); 3131 3132 #ifdef EXPENSIVE_CHECKS 3133 assert(DT->verify(DominatorTree::VerificationLevel::Fast)); 3134 LI->verify(*DT); 3135 #endif 3136 3137 return LoopVectorPreHeader; 3138 } 3139 3140 // Fix up external users of the induction variable. At this point, we are 3141 // in LCSSA form, with all external PHIs that use the IV having one input value, 3142 // coming from the remainder loop. We need those PHIs to also have a correct 3143 // value for the IV when arriving directly from the middle block. 3144 void InnerLoopVectorizer::fixupIVUsers(PHINode *OrigPhi, 3145 const InductionDescriptor &II, 3146 Value *CountRoundDown, Value *EndValue, 3147 BasicBlock *MiddleBlock) { 3148 // There are two kinds of external IV usages - those that use the value 3149 // computed in the last iteration (the PHI) and those that use the penultimate 3150 // value (the value that feeds into the phi from the loop latch). 3151 // We allow both, but they, obviously, have different values. 3152 3153 assert(OrigLoop->getExitBlock() && "Expected a single exit block"); 3154 3155 DenseMap<Value *, Value *> MissingVals; 3156 3157 // An external user of the last iteration's value should see the value that 3158 // the remainder loop uses to initialize its own IV. 3159 Value *PostInc = OrigPhi->getIncomingValueForBlock(OrigLoop->getLoopLatch()); 3160 for (User *U : PostInc->users()) { 3161 Instruction *UI = cast<Instruction>(U); 3162 if (!OrigLoop->contains(UI)) { 3163 assert(isa<PHINode>(UI) && "Expected LCSSA form"); 3164 MissingVals[UI] = EndValue; 3165 } 3166 } 3167 3168 // An external user of the penultimate value need to see EndValue - Step. 3169 // The simplest way to get this is to recompute it from the constituent SCEVs, 3170 // that is Start + (Step * (CRD - 1)). 3171 for (User *U : OrigPhi->users()) { 3172 auto *UI = cast<Instruction>(U); 3173 if (!OrigLoop->contains(UI)) { 3174 const DataLayout &DL = 3175 OrigLoop->getHeader()->getModule()->getDataLayout(); 3176 assert(isa<PHINode>(UI) && "Expected LCSSA form"); 3177 3178 IRBuilder<> B(MiddleBlock->getTerminator()); 3179 Value *CountMinusOne = B.CreateSub( 3180 CountRoundDown, ConstantInt::get(CountRoundDown->getType(), 1)); 3181 Value *CMO = 3182 !II.getStep()->getType()->isIntegerTy() 3183 ? B.CreateCast(Instruction::SIToFP, CountMinusOne, 3184 II.getStep()->getType()) 3185 : B.CreateSExtOrTrunc(CountMinusOne, II.getStep()->getType()); 3186 CMO->setName("cast.cmo"); 3187 Value *Escape = emitTransformedIndex(B, CMO, PSE.getSE(), DL, II); 3188 Escape->setName("ind.escape"); 3189 MissingVals[UI] = Escape; 3190 } 3191 } 3192 3193 for (auto &I : MissingVals) { 3194 PHINode *PHI = cast<PHINode>(I.first); 3195 // One corner case we have to handle is two IVs "chasing" each-other, 3196 // that is %IV2 = phi [...], [ %IV1, %latch ] 3197 // In this case, if IV1 has an external use, we need to avoid adding both 3198 // "last value of IV1" and "penultimate value of IV2". So, verify that we 3199 // don't already have an incoming value for the middle block. 3200 if (PHI->getBasicBlockIndex(MiddleBlock) == -1) 3201 PHI->addIncoming(I.second, MiddleBlock); 3202 } 3203 } 3204 3205 namespace { 3206 3207 struct CSEDenseMapInfo { 3208 static bool canHandle(const Instruction *I) { 3209 return isa<InsertElementInst>(I) || isa<ExtractElementInst>(I) || 3210 isa<ShuffleVectorInst>(I) || isa<GetElementPtrInst>(I); 3211 } 3212 3213 static inline Instruction *getEmptyKey() { 3214 return DenseMapInfo<Instruction *>::getEmptyKey(); 3215 } 3216 3217 static inline Instruction *getTombstoneKey() { 3218 return DenseMapInfo<Instruction *>::getTombstoneKey(); 3219 } 3220 3221 static unsigned getHashValue(const Instruction *I) { 3222 assert(canHandle(I) && "Unknown instruction!"); 3223 return hash_combine(I->getOpcode(), hash_combine_range(I->value_op_begin(), 3224 I->value_op_end())); 3225 } 3226 3227 static bool isEqual(const Instruction *LHS, const Instruction *RHS) { 3228 if (LHS == getEmptyKey() || RHS == getEmptyKey() || 3229 LHS == getTombstoneKey() || RHS == getTombstoneKey()) 3230 return LHS == RHS; 3231 return LHS->isIdenticalTo(RHS); 3232 } 3233 }; 3234 3235 } // end anonymous namespace 3236 3237 ///Perform cse of induction variable instructions. 3238 static void cse(BasicBlock *BB) { 3239 // Perform simple cse. 3240 SmallDenseMap<Instruction *, Instruction *, 4, CSEDenseMapInfo> CSEMap; 3241 for (BasicBlock::iterator I = BB->begin(), E = BB->end(); I != E;) { 3242 Instruction *In = &*I++; 3243 3244 if (!CSEDenseMapInfo::canHandle(In)) 3245 continue; 3246 3247 // Check if we can replace this instruction with any of the 3248 // visited instructions. 3249 if (Instruction *V = CSEMap.lookup(In)) { 3250 In->replaceAllUsesWith(V); 3251 In->eraseFromParent(); 3252 continue; 3253 } 3254 3255 CSEMap[In] = In; 3256 } 3257 } 3258 3259 unsigned LoopVectorizationCostModel::getVectorCallCost(CallInst *CI, 3260 unsigned VF, 3261 bool &NeedToScalarize) { 3262 Function *F = CI->getCalledFunction(); 3263 Type *ScalarRetTy = CI->getType(); 3264 SmallVector<Type *, 4> Tys, ScalarTys; 3265 for (auto &ArgOp : CI->arg_operands()) 3266 ScalarTys.push_back(ArgOp->getType()); 3267 3268 // Estimate cost of scalarized vector call. The source operands are assumed 3269 // to be vectors, so we need to extract individual elements from there, 3270 // execute VF scalar calls, and then gather the result into the vector return 3271 // value. 3272 unsigned ScalarCallCost = TTI.getCallInstrCost(F, ScalarRetTy, ScalarTys); 3273 if (VF == 1) 3274 return ScalarCallCost; 3275 3276 // Compute corresponding vector type for return value and arguments. 3277 Type *RetTy = ToVectorTy(ScalarRetTy, VF); 3278 for (Type *ScalarTy : ScalarTys) 3279 Tys.push_back(ToVectorTy(ScalarTy, VF)); 3280 3281 // Compute costs of unpacking argument values for the scalar calls and 3282 // packing the return values to a vector. 3283 unsigned ScalarizationCost = getScalarizationOverhead(CI, VF); 3284 3285 unsigned Cost = ScalarCallCost * VF + ScalarizationCost; 3286 3287 // If we can't emit a vector call for this function, then the currently found 3288 // cost is the cost we need to return. 3289 NeedToScalarize = true; 3290 if (!TLI || CI->isNoBuiltin() || VFDatabase::getMappings(*CI).empty()) 3291 return Cost; 3292 3293 // If the corresponding vector cost is cheaper, return its cost. 3294 unsigned VectorCallCost = TTI.getCallInstrCost(nullptr, RetTy, Tys); 3295 if (VectorCallCost < Cost) { 3296 NeedToScalarize = false; 3297 return VectorCallCost; 3298 } 3299 return Cost; 3300 } 3301 3302 unsigned LoopVectorizationCostModel::getVectorIntrinsicCost(CallInst *CI, 3303 unsigned VF) { 3304 Intrinsic::ID ID = getVectorIntrinsicIDForCall(CI, TLI); 3305 assert(ID && "Expected intrinsic call!"); 3306 3307 FastMathFlags FMF; 3308 if (auto *FPMO = dyn_cast<FPMathOperator>(CI)) 3309 FMF = FPMO->getFastMathFlags(); 3310 3311 SmallVector<Value *, 4> Operands(CI->arg_operands()); 3312 return TTI.getIntrinsicInstrCost(ID, CI->getType(), Operands, FMF, VF); 3313 } 3314 3315 static Type *smallestIntegerVectorType(Type *T1, Type *T2) { 3316 auto *I1 = cast<IntegerType>(T1->getVectorElementType()); 3317 auto *I2 = cast<IntegerType>(T2->getVectorElementType()); 3318 return I1->getBitWidth() < I2->getBitWidth() ? T1 : T2; 3319 } 3320 static Type *largestIntegerVectorType(Type *T1, Type *T2) { 3321 auto *I1 = cast<IntegerType>(T1->getVectorElementType()); 3322 auto *I2 = cast<IntegerType>(T2->getVectorElementType()); 3323 return I1->getBitWidth() > I2->getBitWidth() ? T1 : T2; 3324 } 3325 3326 void InnerLoopVectorizer::truncateToMinimalBitwidths() { 3327 // For every instruction `I` in MinBWs, truncate the operands, create a 3328 // truncated version of `I` and reextend its result. InstCombine runs 3329 // later and will remove any ext/trunc pairs. 3330 SmallPtrSet<Value *, 4> Erased; 3331 for (const auto &KV : Cost->getMinimalBitwidths()) { 3332 // If the value wasn't vectorized, we must maintain the original scalar 3333 // type. The absence of the value from VectorLoopValueMap indicates that it 3334 // wasn't vectorized. 3335 if (!VectorLoopValueMap.hasAnyVectorValue(KV.first)) 3336 continue; 3337 for (unsigned Part = 0; Part < UF; ++Part) { 3338 Value *I = getOrCreateVectorValue(KV.first, Part); 3339 if (Erased.find(I) != Erased.end() || I->use_empty() || 3340 !isa<Instruction>(I)) 3341 continue; 3342 Type *OriginalTy = I->getType(); 3343 Type *ScalarTruncatedTy = 3344 IntegerType::get(OriginalTy->getContext(), KV.second); 3345 Type *TruncatedTy = VectorType::get(ScalarTruncatedTy, 3346 OriginalTy->getVectorNumElements()); 3347 if (TruncatedTy == OriginalTy) 3348 continue; 3349 3350 IRBuilder<> B(cast<Instruction>(I)); 3351 auto ShrinkOperand = [&](Value *V) -> Value * { 3352 if (auto *ZI = dyn_cast<ZExtInst>(V)) 3353 if (ZI->getSrcTy() == TruncatedTy) 3354 return ZI->getOperand(0); 3355 return B.CreateZExtOrTrunc(V, TruncatedTy); 3356 }; 3357 3358 // The actual instruction modification depends on the instruction type, 3359 // unfortunately. 3360 Value *NewI = nullptr; 3361 if (auto *BO = dyn_cast<BinaryOperator>(I)) { 3362 NewI = B.CreateBinOp(BO->getOpcode(), ShrinkOperand(BO->getOperand(0)), 3363 ShrinkOperand(BO->getOperand(1))); 3364 3365 // Any wrapping introduced by shrinking this operation shouldn't be 3366 // considered undefined behavior. So, we can't unconditionally copy 3367 // arithmetic wrapping flags to NewI. 3368 cast<BinaryOperator>(NewI)->copyIRFlags(I, /*IncludeWrapFlags=*/false); 3369 } else if (auto *CI = dyn_cast<ICmpInst>(I)) { 3370 NewI = 3371 B.CreateICmp(CI->getPredicate(), ShrinkOperand(CI->getOperand(0)), 3372 ShrinkOperand(CI->getOperand(1))); 3373 } else if (auto *SI = dyn_cast<SelectInst>(I)) { 3374 NewI = B.CreateSelect(SI->getCondition(), 3375 ShrinkOperand(SI->getTrueValue()), 3376 ShrinkOperand(SI->getFalseValue())); 3377 } else if (auto *CI = dyn_cast<CastInst>(I)) { 3378 switch (CI->getOpcode()) { 3379 default: 3380 llvm_unreachable("Unhandled cast!"); 3381 case Instruction::Trunc: 3382 NewI = ShrinkOperand(CI->getOperand(0)); 3383 break; 3384 case Instruction::SExt: 3385 NewI = B.CreateSExtOrTrunc( 3386 CI->getOperand(0), 3387 smallestIntegerVectorType(OriginalTy, TruncatedTy)); 3388 break; 3389 case Instruction::ZExt: 3390 NewI = B.CreateZExtOrTrunc( 3391 CI->getOperand(0), 3392 smallestIntegerVectorType(OriginalTy, TruncatedTy)); 3393 break; 3394 } 3395 } else if (auto *SI = dyn_cast<ShuffleVectorInst>(I)) { 3396 auto Elements0 = SI->getOperand(0)->getType()->getVectorNumElements(); 3397 auto *O0 = B.CreateZExtOrTrunc( 3398 SI->getOperand(0), VectorType::get(ScalarTruncatedTy, Elements0)); 3399 auto Elements1 = SI->getOperand(1)->getType()->getVectorNumElements(); 3400 auto *O1 = B.CreateZExtOrTrunc( 3401 SI->getOperand(1), VectorType::get(ScalarTruncatedTy, Elements1)); 3402 3403 NewI = B.CreateShuffleVector(O0, O1, SI->getMask()); 3404 } else if (isa<LoadInst>(I) || isa<PHINode>(I)) { 3405 // Don't do anything with the operands, just extend the result. 3406 continue; 3407 } else if (auto *IE = dyn_cast<InsertElementInst>(I)) { 3408 auto Elements = IE->getOperand(0)->getType()->getVectorNumElements(); 3409 auto *O0 = B.CreateZExtOrTrunc( 3410 IE->getOperand(0), VectorType::get(ScalarTruncatedTy, Elements)); 3411 auto *O1 = B.CreateZExtOrTrunc(IE->getOperand(1), ScalarTruncatedTy); 3412 NewI = B.CreateInsertElement(O0, O1, IE->getOperand(2)); 3413 } else if (auto *EE = dyn_cast<ExtractElementInst>(I)) { 3414 auto Elements = EE->getOperand(0)->getType()->getVectorNumElements(); 3415 auto *O0 = B.CreateZExtOrTrunc( 3416 EE->getOperand(0), VectorType::get(ScalarTruncatedTy, Elements)); 3417 NewI = B.CreateExtractElement(O0, EE->getOperand(2)); 3418 } else { 3419 // If we don't know what to do, be conservative and don't do anything. 3420 continue; 3421 } 3422 3423 // Lastly, extend the result. 3424 NewI->takeName(cast<Instruction>(I)); 3425 Value *Res = B.CreateZExtOrTrunc(NewI, OriginalTy); 3426 I->replaceAllUsesWith(Res); 3427 cast<Instruction>(I)->eraseFromParent(); 3428 Erased.insert(I); 3429 VectorLoopValueMap.resetVectorValue(KV.first, Part, Res); 3430 } 3431 } 3432 3433 // We'll have created a bunch of ZExts that are now parentless. Clean up. 3434 for (const auto &KV : Cost->getMinimalBitwidths()) { 3435 // If the value wasn't vectorized, we must maintain the original scalar 3436 // type. The absence of the value from VectorLoopValueMap indicates that it 3437 // wasn't vectorized. 3438 if (!VectorLoopValueMap.hasAnyVectorValue(KV.first)) 3439 continue; 3440 for (unsigned Part = 0; Part < UF; ++Part) { 3441 Value *I = getOrCreateVectorValue(KV.first, Part); 3442 ZExtInst *Inst = dyn_cast<ZExtInst>(I); 3443 if (Inst && Inst->use_empty()) { 3444 Value *NewI = Inst->getOperand(0); 3445 Inst->eraseFromParent(); 3446 VectorLoopValueMap.resetVectorValue(KV.first, Part, NewI); 3447 } 3448 } 3449 } 3450 } 3451 3452 void InnerLoopVectorizer::fixVectorizedLoop() { 3453 // Insert truncates and extends for any truncated instructions as hints to 3454 // InstCombine. 3455 if (VF > 1) 3456 truncateToMinimalBitwidths(); 3457 3458 // Fix widened non-induction PHIs by setting up the PHI operands. 3459 if (OrigPHIsToFix.size()) { 3460 assert(EnableVPlanNativePath && 3461 "Unexpected non-induction PHIs for fixup in non VPlan-native path"); 3462 fixNonInductionPHIs(); 3463 } 3464 3465 // At this point every instruction in the original loop is widened to a 3466 // vector form. Now we need to fix the recurrences in the loop. These PHI 3467 // nodes are currently empty because we did not want to introduce cycles. 3468 // This is the second stage of vectorizing recurrences. 3469 fixCrossIterationPHIs(); 3470 3471 // Forget the original basic block. 3472 PSE.getSE()->forgetLoop(OrigLoop); 3473 3474 // Fix-up external users of the induction variables. 3475 for (auto &Entry : *Legal->getInductionVars()) 3476 fixupIVUsers(Entry.first, Entry.second, 3477 getOrCreateVectorTripCount(LI->getLoopFor(LoopVectorBody)), 3478 IVEndValues[Entry.first], LoopMiddleBlock); 3479 3480 fixLCSSAPHIs(); 3481 for (Instruction *PI : PredicatedInstructions) 3482 sinkScalarOperands(&*PI); 3483 3484 // Remove redundant induction instructions. 3485 cse(LoopVectorBody); 3486 } 3487 3488 void InnerLoopVectorizer::fixCrossIterationPHIs() { 3489 // In order to support recurrences we need to be able to vectorize Phi nodes. 3490 // Phi nodes have cycles, so we need to vectorize them in two stages. This is 3491 // stage #2: We now need to fix the recurrences by adding incoming edges to 3492 // the currently empty PHI nodes. At this point every instruction in the 3493 // original loop is widened to a vector form so we can use them to construct 3494 // the incoming edges. 3495 for (PHINode &Phi : OrigLoop->getHeader()->phis()) { 3496 // Handle first-order recurrences and reductions that need to be fixed. 3497 if (Legal->isFirstOrderRecurrence(&Phi)) 3498 fixFirstOrderRecurrence(&Phi); 3499 else if (Legal->isReductionVariable(&Phi)) 3500 fixReduction(&Phi); 3501 } 3502 } 3503 3504 void InnerLoopVectorizer::fixFirstOrderRecurrence(PHINode *Phi) { 3505 // This is the second phase of vectorizing first-order recurrences. An 3506 // overview of the transformation is described below. Suppose we have the 3507 // following loop. 3508 // 3509 // for (int i = 0; i < n; ++i) 3510 // b[i] = a[i] - a[i - 1]; 3511 // 3512 // There is a first-order recurrence on "a". For this loop, the shorthand 3513 // scalar IR looks like: 3514 // 3515 // scalar.ph: 3516 // s_init = a[-1] 3517 // br scalar.body 3518 // 3519 // scalar.body: 3520 // i = phi [0, scalar.ph], [i+1, scalar.body] 3521 // s1 = phi [s_init, scalar.ph], [s2, scalar.body] 3522 // s2 = a[i] 3523 // b[i] = s2 - s1 3524 // br cond, scalar.body, ... 3525 // 3526 // In this example, s1 is a recurrence because it's value depends on the 3527 // previous iteration. In the first phase of vectorization, we created a 3528 // temporary value for s1. We now complete the vectorization and produce the 3529 // shorthand vector IR shown below (for VF = 4, UF = 1). 3530 // 3531 // vector.ph: 3532 // v_init = vector(..., ..., ..., a[-1]) 3533 // br vector.body 3534 // 3535 // vector.body 3536 // i = phi [0, vector.ph], [i+4, vector.body] 3537 // v1 = phi [v_init, vector.ph], [v2, vector.body] 3538 // v2 = a[i, i+1, i+2, i+3]; 3539 // v3 = vector(v1(3), v2(0, 1, 2)) 3540 // b[i, i+1, i+2, i+3] = v2 - v3 3541 // br cond, vector.body, middle.block 3542 // 3543 // middle.block: 3544 // x = v2(3) 3545 // br scalar.ph 3546 // 3547 // scalar.ph: 3548 // s_init = phi [x, middle.block], [a[-1], otherwise] 3549 // br scalar.body 3550 // 3551 // After execution completes the vector loop, we extract the next value of 3552 // the recurrence (x) to use as the initial value in the scalar loop. 3553 3554 // Get the original loop preheader and single loop latch. 3555 auto *Preheader = OrigLoop->getLoopPreheader(); 3556 auto *Latch = OrigLoop->getLoopLatch(); 3557 3558 // Get the initial and previous values of the scalar recurrence. 3559 auto *ScalarInit = Phi->getIncomingValueForBlock(Preheader); 3560 auto *Previous = Phi->getIncomingValueForBlock(Latch); 3561 3562 // Create a vector from the initial value. 3563 auto *VectorInit = ScalarInit; 3564 if (VF > 1) { 3565 Builder.SetInsertPoint(LoopVectorPreHeader->getTerminator()); 3566 VectorInit = Builder.CreateInsertElement( 3567 UndefValue::get(VectorType::get(VectorInit->getType(), VF)), VectorInit, 3568 Builder.getInt32(VF - 1), "vector.recur.init"); 3569 } 3570 3571 // We constructed a temporary phi node in the first phase of vectorization. 3572 // This phi node will eventually be deleted. 3573 Builder.SetInsertPoint( 3574 cast<Instruction>(VectorLoopValueMap.getVectorValue(Phi, 0))); 3575 3576 // Create a phi node for the new recurrence. The current value will either be 3577 // the initial value inserted into a vector or loop-varying vector value. 3578 auto *VecPhi = Builder.CreatePHI(VectorInit->getType(), 2, "vector.recur"); 3579 VecPhi->addIncoming(VectorInit, LoopVectorPreHeader); 3580 3581 // Get the vectorized previous value of the last part UF - 1. It appears last 3582 // among all unrolled iterations, due to the order of their construction. 3583 Value *PreviousLastPart = getOrCreateVectorValue(Previous, UF - 1); 3584 3585 // Find and set the insertion point after the previous value if it is an 3586 // instruction. 3587 BasicBlock::iterator InsertPt; 3588 // Note that the previous value may have been constant-folded so it is not 3589 // guaranteed to be an instruction in the vector loop. 3590 // FIXME: Loop invariant values do not form recurrences. We should deal with 3591 // them earlier. 3592 if (LI->getLoopFor(LoopVectorBody)->isLoopInvariant(PreviousLastPart)) 3593 InsertPt = LoopVectorBody->getFirstInsertionPt(); 3594 else { 3595 Instruction *PreviousInst = cast<Instruction>(PreviousLastPart); 3596 if (isa<PHINode>(PreviousLastPart)) 3597 // If the previous value is a phi node, we should insert after all the phi 3598 // nodes in the block containing the PHI to avoid breaking basic block 3599 // verification. Note that the basic block may be different to 3600 // LoopVectorBody, in case we predicate the loop. 3601 InsertPt = PreviousInst->getParent()->getFirstInsertionPt(); 3602 else 3603 InsertPt = ++PreviousInst->getIterator(); 3604 } 3605 Builder.SetInsertPoint(&*InsertPt); 3606 3607 // We will construct a vector for the recurrence by combining the values for 3608 // the current and previous iterations. This is the required shuffle mask. 3609 SmallVector<Constant *, 8> ShuffleMask(VF); 3610 ShuffleMask[0] = Builder.getInt32(VF - 1); 3611 for (unsigned I = 1; I < VF; ++I) 3612 ShuffleMask[I] = Builder.getInt32(I + VF - 1); 3613 3614 // The vector from which to take the initial value for the current iteration 3615 // (actual or unrolled). Initially, this is the vector phi node. 3616 Value *Incoming = VecPhi; 3617 3618 // Shuffle the current and previous vector and update the vector parts. 3619 for (unsigned Part = 0; Part < UF; ++Part) { 3620 Value *PreviousPart = getOrCreateVectorValue(Previous, Part); 3621 Value *PhiPart = VectorLoopValueMap.getVectorValue(Phi, Part); 3622 auto *Shuffle = 3623 VF > 1 ? Builder.CreateShuffleVector(Incoming, PreviousPart, 3624 ConstantVector::get(ShuffleMask)) 3625 : Incoming; 3626 PhiPart->replaceAllUsesWith(Shuffle); 3627 cast<Instruction>(PhiPart)->eraseFromParent(); 3628 VectorLoopValueMap.resetVectorValue(Phi, Part, Shuffle); 3629 Incoming = PreviousPart; 3630 } 3631 3632 // Fix the latch value of the new recurrence in the vector loop. 3633 VecPhi->addIncoming(Incoming, LI->getLoopFor(LoopVectorBody)->getLoopLatch()); 3634 3635 // Extract the last vector element in the middle block. This will be the 3636 // initial value for the recurrence when jumping to the scalar loop. 3637 auto *ExtractForScalar = Incoming; 3638 if (VF > 1) { 3639 Builder.SetInsertPoint(LoopMiddleBlock->getTerminator()); 3640 ExtractForScalar = Builder.CreateExtractElement( 3641 ExtractForScalar, Builder.getInt32(VF - 1), "vector.recur.extract"); 3642 } 3643 // Extract the second last element in the middle block if the 3644 // Phi is used outside the loop. We need to extract the phi itself 3645 // and not the last element (the phi update in the current iteration). This 3646 // will be the value when jumping to the exit block from the LoopMiddleBlock, 3647 // when the scalar loop is not run at all. 3648 Value *ExtractForPhiUsedOutsideLoop = nullptr; 3649 if (VF > 1) 3650 ExtractForPhiUsedOutsideLoop = Builder.CreateExtractElement( 3651 Incoming, Builder.getInt32(VF - 2), "vector.recur.extract.for.phi"); 3652 // When loop is unrolled without vectorizing, initialize 3653 // ExtractForPhiUsedOutsideLoop with the value just prior to unrolled value of 3654 // `Incoming`. This is analogous to the vectorized case above: extracting the 3655 // second last element when VF > 1. 3656 else if (UF > 1) 3657 ExtractForPhiUsedOutsideLoop = getOrCreateVectorValue(Previous, UF - 2); 3658 3659 // Fix the initial value of the original recurrence in the scalar loop. 3660 Builder.SetInsertPoint(&*LoopScalarPreHeader->begin()); 3661 auto *Start = Builder.CreatePHI(Phi->getType(), 2, "scalar.recur.init"); 3662 for (auto *BB : predecessors(LoopScalarPreHeader)) { 3663 auto *Incoming = BB == LoopMiddleBlock ? ExtractForScalar : ScalarInit; 3664 Start->addIncoming(Incoming, BB); 3665 } 3666 3667 Phi->setIncomingValueForBlock(LoopScalarPreHeader, Start); 3668 Phi->setName("scalar.recur"); 3669 3670 // Finally, fix users of the recurrence outside the loop. The users will need 3671 // either the last value of the scalar recurrence or the last value of the 3672 // vector recurrence we extracted in the middle block. Since the loop is in 3673 // LCSSA form, we just need to find all the phi nodes for the original scalar 3674 // recurrence in the exit block, and then add an edge for the middle block. 3675 for (PHINode &LCSSAPhi : LoopExitBlock->phis()) { 3676 if (LCSSAPhi.getIncomingValue(0) == Phi) { 3677 LCSSAPhi.addIncoming(ExtractForPhiUsedOutsideLoop, LoopMiddleBlock); 3678 } 3679 } 3680 } 3681 3682 void InnerLoopVectorizer::fixReduction(PHINode *Phi) { 3683 Constant *Zero = Builder.getInt32(0); 3684 3685 // Get it's reduction variable descriptor. 3686 assert(Legal->isReductionVariable(Phi) && 3687 "Unable to find the reduction variable"); 3688 RecurrenceDescriptor RdxDesc = (*Legal->getReductionVars())[Phi]; 3689 3690 RecurrenceDescriptor::RecurrenceKind RK = RdxDesc.getRecurrenceKind(); 3691 TrackingVH<Value> ReductionStartValue = RdxDesc.getRecurrenceStartValue(); 3692 Instruction *LoopExitInst = RdxDesc.getLoopExitInstr(); 3693 RecurrenceDescriptor::MinMaxRecurrenceKind MinMaxKind = 3694 RdxDesc.getMinMaxRecurrenceKind(); 3695 setDebugLocFromInst(Builder, ReductionStartValue); 3696 3697 // We need to generate a reduction vector from the incoming scalar. 3698 // To do so, we need to generate the 'identity' vector and override 3699 // one of the elements with the incoming scalar reduction. We need 3700 // to do it in the vector-loop preheader. 3701 Builder.SetInsertPoint(LoopVectorPreHeader->getTerminator()); 3702 3703 // This is the vector-clone of the value that leaves the loop. 3704 Type *VecTy = getOrCreateVectorValue(LoopExitInst, 0)->getType(); 3705 3706 // Find the reduction identity variable. Zero for addition, or, xor, 3707 // one for multiplication, -1 for And. 3708 Value *Identity; 3709 Value *VectorStart; 3710 if (RK == RecurrenceDescriptor::RK_IntegerMinMax || 3711 RK == RecurrenceDescriptor::RK_FloatMinMax) { 3712 // MinMax reduction have the start value as their identify. 3713 if (VF == 1) { 3714 VectorStart = Identity = ReductionStartValue; 3715 } else { 3716 VectorStart = Identity = 3717 Builder.CreateVectorSplat(VF, ReductionStartValue, "minmax.ident"); 3718 } 3719 } else { 3720 // Handle other reduction kinds: 3721 Constant *Iden = RecurrenceDescriptor::getRecurrenceIdentity( 3722 RK, VecTy->getScalarType()); 3723 if (VF == 1) { 3724 Identity = Iden; 3725 // This vector is the Identity vector where the first element is the 3726 // incoming scalar reduction. 3727 VectorStart = ReductionStartValue; 3728 } else { 3729 Identity = ConstantVector::getSplat(VF, Iden); 3730 3731 // This vector is the Identity vector where the first element is the 3732 // incoming scalar reduction. 3733 VectorStart = 3734 Builder.CreateInsertElement(Identity, ReductionStartValue, Zero); 3735 } 3736 } 3737 3738 // Wrap flags are in general invalid after vectorization, clear them. 3739 clearReductionWrapFlags(RdxDesc); 3740 3741 // Fix the vector-loop phi. 3742 3743 // Reductions do not have to start at zero. They can start with 3744 // any loop invariant values. 3745 BasicBlock *Latch = OrigLoop->getLoopLatch(); 3746 Value *LoopVal = Phi->getIncomingValueForBlock(Latch); 3747 3748 for (unsigned Part = 0; Part < UF; ++Part) { 3749 Value *VecRdxPhi = getOrCreateVectorValue(Phi, Part); 3750 Value *Val = getOrCreateVectorValue(LoopVal, Part); 3751 // Make sure to add the reduction start value only to the 3752 // first unroll part. 3753 Value *StartVal = (Part == 0) ? VectorStart : Identity; 3754 cast<PHINode>(VecRdxPhi)->addIncoming(StartVal, LoopVectorPreHeader); 3755 cast<PHINode>(VecRdxPhi) 3756 ->addIncoming(Val, LI->getLoopFor(LoopVectorBody)->getLoopLatch()); 3757 } 3758 3759 // Before each round, move the insertion point right between 3760 // the PHIs and the values we are going to write. 3761 // This allows us to write both PHINodes and the extractelement 3762 // instructions. 3763 Builder.SetInsertPoint(&*LoopMiddleBlock->getFirstInsertionPt()); 3764 3765 setDebugLocFromInst(Builder, LoopExitInst); 3766 3767 // If tail is folded by masking, the vector value to leave the loop should be 3768 // a Select choosing between the vectorized LoopExitInst and vectorized Phi, 3769 // instead of the former. 3770 if (Cost->foldTailByMasking()) { 3771 for (unsigned Part = 0; Part < UF; ++Part) { 3772 Value *VecLoopExitInst = 3773 VectorLoopValueMap.getVectorValue(LoopExitInst, Part); 3774 Value *Sel = nullptr; 3775 for (User *U : VecLoopExitInst->users()) { 3776 if (isa<SelectInst>(U)) { 3777 assert(!Sel && "Reduction exit feeding two selects"); 3778 Sel = U; 3779 } else 3780 assert(isa<PHINode>(U) && "Reduction exit must feed Phi's or select"); 3781 } 3782 assert(Sel && "Reduction exit feeds no select"); 3783 VectorLoopValueMap.resetVectorValue(LoopExitInst, Part, Sel); 3784 } 3785 } 3786 3787 // If the vector reduction can be performed in a smaller type, we truncate 3788 // then extend the loop exit value to enable InstCombine to evaluate the 3789 // entire expression in the smaller type. 3790 if (VF > 1 && Phi->getType() != RdxDesc.getRecurrenceType()) { 3791 Type *RdxVecTy = VectorType::get(RdxDesc.getRecurrenceType(), VF); 3792 Builder.SetInsertPoint( 3793 LI->getLoopFor(LoopVectorBody)->getLoopLatch()->getTerminator()); 3794 VectorParts RdxParts(UF); 3795 for (unsigned Part = 0; Part < UF; ++Part) { 3796 RdxParts[Part] = VectorLoopValueMap.getVectorValue(LoopExitInst, Part); 3797 Value *Trunc = Builder.CreateTrunc(RdxParts[Part], RdxVecTy); 3798 Value *Extnd = RdxDesc.isSigned() ? Builder.CreateSExt(Trunc, VecTy) 3799 : Builder.CreateZExt(Trunc, VecTy); 3800 for (Value::user_iterator UI = RdxParts[Part]->user_begin(); 3801 UI != RdxParts[Part]->user_end();) 3802 if (*UI != Trunc) { 3803 (*UI++)->replaceUsesOfWith(RdxParts[Part], Extnd); 3804 RdxParts[Part] = Extnd; 3805 } else { 3806 ++UI; 3807 } 3808 } 3809 Builder.SetInsertPoint(&*LoopMiddleBlock->getFirstInsertionPt()); 3810 for (unsigned Part = 0; Part < UF; ++Part) { 3811 RdxParts[Part] = Builder.CreateTrunc(RdxParts[Part], RdxVecTy); 3812 VectorLoopValueMap.resetVectorValue(LoopExitInst, Part, RdxParts[Part]); 3813 } 3814 } 3815 3816 // Reduce all of the unrolled parts into a single vector. 3817 Value *ReducedPartRdx = VectorLoopValueMap.getVectorValue(LoopExitInst, 0); 3818 unsigned Op = RecurrenceDescriptor::getRecurrenceBinOp(RK); 3819 3820 // The middle block terminator has already been assigned a DebugLoc here (the 3821 // OrigLoop's single latch terminator). We want the whole middle block to 3822 // appear to execute on this line because: (a) it is all compiler generated, 3823 // (b) these instructions are always executed after evaluating the latch 3824 // conditional branch, and (c) other passes may add new predecessors which 3825 // terminate on this line. This is the easiest way to ensure we don't 3826 // accidentally cause an extra step back into the loop while debugging. 3827 setDebugLocFromInst(Builder, LoopMiddleBlock->getTerminator()); 3828 for (unsigned Part = 1; Part < UF; ++Part) { 3829 Value *RdxPart = VectorLoopValueMap.getVectorValue(LoopExitInst, Part); 3830 if (Op != Instruction::ICmp && Op != Instruction::FCmp) 3831 // Floating point operations had to be 'fast' to enable the reduction. 3832 ReducedPartRdx = addFastMathFlag( 3833 Builder.CreateBinOp((Instruction::BinaryOps)Op, RdxPart, 3834 ReducedPartRdx, "bin.rdx"), 3835 RdxDesc.getFastMathFlags()); 3836 else 3837 ReducedPartRdx = createMinMaxOp(Builder, MinMaxKind, ReducedPartRdx, 3838 RdxPart); 3839 } 3840 3841 if (VF > 1) { 3842 bool NoNaN = Legal->hasFunNoNaNAttr(); 3843 ReducedPartRdx = 3844 createTargetReduction(Builder, TTI, RdxDesc, ReducedPartRdx, NoNaN); 3845 // If the reduction can be performed in a smaller type, we need to extend 3846 // the reduction to the wider type before we branch to the original loop. 3847 if (Phi->getType() != RdxDesc.getRecurrenceType()) 3848 ReducedPartRdx = 3849 RdxDesc.isSigned() 3850 ? Builder.CreateSExt(ReducedPartRdx, Phi->getType()) 3851 : Builder.CreateZExt(ReducedPartRdx, Phi->getType()); 3852 } 3853 3854 // Create a phi node that merges control-flow from the backedge-taken check 3855 // block and the middle block. 3856 PHINode *BCBlockPhi = PHINode::Create(Phi->getType(), 2, "bc.merge.rdx", 3857 LoopScalarPreHeader->getTerminator()); 3858 for (unsigned I = 0, E = LoopBypassBlocks.size(); I != E; ++I) 3859 BCBlockPhi->addIncoming(ReductionStartValue, LoopBypassBlocks[I]); 3860 BCBlockPhi->addIncoming(ReducedPartRdx, LoopMiddleBlock); 3861 3862 // Now, we need to fix the users of the reduction variable 3863 // inside and outside of the scalar remainder loop. 3864 // We know that the loop is in LCSSA form. We need to update the 3865 // PHI nodes in the exit blocks. 3866 for (PHINode &LCSSAPhi : LoopExitBlock->phis()) { 3867 // All PHINodes need to have a single entry edge, or two if 3868 // we already fixed them. 3869 assert(LCSSAPhi.getNumIncomingValues() < 3 && "Invalid LCSSA PHI"); 3870 3871 // We found a reduction value exit-PHI. Update it with the 3872 // incoming bypass edge. 3873 if (LCSSAPhi.getIncomingValue(0) == LoopExitInst) 3874 LCSSAPhi.addIncoming(ReducedPartRdx, LoopMiddleBlock); 3875 } // end of the LCSSA phi scan. 3876 3877 // Fix the scalar loop reduction variable with the incoming reduction sum 3878 // from the vector body and from the backedge value. 3879 int IncomingEdgeBlockIdx = 3880 Phi->getBasicBlockIndex(OrigLoop->getLoopLatch()); 3881 assert(IncomingEdgeBlockIdx >= 0 && "Invalid block index"); 3882 // Pick the other block. 3883 int SelfEdgeBlockIdx = (IncomingEdgeBlockIdx ? 0 : 1); 3884 Phi->setIncomingValue(SelfEdgeBlockIdx, BCBlockPhi); 3885 Phi->setIncomingValue(IncomingEdgeBlockIdx, LoopExitInst); 3886 } 3887 3888 void InnerLoopVectorizer::clearReductionWrapFlags( 3889 RecurrenceDescriptor &RdxDesc) { 3890 RecurrenceDescriptor::RecurrenceKind RK = RdxDesc.getRecurrenceKind(); 3891 if (RK != RecurrenceDescriptor::RK_IntegerAdd && 3892 RK != RecurrenceDescriptor::RK_IntegerMult) 3893 return; 3894 3895 Instruction *LoopExitInstr = RdxDesc.getLoopExitInstr(); 3896 assert(LoopExitInstr && "null loop exit instruction"); 3897 SmallVector<Instruction *, 8> Worklist; 3898 SmallPtrSet<Instruction *, 8> Visited; 3899 Worklist.push_back(LoopExitInstr); 3900 Visited.insert(LoopExitInstr); 3901 3902 while (!Worklist.empty()) { 3903 Instruction *Cur = Worklist.pop_back_val(); 3904 if (isa<OverflowingBinaryOperator>(Cur)) 3905 for (unsigned Part = 0; Part < UF; ++Part) { 3906 Value *V = getOrCreateVectorValue(Cur, Part); 3907 cast<Instruction>(V)->dropPoisonGeneratingFlags(); 3908 } 3909 3910 for (User *U : Cur->users()) { 3911 Instruction *UI = cast<Instruction>(U); 3912 if ((Cur != LoopExitInstr || OrigLoop->contains(UI->getParent())) && 3913 Visited.insert(UI).second) 3914 Worklist.push_back(UI); 3915 } 3916 } 3917 } 3918 3919 void InnerLoopVectorizer::fixLCSSAPHIs() { 3920 for (PHINode &LCSSAPhi : LoopExitBlock->phis()) { 3921 if (LCSSAPhi.getNumIncomingValues() == 1) { 3922 auto *IncomingValue = LCSSAPhi.getIncomingValue(0); 3923 // Non-instruction incoming values will have only one value. 3924 unsigned LastLane = 0; 3925 if (isa<Instruction>(IncomingValue)) 3926 LastLane = Cost->isUniformAfterVectorization( 3927 cast<Instruction>(IncomingValue), VF) 3928 ? 0 3929 : VF - 1; 3930 // Can be a loop invariant incoming value or the last scalar value to be 3931 // extracted from the vectorized loop. 3932 Builder.SetInsertPoint(LoopMiddleBlock->getTerminator()); 3933 Value *lastIncomingValue = 3934 getOrCreateScalarValue(IncomingValue, { UF - 1, LastLane }); 3935 LCSSAPhi.addIncoming(lastIncomingValue, LoopMiddleBlock); 3936 } 3937 } 3938 } 3939 3940 void InnerLoopVectorizer::sinkScalarOperands(Instruction *PredInst) { 3941 // The basic block and loop containing the predicated instruction. 3942 auto *PredBB = PredInst->getParent(); 3943 auto *VectorLoop = LI->getLoopFor(PredBB); 3944 3945 // Initialize a worklist with the operands of the predicated instruction. 3946 SetVector<Value *> Worklist(PredInst->op_begin(), PredInst->op_end()); 3947 3948 // Holds instructions that we need to analyze again. An instruction may be 3949 // reanalyzed if we don't yet know if we can sink it or not. 3950 SmallVector<Instruction *, 8> InstsToReanalyze; 3951 3952 // Returns true if a given use occurs in the predicated block. Phi nodes use 3953 // their operands in their corresponding predecessor blocks. 3954 auto isBlockOfUsePredicated = [&](Use &U) -> bool { 3955 auto *I = cast<Instruction>(U.getUser()); 3956 BasicBlock *BB = I->getParent(); 3957 if (auto *Phi = dyn_cast<PHINode>(I)) 3958 BB = Phi->getIncomingBlock( 3959 PHINode::getIncomingValueNumForOperand(U.getOperandNo())); 3960 return BB == PredBB; 3961 }; 3962 3963 // Iteratively sink the scalarized operands of the predicated instruction 3964 // into the block we created for it. When an instruction is sunk, it's 3965 // operands are then added to the worklist. The algorithm ends after one pass 3966 // through the worklist doesn't sink a single instruction. 3967 bool Changed; 3968 do { 3969 // Add the instructions that need to be reanalyzed to the worklist, and 3970 // reset the changed indicator. 3971 Worklist.insert(InstsToReanalyze.begin(), InstsToReanalyze.end()); 3972 InstsToReanalyze.clear(); 3973 Changed = false; 3974 3975 while (!Worklist.empty()) { 3976 auto *I = dyn_cast<Instruction>(Worklist.pop_back_val()); 3977 3978 // We can't sink an instruction if it is a phi node, is already in the 3979 // predicated block, is not in the loop, or may have side effects. 3980 if (!I || isa<PHINode>(I) || I->getParent() == PredBB || 3981 !VectorLoop->contains(I) || I->mayHaveSideEffects()) 3982 continue; 3983 3984 // It's legal to sink the instruction if all its uses occur in the 3985 // predicated block. Otherwise, there's nothing to do yet, and we may 3986 // need to reanalyze the instruction. 3987 if (!llvm::all_of(I->uses(), isBlockOfUsePredicated)) { 3988 InstsToReanalyze.push_back(I); 3989 continue; 3990 } 3991 3992 // Move the instruction to the beginning of the predicated block, and add 3993 // it's operands to the worklist. 3994 I->moveBefore(&*PredBB->getFirstInsertionPt()); 3995 Worklist.insert(I->op_begin(), I->op_end()); 3996 3997 // The sinking may have enabled other instructions to be sunk, so we will 3998 // need to iterate. 3999 Changed = true; 4000 } 4001 } while (Changed); 4002 } 4003 4004 void InnerLoopVectorizer::fixNonInductionPHIs() { 4005 for (PHINode *OrigPhi : OrigPHIsToFix) { 4006 PHINode *NewPhi = 4007 cast<PHINode>(VectorLoopValueMap.getVectorValue(OrigPhi, 0)); 4008 unsigned NumIncomingValues = OrigPhi->getNumIncomingValues(); 4009 4010 SmallVector<BasicBlock *, 2> ScalarBBPredecessors( 4011 predecessors(OrigPhi->getParent())); 4012 SmallVector<BasicBlock *, 2> VectorBBPredecessors( 4013 predecessors(NewPhi->getParent())); 4014 assert(ScalarBBPredecessors.size() == VectorBBPredecessors.size() && 4015 "Scalar and Vector BB should have the same number of predecessors"); 4016 4017 // The insertion point in Builder may be invalidated by the time we get 4018 // here. Force the Builder insertion point to something valid so that we do 4019 // not run into issues during insertion point restore in 4020 // getOrCreateVectorValue calls below. 4021 Builder.SetInsertPoint(NewPhi); 4022 4023 // The predecessor order is preserved and we can rely on mapping between 4024 // scalar and vector block predecessors. 4025 for (unsigned i = 0; i < NumIncomingValues; ++i) { 4026 BasicBlock *NewPredBB = VectorBBPredecessors[i]; 4027 4028 // When looking up the new scalar/vector values to fix up, use incoming 4029 // values from original phi. 4030 Value *ScIncV = 4031 OrigPhi->getIncomingValueForBlock(ScalarBBPredecessors[i]); 4032 4033 // Scalar incoming value may need a broadcast 4034 Value *NewIncV = getOrCreateVectorValue(ScIncV, 0); 4035 NewPhi->addIncoming(NewIncV, NewPredBB); 4036 } 4037 } 4038 } 4039 4040 void InnerLoopVectorizer::widenGEP(GetElementPtrInst *GEP, unsigned UF, 4041 unsigned VF, bool IsPtrLoopInvariant, 4042 SmallBitVector &IsIndexLoopInvariant) { 4043 // Construct a vector GEP by widening the operands of the scalar GEP as 4044 // necessary. We mark the vector GEP 'inbounds' if appropriate. A GEP 4045 // results in a vector of pointers when at least one operand of the GEP 4046 // is vector-typed. Thus, to keep the representation compact, we only use 4047 // vector-typed operands for loop-varying values. 4048 4049 if (VF > 1 && IsPtrLoopInvariant && IsIndexLoopInvariant.all()) { 4050 // If we are vectorizing, but the GEP has only loop-invariant operands, 4051 // the GEP we build (by only using vector-typed operands for 4052 // loop-varying values) would be a scalar pointer. Thus, to ensure we 4053 // produce a vector of pointers, we need to either arbitrarily pick an 4054 // operand to broadcast, or broadcast a clone of the original GEP. 4055 // Here, we broadcast a clone of the original. 4056 // 4057 // TODO: If at some point we decide to scalarize instructions having 4058 // loop-invariant operands, this special case will no longer be 4059 // required. We would add the scalarization decision to 4060 // collectLoopScalars() and teach getVectorValue() to broadcast 4061 // the lane-zero scalar value. 4062 auto *Clone = Builder.Insert(GEP->clone()); 4063 for (unsigned Part = 0; Part < UF; ++Part) { 4064 Value *EntryPart = Builder.CreateVectorSplat(VF, Clone); 4065 VectorLoopValueMap.setVectorValue(GEP, Part, EntryPart); 4066 addMetadata(EntryPart, GEP); 4067 } 4068 } else { 4069 // If the GEP has at least one loop-varying operand, we are sure to 4070 // produce a vector of pointers. But if we are only unrolling, we want 4071 // to produce a scalar GEP for each unroll part. Thus, the GEP we 4072 // produce with the code below will be scalar (if VF == 1) or vector 4073 // (otherwise). Note that for the unroll-only case, we still maintain 4074 // values in the vector mapping with initVector, as we do for other 4075 // instructions. 4076 for (unsigned Part = 0; Part < UF; ++Part) { 4077 // The pointer operand of the new GEP. If it's loop-invariant, we 4078 // won't broadcast it. 4079 auto *Ptr = IsPtrLoopInvariant 4080 ? GEP->getPointerOperand() 4081 : getOrCreateVectorValue(GEP->getPointerOperand(), Part); 4082 4083 // Collect all the indices for the new GEP. If any index is 4084 // loop-invariant, we won't broadcast it. 4085 SmallVector<Value *, 4> Indices; 4086 for (auto Index : enumerate(GEP->indices())) { 4087 Value *User = Index.value().get(); 4088 if (IsIndexLoopInvariant[Index.index()]) 4089 Indices.push_back(User); 4090 else 4091 Indices.push_back(getOrCreateVectorValue(User, Part)); 4092 } 4093 4094 // Create the new GEP. Note that this GEP may be a scalar if VF == 1, 4095 // but it should be a vector, otherwise. 4096 auto *NewGEP = 4097 GEP->isInBounds() 4098 ? Builder.CreateInBoundsGEP(GEP->getSourceElementType(), Ptr, 4099 Indices) 4100 : Builder.CreateGEP(GEP->getSourceElementType(), Ptr, Indices); 4101 assert((VF == 1 || NewGEP->getType()->isVectorTy()) && 4102 "NewGEP is not a pointer vector"); 4103 VectorLoopValueMap.setVectorValue(GEP, Part, NewGEP); 4104 addMetadata(NewGEP, GEP); 4105 } 4106 } 4107 } 4108 4109 void InnerLoopVectorizer::widenPHIInstruction(Instruction *PN, unsigned UF, 4110 unsigned VF) { 4111 PHINode *P = cast<PHINode>(PN); 4112 if (EnableVPlanNativePath) { 4113 // Currently we enter here in the VPlan-native path for non-induction 4114 // PHIs where all control flow is uniform. We simply widen these PHIs. 4115 // Create a vector phi with no operands - the vector phi operands will be 4116 // set at the end of vector code generation. 4117 Type *VecTy = 4118 (VF == 1) ? PN->getType() : VectorType::get(PN->getType(), VF); 4119 Value *VecPhi = Builder.CreatePHI(VecTy, PN->getNumOperands(), "vec.phi"); 4120 VectorLoopValueMap.setVectorValue(P, 0, VecPhi); 4121 OrigPHIsToFix.push_back(P); 4122 4123 return; 4124 } 4125 4126 assert(PN->getParent() == OrigLoop->getHeader() && 4127 "Non-header phis should have been handled elsewhere"); 4128 4129 // In order to support recurrences we need to be able to vectorize Phi nodes. 4130 // Phi nodes have cycles, so we need to vectorize them in two stages. This is 4131 // stage #1: We create a new vector PHI node with no incoming edges. We'll use 4132 // this value when we vectorize all of the instructions that use the PHI. 4133 if (Legal->isReductionVariable(P) || Legal->isFirstOrderRecurrence(P)) { 4134 for (unsigned Part = 0; Part < UF; ++Part) { 4135 // This is phase one of vectorizing PHIs. 4136 Type *VecTy = 4137 (VF == 1) ? PN->getType() : VectorType::get(PN->getType(), VF); 4138 Value *EntryPart = PHINode::Create( 4139 VecTy, 2, "vec.phi", &*LoopVectorBody->getFirstInsertionPt()); 4140 VectorLoopValueMap.setVectorValue(P, Part, EntryPart); 4141 } 4142 return; 4143 } 4144 4145 setDebugLocFromInst(Builder, P); 4146 4147 // This PHINode must be an induction variable. 4148 // Make sure that we know about it. 4149 assert(Legal->getInductionVars()->count(P) && "Not an induction variable"); 4150 4151 InductionDescriptor II = Legal->getInductionVars()->lookup(P); 4152 const DataLayout &DL = OrigLoop->getHeader()->getModule()->getDataLayout(); 4153 4154 // FIXME: The newly created binary instructions should contain nsw/nuw flags, 4155 // which can be found from the original scalar operations. 4156 switch (II.getKind()) { 4157 case InductionDescriptor::IK_NoInduction: 4158 llvm_unreachable("Unknown induction"); 4159 case InductionDescriptor::IK_IntInduction: 4160 case InductionDescriptor::IK_FpInduction: 4161 llvm_unreachable("Integer/fp induction is handled elsewhere."); 4162 case InductionDescriptor::IK_PtrInduction: { 4163 // Handle the pointer induction variable case. 4164 assert(P->getType()->isPointerTy() && "Unexpected type."); 4165 // This is the normalized GEP that starts counting at zero. 4166 Value *PtrInd = Induction; 4167 PtrInd = Builder.CreateSExtOrTrunc(PtrInd, II.getStep()->getType()); 4168 // Determine the number of scalars we need to generate for each unroll 4169 // iteration. If the instruction is uniform, we only need to generate the 4170 // first lane. Otherwise, we generate all VF values. 4171 unsigned Lanes = Cost->isUniformAfterVectorization(P, VF) ? 1 : VF; 4172 // These are the scalar results. Notice that we don't generate vector GEPs 4173 // because scalar GEPs result in better code. 4174 for (unsigned Part = 0; Part < UF; ++Part) { 4175 for (unsigned Lane = 0; Lane < Lanes; ++Lane) { 4176 Constant *Idx = ConstantInt::get(PtrInd->getType(), Lane + Part * VF); 4177 Value *GlobalIdx = Builder.CreateAdd(PtrInd, Idx); 4178 Value *SclrGep = 4179 emitTransformedIndex(Builder, GlobalIdx, PSE.getSE(), DL, II); 4180 SclrGep->setName("next.gep"); 4181 VectorLoopValueMap.setScalarValue(P, {Part, Lane}, SclrGep); 4182 } 4183 } 4184 return; 4185 } 4186 } 4187 } 4188 4189 /// A helper function for checking whether an integer division-related 4190 /// instruction may divide by zero (in which case it must be predicated if 4191 /// executed conditionally in the scalar code). 4192 /// TODO: It may be worthwhile to generalize and check isKnownNonZero(). 4193 /// Non-zero divisors that are non compile-time constants will not be 4194 /// converted into multiplication, so we will still end up scalarizing 4195 /// the division, but can do so w/o predication. 4196 static bool mayDivideByZero(Instruction &I) { 4197 assert((I.getOpcode() == Instruction::UDiv || 4198 I.getOpcode() == Instruction::SDiv || 4199 I.getOpcode() == Instruction::URem || 4200 I.getOpcode() == Instruction::SRem) && 4201 "Unexpected instruction"); 4202 Value *Divisor = I.getOperand(1); 4203 auto *CInt = dyn_cast<ConstantInt>(Divisor); 4204 return !CInt || CInt->isZero(); 4205 } 4206 4207 void InnerLoopVectorizer::widenInstruction(Instruction &I) { 4208 switch (I.getOpcode()) { 4209 case Instruction::Br: 4210 case Instruction::PHI: 4211 case Instruction::GetElementPtr: 4212 llvm_unreachable("This instruction is handled by a different recipe."); 4213 case Instruction::UDiv: 4214 case Instruction::SDiv: 4215 case Instruction::SRem: 4216 case Instruction::URem: 4217 case Instruction::Add: 4218 case Instruction::FAdd: 4219 case Instruction::Sub: 4220 case Instruction::FSub: 4221 case Instruction::FNeg: 4222 case Instruction::Mul: 4223 case Instruction::FMul: 4224 case Instruction::FDiv: 4225 case Instruction::FRem: 4226 case Instruction::Shl: 4227 case Instruction::LShr: 4228 case Instruction::AShr: 4229 case Instruction::And: 4230 case Instruction::Or: 4231 case Instruction::Xor: { 4232 // Just widen unops and binops. 4233 setDebugLocFromInst(Builder, &I); 4234 4235 for (unsigned Part = 0; Part < UF; ++Part) { 4236 SmallVector<Value *, 2> Ops; 4237 for (Value *Op : I.operands()) 4238 Ops.push_back(getOrCreateVectorValue(Op, Part)); 4239 4240 Value *V = Builder.CreateNAryOp(I.getOpcode(), Ops); 4241 4242 if (auto *VecOp = dyn_cast<Instruction>(V)) 4243 VecOp->copyIRFlags(&I); 4244 4245 // Use this vector value for all users of the original instruction. 4246 VectorLoopValueMap.setVectorValue(&I, Part, V); 4247 addMetadata(V, &I); 4248 } 4249 4250 break; 4251 } 4252 case Instruction::Select: { 4253 // Widen selects. 4254 // If the selector is loop invariant we can create a select 4255 // instruction with a scalar condition. Otherwise, use vector-select. 4256 auto *SE = PSE.getSE(); 4257 bool InvariantCond = 4258 SE->isLoopInvariant(PSE.getSCEV(I.getOperand(0)), OrigLoop); 4259 setDebugLocFromInst(Builder, &I); 4260 4261 // The condition can be loop invariant but still defined inside the 4262 // loop. This means that we can't just use the original 'cond' value. 4263 // We have to take the 'vectorized' value and pick the first lane. 4264 // Instcombine will make this a no-op. 4265 4266 auto *ScalarCond = getOrCreateScalarValue(I.getOperand(0), {0, 0}); 4267 4268 for (unsigned Part = 0; Part < UF; ++Part) { 4269 Value *Cond = getOrCreateVectorValue(I.getOperand(0), Part); 4270 Value *Op0 = getOrCreateVectorValue(I.getOperand(1), Part); 4271 Value *Op1 = getOrCreateVectorValue(I.getOperand(2), Part); 4272 Value *Sel = 4273 Builder.CreateSelect(InvariantCond ? ScalarCond : Cond, Op0, Op1); 4274 VectorLoopValueMap.setVectorValue(&I, Part, Sel); 4275 addMetadata(Sel, &I); 4276 } 4277 4278 break; 4279 } 4280 4281 case Instruction::ICmp: 4282 case Instruction::FCmp: { 4283 // Widen compares. Generate vector compares. 4284 bool FCmp = (I.getOpcode() == Instruction::FCmp); 4285 auto *Cmp = cast<CmpInst>(&I); 4286 setDebugLocFromInst(Builder, Cmp); 4287 for (unsigned Part = 0; Part < UF; ++Part) { 4288 Value *A = getOrCreateVectorValue(Cmp->getOperand(0), Part); 4289 Value *B = getOrCreateVectorValue(Cmp->getOperand(1), Part); 4290 Value *C = nullptr; 4291 if (FCmp) { 4292 // Propagate fast math flags. 4293 IRBuilder<>::FastMathFlagGuard FMFG(Builder); 4294 Builder.setFastMathFlags(Cmp->getFastMathFlags()); 4295 C = Builder.CreateFCmp(Cmp->getPredicate(), A, B); 4296 } else { 4297 C = Builder.CreateICmp(Cmp->getPredicate(), A, B); 4298 } 4299 VectorLoopValueMap.setVectorValue(&I, Part, C); 4300 addMetadata(C, &I); 4301 } 4302 4303 break; 4304 } 4305 4306 case Instruction::ZExt: 4307 case Instruction::SExt: 4308 case Instruction::FPToUI: 4309 case Instruction::FPToSI: 4310 case Instruction::FPExt: 4311 case Instruction::PtrToInt: 4312 case Instruction::IntToPtr: 4313 case Instruction::SIToFP: 4314 case Instruction::UIToFP: 4315 case Instruction::Trunc: 4316 case Instruction::FPTrunc: 4317 case Instruction::BitCast: { 4318 auto *CI = cast<CastInst>(&I); 4319 setDebugLocFromInst(Builder, CI); 4320 4321 /// Vectorize casts. 4322 Type *DestTy = 4323 (VF == 1) ? CI->getType() : VectorType::get(CI->getType(), VF); 4324 4325 for (unsigned Part = 0; Part < UF; ++Part) { 4326 Value *A = getOrCreateVectorValue(CI->getOperand(0), Part); 4327 Value *Cast = Builder.CreateCast(CI->getOpcode(), A, DestTy); 4328 VectorLoopValueMap.setVectorValue(&I, Part, Cast); 4329 addMetadata(Cast, &I); 4330 } 4331 break; 4332 } 4333 4334 case Instruction::Call: { 4335 // Ignore dbg intrinsics. 4336 if (isa<DbgInfoIntrinsic>(I)) 4337 break; 4338 setDebugLocFromInst(Builder, &I); 4339 4340 Module *M = I.getParent()->getParent()->getParent(); 4341 auto *CI = cast<CallInst>(&I); 4342 4343 SmallVector<Type *, 4> Tys; 4344 for (Value *ArgOperand : CI->arg_operands()) 4345 Tys.push_back(ToVectorTy(ArgOperand->getType(), VF)); 4346 4347 Intrinsic::ID ID = getVectorIntrinsicIDForCall(CI, TLI); 4348 4349 // The flag shows whether we use Intrinsic or a usual Call for vectorized 4350 // version of the instruction. 4351 // Is it beneficial to perform intrinsic call compared to lib call? 4352 bool NeedToScalarize; 4353 unsigned CallCost = Cost->getVectorCallCost(CI, VF, NeedToScalarize); 4354 bool UseVectorIntrinsic = 4355 ID && Cost->getVectorIntrinsicCost(CI, VF) <= CallCost; 4356 assert((UseVectorIntrinsic || !NeedToScalarize) && 4357 "Instruction should be scalarized elsewhere."); 4358 4359 for (unsigned Part = 0; Part < UF; ++Part) { 4360 SmallVector<Value *, 4> Args; 4361 for (unsigned i = 0, ie = CI->getNumArgOperands(); i != ie; ++i) { 4362 Value *Arg = CI->getArgOperand(i); 4363 // Some intrinsics have a scalar argument - don't replace it with a 4364 // vector. 4365 if (!UseVectorIntrinsic || !hasVectorInstrinsicScalarOpd(ID, i)) 4366 Arg = getOrCreateVectorValue(CI->getArgOperand(i), Part); 4367 Args.push_back(Arg); 4368 } 4369 4370 Function *VectorF; 4371 if (UseVectorIntrinsic) { 4372 // Use vector version of the intrinsic. 4373 Type *TysForDecl[] = {CI->getType()}; 4374 if (VF > 1) 4375 TysForDecl[0] = VectorType::get(CI->getType()->getScalarType(), VF); 4376 VectorF = Intrinsic::getDeclaration(M, ID, TysForDecl); 4377 } else { 4378 // Use vector version of the function call. 4379 const VFShape Shape = 4380 VFShape::get(*CI, {VF, false} /*EC*/, false /*HasGlobalPred*/); 4381 #ifndef NDEBUG 4382 const SmallVector<VFInfo, 8> Infos = VFDatabase::getMappings(*CI); 4383 assert(std::find_if(Infos.begin(), Infos.end(), 4384 [&Shape](const VFInfo &Info) { 4385 return Info.Shape == Shape; 4386 }) != Infos.end() && 4387 "Vector function shape is missing from the database."); 4388 #endif 4389 VectorF = VFDatabase(*CI).getVectorizedFunction(Shape); 4390 } 4391 assert(VectorF && "Can't create vector function."); 4392 4393 SmallVector<OperandBundleDef, 1> OpBundles; 4394 CI->getOperandBundlesAsDefs(OpBundles); 4395 CallInst *V = Builder.CreateCall(VectorF, Args, OpBundles); 4396 4397 if (isa<FPMathOperator>(V)) 4398 V->copyFastMathFlags(CI); 4399 4400 VectorLoopValueMap.setVectorValue(&I, Part, V); 4401 addMetadata(V, &I); 4402 } 4403 4404 break; 4405 } 4406 4407 default: 4408 // This instruction is not vectorized by simple widening. 4409 LLVM_DEBUG(dbgs() << "LV: Found an unhandled instruction: " << I); 4410 llvm_unreachable("Unhandled instruction!"); 4411 } // end of switch. 4412 } 4413 4414 void LoopVectorizationCostModel::collectLoopScalars(unsigned VF) { 4415 // We should not collect Scalars more than once per VF. Right now, this 4416 // function is called from collectUniformsAndScalars(), which already does 4417 // this check. Collecting Scalars for VF=1 does not make any sense. 4418 assert(VF >= 2 && Scalars.find(VF) == Scalars.end() && 4419 "This function should not be visited twice for the same VF"); 4420 4421 SmallSetVector<Instruction *, 8> Worklist; 4422 4423 // These sets are used to seed the analysis with pointers used by memory 4424 // accesses that will remain scalar. 4425 SmallSetVector<Instruction *, 8> ScalarPtrs; 4426 SmallPtrSet<Instruction *, 8> PossibleNonScalarPtrs; 4427 4428 // A helper that returns true if the use of Ptr by MemAccess will be scalar. 4429 // The pointer operands of loads and stores will be scalar as long as the 4430 // memory access is not a gather or scatter operation. The value operand of a 4431 // store will remain scalar if the store is scalarized. 4432 auto isScalarUse = [&](Instruction *MemAccess, Value *Ptr) { 4433 InstWidening WideningDecision = getWideningDecision(MemAccess, VF); 4434 assert(WideningDecision != CM_Unknown && 4435 "Widening decision should be ready at this moment"); 4436 if (auto *Store = dyn_cast<StoreInst>(MemAccess)) 4437 if (Ptr == Store->getValueOperand()) 4438 return WideningDecision == CM_Scalarize; 4439 assert(Ptr == getLoadStorePointerOperand(MemAccess) && 4440 "Ptr is neither a value or pointer operand"); 4441 return WideningDecision != CM_GatherScatter; 4442 }; 4443 4444 // A helper that returns true if the given value is a bitcast or 4445 // getelementptr instruction contained in the loop. 4446 auto isLoopVaryingBitCastOrGEP = [&](Value *V) { 4447 return ((isa<BitCastInst>(V) && V->getType()->isPointerTy()) || 4448 isa<GetElementPtrInst>(V)) && 4449 !TheLoop->isLoopInvariant(V); 4450 }; 4451 4452 // A helper that evaluates a memory access's use of a pointer. If the use 4453 // will be a scalar use, and the pointer is only used by memory accesses, we 4454 // place the pointer in ScalarPtrs. Otherwise, the pointer is placed in 4455 // PossibleNonScalarPtrs. 4456 auto evaluatePtrUse = [&](Instruction *MemAccess, Value *Ptr) { 4457 // We only care about bitcast and getelementptr instructions contained in 4458 // the loop. 4459 if (!isLoopVaryingBitCastOrGEP(Ptr)) 4460 return; 4461 4462 // If the pointer has already been identified as scalar (e.g., if it was 4463 // also identified as uniform), there's nothing to do. 4464 auto *I = cast<Instruction>(Ptr); 4465 if (Worklist.count(I)) 4466 return; 4467 4468 // If the use of the pointer will be a scalar use, and all users of the 4469 // pointer are memory accesses, place the pointer in ScalarPtrs. Otherwise, 4470 // place the pointer in PossibleNonScalarPtrs. 4471 if (isScalarUse(MemAccess, Ptr) && llvm::all_of(I->users(), [&](User *U) { 4472 return isa<LoadInst>(U) || isa<StoreInst>(U); 4473 })) 4474 ScalarPtrs.insert(I); 4475 else 4476 PossibleNonScalarPtrs.insert(I); 4477 }; 4478 4479 // We seed the scalars analysis with three classes of instructions: (1) 4480 // instructions marked uniform-after-vectorization, (2) bitcast and 4481 // getelementptr instructions used by memory accesses requiring a scalar use, 4482 // and (3) pointer induction variables and their update instructions (we 4483 // currently only scalarize these). 4484 // 4485 // (1) Add to the worklist all instructions that have been identified as 4486 // uniform-after-vectorization. 4487 Worklist.insert(Uniforms[VF].begin(), Uniforms[VF].end()); 4488 4489 // (2) Add to the worklist all bitcast and getelementptr instructions used by 4490 // memory accesses requiring a scalar use. The pointer operands of loads and 4491 // stores will be scalar as long as the memory accesses is not a gather or 4492 // scatter operation. The value operand of a store will remain scalar if the 4493 // store is scalarized. 4494 for (auto *BB : TheLoop->blocks()) 4495 for (auto &I : *BB) { 4496 if (auto *Load = dyn_cast<LoadInst>(&I)) { 4497 evaluatePtrUse(Load, Load->getPointerOperand()); 4498 } else if (auto *Store = dyn_cast<StoreInst>(&I)) { 4499 evaluatePtrUse(Store, Store->getPointerOperand()); 4500 evaluatePtrUse(Store, Store->getValueOperand()); 4501 } 4502 } 4503 for (auto *I : ScalarPtrs) 4504 if (PossibleNonScalarPtrs.find(I) == PossibleNonScalarPtrs.end()) { 4505 LLVM_DEBUG(dbgs() << "LV: Found scalar instruction: " << *I << "\n"); 4506 Worklist.insert(I); 4507 } 4508 4509 // (3) Add to the worklist all pointer induction variables and their update 4510 // instructions. 4511 // 4512 // TODO: Once we are able to vectorize pointer induction variables we should 4513 // no longer insert them into the worklist here. 4514 auto *Latch = TheLoop->getLoopLatch(); 4515 for (auto &Induction : *Legal->getInductionVars()) { 4516 auto *Ind = Induction.first; 4517 auto *IndUpdate = cast<Instruction>(Ind->getIncomingValueForBlock(Latch)); 4518 if (Induction.second.getKind() != InductionDescriptor::IK_PtrInduction) 4519 continue; 4520 Worklist.insert(Ind); 4521 Worklist.insert(IndUpdate); 4522 LLVM_DEBUG(dbgs() << "LV: Found scalar instruction: " << *Ind << "\n"); 4523 LLVM_DEBUG(dbgs() << "LV: Found scalar instruction: " << *IndUpdate 4524 << "\n"); 4525 } 4526 4527 // Insert the forced scalars. 4528 // FIXME: Currently widenPHIInstruction() often creates a dead vector 4529 // induction variable when the PHI user is scalarized. 4530 auto ForcedScalar = ForcedScalars.find(VF); 4531 if (ForcedScalar != ForcedScalars.end()) 4532 for (auto *I : ForcedScalar->second) 4533 Worklist.insert(I); 4534 4535 // Expand the worklist by looking through any bitcasts and getelementptr 4536 // instructions we've already identified as scalar. This is similar to the 4537 // expansion step in collectLoopUniforms(); however, here we're only 4538 // expanding to include additional bitcasts and getelementptr instructions. 4539 unsigned Idx = 0; 4540 while (Idx != Worklist.size()) { 4541 Instruction *Dst = Worklist[Idx++]; 4542 if (!isLoopVaryingBitCastOrGEP(Dst->getOperand(0))) 4543 continue; 4544 auto *Src = cast<Instruction>(Dst->getOperand(0)); 4545 if (llvm::all_of(Src->users(), [&](User *U) -> bool { 4546 auto *J = cast<Instruction>(U); 4547 return !TheLoop->contains(J) || Worklist.count(J) || 4548 ((isa<LoadInst>(J) || isa<StoreInst>(J)) && 4549 isScalarUse(J, Src)); 4550 })) { 4551 Worklist.insert(Src); 4552 LLVM_DEBUG(dbgs() << "LV: Found scalar instruction: " << *Src << "\n"); 4553 } 4554 } 4555 4556 // An induction variable will remain scalar if all users of the induction 4557 // variable and induction variable update remain scalar. 4558 for (auto &Induction : *Legal->getInductionVars()) { 4559 auto *Ind = Induction.first; 4560 auto *IndUpdate = cast<Instruction>(Ind->getIncomingValueForBlock(Latch)); 4561 4562 // We already considered pointer induction variables, so there's no reason 4563 // to look at their users again. 4564 // 4565 // TODO: Once we are able to vectorize pointer induction variables we 4566 // should no longer skip over them here. 4567 if (Induction.second.getKind() == InductionDescriptor::IK_PtrInduction) 4568 continue; 4569 4570 // Determine if all users of the induction variable are scalar after 4571 // vectorization. 4572 auto ScalarInd = llvm::all_of(Ind->users(), [&](User *U) -> bool { 4573 auto *I = cast<Instruction>(U); 4574 return I == IndUpdate || !TheLoop->contains(I) || Worklist.count(I); 4575 }); 4576 if (!ScalarInd) 4577 continue; 4578 4579 // Determine if all users of the induction variable update instruction are 4580 // scalar after vectorization. 4581 auto ScalarIndUpdate = 4582 llvm::all_of(IndUpdate->users(), [&](User *U) -> bool { 4583 auto *I = cast<Instruction>(U); 4584 return I == Ind || !TheLoop->contains(I) || Worklist.count(I); 4585 }); 4586 if (!ScalarIndUpdate) 4587 continue; 4588 4589 // The induction variable and its update instruction will remain scalar. 4590 Worklist.insert(Ind); 4591 Worklist.insert(IndUpdate); 4592 LLVM_DEBUG(dbgs() << "LV: Found scalar instruction: " << *Ind << "\n"); 4593 LLVM_DEBUG(dbgs() << "LV: Found scalar instruction: " << *IndUpdate 4594 << "\n"); 4595 } 4596 4597 Scalars[VF].insert(Worklist.begin(), Worklist.end()); 4598 } 4599 4600 bool LoopVectorizationCostModel::isScalarWithPredication(Instruction *I, unsigned VF) { 4601 if (!blockNeedsPredication(I->getParent())) 4602 return false; 4603 switch(I->getOpcode()) { 4604 default: 4605 break; 4606 case Instruction::Load: 4607 case Instruction::Store: { 4608 if (!Legal->isMaskRequired(I)) 4609 return false; 4610 auto *Ptr = getLoadStorePointerOperand(I); 4611 auto *Ty = getMemInstValueType(I); 4612 // We have already decided how to vectorize this instruction, get that 4613 // result. 4614 if (VF > 1) { 4615 InstWidening WideningDecision = getWideningDecision(I, VF); 4616 assert(WideningDecision != CM_Unknown && 4617 "Widening decision should be ready at this moment"); 4618 return WideningDecision == CM_Scalarize; 4619 } 4620 const MaybeAlign Alignment = getLoadStoreAlignment(I); 4621 return isa<LoadInst>(I) ? !(isLegalMaskedLoad(Ty, Ptr, Alignment) || 4622 isLegalMaskedGather(Ty, Alignment)) 4623 : !(isLegalMaskedStore(Ty, Ptr, Alignment) || 4624 isLegalMaskedScatter(Ty, Alignment)); 4625 } 4626 case Instruction::UDiv: 4627 case Instruction::SDiv: 4628 case Instruction::SRem: 4629 case Instruction::URem: 4630 return mayDivideByZero(*I); 4631 } 4632 return false; 4633 } 4634 4635 bool LoopVectorizationCostModel::interleavedAccessCanBeWidened(Instruction *I, 4636 unsigned VF) { 4637 assert(isAccessInterleaved(I) && "Expecting interleaved access."); 4638 assert(getWideningDecision(I, VF) == CM_Unknown && 4639 "Decision should not be set yet."); 4640 auto *Group = getInterleavedAccessGroup(I); 4641 assert(Group && "Must have a group."); 4642 4643 // If the instruction's allocated size doesn't equal it's type size, it 4644 // requires padding and will be scalarized. 4645 auto &DL = I->getModule()->getDataLayout(); 4646 auto *ScalarTy = getMemInstValueType(I); 4647 if (hasIrregularType(ScalarTy, DL, VF)) 4648 return false; 4649 4650 // Check if masking is required. 4651 // A Group may need masking for one of two reasons: it resides in a block that 4652 // needs predication, or it was decided to use masking to deal with gaps. 4653 bool PredicatedAccessRequiresMasking = 4654 Legal->blockNeedsPredication(I->getParent()) && Legal->isMaskRequired(I); 4655 bool AccessWithGapsRequiresMasking = 4656 Group->requiresScalarEpilogue() && !isScalarEpilogueAllowed(); 4657 if (!PredicatedAccessRequiresMasking && !AccessWithGapsRequiresMasking) 4658 return true; 4659 4660 // If masked interleaving is required, we expect that the user/target had 4661 // enabled it, because otherwise it either wouldn't have been created or 4662 // it should have been invalidated by the CostModel. 4663 assert(useMaskedInterleavedAccesses(TTI) && 4664 "Masked interleave-groups for predicated accesses are not enabled."); 4665 4666 auto *Ty = getMemInstValueType(I); 4667 const MaybeAlign Alignment = getLoadStoreAlignment(I); 4668 return isa<LoadInst>(I) ? TTI.isLegalMaskedLoad(Ty, Alignment) 4669 : TTI.isLegalMaskedStore(Ty, Alignment); 4670 } 4671 4672 bool LoopVectorizationCostModel::memoryInstructionCanBeWidened(Instruction *I, 4673 unsigned VF) { 4674 // Get and ensure we have a valid memory instruction. 4675 LoadInst *LI = dyn_cast<LoadInst>(I); 4676 StoreInst *SI = dyn_cast<StoreInst>(I); 4677 assert((LI || SI) && "Invalid memory instruction"); 4678 4679 auto *Ptr = getLoadStorePointerOperand(I); 4680 4681 // In order to be widened, the pointer should be consecutive, first of all. 4682 if (!Legal->isConsecutivePtr(Ptr)) 4683 return false; 4684 4685 // If the instruction is a store located in a predicated block, it will be 4686 // scalarized. 4687 if (isScalarWithPredication(I)) 4688 return false; 4689 4690 // If the instruction's allocated size doesn't equal it's type size, it 4691 // requires padding and will be scalarized. 4692 auto &DL = I->getModule()->getDataLayout(); 4693 auto *ScalarTy = LI ? LI->getType() : SI->getValueOperand()->getType(); 4694 if (hasIrregularType(ScalarTy, DL, VF)) 4695 return false; 4696 4697 return true; 4698 } 4699 4700 void LoopVectorizationCostModel::collectLoopUniforms(unsigned VF) { 4701 // We should not collect Uniforms more than once per VF. Right now, 4702 // this function is called from collectUniformsAndScalars(), which 4703 // already does this check. Collecting Uniforms for VF=1 does not make any 4704 // sense. 4705 4706 assert(VF >= 2 && Uniforms.find(VF) == Uniforms.end() && 4707 "This function should not be visited twice for the same VF"); 4708 4709 // Visit the list of Uniforms. If we'll not find any uniform value, we'll 4710 // not analyze again. Uniforms.count(VF) will return 1. 4711 Uniforms[VF].clear(); 4712 4713 // We now know that the loop is vectorizable! 4714 // Collect instructions inside the loop that will remain uniform after 4715 // vectorization. 4716 4717 // Global values, params and instructions outside of current loop are out of 4718 // scope. 4719 auto isOutOfScope = [&](Value *V) -> bool { 4720 Instruction *I = dyn_cast<Instruction>(V); 4721 return (!I || !TheLoop->contains(I)); 4722 }; 4723 4724 SetVector<Instruction *> Worklist; 4725 BasicBlock *Latch = TheLoop->getLoopLatch(); 4726 4727 // Instructions that are scalar with predication must not be considered 4728 // uniform after vectorization, because that would create an erroneous 4729 // replicating region where only a single instance out of VF should be formed. 4730 // TODO: optimize such seldom cases if found important, see PR40816. 4731 auto addToWorklistIfAllowed = [&](Instruction *I) -> void { 4732 if (isScalarWithPredication(I, VF)) { 4733 LLVM_DEBUG(dbgs() << "LV: Found not uniform being ScalarWithPredication: " 4734 << *I << "\n"); 4735 return; 4736 } 4737 LLVM_DEBUG(dbgs() << "LV: Found uniform instruction: " << *I << "\n"); 4738 Worklist.insert(I); 4739 }; 4740 4741 // Start with the conditional branch. If the branch condition is an 4742 // instruction contained in the loop that is only used by the branch, it is 4743 // uniform. 4744 auto *Cmp = dyn_cast<Instruction>(Latch->getTerminator()->getOperand(0)); 4745 if (Cmp && TheLoop->contains(Cmp) && Cmp->hasOneUse()) 4746 addToWorklistIfAllowed(Cmp); 4747 4748 // Holds consecutive and consecutive-like pointers. Consecutive-like pointers 4749 // are pointers that are treated like consecutive pointers during 4750 // vectorization. The pointer operands of interleaved accesses are an 4751 // example. 4752 SmallSetVector<Instruction *, 8> ConsecutiveLikePtrs; 4753 4754 // Holds pointer operands of instructions that are possibly non-uniform. 4755 SmallPtrSet<Instruction *, 8> PossibleNonUniformPtrs; 4756 4757 auto isUniformDecision = [&](Instruction *I, unsigned VF) { 4758 InstWidening WideningDecision = getWideningDecision(I, VF); 4759 assert(WideningDecision != CM_Unknown && 4760 "Widening decision should be ready at this moment"); 4761 4762 return (WideningDecision == CM_Widen || 4763 WideningDecision == CM_Widen_Reverse || 4764 WideningDecision == CM_Interleave); 4765 }; 4766 // Iterate over the instructions in the loop, and collect all 4767 // consecutive-like pointer operands in ConsecutiveLikePtrs. If it's possible 4768 // that a consecutive-like pointer operand will be scalarized, we collect it 4769 // in PossibleNonUniformPtrs instead. We use two sets here because a single 4770 // getelementptr instruction can be used by both vectorized and scalarized 4771 // memory instructions. For example, if a loop loads and stores from the same 4772 // location, but the store is conditional, the store will be scalarized, and 4773 // the getelementptr won't remain uniform. 4774 for (auto *BB : TheLoop->blocks()) 4775 for (auto &I : *BB) { 4776 // If there's no pointer operand, there's nothing to do. 4777 auto *Ptr = dyn_cast_or_null<Instruction>(getLoadStorePointerOperand(&I)); 4778 if (!Ptr) 4779 continue; 4780 4781 // True if all users of Ptr are memory accesses that have Ptr as their 4782 // pointer operand. 4783 auto UsersAreMemAccesses = 4784 llvm::all_of(Ptr->users(), [&](User *U) -> bool { 4785 return getLoadStorePointerOperand(U) == Ptr; 4786 }); 4787 4788 // Ensure the memory instruction will not be scalarized or used by 4789 // gather/scatter, making its pointer operand non-uniform. If the pointer 4790 // operand is used by any instruction other than a memory access, we 4791 // conservatively assume the pointer operand may be non-uniform. 4792 if (!UsersAreMemAccesses || !isUniformDecision(&I, VF)) 4793 PossibleNonUniformPtrs.insert(Ptr); 4794 4795 // If the memory instruction will be vectorized and its pointer operand 4796 // is consecutive-like, or interleaving - the pointer operand should 4797 // remain uniform. 4798 else 4799 ConsecutiveLikePtrs.insert(Ptr); 4800 } 4801 4802 // Add to the Worklist all consecutive and consecutive-like pointers that 4803 // aren't also identified as possibly non-uniform. 4804 for (auto *V : ConsecutiveLikePtrs) 4805 if (PossibleNonUniformPtrs.find(V) == PossibleNonUniformPtrs.end()) 4806 addToWorklistIfAllowed(V); 4807 4808 // Expand Worklist in topological order: whenever a new instruction 4809 // is added , its users should be already inside Worklist. It ensures 4810 // a uniform instruction will only be used by uniform instructions. 4811 unsigned idx = 0; 4812 while (idx != Worklist.size()) { 4813 Instruction *I = Worklist[idx++]; 4814 4815 for (auto OV : I->operand_values()) { 4816 // isOutOfScope operands cannot be uniform instructions. 4817 if (isOutOfScope(OV)) 4818 continue; 4819 // First order recurrence Phi's should typically be considered 4820 // non-uniform. 4821 auto *OP = dyn_cast<PHINode>(OV); 4822 if (OP && Legal->isFirstOrderRecurrence(OP)) 4823 continue; 4824 // If all the users of the operand are uniform, then add the 4825 // operand into the uniform worklist. 4826 auto *OI = cast<Instruction>(OV); 4827 if (llvm::all_of(OI->users(), [&](User *U) -> bool { 4828 auto *J = cast<Instruction>(U); 4829 return Worklist.count(J) || 4830 (OI == getLoadStorePointerOperand(J) && 4831 isUniformDecision(J, VF)); 4832 })) 4833 addToWorklistIfAllowed(OI); 4834 } 4835 } 4836 4837 // Returns true if Ptr is the pointer operand of a memory access instruction 4838 // I, and I is known to not require scalarization. 4839 auto isVectorizedMemAccessUse = [&](Instruction *I, Value *Ptr) -> bool { 4840 return getLoadStorePointerOperand(I) == Ptr && isUniformDecision(I, VF); 4841 }; 4842 4843 // For an instruction to be added into Worklist above, all its users inside 4844 // the loop should also be in Worklist. However, this condition cannot be 4845 // true for phi nodes that form a cyclic dependence. We must process phi 4846 // nodes separately. An induction variable will remain uniform if all users 4847 // of the induction variable and induction variable update remain uniform. 4848 // The code below handles both pointer and non-pointer induction variables. 4849 for (auto &Induction : *Legal->getInductionVars()) { 4850 auto *Ind = Induction.first; 4851 auto *IndUpdate = cast<Instruction>(Ind->getIncomingValueForBlock(Latch)); 4852 4853 // Determine if all users of the induction variable are uniform after 4854 // vectorization. 4855 auto UniformInd = llvm::all_of(Ind->users(), [&](User *U) -> bool { 4856 auto *I = cast<Instruction>(U); 4857 return I == IndUpdate || !TheLoop->contains(I) || Worklist.count(I) || 4858 isVectorizedMemAccessUse(I, Ind); 4859 }); 4860 if (!UniformInd) 4861 continue; 4862 4863 // Determine if all users of the induction variable update instruction are 4864 // uniform after vectorization. 4865 auto UniformIndUpdate = 4866 llvm::all_of(IndUpdate->users(), [&](User *U) -> bool { 4867 auto *I = cast<Instruction>(U); 4868 return I == Ind || !TheLoop->contains(I) || Worklist.count(I) || 4869 isVectorizedMemAccessUse(I, IndUpdate); 4870 }); 4871 if (!UniformIndUpdate) 4872 continue; 4873 4874 // The induction variable and its update instruction will remain uniform. 4875 addToWorklistIfAllowed(Ind); 4876 addToWorklistIfAllowed(IndUpdate); 4877 } 4878 4879 Uniforms[VF].insert(Worklist.begin(), Worklist.end()); 4880 } 4881 4882 bool LoopVectorizationCostModel::runtimeChecksRequired() { 4883 LLVM_DEBUG(dbgs() << "LV: Performing code size checks.\n"); 4884 4885 if (Legal->getRuntimePointerChecking()->Need) { 4886 reportVectorizationFailure("Runtime ptr check is required with -Os/-Oz", 4887 "runtime pointer checks needed. Enable vectorization of this " 4888 "loop with '#pragma clang loop vectorize(enable)' when " 4889 "compiling with -Os/-Oz", 4890 "CantVersionLoopWithOptForSize", ORE, TheLoop); 4891 return true; 4892 } 4893 4894 if (!PSE.getUnionPredicate().getPredicates().empty()) { 4895 reportVectorizationFailure("Runtime SCEV check is required with -Os/-Oz", 4896 "runtime SCEV checks needed. Enable vectorization of this " 4897 "loop with '#pragma clang loop vectorize(enable)' when " 4898 "compiling with -Os/-Oz", 4899 "CantVersionLoopWithOptForSize", ORE, TheLoop); 4900 return true; 4901 } 4902 4903 // FIXME: Avoid specializing for stride==1 instead of bailing out. 4904 if (!Legal->getLAI()->getSymbolicStrides().empty()) { 4905 reportVectorizationFailure("Runtime stride check is required with -Os/-Oz", 4906 "runtime stride == 1 checks needed. Enable vectorization of " 4907 "this loop with '#pragma clang loop vectorize(enable)' when " 4908 "compiling with -Os/-Oz", 4909 "CantVersionLoopWithOptForSize", ORE, TheLoop); 4910 return true; 4911 } 4912 4913 return false; 4914 } 4915 4916 Optional<unsigned> LoopVectorizationCostModel::computeMaxVF() { 4917 if (Legal->getRuntimePointerChecking()->Need && TTI.hasBranchDivergence()) { 4918 // TODO: It may by useful to do since it's still likely to be dynamically 4919 // uniform if the target can skip. 4920 reportVectorizationFailure( 4921 "Not inserting runtime ptr check for divergent target", 4922 "runtime pointer checks needed. Not enabled for divergent target", 4923 "CantVersionLoopWithDivergentTarget", ORE, TheLoop); 4924 return None; 4925 } 4926 4927 unsigned TC = PSE.getSE()->getSmallConstantTripCount(TheLoop); 4928 LLVM_DEBUG(dbgs() << "LV: Found trip count: " << TC << '\n'); 4929 if (TC == 1) { 4930 reportVectorizationFailure("Single iteration (non) loop", 4931 "loop trip count is one, irrelevant for vectorization", 4932 "SingleIterationLoop", ORE, TheLoop); 4933 return None; 4934 } 4935 4936 switch (ScalarEpilogueStatus) { 4937 case CM_ScalarEpilogueAllowed: 4938 return computeFeasibleMaxVF(TC); 4939 case CM_ScalarEpilogueNotNeededUsePredicate: 4940 LLVM_DEBUG( 4941 dbgs() << "LV: vector predicate hint/switch found.\n" 4942 << "LV: Not allowing scalar epilogue, creating predicated " 4943 << "vector loop.\n"); 4944 break; 4945 case CM_ScalarEpilogueNotAllowedLowTripLoop: 4946 // fallthrough as a special case of OptForSize 4947 case CM_ScalarEpilogueNotAllowedOptSize: 4948 if (ScalarEpilogueStatus == CM_ScalarEpilogueNotAllowedOptSize) 4949 LLVM_DEBUG( 4950 dbgs() << "LV: Not allowing scalar epilogue due to -Os/-Oz.\n"); 4951 else 4952 LLVM_DEBUG(dbgs() << "LV: Not allowing scalar epilogue due to low trip " 4953 << "count.\n"); 4954 4955 // Bail if runtime checks are required, which are not good when optimising 4956 // for size. 4957 if (runtimeChecksRequired()) 4958 return None; 4959 break; 4960 } 4961 4962 // Now try the tail folding 4963 4964 // Invalidate interleave groups that require an epilogue if we can't mask 4965 // the interleave-group. 4966 if (!useMaskedInterleavedAccesses(TTI)) 4967 InterleaveInfo.invalidateGroupsRequiringScalarEpilogue(); 4968 4969 unsigned MaxVF = computeFeasibleMaxVF(TC); 4970 if (TC > 0 && TC % MaxVF == 0) { 4971 // Accept MaxVF if we do not have a tail. 4972 LLVM_DEBUG(dbgs() << "LV: No tail will remain for any chosen VF.\n"); 4973 return MaxVF; 4974 } 4975 4976 // If we don't know the precise trip count, or if the trip count that we 4977 // found modulo the vectorization factor is not zero, try to fold the tail 4978 // by masking. 4979 // FIXME: look for a smaller MaxVF that does divide TC rather than masking. 4980 if (Legal->prepareToFoldTailByMasking()) { 4981 FoldTailByMasking = true; 4982 return MaxVF; 4983 } 4984 4985 if (TC == 0) { 4986 reportVectorizationFailure( 4987 "Unable to calculate the loop count due to complex control flow", 4988 "unable to calculate the loop count due to complex control flow", 4989 "UnknownLoopCountComplexCFG", ORE, TheLoop); 4990 return None; 4991 } 4992 4993 reportVectorizationFailure( 4994 "Cannot optimize for size and vectorize at the same time.", 4995 "cannot optimize for size and vectorize at the same time. " 4996 "Enable vectorization of this loop with '#pragma clang loop " 4997 "vectorize(enable)' when compiling with -Os/-Oz", 4998 "NoTailLoopWithOptForSize", ORE, TheLoop); 4999 return None; 5000 } 5001 5002 unsigned 5003 LoopVectorizationCostModel::computeFeasibleMaxVF(unsigned ConstTripCount) { 5004 MinBWs = computeMinimumValueSizes(TheLoop->getBlocks(), *DB, &TTI); 5005 unsigned SmallestType, WidestType; 5006 std::tie(SmallestType, WidestType) = getSmallestAndWidestTypes(); 5007 unsigned WidestRegister = TTI.getRegisterBitWidth(true); 5008 5009 // Get the maximum safe dependence distance in bits computed by LAA. 5010 // It is computed by MaxVF * sizeOf(type) * 8, where type is taken from 5011 // the memory accesses that is most restrictive (involved in the smallest 5012 // dependence distance). 5013 unsigned MaxSafeRegisterWidth = Legal->getMaxSafeRegisterWidth(); 5014 5015 WidestRegister = std::min(WidestRegister, MaxSafeRegisterWidth); 5016 5017 unsigned MaxVectorSize = WidestRegister / WidestType; 5018 5019 LLVM_DEBUG(dbgs() << "LV: The Smallest and Widest types: " << SmallestType 5020 << " / " << WidestType << " bits.\n"); 5021 LLVM_DEBUG(dbgs() << "LV: The Widest register safe to use is: " 5022 << WidestRegister << " bits.\n"); 5023 5024 assert(MaxVectorSize <= 256 && "Did not expect to pack so many elements" 5025 " into one vector!"); 5026 if (MaxVectorSize == 0) { 5027 LLVM_DEBUG(dbgs() << "LV: The target has no vector registers.\n"); 5028 MaxVectorSize = 1; 5029 return MaxVectorSize; 5030 } else if (ConstTripCount && ConstTripCount < MaxVectorSize && 5031 isPowerOf2_32(ConstTripCount)) { 5032 // We need to clamp the VF to be the ConstTripCount. There is no point in 5033 // choosing a higher viable VF as done in the loop below. 5034 LLVM_DEBUG(dbgs() << "LV: Clamping the MaxVF to the constant trip count: " 5035 << ConstTripCount << "\n"); 5036 MaxVectorSize = ConstTripCount; 5037 return MaxVectorSize; 5038 } 5039 5040 unsigned MaxVF = MaxVectorSize; 5041 if (TTI.shouldMaximizeVectorBandwidth(!isScalarEpilogueAllowed()) || 5042 (MaximizeBandwidth && isScalarEpilogueAllowed())) { 5043 // Collect all viable vectorization factors larger than the default MaxVF 5044 // (i.e. MaxVectorSize). 5045 SmallVector<unsigned, 8> VFs; 5046 unsigned NewMaxVectorSize = WidestRegister / SmallestType; 5047 for (unsigned VS = MaxVectorSize * 2; VS <= NewMaxVectorSize; VS *= 2) 5048 VFs.push_back(VS); 5049 5050 // For each VF calculate its register usage. 5051 auto RUs = calculateRegisterUsage(VFs); 5052 5053 // Select the largest VF which doesn't require more registers than existing 5054 // ones. 5055 for (int i = RUs.size() - 1; i >= 0; --i) { 5056 bool Selected = true; 5057 for (auto& pair : RUs[i].MaxLocalUsers) { 5058 unsigned TargetNumRegisters = TTI.getNumberOfRegisters(pair.first); 5059 if (pair.second > TargetNumRegisters) 5060 Selected = false; 5061 } 5062 if (Selected) { 5063 MaxVF = VFs[i]; 5064 break; 5065 } 5066 } 5067 if (unsigned MinVF = TTI.getMinimumVF(SmallestType)) { 5068 if (MaxVF < MinVF) { 5069 LLVM_DEBUG(dbgs() << "LV: Overriding calculated MaxVF(" << MaxVF 5070 << ") with target's minimum: " << MinVF << '\n'); 5071 MaxVF = MinVF; 5072 } 5073 } 5074 } 5075 return MaxVF; 5076 } 5077 5078 VectorizationFactor 5079 LoopVectorizationCostModel::selectVectorizationFactor(unsigned MaxVF) { 5080 float Cost = expectedCost(1).first; 5081 const float ScalarCost = Cost; 5082 unsigned Width = 1; 5083 LLVM_DEBUG(dbgs() << "LV: Scalar loop costs: " << (int)ScalarCost << ".\n"); 5084 5085 bool ForceVectorization = Hints->getForce() == LoopVectorizeHints::FK_Enabled; 5086 if (ForceVectorization && MaxVF > 1) { 5087 // Ignore scalar width, because the user explicitly wants vectorization. 5088 // Initialize cost to max so that VF = 2 is, at least, chosen during cost 5089 // evaluation. 5090 Cost = std::numeric_limits<float>::max(); 5091 } 5092 5093 for (unsigned i = 2; i <= MaxVF; i *= 2) { 5094 // Notice that the vector loop needs to be executed less times, so 5095 // we need to divide the cost of the vector loops by the width of 5096 // the vector elements. 5097 VectorizationCostTy C = expectedCost(i); 5098 float VectorCost = C.first / (float)i; 5099 LLVM_DEBUG(dbgs() << "LV: Vector loop of width " << i 5100 << " costs: " << (int)VectorCost << ".\n"); 5101 if (!C.second && !ForceVectorization) { 5102 LLVM_DEBUG( 5103 dbgs() << "LV: Not considering vector loop of width " << i 5104 << " because it will not generate any vector instructions.\n"); 5105 continue; 5106 } 5107 if (VectorCost < Cost) { 5108 Cost = VectorCost; 5109 Width = i; 5110 } 5111 } 5112 5113 if (!EnableCondStoresVectorization && NumPredStores) { 5114 reportVectorizationFailure("There are conditional stores.", 5115 "store that is conditionally executed prevents vectorization", 5116 "ConditionalStore", ORE, TheLoop); 5117 Width = 1; 5118 Cost = ScalarCost; 5119 } 5120 5121 LLVM_DEBUG(if (ForceVectorization && Width > 1 && Cost >= ScalarCost) dbgs() 5122 << "LV: Vectorization seems to be not beneficial, " 5123 << "but was forced by a user.\n"); 5124 LLVM_DEBUG(dbgs() << "LV: Selecting VF: " << Width << ".\n"); 5125 VectorizationFactor Factor = {Width, (unsigned)(Width * Cost)}; 5126 return Factor; 5127 } 5128 5129 std::pair<unsigned, unsigned> 5130 LoopVectorizationCostModel::getSmallestAndWidestTypes() { 5131 unsigned MinWidth = -1U; 5132 unsigned MaxWidth = 8; 5133 const DataLayout &DL = TheFunction->getParent()->getDataLayout(); 5134 5135 // For each block. 5136 for (BasicBlock *BB : TheLoop->blocks()) { 5137 // For each instruction in the loop. 5138 for (Instruction &I : BB->instructionsWithoutDebug()) { 5139 Type *T = I.getType(); 5140 5141 // Skip ignored values. 5142 if (ValuesToIgnore.find(&I) != ValuesToIgnore.end()) 5143 continue; 5144 5145 // Only examine Loads, Stores and PHINodes. 5146 if (!isa<LoadInst>(I) && !isa<StoreInst>(I) && !isa<PHINode>(I)) 5147 continue; 5148 5149 // Examine PHI nodes that are reduction variables. Update the type to 5150 // account for the recurrence type. 5151 if (auto *PN = dyn_cast<PHINode>(&I)) { 5152 if (!Legal->isReductionVariable(PN)) 5153 continue; 5154 RecurrenceDescriptor RdxDesc = (*Legal->getReductionVars())[PN]; 5155 T = RdxDesc.getRecurrenceType(); 5156 } 5157 5158 // Examine the stored values. 5159 if (auto *ST = dyn_cast<StoreInst>(&I)) 5160 T = ST->getValueOperand()->getType(); 5161 5162 // Ignore loaded pointer types and stored pointer types that are not 5163 // vectorizable. 5164 // 5165 // FIXME: The check here attempts to predict whether a load or store will 5166 // be vectorized. We only know this for certain after a VF has 5167 // been selected. Here, we assume that if an access can be 5168 // vectorized, it will be. We should also look at extending this 5169 // optimization to non-pointer types. 5170 // 5171 if (T->isPointerTy() && !isConsecutiveLoadOrStore(&I) && 5172 !isAccessInterleaved(&I) && !isLegalGatherOrScatter(&I)) 5173 continue; 5174 5175 MinWidth = std::min(MinWidth, 5176 (unsigned)DL.getTypeSizeInBits(T->getScalarType())); 5177 MaxWidth = std::max(MaxWidth, 5178 (unsigned)DL.getTypeSizeInBits(T->getScalarType())); 5179 } 5180 } 5181 5182 return {MinWidth, MaxWidth}; 5183 } 5184 5185 unsigned LoopVectorizationCostModel::selectInterleaveCount(unsigned VF, 5186 unsigned LoopCost) { 5187 // -- The interleave heuristics -- 5188 // We interleave the loop in order to expose ILP and reduce the loop overhead. 5189 // There are many micro-architectural considerations that we can't predict 5190 // at this level. For example, frontend pressure (on decode or fetch) due to 5191 // code size, or the number and capabilities of the execution ports. 5192 // 5193 // We use the following heuristics to select the interleave count: 5194 // 1. If the code has reductions, then we interleave to break the cross 5195 // iteration dependency. 5196 // 2. If the loop is really small, then we interleave to reduce the loop 5197 // overhead. 5198 // 3. We don't interleave if we think that we will spill registers to memory 5199 // due to the increased register pressure. 5200 5201 if (!isScalarEpilogueAllowed()) 5202 return 1; 5203 5204 // We used the distance for the interleave count. 5205 if (Legal->getMaxSafeDepDistBytes() != -1U) 5206 return 1; 5207 5208 // Do not interleave loops with a relatively small known or estimated trip 5209 // count. 5210 auto BestKnownTC = getSmallBestKnownTC(*PSE.getSE(), TheLoop); 5211 if (BestKnownTC && *BestKnownTC < TinyTripCountInterleaveThreshold) 5212 return 1; 5213 5214 RegisterUsage R = calculateRegisterUsage({VF})[0]; 5215 // We divide by these constants so assume that we have at least one 5216 // instruction that uses at least one register. 5217 for (auto& pair : R.MaxLocalUsers) { 5218 pair.second = std::max(pair.second, 1U); 5219 } 5220 5221 // We calculate the interleave count using the following formula. 5222 // Subtract the number of loop invariants from the number of available 5223 // registers. These registers are used by all of the interleaved instances. 5224 // Next, divide the remaining registers by the number of registers that is 5225 // required by the loop, in order to estimate how many parallel instances 5226 // fit without causing spills. All of this is rounded down if necessary to be 5227 // a power of two. We want power of two interleave count to simplify any 5228 // addressing operations or alignment considerations. 5229 // We also want power of two interleave counts to ensure that the induction 5230 // variable of the vector loop wraps to zero, when tail is folded by masking; 5231 // this currently happens when OptForSize, in which case IC is set to 1 above. 5232 unsigned IC = UINT_MAX; 5233 5234 for (auto& pair : R.MaxLocalUsers) { 5235 unsigned TargetNumRegisters = TTI.getNumberOfRegisters(pair.first); 5236 LLVM_DEBUG(dbgs() << "LV: The target has " << TargetNumRegisters 5237 << " registers of " 5238 << TTI.getRegisterClassName(pair.first) << " register class\n"); 5239 if (VF == 1) { 5240 if (ForceTargetNumScalarRegs.getNumOccurrences() > 0) 5241 TargetNumRegisters = ForceTargetNumScalarRegs; 5242 } else { 5243 if (ForceTargetNumVectorRegs.getNumOccurrences() > 0) 5244 TargetNumRegisters = ForceTargetNumVectorRegs; 5245 } 5246 unsigned MaxLocalUsers = pair.second; 5247 unsigned LoopInvariantRegs = 0; 5248 if (R.LoopInvariantRegs.find(pair.first) != R.LoopInvariantRegs.end()) 5249 LoopInvariantRegs = R.LoopInvariantRegs[pair.first]; 5250 5251 unsigned TmpIC = PowerOf2Floor((TargetNumRegisters - LoopInvariantRegs) / MaxLocalUsers); 5252 // Don't count the induction variable as interleaved. 5253 if (EnableIndVarRegisterHeur) { 5254 TmpIC = 5255 PowerOf2Floor((TargetNumRegisters - LoopInvariantRegs - 1) / 5256 std::max(1U, (MaxLocalUsers - 1))); 5257 } 5258 5259 IC = std::min(IC, TmpIC); 5260 } 5261 5262 // Clamp the interleave ranges to reasonable counts. 5263 unsigned MaxInterleaveCount = TTI.getMaxInterleaveFactor(VF); 5264 5265 // Check if the user has overridden the max. 5266 if (VF == 1) { 5267 if (ForceTargetMaxScalarInterleaveFactor.getNumOccurrences() > 0) 5268 MaxInterleaveCount = ForceTargetMaxScalarInterleaveFactor; 5269 } else { 5270 if (ForceTargetMaxVectorInterleaveFactor.getNumOccurrences() > 0) 5271 MaxInterleaveCount = ForceTargetMaxVectorInterleaveFactor; 5272 } 5273 5274 // If trip count is known or estimated compile time constant, limit the 5275 // interleave count to be less than the trip count divided by VF. 5276 if (BestKnownTC) { 5277 MaxInterleaveCount = std::min(*BestKnownTC / VF, MaxInterleaveCount); 5278 } 5279 5280 // If we did not calculate the cost for VF (because the user selected the VF) 5281 // then we calculate the cost of VF here. 5282 if (LoopCost == 0) 5283 LoopCost = expectedCost(VF).first; 5284 5285 assert(LoopCost && "Non-zero loop cost expected"); 5286 5287 // Clamp the calculated IC to be between the 1 and the max interleave count 5288 // that the target and trip count allows. 5289 if (IC > MaxInterleaveCount) 5290 IC = MaxInterleaveCount; 5291 else if (IC < 1) 5292 IC = 1; 5293 5294 // Interleave if we vectorized this loop and there is a reduction that could 5295 // benefit from interleaving. 5296 if (VF > 1 && !Legal->getReductionVars()->empty()) { 5297 LLVM_DEBUG(dbgs() << "LV: Interleaving because of reductions.\n"); 5298 return IC; 5299 } 5300 5301 // Note that if we've already vectorized the loop we will have done the 5302 // runtime check and so interleaving won't require further checks. 5303 bool InterleavingRequiresRuntimePointerCheck = 5304 (VF == 1 && Legal->getRuntimePointerChecking()->Need); 5305 5306 // We want to interleave small loops in order to reduce the loop overhead and 5307 // potentially expose ILP opportunities. 5308 LLVM_DEBUG(dbgs() << "LV: Loop cost is " << LoopCost << '\n'); 5309 if (!InterleavingRequiresRuntimePointerCheck && LoopCost < SmallLoopCost) { 5310 // We assume that the cost overhead is 1 and we use the cost model 5311 // to estimate the cost of the loop and interleave until the cost of the 5312 // loop overhead is about 5% of the cost of the loop. 5313 unsigned SmallIC = 5314 std::min(IC, (unsigned)PowerOf2Floor(SmallLoopCost / LoopCost)); 5315 5316 // Interleave until store/load ports (estimated by max interleave count) are 5317 // saturated. 5318 unsigned NumStores = Legal->getNumStores(); 5319 unsigned NumLoads = Legal->getNumLoads(); 5320 unsigned StoresIC = IC / (NumStores ? NumStores : 1); 5321 unsigned LoadsIC = IC / (NumLoads ? NumLoads : 1); 5322 5323 // If we have a scalar reduction (vector reductions are already dealt with 5324 // by this point), we can increase the critical path length if the loop 5325 // we're interleaving is inside another loop. Limit, by default to 2, so the 5326 // critical path only gets increased by one reduction operation. 5327 if (!Legal->getReductionVars()->empty() && TheLoop->getLoopDepth() > 1) { 5328 unsigned F = static_cast<unsigned>(MaxNestedScalarReductionIC); 5329 SmallIC = std::min(SmallIC, F); 5330 StoresIC = std::min(StoresIC, F); 5331 LoadsIC = std::min(LoadsIC, F); 5332 } 5333 5334 if (EnableLoadStoreRuntimeInterleave && 5335 std::max(StoresIC, LoadsIC) > SmallIC) { 5336 LLVM_DEBUG( 5337 dbgs() << "LV: Interleaving to saturate store or load ports.\n"); 5338 return std::max(StoresIC, LoadsIC); 5339 } 5340 5341 LLVM_DEBUG(dbgs() << "LV: Interleaving to reduce branch cost.\n"); 5342 return SmallIC; 5343 } 5344 5345 // Interleave if this is a large loop (small loops are already dealt with by 5346 // this point) that could benefit from interleaving. 5347 bool HasReductions = !Legal->getReductionVars()->empty(); 5348 if (TTI.enableAggressiveInterleaving(HasReductions)) { 5349 LLVM_DEBUG(dbgs() << "LV: Interleaving to expose ILP.\n"); 5350 return IC; 5351 } 5352 5353 LLVM_DEBUG(dbgs() << "LV: Not Interleaving.\n"); 5354 return 1; 5355 } 5356 5357 SmallVector<LoopVectorizationCostModel::RegisterUsage, 8> 5358 LoopVectorizationCostModel::calculateRegisterUsage(ArrayRef<unsigned> VFs) { 5359 // This function calculates the register usage by measuring the highest number 5360 // of values that are alive at a single location. Obviously, this is a very 5361 // rough estimation. We scan the loop in a topological order in order and 5362 // assign a number to each instruction. We use RPO to ensure that defs are 5363 // met before their users. We assume that each instruction that has in-loop 5364 // users starts an interval. We record every time that an in-loop value is 5365 // used, so we have a list of the first and last occurrences of each 5366 // instruction. Next, we transpose this data structure into a multi map that 5367 // holds the list of intervals that *end* at a specific location. This multi 5368 // map allows us to perform a linear search. We scan the instructions linearly 5369 // and record each time that a new interval starts, by placing it in a set. 5370 // If we find this value in the multi-map then we remove it from the set. 5371 // The max register usage is the maximum size of the set. 5372 // We also search for instructions that are defined outside the loop, but are 5373 // used inside the loop. We need this number separately from the max-interval 5374 // usage number because when we unroll, loop-invariant values do not take 5375 // more register. 5376 LoopBlocksDFS DFS(TheLoop); 5377 DFS.perform(LI); 5378 5379 RegisterUsage RU; 5380 5381 // Each 'key' in the map opens a new interval. The values 5382 // of the map are the index of the 'last seen' usage of the 5383 // instruction that is the key. 5384 using IntervalMap = DenseMap<Instruction *, unsigned>; 5385 5386 // Maps instruction to its index. 5387 SmallVector<Instruction *, 64> IdxToInstr; 5388 // Marks the end of each interval. 5389 IntervalMap EndPoint; 5390 // Saves the list of instruction indices that are used in the loop. 5391 SmallPtrSet<Instruction *, 8> Ends; 5392 // Saves the list of values that are used in the loop but are 5393 // defined outside the loop, such as arguments and constants. 5394 SmallPtrSet<Value *, 8> LoopInvariants; 5395 5396 for (BasicBlock *BB : make_range(DFS.beginRPO(), DFS.endRPO())) { 5397 for (Instruction &I : BB->instructionsWithoutDebug()) { 5398 IdxToInstr.push_back(&I); 5399 5400 // Save the end location of each USE. 5401 for (Value *U : I.operands()) { 5402 auto *Instr = dyn_cast<Instruction>(U); 5403 5404 // Ignore non-instruction values such as arguments, constants, etc. 5405 if (!Instr) 5406 continue; 5407 5408 // If this instruction is outside the loop then record it and continue. 5409 if (!TheLoop->contains(Instr)) { 5410 LoopInvariants.insert(Instr); 5411 continue; 5412 } 5413 5414 // Overwrite previous end points. 5415 EndPoint[Instr] = IdxToInstr.size(); 5416 Ends.insert(Instr); 5417 } 5418 } 5419 } 5420 5421 // Saves the list of intervals that end with the index in 'key'. 5422 using InstrList = SmallVector<Instruction *, 2>; 5423 DenseMap<unsigned, InstrList> TransposeEnds; 5424 5425 // Transpose the EndPoints to a list of values that end at each index. 5426 for (auto &Interval : EndPoint) 5427 TransposeEnds[Interval.second].push_back(Interval.first); 5428 5429 SmallPtrSet<Instruction *, 8> OpenIntervals; 5430 5431 // Get the size of the widest register. 5432 unsigned MaxSafeDepDist = -1U; 5433 if (Legal->getMaxSafeDepDistBytes() != -1U) 5434 MaxSafeDepDist = Legal->getMaxSafeDepDistBytes() * 8; 5435 unsigned WidestRegister = 5436 std::min(TTI.getRegisterBitWidth(true), MaxSafeDepDist); 5437 const DataLayout &DL = TheFunction->getParent()->getDataLayout(); 5438 5439 SmallVector<RegisterUsage, 8> RUs(VFs.size()); 5440 SmallVector<SmallMapVector<unsigned, unsigned, 4>, 8> MaxUsages(VFs.size()); 5441 5442 LLVM_DEBUG(dbgs() << "LV(REG): Calculating max register usage:\n"); 5443 5444 // A lambda that gets the register usage for the given type and VF. 5445 auto GetRegUsage = [&DL, WidestRegister](Type *Ty, unsigned VF) { 5446 if (Ty->isTokenTy()) 5447 return 0U; 5448 unsigned TypeSize = DL.getTypeSizeInBits(Ty->getScalarType()); 5449 return std::max<unsigned>(1, VF * TypeSize / WidestRegister); 5450 }; 5451 5452 for (unsigned int i = 0, s = IdxToInstr.size(); i < s; ++i) { 5453 Instruction *I = IdxToInstr[i]; 5454 5455 // Remove all of the instructions that end at this location. 5456 InstrList &List = TransposeEnds[i]; 5457 for (Instruction *ToRemove : List) 5458 OpenIntervals.erase(ToRemove); 5459 5460 // Ignore instructions that are never used within the loop. 5461 if (Ends.find(I) == Ends.end()) 5462 continue; 5463 5464 // Skip ignored values. 5465 if (ValuesToIgnore.find(I) != ValuesToIgnore.end()) 5466 continue; 5467 5468 // For each VF find the maximum usage of registers. 5469 for (unsigned j = 0, e = VFs.size(); j < e; ++j) { 5470 // Count the number of live intervals. 5471 SmallMapVector<unsigned, unsigned, 4> RegUsage; 5472 5473 if (VFs[j] == 1) { 5474 for (auto Inst : OpenIntervals) { 5475 unsigned ClassID = TTI.getRegisterClassForType(false, Inst->getType()); 5476 if (RegUsage.find(ClassID) == RegUsage.end()) 5477 RegUsage[ClassID] = 1; 5478 else 5479 RegUsage[ClassID] += 1; 5480 } 5481 } else { 5482 collectUniformsAndScalars(VFs[j]); 5483 for (auto Inst : OpenIntervals) { 5484 // Skip ignored values for VF > 1. 5485 if (VecValuesToIgnore.find(Inst) != VecValuesToIgnore.end()) 5486 continue; 5487 if (isScalarAfterVectorization(Inst, VFs[j])) { 5488 unsigned ClassID = TTI.getRegisterClassForType(false, Inst->getType()); 5489 if (RegUsage.find(ClassID) == RegUsage.end()) 5490 RegUsage[ClassID] = 1; 5491 else 5492 RegUsage[ClassID] += 1; 5493 } else { 5494 unsigned ClassID = TTI.getRegisterClassForType(true, Inst->getType()); 5495 if (RegUsage.find(ClassID) == RegUsage.end()) 5496 RegUsage[ClassID] = GetRegUsage(Inst->getType(), VFs[j]); 5497 else 5498 RegUsage[ClassID] += GetRegUsage(Inst->getType(), VFs[j]); 5499 } 5500 } 5501 } 5502 5503 for (auto& pair : RegUsage) { 5504 if (MaxUsages[j].find(pair.first) != MaxUsages[j].end()) 5505 MaxUsages[j][pair.first] = std::max(MaxUsages[j][pair.first], pair.second); 5506 else 5507 MaxUsages[j][pair.first] = pair.second; 5508 } 5509 } 5510 5511 LLVM_DEBUG(dbgs() << "LV(REG): At #" << i << " Interval # " 5512 << OpenIntervals.size() << '\n'); 5513 5514 // Add the current instruction to the list of open intervals. 5515 OpenIntervals.insert(I); 5516 } 5517 5518 for (unsigned i = 0, e = VFs.size(); i < e; ++i) { 5519 SmallMapVector<unsigned, unsigned, 4> Invariant; 5520 5521 for (auto Inst : LoopInvariants) { 5522 unsigned Usage = VFs[i] == 1 ? 1 : GetRegUsage(Inst->getType(), VFs[i]); 5523 unsigned ClassID = TTI.getRegisterClassForType(VFs[i] > 1, Inst->getType()); 5524 if (Invariant.find(ClassID) == Invariant.end()) 5525 Invariant[ClassID] = Usage; 5526 else 5527 Invariant[ClassID] += Usage; 5528 } 5529 5530 LLVM_DEBUG({ 5531 dbgs() << "LV(REG): VF = " << VFs[i] << '\n'; 5532 dbgs() << "LV(REG): Found max usage: " << MaxUsages[i].size() 5533 << " item\n"; 5534 for (const auto &pair : MaxUsages[i]) { 5535 dbgs() << "LV(REG): RegisterClass: " 5536 << TTI.getRegisterClassName(pair.first) << ", " << pair.second 5537 << " registers\n"; 5538 } 5539 dbgs() << "LV(REG): Found invariant usage: " << Invariant.size() 5540 << " item\n"; 5541 for (const auto &pair : Invariant) { 5542 dbgs() << "LV(REG): RegisterClass: " 5543 << TTI.getRegisterClassName(pair.first) << ", " << pair.second 5544 << " registers\n"; 5545 } 5546 }); 5547 5548 RU.LoopInvariantRegs = Invariant; 5549 RU.MaxLocalUsers = MaxUsages[i]; 5550 RUs[i] = RU; 5551 } 5552 5553 return RUs; 5554 } 5555 5556 bool LoopVectorizationCostModel::useEmulatedMaskMemRefHack(Instruction *I){ 5557 // TODO: Cost model for emulated masked load/store is completely 5558 // broken. This hack guides the cost model to use an artificially 5559 // high enough value to practically disable vectorization with such 5560 // operations, except where previously deployed legality hack allowed 5561 // using very low cost values. This is to avoid regressions coming simply 5562 // from moving "masked load/store" check from legality to cost model. 5563 // Masked Load/Gather emulation was previously never allowed. 5564 // Limited number of Masked Store/Scatter emulation was allowed. 5565 assert(isPredicatedInst(I) && "Expecting a scalar emulated instruction"); 5566 return isa<LoadInst>(I) || 5567 (isa<StoreInst>(I) && 5568 NumPredStores > NumberOfStoresToPredicate); 5569 } 5570 5571 void LoopVectorizationCostModel::collectInstsToScalarize(unsigned VF) { 5572 // If we aren't vectorizing the loop, or if we've already collected the 5573 // instructions to scalarize, there's nothing to do. Collection may already 5574 // have occurred if we have a user-selected VF and are now computing the 5575 // expected cost for interleaving. 5576 if (VF < 2 || InstsToScalarize.find(VF) != InstsToScalarize.end()) 5577 return; 5578 5579 // Initialize a mapping for VF in InstsToScalalarize. If we find that it's 5580 // not profitable to scalarize any instructions, the presence of VF in the 5581 // map will indicate that we've analyzed it already. 5582 ScalarCostsTy &ScalarCostsVF = InstsToScalarize[VF]; 5583 5584 // Find all the instructions that are scalar with predication in the loop and 5585 // determine if it would be better to not if-convert the blocks they are in. 5586 // If so, we also record the instructions to scalarize. 5587 for (BasicBlock *BB : TheLoop->blocks()) { 5588 if (!blockNeedsPredication(BB)) 5589 continue; 5590 for (Instruction &I : *BB) 5591 if (isScalarWithPredication(&I)) { 5592 ScalarCostsTy ScalarCosts; 5593 // Do not apply discount logic if hacked cost is needed 5594 // for emulated masked memrefs. 5595 if (!useEmulatedMaskMemRefHack(&I) && 5596 computePredInstDiscount(&I, ScalarCosts, VF) >= 0) 5597 ScalarCostsVF.insert(ScalarCosts.begin(), ScalarCosts.end()); 5598 // Remember that BB will remain after vectorization. 5599 PredicatedBBsAfterVectorization.insert(BB); 5600 } 5601 } 5602 } 5603 5604 int LoopVectorizationCostModel::computePredInstDiscount( 5605 Instruction *PredInst, DenseMap<Instruction *, unsigned> &ScalarCosts, 5606 unsigned VF) { 5607 assert(!isUniformAfterVectorization(PredInst, VF) && 5608 "Instruction marked uniform-after-vectorization will be predicated"); 5609 5610 // Initialize the discount to zero, meaning that the scalar version and the 5611 // vector version cost the same. 5612 int Discount = 0; 5613 5614 // Holds instructions to analyze. The instructions we visit are mapped in 5615 // ScalarCosts. Those instructions are the ones that would be scalarized if 5616 // we find that the scalar version costs less. 5617 SmallVector<Instruction *, 8> Worklist; 5618 5619 // Returns true if the given instruction can be scalarized. 5620 auto canBeScalarized = [&](Instruction *I) -> bool { 5621 // We only attempt to scalarize instructions forming a single-use chain 5622 // from the original predicated block that would otherwise be vectorized. 5623 // Although not strictly necessary, we give up on instructions we know will 5624 // already be scalar to avoid traversing chains that are unlikely to be 5625 // beneficial. 5626 if (!I->hasOneUse() || PredInst->getParent() != I->getParent() || 5627 isScalarAfterVectorization(I, VF)) 5628 return false; 5629 5630 // If the instruction is scalar with predication, it will be analyzed 5631 // separately. We ignore it within the context of PredInst. 5632 if (isScalarWithPredication(I)) 5633 return false; 5634 5635 // If any of the instruction's operands are uniform after vectorization, 5636 // the instruction cannot be scalarized. This prevents, for example, a 5637 // masked load from being scalarized. 5638 // 5639 // We assume we will only emit a value for lane zero of an instruction 5640 // marked uniform after vectorization, rather than VF identical values. 5641 // Thus, if we scalarize an instruction that uses a uniform, we would 5642 // create uses of values corresponding to the lanes we aren't emitting code 5643 // for. This behavior can be changed by allowing getScalarValue to clone 5644 // the lane zero values for uniforms rather than asserting. 5645 for (Use &U : I->operands()) 5646 if (auto *J = dyn_cast<Instruction>(U.get())) 5647 if (isUniformAfterVectorization(J, VF)) 5648 return false; 5649 5650 // Otherwise, we can scalarize the instruction. 5651 return true; 5652 }; 5653 5654 // Compute the expected cost discount from scalarizing the entire expression 5655 // feeding the predicated instruction. We currently only consider expressions 5656 // that are single-use instruction chains. 5657 Worklist.push_back(PredInst); 5658 while (!Worklist.empty()) { 5659 Instruction *I = Worklist.pop_back_val(); 5660 5661 // If we've already analyzed the instruction, there's nothing to do. 5662 if (ScalarCosts.find(I) != ScalarCosts.end()) 5663 continue; 5664 5665 // Compute the cost of the vector instruction. Note that this cost already 5666 // includes the scalarization overhead of the predicated instruction. 5667 unsigned VectorCost = getInstructionCost(I, VF).first; 5668 5669 // Compute the cost of the scalarized instruction. This cost is the cost of 5670 // the instruction as if it wasn't if-converted and instead remained in the 5671 // predicated block. We will scale this cost by block probability after 5672 // computing the scalarization overhead. 5673 unsigned ScalarCost = VF * getInstructionCost(I, 1).first; 5674 5675 // Compute the scalarization overhead of needed insertelement instructions 5676 // and phi nodes. 5677 if (isScalarWithPredication(I) && !I->getType()->isVoidTy()) { 5678 ScalarCost += TTI.getScalarizationOverhead(ToVectorTy(I->getType(), VF), 5679 true, false); 5680 ScalarCost += VF * TTI.getCFInstrCost(Instruction::PHI); 5681 } 5682 5683 // Compute the scalarization overhead of needed extractelement 5684 // instructions. For each of the instruction's operands, if the operand can 5685 // be scalarized, add it to the worklist; otherwise, account for the 5686 // overhead. 5687 for (Use &U : I->operands()) 5688 if (auto *J = dyn_cast<Instruction>(U.get())) { 5689 assert(VectorType::isValidElementType(J->getType()) && 5690 "Instruction has non-scalar type"); 5691 if (canBeScalarized(J)) 5692 Worklist.push_back(J); 5693 else if (needsExtract(J, VF)) 5694 ScalarCost += TTI.getScalarizationOverhead( 5695 ToVectorTy(J->getType(),VF), false, true); 5696 } 5697 5698 // Scale the total scalar cost by block probability. 5699 ScalarCost /= getReciprocalPredBlockProb(); 5700 5701 // Compute the discount. A non-negative discount means the vector version 5702 // of the instruction costs more, and scalarizing would be beneficial. 5703 Discount += VectorCost - ScalarCost; 5704 ScalarCosts[I] = ScalarCost; 5705 } 5706 5707 return Discount; 5708 } 5709 5710 LoopVectorizationCostModel::VectorizationCostTy 5711 LoopVectorizationCostModel::expectedCost(unsigned VF) { 5712 VectorizationCostTy Cost; 5713 5714 // For each block. 5715 for (BasicBlock *BB : TheLoop->blocks()) { 5716 VectorizationCostTy BlockCost; 5717 5718 // For each instruction in the old loop. 5719 for (Instruction &I : BB->instructionsWithoutDebug()) { 5720 // Skip ignored values. 5721 if (ValuesToIgnore.find(&I) != ValuesToIgnore.end() || 5722 (VF > 1 && VecValuesToIgnore.find(&I) != VecValuesToIgnore.end())) 5723 continue; 5724 5725 VectorizationCostTy C = getInstructionCost(&I, VF); 5726 5727 // Check if we should override the cost. 5728 if (ForceTargetInstructionCost.getNumOccurrences() > 0) 5729 C.first = ForceTargetInstructionCost; 5730 5731 BlockCost.first += C.first; 5732 BlockCost.second |= C.second; 5733 LLVM_DEBUG(dbgs() << "LV: Found an estimated cost of " << C.first 5734 << " for VF " << VF << " For instruction: " << I 5735 << '\n'); 5736 } 5737 5738 // If we are vectorizing a predicated block, it will have been 5739 // if-converted. This means that the block's instructions (aside from 5740 // stores and instructions that may divide by zero) will now be 5741 // unconditionally executed. For the scalar case, we may not always execute 5742 // the predicated block. Thus, scale the block's cost by the probability of 5743 // executing it. 5744 if (VF == 1 && blockNeedsPredication(BB)) 5745 BlockCost.first /= getReciprocalPredBlockProb(); 5746 5747 Cost.first += BlockCost.first; 5748 Cost.second |= BlockCost.second; 5749 } 5750 5751 return Cost; 5752 } 5753 5754 /// Gets Address Access SCEV after verifying that the access pattern 5755 /// is loop invariant except the induction variable dependence. 5756 /// 5757 /// This SCEV can be sent to the Target in order to estimate the address 5758 /// calculation cost. 5759 static const SCEV *getAddressAccessSCEV( 5760 Value *Ptr, 5761 LoopVectorizationLegality *Legal, 5762 PredicatedScalarEvolution &PSE, 5763 const Loop *TheLoop) { 5764 5765 auto *Gep = dyn_cast<GetElementPtrInst>(Ptr); 5766 if (!Gep) 5767 return nullptr; 5768 5769 // We are looking for a gep with all loop invariant indices except for one 5770 // which should be an induction variable. 5771 auto SE = PSE.getSE(); 5772 unsigned NumOperands = Gep->getNumOperands(); 5773 for (unsigned i = 1; i < NumOperands; ++i) { 5774 Value *Opd = Gep->getOperand(i); 5775 if (!SE->isLoopInvariant(SE->getSCEV(Opd), TheLoop) && 5776 !Legal->isInductionVariable(Opd)) 5777 return nullptr; 5778 } 5779 5780 // Now we know we have a GEP ptr, %inv, %ind, %inv. return the Ptr SCEV. 5781 return PSE.getSCEV(Ptr); 5782 } 5783 5784 static bool isStrideMul(Instruction *I, LoopVectorizationLegality *Legal) { 5785 return Legal->hasStride(I->getOperand(0)) || 5786 Legal->hasStride(I->getOperand(1)); 5787 } 5788 5789 unsigned LoopVectorizationCostModel::getMemInstScalarizationCost(Instruction *I, 5790 unsigned VF) { 5791 assert(VF > 1 && "Scalarization cost of instruction implies vectorization."); 5792 Type *ValTy = getMemInstValueType(I); 5793 auto SE = PSE.getSE(); 5794 5795 unsigned AS = getLoadStoreAddressSpace(I); 5796 Value *Ptr = getLoadStorePointerOperand(I); 5797 Type *PtrTy = ToVectorTy(Ptr->getType(), VF); 5798 5799 // Figure out whether the access is strided and get the stride value 5800 // if it's known in compile time 5801 const SCEV *PtrSCEV = getAddressAccessSCEV(Ptr, Legal, PSE, TheLoop); 5802 5803 // Get the cost of the scalar memory instruction and address computation. 5804 unsigned Cost = VF * TTI.getAddressComputationCost(PtrTy, SE, PtrSCEV); 5805 5806 // Don't pass *I here, since it is scalar but will actually be part of a 5807 // vectorized loop where the user of it is a vectorized instruction. 5808 const MaybeAlign Alignment = getLoadStoreAlignment(I); 5809 Cost += VF * TTI.getMemoryOpCost(I->getOpcode(), ValTy->getScalarType(), 5810 Alignment, AS); 5811 5812 // Get the overhead of the extractelement and insertelement instructions 5813 // we might create due to scalarization. 5814 Cost += getScalarizationOverhead(I, VF); 5815 5816 // If we have a predicated store, it may not be executed for each vector 5817 // lane. Scale the cost by the probability of executing the predicated 5818 // block. 5819 if (isPredicatedInst(I)) { 5820 Cost /= getReciprocalPredBlockProb(); 5821 5822 if (useEmulatedMaskMemRefHack(I)) 5823 // Artificially setting to a high enough value to practically disable 5824 // vectorization with such operations. 5825 Cost = 3000000; 5826 } 5827 5828 return Cost; 5829 } 5830 5831 unsigned LoopVectorizationCostModel::getConsecutiveMemOpCost(Instruction *I, 5832 unsigned VF) { 5833 Type *ValTy = getMemInstValueType(I); 5834 Type *VectorTy = ToVectorTy(ValTy, VF); 5835 Value *Ptr = getLoadStorePointerOperand(I); 5836 unsigned AS = getLoadStoreAddressSpace(I); 5837 int ConsecutiveStride = Legal->isConsecutivePtr(Ptr); 5838 5839 assert((ConsecutiveStride == 1 || ConsecutiveStride == -1) && 5840 "Stride should be 1 or -1 for consecutive memory access"); 5841 const MaybeAlign Alignment = getLoadStoreAlignment(I); 5842 unsigned Cost = 0; 5843 if (Legal->isMaskRequired(I)) 5844 Cost += TTI.getMaskedMemoryOpCost(I->getOpcode(), VectorTy, 5845 Alignment ? Alignment->value() : 0, AS); 5846 else 5847 Cost += TTI.getMemoryOpCost(I->getOpcode(), VectorTy, Alignment, AS, I); 5848 5849 bool Reverse = ConsecutiveStride < 0; 5850 if (Reverse) 5851 Cost += TTI.getShuffleCost(TargetTransformInfo::SK_Reverse, VectorTy, 0); 5852 return Cost; 5853 } 5854 5855 unsigned LoopVectorizationCostModel::getUniformMemOpCost(Instruction *I, 5856 unsigned VF) { 5857 Type *ValTy = getMemInstValueType(I); 5858 Type *VectorTy = ToVectorTy(ValTy, VF); 5859 const MaybeAlign Alignment = getLoadStoreAlignment(I); 5860 unsigned AS = getLoadStoreAddressSpace(I); 5861 if (isa<LoadInst>(I)) { 5862 return TTI.getAddressComputationCost(ValTy) + 5863 TTI.getMemoryOpCost(Instruction::Load, ValTy, Alignment, AS) + 5864 TTI.getShuffleCost(TargetTransformInfo::SK_Broadcast, VectorTy); 5865 } 5866 StoreInst *SI = cast<StoreInst>(I); 5867 5868 bool isLoopInvariantStoreValue = Legal->isUniform(SI->getValueOperand()); 5869 return TTI.getAddressComputationCost(ValTy) + 5870 TTI.getMemoryOpCost(Instruction::Store, ValTy, Alignment, AS) + 5871 (isLoopInvariantStoreValue 5872 ? 0 5873 : TTI.getVectorInstrCost(Instruction::ExtractElement, VectorTy, 5874 VF - 1)); 5875 } 5876 5877 unsigned LoopVectorizationCostModel::getGatherScatterCost(Instruction *I, 5878 unsigned VF) { 5879 Type *ValTy = getMemInstValueType(I); 5880 Type *VectorTy = ToVectorTy(ValTy, VF); 5881 const MaybeAlign Alignment = getLoadStoreAlignment(I); 5882 Value *Ptr = getLoadStorePointerOperand(I); 5883 5884 return TTI.getAddressComputationCost(VectorTy) + 5885 TTI.getGatherScatterOpCost(I->getOpcode(), VectorTy, Ptr, 5886 Legal->isMaskRequired(I), 5887 Alignment ? Alignment->value() : 0); 5888 } 5889 5890 unsigned LoopVectorizationCostModel::getInterleaveGroupCost(Instruction *I, 5891 unsigned VF) { 5892 Type *ValTy = getMemInstValueType(I); 5893 Type *VectorTy = ToVectorTy(ValTy, VF); 5894 unsigned AS = getLoadStoreAddressSpace(I); 5895 5896 auto Group = getInterleavedAccessGroup(I); 5897 assert(Group && "Fail to get an interleaved access group."); 5898 5899 unsigned InterleaveFactor = Group->getFactor(); 5900 Type *WideVecTy = VectorType::get(ValTy, VF * InterleaveFactor); 5901 5902 // Holds the indices of existing members in an interleaved load group. 5903 // An interleaved store group doesn't need this as it doesn't allow gaps. 5904 SmallVector<unsigned, 4> Indices; 5905 if (isa<LoadInst>(I)) { 5906 for (unsigned i = 0; i < InterleaveFactor; i++) 5907 if (Group->getMember(i)) 5908 Indices.push_back(i); 5909 } 5910 5911 // Calculate the cost of the whole interleaved group. 5912 bool UseMaskForGaps = 5913 Group->requiresScalarEpilogue() && !isScalarEpilogueAllowed(); 5914 unsigned Cost = TTI.getInterleavedMemoryOpCost( 5915 I->getOpcode(), WideVecTy, Group->getFactor(), Indices, 5916 Group->getAlignment(), AS, Legal->isMaskRequired(I), UseMaskForGaps); 5917 5918 if (Group->isReverse()) { 5919 // TODO: Add support for reversed masked interleaved access. 5920 assert(!Legal->isMaskRequired(I) && 5921 "Reverse masked interleaved access not supported."); 5922 Cost += Group->getNumMembers() * 5923 TTI.getShuffleCost(TargetTransformInfo::SK_Reverse, VectorTy, 0); 5924 } 5925 return Cost; 5926 } 5927 5928 unsigned LoopVectorizationCostModel::getMemoryInstructionCost(Instruction *I, 5929 unsigned VF) { 5930 // Calculate scalar cost only. Vectorization cost should be ready at this 5931 // moment. 5932 if (VF == 1) { 5933 Type *ValTy = getMemInstValueType(I); 5934 const MaybeAlign Alignment = getLoadStoreAlignment(I); 5935 unsigned AS = getLoadStoreAddressSpace(I); 5936 5937 return TTI.getAddressComputationCost(ValTy) + 5938 TTI.getMemoryOpCost(I->getOpcode(), ValTy, Alignment, AS, I); 5939 } 5940 return getWideningCost(I, VF); 5941 } 5942 5943 LoopVectorizationCostModel::VectorizationCostTy 5944 LoopVectorizationCostModel::getInstructionCost(Instruction *I, unsigned VF) { 5945 // If we know that this instruction will remain uniform, check the cost of 5946 // the scalar version. 5947 if (isUniformAfterVectorization(I, VF)) 5948 VF = 1; 5949 5950 if (VF > 1 && isProfitableToScalarize(I, VF)) 5951 return VectorizationCostTy(InstsToScalarize[VF][I], false); 5952 5953 // Forced scalars do not have any scalarization overhead. 5954 auto ForcedScalar = ForcedScalars.find(VF); 5955 if (VF > 1 && ForcedScalar != ForcedScalars.end()) { 5956 auto InstSet = ForcedScalar->second; 5957 if (InstSet.find(I) != InstSet.end()) 5958 return VectorizationCostTy((getInstructionCost(I, 1).first * VF), false); 5959 } 5960 5961 Type *VectorTy; 5962 unsigned C = getInstructionCost(I, VF, VectorTy); 5963 5964 bool TypeNotScalarized = 5965 VF > 1 && VectorTy->isVectorTy() && TTI.getNumberOfParts(VectorTy) < VF; 5966 return VectorizationCostTy(C, TypeNotScalarized); 5967 } 5968 5969 unsigned LoopVectorizationCostModel::getScalarizationOverhead(Instruction *I, 5970 unsigned VF) { 5971 5972 if (VF == 1) 5973 return 0; 5974 5975 unsigned Cost = 0; 5976 Type *RetTy = ToVectorTy(I->getType(), VF); 5977 if (!RetTy->isVoidTy() && 5978 (!isa<LoadInst>(I) || !TTI.supportsEfficientVectorElementLoadStore())) 5979 Cost += TTI.getScalarizationOverhead(RetTy, true, false); 5980 5981 // Some targets keep addresses scalar. 5982 if (isa<LoadInst>(I) && !TTI.prefersVectorizedAddressing()) 5983 return Cost; 5984 5985 // Some targets support efficient element stores. 5986 if (isa<StoreInst>(I) && TTI.supportsEfficientVectorElementLoadStore()) 5987 return Cost; 5988 5989 // Collect operands to consider. 5990 CallInst *CI = dyn_cast<CallInst>(I); 5991 Instruction::op_range Ops = CI ? CI->arg_operands() : I->operands(); 5992 5993 // Skip operands that do not require extraction/scalarization and do not incur 5994 // any overhead. 5995 return Cost + TTI.getOperandsScalarizationOverhead( 5996 filterExtractingOperands(Ops, VF), VF); 5997 } 5998 5999 void LoopVectorizationCostModel::setCostBasedWideningDecision(unsigned VF) { 6000 if (VF == 1) 6001 return; 6002 NumPredStores = 0; 6003 for (BasicBlock *BB : TheLoop->blocks()) { 6004 // For each instruction in the old loop. 6005 for (Instruction &I : *BB) { 6006 Value *Ptr = getLoadStorePointerOperand(&I); 6007 if (!Ptr) 6008 continue; 6009 6010 // TODO: We should generate better code and update the cost model for 6011 // predicated uniform stores. Today they are treated as any other 6012 // predicated store (see added test cases in 6013 // invariant-store-vectorization.ll). 6014 if (isa<StoreInst>(&I) && isScalarWithPredication(&I)) 6015 NumPredStores++; 6016 6017 if (Legal->isUniform(Ptr) && 6018 // Conditional loads and stores should be scalarized and predicated. 6019 // isScalarWithPredication cannot be used here since masked 6020 // gather/scatters are not considered scalar with predication. 6021 !Legal->blockNeedsPredication(I.getParent())) { 6022 // TODO: Avoid replicating loads and stores instead of 6023 // relying on instcombine to remove them. 6024 // Load: Scalar load + broadcast 6025 // Store: Scalar store + isLoopInvariantStoreValue ? 0 : extract 6026 unsigned Cost = getUniformMemOpCost(&I, VF); 6027 setWideningDecision(&I, VF, CM_Scalarize, Cost); 6028 continue; 6029 } 6030 6031 // We assume that widening is the best solution when possible. 6032 if (memoryInstructionCanBeWidened(&I, VF)) { 6033 unsigned Cost = getConsecutiveMemOpCost(&I, VF); 6034 int ConsecutiveStride = 6035 Legal->isConsecutivePtr(getLoadStorePointerOperand(&I)); 6036 assert((ConsecutiveStride == 1 || ConsecutiveStride == -1) && 6037 "Expected consecutive stride."); 6038 InstWidening Decision = 6039 ConsecutiveStride == 1 ? CM_Widen : CM_Widen_Reverse; 6040 setWideningDecision(&I, VF, Decision, Cost); 6041 continue; 6042 } 6043 6044 // Choose between Interleaving, Gather/Scatter or Scalarization. 6045 unsigned InterleaveCost = std::numeric_limits<unsigned>::max(); 6046 unsigned NumAccesses = 1; 6047 if (isAccessInterleaved(&I)) { 6048 auto Group = getInterleavedAccessGroup(&I); 6049 assert(Group && "Fail to get an interleaved access group."); 6050 6051 // Make one decision for the whole group. 6052 if (getWideningDecision(&I, VF) != CM_Unknown) 6053 continue; 6054 6055 NumAccesses = Group->getNumMembers(); 6056 if (interleavedAccessCanBeWidened(&I, VF)) 6057 InterleaveCost = getInterleaveGroupCost(&I, VF); 6058 } 6059 6060 unsigned GatherScatterCost = 6061 isLegalGatherOrScatter(&I) 6062 ? getGatherScatterCost(&I, VF) * NumAccesses 6063 : std::numeric_limits<unsigned>::max(); 6064 6065 unsigned ScalarizationCost = 6066 getMemInstScalarizationCost(&I, VF) * NumAccesses; 6067 6068 // Choose better solution for the current VF, 6069 // write down this decision and use it during vectorization. 6070 unsigned Cost; 6071 InstWidening Decision; 6072 if (InterleaveCost <= GatherScatterCost && 6073 InterleaveCost < ScalarizationCost) { 6074 Decision = CM_Interleave; 6075 Cost = InterleaveCost; 6076 } else if (GatherScatterCost < ScalarizationCost) { 6077 Decision = CM_GatherScatter; 6078 Cost = GatherScatterCost; 6079 } else { 6080 Decision = CM_Scalarize; 6081 Cost = ScalarizationCost; 6082 } 6083 // If the instructions belongs to an interleave group, the whole group 6084 // receives the same decision. The whole group receives the cost, but 6085 // the cost will actually be assigned to one instruction. 6086 if (auto Group = getInterleavedAccessGroup(&I)) 6087 setWideningDecision(Group, VF, Decision, Cost); 6088 else 6089 setWideningDecision(&I, VF, Decision, Cost); 6090 } 6091 } 6092 6093 // Make sure that any load of address and any other address computation 6094 // remains scalar unless there is gather/scatter support. This avoids 6095 // inevitable extracts into address registers, and also has the benefit of 6096 // activating LSR more, since that pass can't optimize vectorized 6097 // addresses. 6098 if (TTI.prefersVectorizedAddressing()) 6099 return; 6100 6101 // Start with all scalar pointer uses. 6102 SmallPtrSet<Instruction *, 8> AddrDefs; 6103 for (BasicBlock *BB : TheLoop->blocks()) 6104 for (Instruction &I : *BB) { 6105 Instruction *PtrDef = 6106 dyn_cast_or_null<Instruction>(getLoadStorePointerOperand(&I)); 6107 if (PtrDef && TheLoop->contains(PtrDef) && 6108 getWideningDecision(&I, VF) != CM_GatherScatter) 6109 AddrDefs.insert(PtrDef); 6110 } 6111 6112 // Add all instructions used to generate the addresses. 6113 SmallVector<Instruction *, 4> Worklist; 6114 for (auto *I : AddrDefs) 6115 Worklist.push_back(I); 6116 while (!Worklist.empty()) { 6117 Instruction *I = Worklist.pop_back_val(); 6118 for (auto &Op : I->operands()) 6119 if (auto *InstOp = dyn_cast<Instruction>(Op)) 6120 if ((InstOp->getParent() == I->getParent()) && !isa<PHINode>(InstOp) && 6121 AddrDefs.insert(InstOp).second) 6122 Worklist.push_back(InstOp); 6123 } 6124 6125 for (auto *I : AddrDefs) { 6126 if (isa<LoadInst>(I)) { 6127 // Setting the desired widening decision should ideally be handled in 6128 // by cost functions, but since this involves the task of finding out 6129 // if the loaded register is involved in an address computation, it is 6130 // instead changed here when we know this is the case. 6131 InstWidening Decision = getWideningDecision(I, VF); 6132 if (Decision == CM_Widen || Decision == CM_Widen_Reverse) 6133 // Scalarize a widened load of address. 6134 setWideningDecision(I, VF, CM_Scalarize, 6135 (VF * getMemoryInstructionCost(I, 1))); 6136 else if (auto Group = getInterleavedAccessGroup(I)) { 6137 // Scalarize an interleave group of address loads. 6138 for (unsigned I = 0; I < Group->getFactor(); ++I) { 6139 if (Instruction *Member = Group->getMember(I)) 6140 setWideningDecision(Member, VF, CM_Scalarize, 6141 (VF * getMemoryInstructionCost(Member, 1))); 6142 } 6143 } 6144 } else 6145 // Make sure I gets scalarized and a cost estimate without 6146 // scalarization overhead. 6147 ForcedScalars[VF].insert(I); 6148 } 6149 } 6150 6151 unsigned LoopVectorizationCostModel::getInstructionCost(Instruction *I, 6152 unsigned VF, 6153 Type *&VectorTy) { 6154 Type *RetTy = I->getType(); 6155 if (canTruncateToMinimalBitwidth(I, VF)) 6156 RetTy = IntegerType::get(RetTy->getContext(), MinBWs[I]); 6157 VectorTy = isScalarAfterVectorization(I, VF) ? RetTy : ToVectorTy(RetTy, VF); 6158 auto SE = PSE.getSE(); 6159 6160 // TODO: We need to estimate the cost of intrinsic calls. 6161 switch (I->getOpcode()) { 6162 case Instruction::GetElementPtr: 6163 // We mark this instruction as zero-cost because the cost of GEPs in 6164 // vectorized code depends on whether the corresponding memory instruction 6165 // is scalarized or not. Therefore, we handle GEPs with the memory 6166 // instruction cost. 6167 return 0; 6168 case Instruction::Br: { 6169 // In cases of scalarized and predicated instructions, there will be VF 6170 // predicated blocks in the vectorized loop. Each branch around these 6171 // blocks requires also an extract of its vector compare i1 element. 6172 bool ScalarPredicatedBB = false; 6173 BranchInst *BI = cast<BranchInst>(I); 6174 if (VF > 1 && BI->isConditional() && 6175 (PredicatedBBsAfterVectorization.find(BI->getSuccessor(0)) != 6176 PredicatedBBsAfterVectorization.end() || 6177 PredicatedBBsAfterVectorization.find(BI->getSuccessor(1)) != 6178 PredicatedBBsAfterVectorization.end())) 6179 ScalarPredicatedBB = true; 6180 6181 if (ScalarPredicatedBB) { 6182 // Return cost for branches around scalarized and predicated blocks. 6183 Type *Vec_i1Ty = 6184 VectorType::get(IntegerType::getInt1Ty(RetTy->getContext()), VF); 6185 return (TTI.getScalarizationOverhead(Vec_i1Ty, false, true) + 6186 (TTI.getCFInstrCost(Instruction::Br) * VF)); 6187 } else if (I->getParent() == TheLoop->getLoopLatch() || VF == 1) 6188 // The back-edge branch will remain, as will all scalar branches. 6189 return TTI.getCFInstrCost(Instruction::Br); 6190 else 6191 // This branch will be eliminated by if-conversion. 6192 return 0; 6193 // Note: We currently assume zero cost for an unconditional branch inside 6194 // a predicated block since it will become a fall-through, although we 6195 // may decide in the future to call TTI for all branches. 6196 } 6197 case Instruction::PHI: { 6198 auto *Phi = cast<PHINode>(I); 6199 6200 // First-order recurrences are replaced by vector shuffles inside the loop. 6201 // NOTE: Don't use ToVectorTy as SK_ExtractSubvector expects a vector type. 6202 if (VF > 1 && Legal->isFirstOrderRecurrence(Phi)) 6203 return TTI.getShuffleCost(TargetTransformInfo::SK_ExtractSubvector, 6204 VectorTy, VF - 1, VectorType::get(RetTy, 1)); 6205 6206 // Phi nodes in non-header blocks (not inductions, reductions, etc.) are 6207 // converted into select instructions. We require N - 1 selects per phi 6208 // node, where N is the number of incoming values. 6209 if (VF > 1 && Phi->getParent() != TheLoop->getHeader()) 6210 return (Phi->getNumIncomingValues() - 1) * 6211 TTI.getCmpSelInstrCost( 6212 Instruction::Select, ToVectorTy(Phi->getType(), VF), 6213 ToVectorTy(Type::getInt1Ty(Phi->getContext()), VF)); 6214 6215 return TTI.getCFInstrCost(Instruction::PHI); 6216 } 6217 case Instruction::UDiv: 6218 case Instruction::SDiv: 6219 case Instruction::URem: 6220 case Instruction::SRem: 6221 // If we have a predicated instruction, it may not be executed for each 6222 // vector lane. Get the scalarization cost and scale this amount by the 6223 // probability of executing the predicated block. If the instruction is not 6224 // predicated, we fall through to the next case. 6225 if (VF > 1 && isScalarWithPredication(I)) { 6226 unsigned Cost = 0; 6227 6228 // These instructions have a non-void type, so account for the phi nodes 6229 // that we will create. This cost is likely to be zero. The phi node 6230 // cost, if any, should be scaled by the block probability because it 6231 // models a copy at the end of each predicated block. 6232 Cost += VF * TTI.getCFInstrCost(Instruction::PHI); 6233 6234 // The cost of the non-predicated instruction. 6235 Cost += VF * TTI.getArithmeticInstrCost(I->getOpcode(), RetTy); 6236 6237 // The cost of insertelement and extractelement instructions needed for 6238 // scalarization. 6239 Cost += getScalarizationOverhead(I, VF); 6240 6241 // Scale the cost by the probability of executing the predicated blocks. 6242 // This assumes the predicated block for each vector lane is equally 6243 // likely. 6244 return Cost / getReciprocalPredBlockProb(); 6245 } 6246 LLVM_FALLTHROUGH; 6247 case Instruction::Add: 6248 case Instruction::FAdd: 6249 case Instruction::Sub: 6250 case Instruction::FSub: 6251 case Instruction::Mul: 6252 case Instruction::FMul: 6253 case Instruction::FDiv: 6254 case Instruction::FRem: 6255 case Instruction::Shl: 6256 case Instruction::LShr: 6257 case Instruction::AShr: 6258 case Instruction::And: 6259 case Instruction::Or: 6260 case Instruction::Xor: { 6261 // Since we will replace the stride by 1 the multiplication should go away. 6262 if (I->getOpcode() == Instruction::Mul && isStrideMul(I, Legal)) 6263 return 0; 6264 // Certain instructions can be cheaper to vectorize if they have a constant 6265 // second vector operand. One example of this are shifts on x86. 6266 Value *Op2 = I->getOperand(1); 6267 TargetTransformInfo::OperandValueProperties Op2VP; 6268 TargetTransformInfo::OperandValueKind Op2VK = 6269 TTI.getOperandInfo(Op2, Op2VP); 6270 if (Op2VK == TargetTransformInfo::OK_AnyValue && Legal->isUniform(Op2)) 6271 Op2VK = TargetTransformInfo::OK_UniformValue; 6272 6273 SmallVector<const Value *, 4> Operands(I->operand_values()); 6274 unsigned N = isScalarAfterVectorization(I, VF) ? VF : 1; 6275 return N * TTI.getArithmeticInstrCost( 6276 I->getOpcode(), VectorTy, TargetTransformInfo::OK_AnyValue, 6277 Op2VK, TargetTransformInfo::OP_None, Op2VP, Operands, I); 6278 } 6279 case Instruction::FNeg: { 6280 unsigned N = isScalarAfterVectorization(I, VF) ? VF : 1; 6281 return N * TTI.getArithmeticInstrCost( 6282 I->getOpcode(), VectorTy, TargetTransformInfo::OK_AnyValue, 6283 TargetTransformInfo::OK_AnyValue, 6284 TargetTransformInfo::OP_None, TargetTransformInfo::OP_None, 6285 I->getOperand(0), I); 6286 } 6287 case Instruction::Select: { 6288 SelectInst *SI = cast<SelectInst>(I); 6289 const SCEV *CondSCEV = SE->getSCEV(SI->getCondition()); 6290 bool ScalarCond = (SE->isLoopInvariant(CondSCEV, TheLoop)); 6291 Type *CondTy = SI->getCondition()->getType(); 6292 if (!ScalarCond) 6293 CondTy = VectorType::get(CondTy, VF); 6294 6295 return TTI.getCmpSelInstrCost(I->getOpcode(), VectorTy, CondTy, I); 6296 } 6297 case Instruction::ICmp: 6298 case Instruction::FCmp: { 6299 Type *ValTy = I->getOperand(0)->getType(); 6300 Instruction *Op0AsInstruction = dyn_cast<Instruction>(I->getOperand(0)); 6301 if (canTruncateToMinimalBitwidth(Op0AsInstruction, VF)) 6302 ValTy = IntegerType::get(ValTy->getContext(), MinBWs[Op0AsInstruction]); 6303 VectorTy = ToVectorTy(ValTy, VF); 6304 return TTI.getCmpSelInstrCost(I->getOpcode(), VectorTy, nullptr, I); 6305 } 6306 case Instruction::Store: 6307 case Instruction::Load: { 6308 unsigned Width = VF; 6309 if (Width > 1) { 6310 InstWidening Decision = getWideningDecision(I, Width); 6311 assert(Decision != CM_Unknown && 6312 "CM decision should be taken at this point"); 6313 if (Decision == CM_Scalarize) 6314 Width = 1; 6315 } 6316 VectorTy = ToVectorTy(getMemInstValueType(I), Width); 6317 return getMemoryInstructionCost(I, VF); 6318 } 6319 case Instruction::ZExt: 6320 case Instruction::SExt: 6321 case Instruction::FPToUI: 6322 case Instruction::FPToSI: 6323 case Instruction::FPExt: 6324 case Instruction::PtrToInt: 6325 case Instruction::IntToPtr: 6326 case Instruction::SIToFP: 6327 case Instruction::UIToFP: 6328 case Instruction::Trunc: 6329 case Instruction::FPTrunc: 6330 case Instruction::BitCast: { 6331 // We optimize the truncation of induction variables having constant 6332 // integer steps. The cost of these truncations is the same as the scalar 6333 // operation. 6334 if (isOptimizableIVTruncate(I, VF)) { 6335 auto *Trunc = cast<TruncInst>(I); 6336 return TTI.getCastInstrCost(Instruction::Trunc, Trunc->getDestTy(), 6337 Trunc->getSrcTy(), Trunc); 6338 } 6339 6340 Type *SrcScalarTy = I->getOperand(0)->getType(); 6341 Type *SrcVecTy = 6342 VectorTy->isVectorTy() ? ToVectorTy(SrcScalarTy, VF) : SrcScalarTy; 6343 if (canTruncateToMinimalBitwidth(I, VF)) { 6344 // This cast is going to be shrunk. This may remove the cast or it might 6345 // turn it into slightly different cast. For example, if MinBW == 16, 6346 // "zext i8 %1 to i32" becomes "zext i8 %1 to i16". 6347 // 6348 // Calculate the modified src and dest types. 6349 Type *MinVecTy = VectorTy; 6350 if (I->getOpcode() == Instruction::Trunc) { 6351 SrcVecTy = smallestIntegerVectorType(SrcVecTy, MinVecTy); 6352 VectorTy = 6353 largestIntegerVectorType(ToVectorTy(I->getType(), VF), MinVecTy); 6354 } else if (I->getOpcode() == Instruction::ZExt || 6355 I->getOpcode() == Instruction::SExt) { 6356 SrcVecTy = largestIntegerVectorType(SrcVecTy, MinVecTy); 6357 VectorTy = 6358 smallestIntegerVectorType(ToVectorTy(I->getType(), VF), MinVecTy); 6359 } 6360 } 6361 6362 unsigned N = isScalarAfterVectorization(I, VF) ? VF : 1; 6363 return N * TTI.getCastInstrCost(I->getOpcode(), VectorTy, SrcVecTy, I); 6364 } 6365 case Instruction::Call: { 6366 bool NeedToScalarize; 6367 CallInst *CI = cast<CallInst>(I); 6368 unsigned CallCost = getVectorCallCost(CI, VF, NeedToScalarize); 6369 if (getVectorIntrinsicIDForCall(CI, TLI)) 6370 return std::min(CallCost, getVectorIntrinsicCost(CI, VF)); 6371 return CallCost; 6372 } 6373 default: 6374 // The cost of executing VF copies of the scalar instruction. This opcode 6375 // is unknown. Assume that it is the same as 'mul'. 6376 return VF * TTI.getArithmeticInstrCost(Instruction::Mul, VectorTy) + 6377 getScalarizationOverhead(I, VF); 6378 } // end of switch. 6379 } 6380 6381 char LoopVectorize::ID = 0; 6382 6383 static const char lv_name[] = "Loop Vectorization"; 6384 6385 INITIALIZE_PASS_BEGIN(LoopVectorize, LV_NAME, lv_name, false, false) 6386 INITIALIZE_PASS_DEPENDENCY(TargetTransformInfoWrapperPass) 6387 INITIALIZE_PASS_DEPENDENCY(BasicAAWrapperPass) 6388 INITIALIZE_PASS_DEPENDENCY(AAResultsWrapperPass) 6389 INITIALIZE_PASS_DEPENDENCY(GlobalsAAWrapperPass) 6390 INITIALIZE_PASS_DEPENDENCY(AssumptionCacheTracker) 6391 INITIALIZE_PASS_DEPENDENCY(BlockFrequencyInfoWrapperPass) 6392 INITIALIZE_PASS_DEPENDENCY(DominatorTreeWrapperPass) 6393 INITIALIZE_PASS_DEPENDENCY(ScalarEvolutionWrapperPass) 6394 INITIALIZE_PASS_DEPENDENCY(LoopInfoWrapperPass) 6395 INITIALIZE_PASS_DEPENDENCY(LoopAccessLegacyAnalysis) 6396 INITIALIZE_PASS_DEPENDENCY(DemandedBitsWrapperPass) 6397 INITIALIZE_PASS_DEPENDENCY(OptimizationRemarkEmitterWrapperPass) 6398 INITIALIZE_PASS_DEPENDENCY(ProfileSummaryInfoWrapperPass) 6399 INITIALIZE_PASS_DEPENDENCY(InjectTLIMappingsLegacy) 6400 INITIALIZE_PASS_END(LoopVectorize, LV_NAME, lv_name, false, false) 6401 6402 namespace llvm { 6403 6404 Pass *createLoopVectorizePass() { return new LoopVectorize(); } 6405 6406 Pass *createLoopVectorizePass(bool InterleaveOnlyWhenForced, 6407 bool VectorizeOnlyWhenForced) { 6408 return new LoopVectorize(InterleaveOnlyWhenForced, VectorizeOnlyWhenForced); 6409 } 6410 6411 } // end namespace llvm 6412 6413 bool LoopVectorizationCostModel::isConsecutiveLoadOrStore(Instruction *Inst) { 6414 // Check if the pointer operand of a load or store instruction is 6415 // consecutive. 6416 if (auto *Ptr = getLoadStorePointerOperand(Inst)) 6417 return Legal->isConsecutivePtr(Ptr); 6418 return false; 6419 } 6420 6421 void LoopVectorizationCostModel::collectValuesToIgnore() { 6422 // Ignore ephemeral values. 6423 CodeMetrics::collectEphemeralValues(TheLoop, AC, ValuesToIgnore); 6424 6425 // Ignore type-promoting instructions we identified during reduction 6426 // detection. 6427 for (auto &Reduction : *Legal->getReductionVars()) { 6428 RecurrenceDescriptor &RedDes = Reduction.second; 6429 SmallPtrSetImpl<Instruction *> &Casts = RedDes.getCastInsts(); 6430 VecValuesToIgnore.insert(Casts.begin(), Casts.end()); 6431 } 6432 // Ignore type-casting instructions we identified during induction 6433 // detection. 6434 for (auto &Induction : *Legal->getInductionVars()) { 6435 InductionDescriptor &IndDes = Induction.second; 6436 const SmallVectorImpl<Instruction *> &Casts = IndDes.getCastInsts(); 6437 VecValuesToIgnore.insert(Casts.begin(), Casts.end()); 6438 } 6439 } 6440 6441 // TODO: we could return a pair of values that specify the max VF and 6442 // min VF, to be used in `buildVPlans(MinVF, MaxVF)` instead of 6443 // `buildVPlans(VF, VF)`. We cannot do it because VPLAN at the moment 6444 // doesn't have a cost model that can choose which plan to execute if 6445 // more than one is generated. 6446 static unsigned determineVPlanVF(const unsigned WidestVectorRegBits, 6447 LoopVectorizationCostModel &CM) { 6448 unsigned WidestType; 6449 std::tie(std::ignore, WidestType) = CM.getSmallestAndWidestTypes(); 6450 return WidestVectorRegBits / WidestType; 6451 } 6452 6453 VectorizationFactor 6454 LoopVectorizationPlanner::planInVPlanNativePath(unsigned UserVF) { 6455 unsigned VF = UserVF; 6456 // Outer loop handling: They may require CFG and instruction level 6457 // transformations before even evaluating whether vectorization is profitable. 6458 // Since we cannot modify the incoming IR, we need to build VPlan upfront in 6459 // the vectorization pipeline. 6460 if (!OrigLoop->empty()) { 6461 // If the user doesn't provide a vectorization factor, determine a 6462 // reasonable one. 6463 if (!UserVF) { 6464 VF = determineVPlanVF(TTI->getRegisterBitWidth(true /* Vector*/), CM); 6465 LLVM_DEBUG(dbgs() << "LV: VPlan computed VF " << VF << ".\n"); 6466 6467 // Make sure we have a VF > 1 for stress testing. 6468 if (VPlanBuildStressTest && VF < 2) { 6469 LLVM_DEBUG(dbgs() << "LV: VPlan stress testing: " 6470 << "overriding computed VF.\n"); 6471 VF = 4; 6472 } 6473 } 6474 assert(EnableVPlanNativePath && "VPlan-native path is not enabled."); 6475 assert(isPowerOf2_32(VF) && "VF needs to be a power of two"); 6476 LLVM_DEBUG(dbgs() << "LV: Using " << (UserVF ? "user " : "") << "VF " << VF 6477 << " to build VPlans.\n"); 6478 buildVPlans(VF, VF); 6479 6480 // For VPlan build stress testing, we bail out after VPlan construction. 6481 if (VPlanBuildStressTest) 6482 return VectorizationFactor::Disabled(); 6483 6484 return {VF, 0}; 6485 } 6486 6487 LLVM_DEBUG( 6488 dbgs() << "LV: Not vectorizing. Inner loops aren't supported in the " 6489 "VPlan-native path.\n"); 6490 return VectorizationFactor::Disabled(); 6491 } 6492 6493 Optional<VectorizationFactor> LoopVectorizationPlanner::plan(unsigned UserVF) { 6494 assert(OrigLoop->empty() && "Inner loop expected."); 6495 Optional<unsigned> MaybeMaxVF = CM.computeMaxVF(); 6496 if (!MaybeMaxVF) // Cases that should not to be vectorized nor interleaved. 6497 return None; 6498 6499 // Invalidate interleave groups if all blocks of loop will be predicated. 6500 if (CM.blockNeedsPredication(OrigLoop->getHeader()) && 6501 !useMaskedInterleavedAccesses(*TTI)) { 6502 LLVM_DEBUG( 6503 dbgs() 6504 << "LV: Invalidate all interleaved groups due to fold-tail by masking " 6505 "which requires masked-interleaved support.\n"); 6506 CM.InterleaveInfo.reset(); 6507 } 6508 6509 if (UserVF) { 6510 LLVM_DEBUG(dbgs() << "LV: Using user VF " << UserVF << ".\n"); 6511 assert(isPowerOf2_32(UserVF) && "VF needs to be a power of two"); 6512 // Collect the instructions (and their associated costs) that will be more 6513 // profitable to scalarize. 6514 CM.selectUserVectorizationFactor(UserVF); 6515 buildVPlansWithVPRecipes(UserVF, UserVF); 6516 LLVM_DEBUG(printPlans(dbgs())); 6517 return {{UserVF, 0}}; 6518 } 6519 6520 unsigned MaxVF = MaybeMaxVF.getValue(); 6521 assert(MaxVF != 0 && "MaxVF is zero."); 6522 6523 for (unsigned VF = 1; VF <= MaxVF; VF *= 2) { 6524 // Collect Uniform and Scalar instructions after vectorization with VF. 6525 CM.collectUniformsAndScalars(VF); 6526 6527 // Collect the instructions (and their associated costs) that will be more 6528 // profitable to scalarize. 6529 if (VF > 1) 6530 CM.collectInstsToScalarize(VF); 6531 } 6532 6533 buildVPlansWithVPRecipes(1, MaxVF); 6534 LLVM_DEBUG(printPlans(dbgs())); 6535 if (MaxVF == 1) 6536 return VectorizationFactor::Disabled(); 6537 6538 // Select the optimal vectorization factor. 6539 return CM.selectVectorizationFactor(MaxVF); 6540 } 6541 6542 void LoopVectorizationPlanner::setBestPlan(unsigned VF, unsigned UF) { 6543 LLVM_DEBUG(dbgs() << "Setting best plan to VF=" << VF << ", UF=" << UF 6544 << '\n'); 6545 BestVF = VF; 6546 BestUF = UF; 6547 6548 erase_if(VPlans, [VF](const VPlanPtr &Plan) { 6549 return !Plan->hasVF(VF); 6550 }); 6551 assert(VPlans.size() == 1 && "Best VF has not a single VPlan."); 6552 } 6553 6554 void LoopVectorizationPlanner::executePlan(InnerLoopVectorizer &ILV, 6555 DominatorTree *DT) { 6556 // Perform the actual loop transformation. 6557 6558 // 1. Create a new empty loop. Unlink the old loop and connect the new one. 6559 VPCallbackILV CallbackILV(ILV); 6560 6561 VPTransformState State{BestVF, BestUF, LI, 6562 DT, ILV.Builder, ILV.VectorLoopValueMap, 6563 &ILV, CallbackILV}; 6564 State.CFG.PrevBB = ILV.createVectorizedLoopSkeleton(); 6565 State.TripCount = ILV.getOrCreateTripCount(nullptr); 6566 6567 //===------------------------------------------------===// 6568 // 6569 // Notice: any optimization or new instruction that go 6570 // into the code below should also be implemented in 6571 // the cost-model. 6572 // 6573 //===------------------------------------------------===// 6574 6575 // 2. Copy and widen instructions from the old loop into the new loop. 6576 assert(VPlans.size() == 1 && "Not a single VPlan to execute."); 6577 VPlans.front()->execute(&State); 6578 6579 // 3. Fix the vectorized code: take care of header phi's, live-outs, 6580 // predication, updating analyses. 6581 ILV.fixVectorizedLoop(); 6582 } 6583 6584 void LoopVectorizationPlanner::collectTriviallyDeadInstructions( 6585 SmallPtrSetImpl<Instruction *> &DeadInstructions) { 6586 BasicBlock *Latch = OrigLoop->getLoopLatch(); 6587 6588 // We create new control-flow for the vectorized loop, so the original 6589 // condition will be dead after vectorization if it's only used by the 6590 // branch. 6591 auto *Cmp = dyn_cast<Instruction>(Latch->getTerminator()->getOperand(0)); 6592 if (Cmp && Cmp->hasOneUse()) 6593 DeadInstructions.insert(Cmp); 6594 6595 // We create new "steps" for induction variable updates to which the original 6596 // induction variables map. An original update instruction will be dead if 6597 // all its users except the induction variable are dead. 6598 for (auto &Induction : *Legal->getInductionVars()) { 6599 PHINode *Ind = Induction.first; 6600 auto *IndUpdate = cast<Instruction>(Ind->getIncomingValueForBlock(Latch)); 6601 if (llvm::all_of(IndUpdate->users(), [&](User *U) -> bool { 6602 return U == Ind || DeadInstructions.find(cast<Instruction>(U)) != 6603 DeadInstructions.end(); 6604 })) 6605 DeadInstructions.insert(IndUpdate); 6606 6607 // We record as "Dead" also the type-casting instructions we had identified 6608 // during induction analysis. We don't need any handling for them in the 6609 // vectorized loop because we have proven that, under a proper runtime 6610 // test guarding the vectorized loop, the value of the phi, and the casted 6611 // value of the phi, are the same. The last instruction in this casting chain 6612 // will get its scalar/vector/widened def from the scalar/vector/widened def 6613 // of the respective phi node. Any other casts in the induction def-use chain 6614 // have no other uses outside the phi update chain, and will be ignored. 6615 InductionDescriptor &IndDes = Induction.second; 6616 const SmallVectorImpl<Instruction *> &Casts = IndDes.getCastInsts(); 6617 DeadInstructions.insert(Casts.begin(), Casts.end()); 6618 } 6619 } 6620 6621 Value *InnerLoopUnroller::reverseVector(Value *Vec) { return Vec; } 6622 6623 Value *InnerLoopUnroller::getBroadcastInstrs(Value *V) { return V; } 6624 6625 Value *InnerLoopUnroller::getStepVector(Value *Val, int StartIdx, Value *Step, 6626 Instruction::BinaryOps BinOp) { 6627 // When unrolling and the VF is 1, we only need to add a simple scalar. 6628 Type *Ty = Val->getType(); 6629 assert(!Ty->isVectorTy() && "Val must be a scalar"); 6630 6631 if (Ty->isFloatingPointTy()) { 6632 Constant *C = ConstantFP::get(Ty, (double)StartIdx); 6633 6634 // Floating point operations had to be 'fast' to enable the unrolling. 6635 Value *MulOp = addFastMathFlag(Builder.CreateFMul(C, Step)); 6636 return addFastMathFlag(Builder.CreateBinOp(BinOp, Val, MulOp)); 6637 } 6638 Constant *C = ConstantInt::get(Ty, StartIdx); 6639 return Builder.CreateAdd(Val, Builder.CreateMul(C, Step), "induction"); 6640 } 6641 6642 static void AddRuntimeUnrollDisableMetaData(Loop *L) { 6643 SmallVector<Metadata *, 4> MDs; 6644 // Reserve first location for self reference to the LoopID metadata node. 6645 MDs.push_back(nullptr); 6646 bool IsUnrollMetadata = false; 6647 MDNode *LoopID = L->getLoopID(); 6648 if (LoopID) { 6649 // First find existing loop unrolling disable metadata. 6650 for (unsigned i = 1, ie = LoopID->getNumOperands(); i < ie; ++i) { 6651 auto *MD = dyn_cast<MDNode>(LoopID->getOperand(i)); 6652 if (MD) { 6653 const auto *S = dyn_cast<MDString>(MD->getOperand(0)); 6654 IsUnrollMetadata = 6655 S && S->getString().startswith("llvm.loop.unroll.disable"); 6656 } 6657 MDs.push_back(LoopID->getOperand(i)); 6658 } 6659 } 6660 6661 if (!IsUnrollMetadata) { 6662 // Add runtime unroll disable metadata. 6663 LLVMContext &Context = L->getHeader()->getContext(); 6664 SmallVector<Metadata *, 1> DisableOperands; 6665 DisableOperands.push_back( 6666 MDString::get(Context, "llvm.loop.unroll.runtime.disable")); 6667 MDNode *DisableNode = MDNode::get(Context, DisableOperands); 6668 MDs.push_back(DisableNode); 6669 MDNode *NewLoopID = MDNode::get(Context, MDs); 6670 // Set operand 0 to refer to the loop id itself. 6671 NewLoopID->replaceOperandWith(0, NewLoopID); 6672 L->setLoopID(NewLoopID); 6673 } 6674 } 6675 6676 bool LoopVectorizationPlanner::getDecisionAndClampRange( 6677 const std::function<bool(unsigned)> &Predicate, VFRange &Range) { 6678 assert(Range.End > Range.Start && "Trying to test an empty VF range."); 6679 bool PredicateAtRangeStart = Predicate(Range.Start); 6680 6681 for (unsigned TmpVF = Range.Start * 2; TmpVF < Range.End; TmpVF *= 2) 6682 if (Predicate(TmpVF) != PredicateAtRangeStart) { 6683 Range.End = TmpVF; 6684 break; 6685 } 6686 6687 return PredicateAtRangeStart; 6688 } 6689 6690 /// Build VPlans for the full range of feasible VF's = {\p MinVF, 2 * \p MinVF, 6691 /// 4 * \p MinVF, ..., \p MaxVF} by repeatedly building a VPlan for a sub-range 6692 /// of VF's starting at a given VF and extending it as much as possible. Each 6693 /// vectorization decision can potentially shorten this sub-range during 6694 /// buildVPlan(). 6695 void LoopVectorizationPlanner::buildVPlans(unsigned MinVF, unsigned MaxVF) { 6696 for (unsigned VF = MinVF; VF < MaxVF + 1;) { 6697 VFRange SubRange = {VF, MaxVF + 1}; 6698 VPlans.push_back(buildVPlan(SubRange)); 6699 VF = SubRange.End; 6700 } 6701 } 6702 6703 VPValue *VPRecipeBuilder::createEdgeMask(BasicBlock *Src, BasicBlock *Dst, 6704 VPlanPtr &Plan) { 6705 assert(is_contained(predecessors(Dst), Src) && "Invalid edge"); 6706 6707 // Look for cached value. 6708 std::pair<BasicBlock *, BasicBlock *> Edge(Src, Dst); 6709 EdgeMaskCacheTy::iterator ECEntryIt = EdgeMaskCache.find(Edge); 6710 if (ECEntryIt != EdgeMaskCache.end()) 6711 return ECEntryIt->second; 6712 6713 VPValue *SrcMask = createBlockInMask(Src, Plan); 6714 6715 // The terminator has to be a branch inst! 6716 BranchInst *BI = dyn_cast<BranchInst>(Src->getTerminator()); 6717 assert(BI && "Unexpected terminator found"); 6718 6719 if (!BI->isConditional()) 6720 return EdgeMaskCache[Edge] = SrcMask; 6721 6722 VPValue *EdgeMask = Plan->getVPValue(BI->getCondition()); 6723 assert(EdgeMask && "No Edge Mask found for condition"); 6724 6725 if (BI->getSuccessor(0) != Dst) 6726 EdgeMask = Builder.createNot(EdgeMask); 6727 6728 if (SrcMask) // Otherwise block in-mask is all-one, no need to AND. 6729 EdgeMask = Builder.createAnd(EdgeMask, SrcMask); 6730 6731 return EdgeMaskCache[Edge] = EdgeMask; 6732 } 6733 6734 VPValue *VPRecipeBuilder::createBlockInMask(BasicBlock *BB, VPlanPtr &Plan) { 6735 assert(OrigLoop->contains(BB) && "Block is not a part of a loop"); 6736 6737 // Look for cached value. 6738 BlockMaskCacheTy::iterator BCEntryIt = BlockMaskCache.find(BB); 6739 if (BCEntryIt != BlockMaskCache.end()) 6740 return BCEntryIt->second; 6741 6742 // All-one mask is modelled as no-mask following the convention for masked 6743 // load/store/gather/scatter. Initialize BlockMask to no-mask. 6744 VPValue *BlockMask = nullptr; 6745 6746 if (OrigLoop->getHeader() == BB) { 6747 if (!CM.blockNeedsPredication(BB)) 6748 return BlockMaskCache[BB] = BlockMask; // Loop incoming mask is all-one. 6749 6750 // Introduce the early-exit compare IV <= BTC to form header block mask. 6751 // This is used instead of IV < TC because TC may wrap, unlike BTC. 6752 VPValue *IV = Plan->getVPValue(Legal->getPrimaryInduction()); 6753 VPValue *BTC = Plan->getOrCreateBackedgeTakenCount(); 6754 BlockMask = Builder.createNaryOp(VPInstruction::ICmpULE, {IV, BTC}); 6755 return BlockMaskCache[BB] = BlockMask; 6756 } 6757 6758 // This is the block mask. We OR all incoming edges. 6759 for (auto *Predecessor : predecessors(BB)) { 6760 VPValue *EdgeMask = createEdgeMask(Predecessor, BB, Plan); 6761 if (!EdgeMask) // Mask of predecessor is all-one so mask of block is too. 6762 return BlockMaskCache[BB] = EdgeMask; 6763 6764 if (!BlockMask) { // BlockMask has its initialized nullptr value. 6765 BlockMask = EdgeMask; 6766 continue; 6767 } 6768 6769 BlockMask = Builder.createOr(BlockMask, EdgeMask); 6770 } 6771 6772 return BlockMaskCache[BB] = BlockMask; 6773 } 6774 6775 VPWidenMemoryInstructionRecipe * 6776 VPRecipeBuilder::tryToWidenMemory(Instruction *I, VFRange &Range, 6777 VPlanPtr &Plan) { 6778 if (!isa<LoadInst>(I) && !isa<StoreInst>(I)) 6779 return nullptr; 6780 6781 auto willWiden = [&](unsigned VF) -> bool { 6782 if (VF == 1) 6783 return false; 6784 LoopVectorizationCostModel::InstWidening Decision = 6785 CM.getWideningDecision(I, VF); 6786 assert(Decision != LoopVectorizationCostModel::CM_Unknown && 6787 "CM decision should be taken at this point."); 6788 if (Decision == LoopVectorizationCostModel::CM_Interleave) 6789 return true; 6790 if (CM.isScalarAfterVectorization(I, VF) || 6791 CM.isProfitableToScalarize(I, VF)) 6792 return false; 6793 return Decision != LoopVectorizationCostModel::CM_Scalarize; 6794 }; 6795 6796 if (!LoopVectorizationPlanner::getDecisionAndClampRange(willWiden, Range)) 6797 return nullptr; 6798 6799 VPValue *Mask = nullptr; 6800 if (Legal->isMaskRequired(I)) 6801 Mask = createBlockInMask(I->getParent(), Plan); 6802 6803 VPValue *Addr = Plan->getOrAddVPValue(getLoadStorePointerOperand(I)); 6804 return new VPWidenMemoryInstructionRecipe(*I, Addr, Mask); 6805 } 6806 6807 VPWidenIntOrFpInductionRecipe * 6808 VPRecipeBuilder::tryToOptimizeInduction(Instruction *I, VFRange &Range) { 6809 if (PHINode *Phi = dyn_cast<PHINode>(I)) { 6810 // Check if this is an integer or fp induction. If so, build the recipe that 6811 // produces its scalar and vector values. 6812 InductionDescriptor II = Legal->getInductionVars()->lookup(Phi); 6813 if (II.getKind() == InductionDescriptor::IK_IntInduction || 6814 II.getKind() == InductionDescriptor::IK_FpInduction) 6815 return new VPWidenIntOrFpInductionRecipe(Phi); 6816 6817 return nullptr; 6818 } 6819 6820 // Optimize the special case where the source is a constant integer 6821 // induction variable. Notice that we can only optimize the 'trunc' case 6822 // because (a) FP conversions lose precision, (b) sext/zext may wrap, and 6823 // (c) other casts depend on pointer size. 6824 6825 // Determine whether \p K is a truncation based on an induction variable that 6826 // can be optimized. 6827 auto isOptimizableIVTruncate = 6828 [&](Instruction *K) -> std::function<bool(unsigned)> { 6829 return 6830 [=](unsigned VF) -> bool { return CM.isOptimizableIVTruncate(K, VF); }; 6831 }; 6832 6833 if (isa<TruncInst>(I) && LoopVectorizationPlanner::getDecisionAndClampRange( 6834 isOptimizableIVTruncate(I), Range)) 6835 return new VPWidenIntOrFpInductionRecipe(cast<PHINode>(I->getOperand(0)), 6836 cast<TruncInst>(I)); 6837 return nullptr; 6838 } 6839 6840 VPBlendRecipe *VPRecipeBuilder::tryToBlend(Instruction *I, VPlanPtr &Plan) { 6841 PHINode *Phi = dyn_cast<PHINode>(I); 6842 if (!Phi || Phi->getParent() == OrigLoop->getHeader()) 6843 return nullptr; 6844 6845 // We know that all PHIs in non-header blocks are converted into selects, so 6846 // we don't have to worry about the insertion order and we can just use the 6847 // builder. At this point we generate the predication tree. There may be 6848 // duplications since this is a simple recursive scan, but future 6849 // optimizations will clean it up. 6850 6851 SmallVector<VPValue *, 2> Masks; 6852 unsigned NumIncoming = Phi->getNumIncomingValues(); 6853 for (unsigned In = 0; In < NumIncoming; In++) { 6854 VPValue *EdgeMask = 6855 createEdgeMask(Phi->getIncomingBlock(In), Phi->getParent(), Plan); 6856 assert((EdgeMask || NumIncoming == 1) && 6857 "Multiple predecessors with one having a full mask"); 6858 if (EdgeMask) 6859 Masks.push_back(EdgeMask); 6860 } 6861 return new VPBlendRecipe(Phi, Masks); 6862 } 6863 6864 bool VPRecipeBuilder::tryToWiden(Instruction *I, VPBasicBlock *VPBB, 6865 VFRange &Range) { 6866 6867 bool IsPredicated = LoopVectorizationPlanner::getDecisionAndClampRange( 6868 [&](unsigned VF) { return CM.isScalarWithPredication(I, VF); }, Range); 6869 6870 if (IsPredicated) 6871 return false; 6872 6873 auto IsVectorizableOpcode = [](unsigned Opcode) { 6874 switch (Opcode) { 6875 case Instruction::Add: 6876 case Instruction::And: 6877 case Instruction::AShr: 6878 case Instruction::BitCast: 6879 case Instruction::Br: 6880 case Instruction::Call: 6881 case Instruction::FAdd: 6882 case Instruction::FCmp: 6883 case Instruction::FDiv: 6884 case Instruction::FMul: 6885 case Instruction::FNeg: 6886 case Instruction::FPExt: 6887 case Instruction::FPToSI: 6888 case Instruction::FPToUI: 6889 case Instruction::FPTrunc: 6890 case Instruction::FRem: 6891 case Instruction::FSub: 6892 case Instruction::ICmp: 6893 case Instruction::IntToPtr: 6894 case Instruction::Load: 6895 case Instruction::LShr: 6896 case Instruction::Mul: 6897 case Instruction::Or: 6898 case Instruction::PHI: 6899 case Instruction::PtrToInt: 6900 case Instruction::SDiv: 6901 case Instruction::Select: 6902 case Instruction::SExt: 6903 case Instruction::Shl: 6904 case Instruction::SIToFP: 6905 case Instruction::SRem: 6906 case Instruction::Store: 6907 case Instruction::Sub: 6908 case Instruction::Trunc: 6909 case Instruction::UDiv: 6910 case Instruction::UIToFP: 6911 case Instruction::URem: 6912 case Instruction::Xor: 6913 case Instruction::ZExt: 6914 return true; 6915 } 6916 return false; 6917 }; 6918 6919 if (!IsVectorizableOpcode(I->getOpcode())) 6920 return false; 6921 6922 if (CallInst *CI = dyn_cast<CallInst>(I)) { 6923 Intrinsic::ID ID = getVectorIntrinsicIDForCall(CI, TLI); 6924 if (ID && (ID == Intrinsic::assume || ID == Intrinsic::lifetime_end || 6925 ID == Intrinsic::lifetime_start || ID == Intrinsic::sideeffect)) 6926 return false; 6927 } 6928 6929 auto willWiden = [&](unsigned VF) -> bool { 6930 if (!isa<PHINode>(I) && (CM.isScalarAfterVectorization(I, VF) || 6931 CM.isProfitableToScalarize(I, VF))) 6932 return false; 6933 if (CallInst *CI = dyn_cast<CallInst>(I)) { 6934 Intrinsic::ID ID = getVectorIntrinsicIDForCall(CI, TLI); 6935 // The following case may be scalarized depending on the VF. 6936 // The flag shows whether we use Intrinsic or a usual Call for vectorized 6937 // version of the instruction. 6938 // Is it beneficial to perform intrinsic call compared to lib call? 6939 bool NeedToScalarize; 6940 unsigned CallCost = CM.getVectorCallCost(CI, VF, NeedToScalarize); 6941 bool UseVectorIntrinsic = 6942 ID && CM.getVectorIntrinsicCost(CI, VF) <= CallCost; 6943 return UseVectorIntrinsic || !NeedToScalarize; 6944 } 6945 if (isa<LoadInst>(I) || isa<StoreInst>(I)) { 6946 assert(CM.getWideningDecision(I, VF) == 6947 LoopVectorizationCostModel::CM_Scalarize && 6948 "Memory widening decisions should have been taken care by now"); 6949 return false; 6950 } 6951 return true; 6952 }; 6953 6954 if (!LoopVectorizationPlanner::getDecisionAndClampRange(willWiden, Range)) 6955 return false; 6956 // If this ingredient's recipe is to be recorded, keep its recipe a singleton 6957 // to avoid having to split recipes later. 6958 bool IsSingleton = Ingredient2Recipe.count(I); 6959 6960 // Success: widen this instruction. 6961 6962 // Use the default widening recipe. We optimize the common case where 6963 // consecutive instructions can be represented by a single recipe. 6964 if (!IsSingleton && !VPBB->empty() && LastExtensibleRecipe == &VPBB->back() && 6965 LastExtensibleRecipe->appendInstruction(I)) 6966 return true; 6967 6968 VPWidenRecipe *WidenRecipe = new VPWidenRecipe(I); 6969 if (!IsSingleton) 6970 LastExtensibleRecipe = WidenRecipe; 6971 setRecipe(I, WidenRecipe); 6972 VPBB->appendRecipe(WidenRecipe); 6973 return true; 6974 } 6975 6976 VPBasicBlock *VPRecipeBuilder::handleReplication( 6977 Instruction *I, VFRange &Range, VPBasicBlock *VPBB, 6978 DenseMap<Instruction *, VPReplicateRecipe *> &PredInst2Recipe, 6979 VPlanPtr &Plan) { 6980 bool IsUniform = LoopVectorizationPlanner::getDecisionAndClampRange( 6981 [&](unsigned VF) { return CM.isUniformAfterVectorization(I, VF); }, 6982 Range); 6983 6984 bool IsPredicated = LoopVectorizationPlanner::getDecisionAndClampRange( 6985 [&](unsigned VF) { return CM.isScalarWithPredication(I, VF); }, Range); 6986 6987 auto *Recipe = new VPReplicateRecipe(I, IsUniform, IsPredicated); 6988 setRecipe(I, Recipe); 6989 6990 // Find if I uses a predicated instruction. If so, it will use its scalar 6991 // value. Avoid hoisting the insert-element which packs the scalar value into 6992 // a vector value, as that happens iff all users use the vector value. 6993 for (auto &Op : I->operands()) 6994 if (auto *PredInst = dyn_cast<Instruction>(Op)) 6995 if (PredInst2Recipe.find(PredInst) != PredInst2Recipe.end()) 6996 PredInst2Recipe[PredInst]->setAlsoPack(false); 6997 6998 // Finalize the recipe for Instr, first if it is not predicated. 6999 if (!IsPredicated) { 7000 LLVM_DEBUG(dbgs() << "LV: Scalarizing:" << *I << "\n"); 7001 VPBB->appendRecipe(Recipe); 7002 return VPBB; 7003 } 7004 LLVM_DEBUG(dbgs() << "LV: Scalarizing and predicating:" << *I << "\n"); 7005 assert(VPBB->getSuccessors().empty() && 7006 "VPBB has successors when handling predicated replication."); 7007 // Record predicated instructions for above packing optimizations. 7008 PredInst2Recipe[I] = Recipe; 7009 VPBlockBase *Region = createReplicateRegion(I, Recipe, Plan); 7010 VPBlockUtils::insertBlockAfter(Region, VPBB); 7011 auto *RegSucc = new VPBasicBlock(); 7012 VPBlockUtils::insertBlockAfter(RegSucc, Region); 7013 return RegSucc; 7014 } 7015 7016 VPRegionBlock *VPRecipeBuilder::createReplicateRegion(Instruction *Instr, 7017 VPRecipeBase *PredRecipe, 7018 VPlanPtr &Plan) { 7019 // Instructions marked for predication are replicated and placed under an 7020 // if-then construct to prevent side-effects. 7021 7022 // Generate recipes to compute the block mask for this region. 7023 VPValue *BlockInMask = createBlockInMask(Instr->getParent(), Plan); 7024 7025 // Build the triangular if-then region. 7026 std::string RegionName = (Twine("pred.") + Instr->getOpcodeName()).str(); 7027 assert(Instr->getParent() && "Predicated instruction not in any basic block"); 7028 auto *BOMRecipe = new VPBranchOnMaskRecipe(BlockInMask); 7029 auto *Entry = new VPBasicBlock(Twine(RegionName) + ".entry", BOMRecipe); 7030 auto *PHIRecipe = 7031 Instr->getType()->isVoidTy() ? nullptr : new VPPredInstPHIRecipe(Instr); 7032 auto *Exit = new VPBasicBlock(Twine(RegionName) + ".continue", PHIRecipe); 7033 auto *Pred = new VPBasicBlock(Twine(RegionName) + ".if", PredRecipe); 7034 VPRegionBlock *Region = new VPRegionBlock(Entry, Exit, RegionName, true); 7035 7036 // Note: first set Entry as region entry and then connect successors starting 7037 // from it in order, to propagate the "parent" of each VPBasicBlock. 7038 VPBlockUtils::insertTwoBlocksAfter(Pred, Exit, BlockInMask, Entry); 7039 VPBlockUtils::connectBlocks(Pred, Exit); 7040 7041 return Region; 7042 } 7043 7044 bool VPRecipeBuilder::tryToCreateRecipe(Instruction *Instr, VFRange &Range, 7045 VPlanPtr &Plan, VPBasicBlock *VPBB) { 7046 VPRecipeBase *Recipe = nullptr; 7047 7048 // First, check for specific widening recipes that deal with memory 7049 // operations, inductions and Phi nodes. 7050 if ((Recipe = tryToWidenMemory(Instr, Range, Plan)) || 7051 (Recipe = tryToOptimizeInduction(Instr, Range)) || 7052 (Recipe = tryToBlend(Instr, Plan)) || 7053 (isa<PHINode>(Instr) && 7054 (Recipe = new VPWidenPHIRecipe(cast<PHINode>(Instr))))) { 7055 setRecipe(Instr, Recipe); 7056 VPBB->appendRecipe(Recipe); 7057 return true; 7058 } 7059 7060 // Handle GEP widening. 7061 if (GetElementPtrInst *GEP = dyn_cast<GetElementPtrInst>(Instr)) { 7062 auto Scalarize = [&](unsigned VF) { 7063 return CM.isScalarWithPredication(Instr, VF) || 7064 CM.isScalarAfterVectorization(Instr, VF) || 7065 CM.isProfitableToScalarize(Instr, VF); 7066 }; 7067 if (LoopVectorizationPlanner::getDecisionAndClampRange(Scalarize, Range)) 7068 return false; 7069 VPWidenGEPRecipe *Recipe = new VPWidenGEPRecipe(GEP, OrigLoop); 7070 setRecipe(Instr, Recipe); 7071 VPBB->appendRecipe(Recipe); 7072 return true; 7073 } 7074 7075 // Check if Instr is to be widened by a general VPWidenRecipe, after 7076 // having first checked for specific widening recipes. 7077 if (tryToWiden(Instr, VPBB, Range)) 7078 return true; 7079 7080 return false; 7081 } 7082 7083 void LoopVectorizationPlanner::buildVPlansWithVPRecipes(unsigned MinVF, 7084 unsigned MaxVF) { 7085 assert(OrigLoop->empty() && "Inner loop expected."); 7086 7087 // Collect conditions feeding internal conditional branches; they need to be 7088 // represented in VPlan for it to model masking. 7089 SmallPtrSet<Value *, 1> NeedDef; 7090 7091 auto *Latch = OrigLoop->getLoopLatch(); 7092 for (BasicBlock *BB : OrigLoop->blocks()) { 7093 if (BB == Latch) 7094 continue; 7095 BranchInst *Branch = dyn_cast<BranchInst>(BB->getTerminator()); 7096 if (Branch && Branch->isConditional()) 7097 NeedDef.insert(Branch->getCondition()); 7098 } 7099 7100 // If the tail is to be folded by masking, the primary induction variable 7101 // needs to be represented in VPlan for it to model early-exit masking. 7102 // Also, both the Phi and the live-out instruction of each reduction are 7103 // required in order to introduce a select between them in VPlan. 7104 if (CM.foldTailByMasking()) { 7105 NeedDef.insert(Legal->getPrimaryInduction()); 7106 for (auto &Reduction : *Legal->getReductionVars()) { 7107 NeedDef.insert(Reduction.first); 7108 NeedDef.insert(Reduction.second.getLoopExitInstr()); 7109 } 7110 } 7111 7112 // Collect instructions from the original loop that will become trivially dead 7113 // in the vectorized loop. We don't need to vectorize these instructions. For 7114 // example, original induction update instructions can become dead because we 7115 // separately emit induction "steps" when generating code for the new loop. 7116 // Similarly, we create a new latch condition when setting up the structure 7117 // of the new loop, so the old one can become dead. 7118 SmallPtrSet<Instruction *, 4> DeadInstructions; 7119 collectTriviallyDeadInstructions(DeadInstructions); 7120 7121 for (unsigned VF = MinVF; VF < MaxVF + 1;) { 7122 VFRange SubRange = {VF, MaxVF + 1}; 7123 VPlans.push_back( 7124 buildVPlanWithVPRecipes(SubRange, NeedDef, DeadInstructions)); 7125 VF = SubRange.End; 7126 } 7127 } 7128 7129 VPlanPtr LoopVectorizationPlanner::buildVPlanWithVPRecipes( 7130 VFRange &Range, SmallPtrSetImpl<Value *> &NeedDef, 7131 SmallPtrSetImpl<Instruction *> &DeadInstructions) { 7132 7133 // Hold a mapping from predicated instructions to their recipes, in order to 7134 // fix their AlsoPack behavior if a user is determined to replicate and use a 7135 // scalar instead of vector value. 7136 DenseMap<Instruction *, VPReplicateRecipe *> PredInst2Recipe; 7137 7138 DenseMap<Instruction *, Instruction *> &SinkAfter = Legal->getSinkAfter(); 7139 7140 SmallPtrSet<const InterleaveGroup<Instruction> *, 1> InterleaveGroups; 7141 7142 VPRecipeBuilder RecipeBuilder(OrigLoop, TLI, Legal, CM, Builder); 7143 7144 // --------------------------------------------------------------------------- 7145 // Pre-construction: record ingredients whose recipes we'll need to further 7146 // process after constructing the initial VPlan. 7147 // --------------------------------------------------------------------------- 7148 7149 // Mark instructions we'll need to sink later and their targets as 7150 // ingredients whose recipe we'll need to record. 7151 for (auto &Entry : SinkAfter) { 7152 RecipeBuilder.recordRecipeOf(Entry.first); 7153 RecipeBuilder.recordRecipeOf(Entry.second); 7154 } 7155 7156 // For each interleave group which is relevant for this (possibly trimmed) 7157 // Range, add it to the set of groups to be later applied to the VPlan and add 7158 // placeholders for its members' Recipes which we'll be replacing with a 7159 // single VPInterleaveRecipe. 7160 for (InterleaveGroup<Instruction> *IG : IAI.getInterleaveGroups()) { 7161 auto applyIG = [IG, this](unsigned VF) -> bool { 7162 return (VF >= 2 && // Query is illegal for VF == 1 7163 CM.getWideningDecision(IG->getInsertPos(), VF) == 7164 LoopVectorizationCostModel::CM_Interleave); 7165 }; 7166 if (!getDecisionAndClampRange(applyIG, Range)) 7167 continue; 7168 InterleaveGroups.insert(IG); 7169 for (unsigned i = 0; i < IG->getFactor(); i++) 7170 if (Instruction *Member = IG->getMember(i)) 7171 RecipeBuilder.recordRecipeOf(Member); 7172 }; 7173 7174 // --------------------------------------------------------------------------- 7175 // Build initial VPlan: Scan the body of the loop in a topological order to 7176 // visit each basic block after having visited its predecessor basic blocks. 7177 // --------------------------------------------------------------------------- 7178 7179 // Add assume instructions we need to drop to DeadInstructions, to prevent 7180 // them from being added to the VPlan. 7181 // TODO: We only need to drop assumes in blocks that get flattend. If the 7182 // control flow is preserved, we should keep them. 7183 auto &ConditionalAssumes = Legal->getConditionalAssumes(); 7184 DeadInstructions.insert(ConditionalAssumes.begin(), ConditionalAssumes.end()); 7185 7186 // Create a dummy pre-entry VPBasicBlock to start building the VPlan. 7187 VPBasicBlock *VPBB = new VPBasicBlock("Pre-Entry"); 7188 auto Plan = std::make_unique<VPlan>(VPBB); 7189 7190 // Represent values that will have defs inside VPlan. 7191 for (Value *V : NeedDef) 7192 Plan->addVPValue(V); 7193 7194 // Scan the body of the loop in a topological order to visit each basic block 7195 // after having visited its predecessor basic blocks. 7196 LoopBlocksDFS DFS(OrigLoop); 7197 DFS.perform(LI); 7198 7199 for (BasicBlock *BB : make_range(DFS.beginRPO(), DFS.endRPO())) { 7200 // Relevant instructions from basic block BB will be grouped into VPRecipe 7201 // ingredients and fill a new VPBasicBlock. 7202 unsigned VPBBsForBB = 0; 7203 auto *FirstVPBBForBB = new VPBasicBlock(BB->getName()); 7204 VPBlockUtils::insertBlockAfter(FirstVPBBForBB, VPBB); 7205 VPBB = FirstVPBBForBB; 7206 Builder.setInsertPoint(VPBB); 7207 7208 // Introduce each ingredient into VPlan. 7209 for (Instruction &I : BB->instructionsWithoutDebug()) { 7210 Instruction *Instr = &I; 7211 7212 // First filter out irrelevant instructions, to ensure no recipes are 7213 // built for them. 7214 if (isa<BranchInst>(Instr) || 7215 DeadInstructions.find(Instr) != DeadInstructions.end()) 7216 continue; 7217 7218 if (RecipeBuilder.tryToCreateRecipe(Instr, Range, Plan, VPBB)) 7219 continue; 7220 7221 // Otherwise, if all widening options failed, Instruction is to be 7222 // replicated. This may create a successor for VPBB. 7223 VPBasicBlock *NextVPBB = RecipeBuilder.handleReplication( 7224 Instr, Range, VPBB, PredInst2Recipe, Plan); 7225 if (NextVPBB != VPBB) { 7226 VPBB = NextVPBB; 7227 VPBB->setName(BB->hasName() ? BB->getName() + "." + Twine(VPBBsForBB++) 7228 : ""); 7229 } 7230 } 7231 } 7232 7233 // Discard empty dummy pre-entry VPBasicBlock. Note that other VPBasicBlocks 7234 // may also be empty, such as the last one VPBB, reflecting original 7235 // basic-blocks with no recipes. 7236 VPBasicBlock *PreEntry = cast<VPBasicBlock>(Plan->getEntry()); 7237 assert(PreEntry->empty() && "Expecting empty pre-entry block."); 7238 VPBlockBase *Entry = Plan->setEntry(PreEntry->getSingleSuccessor()); 7239 VPBlockUtils::disconnectBlocks(PreEntry, Entry); 7240 delete PreEntry; 7241 7242 // --------------------------------------------------------------------------- 7243 // Transform initial VPlan: Apply previously taken decisions, in order, to 7244 // bring the VPlan to its final state. 7245 // --------------------------------------------------------------------------- 7246 7247 // Apply Sink-After legal constraints. 7248 for (auto &Entry : SinkAfter) { 7249 VPRecipeBase *Sink = RecipeBuilder.getRecipe(Entry.first); 7250 VPRecipeBase *Target = RecipeBuilder.getRecipe(Entry.second); 7251 Sink->moveAfter(Target); 7252 } 7253 7254 // Interleave memory: for each Interleave Group we marked earlier as relevant 7255 // for this VPlan, replace the Recipes widening its memory instructions with a 7256 // single VPInterleaveRecipe at its insertion point. 7257 for (auto IG : InterleaveGroups) { 7258 auto *Recipe = cast<VPWidenMemoryInstructionRecipe>( 7259 RecipeBuilder.getRecipe(IG->getInsertPos())); 7260 (new VPInterleaveRecipe(IG, Recipe->getAddr(), Recipe->getMask())) 7261 ->insertBefore(Recipe); 7262 7263 for (unsigned i = 0; i < IG->getFactor(); ++i) 7264 if (Instruction *Member = IG->getMember(i)) { 7265 RecipeBuilder.getRecipe(Member)->eraseFromParent(); 7266 } 7267 } 7268 7269 // Finally, if tail is folded by masking, introduce selects between the phi 7270 // and the live-out instruction of each reduction, at the end of the latch. 7271 if (CM.foldTailByMasking()) { 7272 Builder.setInsertPoint(VPBB); 7273 auto *Cond = RecipeBuilder.createBlockInMask(OrigLoop->getHeader(), Plan); 7274 for (auto &Reduction : *Legal->getReductionVars()) { 7275 VPValue *Phi = Plan->getVPValue(Reduction.first); 7276 VPValue *Red = Plan->getVPValue(Reduction.second.getLoopExitInstr()); 7277 Builder.createNaryOp(Instruction::Select, {Cond, Red, Phi}); 7278 } 7279 } 7280 7281 std::string PlanName; 7282 raw_string_ostream RSO(PlanName); 7283 unsigned VF = Range.Start; 7284 Plan->addVF(VF); 7285 RSO << "Initial VPlan for VF={" << VF; 7286 for (VF *= 2; VF < Range.End; VF *= 2) { 7287 Plan->addVF(VF); 7288 RSO << "," << VF; 7289 } 7290 RSO << "},UF>=1"; 7291 RSO.flush(); 7292 Plan->setName(PlanName); 7293 7294 return Plan; 7295 } 7296 7297 VPlanPtr LoopVectorizationPlanner::buildVPlan(VFRange &Range) { 7298 // Outer loop handling: They may require CFG and instruction level 7299 // transformations before even evaluating whether vectorization is profitable. 7300 // Since we cannot modify the incoming IR, we need to build VPlan upfront in 7301 // the vectorization pipeline. 7302 assert(!OrigLoop->empty()); 7303 assert(EnableVPlanNativePath && "VPlan-native path is not enabled."); 7304 7305 // Create new empty VPlan 7306 auto Plan = std::make_unique<VPlan>(); 7307 7308 // Build hierarchical CFG 7309 VPlanHCFGBuilder HCFGBuilder(OrigLoop, LI, *Plan); 7310 HCFGBuilder.buildHierarchicalCFG(); 7311 7312 for (unsigned VF = Range.Start; VF < Range.End; VF *= 2) 7313 Plan->addVF(VF); 7314 7315 if (EnableVPlanPredication) { 7316 VPlanPredicator VPP(*Plan); 7317 VPP.predicate(); 7318 7319 // Avoid running transformation to recipes until masked code generation in 7320 // VPlan-native path is in place. 7321 return Plan; 7322 } 7323 7324 SmallPtrSet<Instruction *, 1> DeadInstructions; 7325 VPlanTransforms::VPInstructionsToVPRecipes( 7326 OrigLoop, Plan, Legal->getInductionVars(), DeadInstructions); 7327 return Plan; 7328 } 7329 7330 Value* LoopVectorizationPlanner::VPCallbackILV:: 7331 getOrCreateVectorValues(Value *V, unsigned Part) { 7332 return ILV.getOrCreateVectorValue(V, Part); 7333 } 7334 7335 Value *LoopVectorizationPlanner::VPCallbackILV::getOrCreateScalarValue( 7336 Value *V, const VPIteration &Instance) { 7337 return ILV.getOrCreateScalarValue(V, Instance); 7338 } 7339 7340 void VPInterleaveRecipe::print(raw_ostream &O, const Twine &Indent) const { 7341 O << " +\n" 7342 << Indent << "\"INTERLEAVE-GROUP with factor " << IG->getFactor() << " at "; 7343 IG->getInsertPos()->printAsOperand(O, false); 7344 O << ", "; 7345 getAddr()->printAsOperand(O); 7346 VPValue *Mask = getMask(); 7347 if (Mask) { 7348 O << ", "; 7349 Mask->printAsOperand(O); 7350 } 7351 O << "\\l\""; 7352 for (unsigned i = 0; i < IG->getFactor(); ++i) 7353 if (Instruction *I = IG->getMember(i)) 7354 O << " +\n" 7355 << Indent << "\" " << VPlanIngredient(I) << " " << i << "\\l\""; 7356 } 7357 7358 void VPWidenRecipe::execute(VPTransformState &State) { 7359 for (auto &Instr : make_range(Begin, End)) 7360 State.ILV->widenInstruction(Instr); 7361 } 7362 7363 void VPWidenGEPRecipe::execute(VPTransformState &State) { 7364 State.ILV->widenGEP(GEP, State.UF, State.VF, IsPtrLoopInvariant, 7365 IsIndexLoopInvariant); 7366 } 7367 7368 void VPWidenIntOrFpInductionRecipe::execute(VPTransformState &State) { 7369 assert(!State.Instance && "Int or FP induction being replicated."); 7370 State.ILV->widenIntOrFpInduction(IV, Trunc); 7371 } 7372 7373 void VPWidenPHIRecipe::execute(VPTransformState &State) { 7374 State.ILV->widenPHIInstruction(Phi, State.UF, State.VF); 7375 } 7376 7377 void VPBlendRecipe::execute(VPTransformState &State) { 7378 State.ILV->setDebugLocFromInst(State.Builder, Phi); 7379 // We know that all PHIs in non-header blocks are converted into 7380 // selects, so we don't have to worry about the insertion order and we 7381 // can just use the builder. 7382 // At this point we generate the predication tree. There may be 7383 // duplications since this is a simple recursive scan, but future 7384 // optimizations will clean it up. 7385 7386 unsigned NumIncoming = Phi->getNumIncomingValues(); 7387 7388 assert((User || NumIncoming == 1) && 7389 "Multiple predecessors with predecessors having a full mask"); 7390 // Generate a sequence of selects of the form: 7391 // SELECT(Mask3, In3, 7392 // SELECT(Mask2, In2, 7393 // ( ...))) 7394 InnerLoopVectorizer::VectorParts Entry(State.UF); 7395 for (unsigned In = 0; In < NumIncoming; ++In) { 7396 for (unsigned Part = 0; Part < State.UF; ++Part) { 7397 // We might have single edge PHIs (blocks) - use an identity 7398 // 'select' for the first PHI operand. 7399 Value *In0 = 7400 State.ILV->getOrCreateVectorValue(Phi->getIncomingValue(In), Part); 7401 if (In == 0) 7402 Entry[Part] = In0; // Initialize with the first incoming value. 7403 else { 7404 // Select between the current value and the previous incoming edge 7405 // based on the incoming mask. 7406 Value *Cond = State.get(User->getOperand(In), Part); 7407 Entry[Part] = 7408 State.Builder.CreateSelect(Cond, In0, Entry[Part], "predphi"); 7409 } 7410 } 7411 } 7412 for (unsigned Part = 0; Part < State.UF; ++Part) 7413 State.ValueMap.setVectorValue(Phi, Part, Entry[Part]); 7414 } 7415 7416 void VPInterleaveRecipe::execute(VPTransformState &State) { 7417 assert(!State.Instance && "Interleave group being replicated."); 7418 State.ILV->vectorizeInterleaveGroup(IG->getInsertPos(), State, getAddr(), 7419 getMask()); 7420 } 7421 7422 void VPReplicateRecipe::execute(VPTransformState &State) { 7423 if (State.Instance) { // Generate a single instance. 7424 State.ILV->scalarizeInstruction(Ingredient, *State.Instance, IsPredicated); 7425 // Insert scalar instance packing it into a vector. 7426 if (AlsoPack && State.VF > 1) { 7427 // If we're constructing lane 0, initialize to start from undef. 7428 if (State.Instance->Lane == 0) { 7429 Value *Undef = 7430 UndefValue::get(VectorType::get(Ingredient->getType(), State.VF)); 7431 State.ValueMap.setVectorValue(Ingredient, State.Instance->Part, Undef); 7432 } 7433 State.ILV->packScalarIntoVectorValue(Ingredient, *State.Instance); 7434 } 7435 return; 7436 } 7437 7438 // Generate scalar instances for all VF lanes of all UF parts, unless the 7439 // instruction is uniform inwhich case generate only the first lane for each 7440 // of the UF parts. 7441 unsigned EndLane = IsUniform ? 1 : State.VF; 7442 for (unsigned Part = 0; Part < State.UF; ++Part) 7443 for (unsigned Lane = 0; Lane < EndLane; ++Lane) 7444 State.ILV->scalarizeInstruction(Ingredient, {Part, Lane}, IsPredicated); 7445 } 7446 7447 void VPBranchOnMaskRecipe::execute(VPTransformState &State) { 7448 assert(State.Instance && "Branch on Mask works only on single instance."); 7449 7450 unsigned Part = State.Instance->Part; 7451 unsigned Lane = State.Instance->Lane; 7452 7453 Value *ConditionBit = nullptr; 7454 if (!User) // Block in mask is all-one. 7455 ConditionBit = State.Builder.getTrue(); 7456 else { 7457 VPValue *BlockInMask = User->getOperand(0); 7458 ConditionBit = State.get(BlockInMask, Part); 7459 if (ConditionBit->getType()->isVectorTy()) 7460 ConditionBit = State.Builder.CreateExtractElement( 7461 ConditionBit, State.Builder.getInt32(Lane)); 7462 } 7463 7464 // Replace the temporary unreachable terminator with a new conditional branch, 7465 // whose two destinations will be set later when they are created. 7466 auto *CurrentTerminator = State.CFG.PrevBB->getTerminator(); 7467 assert(isa<UnreachableInst>(CurrentTerminator) && 7468 "Expected to replace unreachable terminator with conditional branch."); 7469 auto *CondBr = BranchInst::Create(State.CFG.PrevBB, nullptr, ConditionBit); 7470 CondBr->setSuccessor(0, nullptr); 7471 ReplaceInstWithInst(CurrentTerminator, CondBr); 7472 } 7473 7474 void VPPredInstPHIRecipe::execute(VPTransformState &State) { 7475 assert(State.Instance && "Predicated instruction PHI works per instance."); 7476 Instruction *ScalarPredInst = cast<Instruction>( 7477 State.ValueMap.getScalarValue(PredInst, *State.Instance)); 7478 BasicBlock *PredicatedBB = ScalarPredInst->getParent(); 7479 BasicBlock *PredicatingBB = PredicatedBB->getSinglePredecessor(); 7480 assert(PredicatingBB && "Predicated block has no single predecessor."); 7481 7482 // By current pack/unpack logic we need to generate only a single phi node: if 7483 // a vector value for the predicated instruction exists at this point it means 7484 // the instruction has vector users only, and a phi for the vector value is 7485 // needed. In this case the recipe of the predicated instruction is marked to 7486 // also do that packing, thereby "hoisting" the insert-element sequence. 7487 // Otherwise, a phi node for the scalar value is needed. 7488 unsigned Part = State.Instance->Part; 7489 if (State.ValueMap.hasVectorValue(PredInst, Part)) { 7490 Value *VectorValue = State.ValueMap.getVectorValue(PredInst, Part); 7491 InsertElementInst *IEI = cast<InsertElementInst>(VectorValue); 7492 PHINode *VPhi = State.Builder.CreatePHI(IEI->getType(), 2); 7493 VPhi->addIncoming(IEI->getOperand(0), PredicatingBB); // Unmodified vector. 7494 VPhi->addIncoming(IEI, PredicatedBB); // New vector with inserted element. 7495 State.ValueMap.resetVectorValue(PredInst, Part, VPhi); // Update cache. 7496 } else { 7497 Type *PredInstType = PredInst->getType(); 7498 PHINode *Phi = State.Builder.CreatePHI(PredInstType, 2); 7499 Phi->addIncoming(UndefValue::get(ScalarPredInst->getType()), PredicatingBB); 7500 Phi->addIncoming(ScalarPredInst, PredicatedBB); 7501 State.ValueMap.resetScalarValue(PredInst, *State.Instance, Phi); 7502 } 7503 } 7504 7505 void VPWidenMemoryInstructionRecipe::execute(VPTransformState &State) { 7506 State.ILV->vectorizeMemoryInstruction(&Instr, State, getAddr(), getMask()); 7507 } 7508 7509 // Determine how to lower the scalar epilogue, which depends on 1) optimising 7510 // for minimum code-size, 2) predicate compiler options, 3) loop hints forcing 7511 // predication, and 4) a TTI hook that analyses whether the loop is suitable 7512 // for predication. 7513 static ScalarEpilogueLowering getScalarEpilogueLowering( 7514 Function *F, Loop *L, LoopVectorizeHints &Hints, ProfileSummaryInfo *PSI, 7515 BlockFrequencyInfo *BFI, TargetTransformInfo *TTI, TargetLibraryInfo *TLI, 7516 AssumptionCache *AC, LoopInfo *LI, ScalarEvolution *SE, DominatorTree *DT, 7517 LoopVectorizationLegality &LVL) { 7518 bool OptSize = 7519 F->hasOptSize() || llvm::shouldOptimizeForSize(L->getHeader(), PSI, BFI, 7520 PGSOQueryType::IRPass); 7521 // 1) OptSize takes precedence over all other options, i.e. if this is set, 7522 // don't look at hints or options, and don't request a scalar epilogue. 7523 if (OptSize && Hints.getForce() != LoopVectorizeHints::FK_Enabled) 7524 return CM_ScalarEpilogueNotAllowedOptSize; 7525 7526 bool PredicateOptDisabled = PreferPredicateOverEpilog.getNumOccurrences() && 7527 !PreferPredicateOverEpilog; 7528 7529 // 2) Next, if disabling predication is requested on the command line, honour 7530 // this and request a scalar epilogue. Also do this if we don't have a 7531 // primary induction variable, which is required for predication. 7532 if (PredicateOptDisabled || !LVL.getPrimaryInduction()) 7533 return CM_ScalarEpilogueAllowed; 7534 7535 // 3) and 4) look if enabling predication is requested on the command line, 7536 // with a loop hint, or if the TTI hook indicates this is profitable, request 7537 // predication . 7538 if (PreferPredicateOverEpilog || 7539 Hints.getPredicate() == LoopVectorizeHints::FK_Enabled || 7540 (TTI->preferPredicateOverEpilogue(L, LI, *SE, *AC, TLI, DT, 7541 LVL.getLAI()) && 7542 Hints.getPredicate() != LoopVectorizeHints::FK_Disabled)) 7543 return CM_ScalarEpilogueNotNeededUsePredicate; 7544 7545 return CM_ScalarEpilogueAllowed; 7546 } 7547 7548 // Process the loop in the VPlan-native vectorization path. This path builds 7549 // VPlan upfront in the vectorization pipeline, which allows to apply 7550 // VPlan-to-VPlan transformations from the very beginning without modifying the 7551 // input LLVM IR. 7552 static bool processLoopInVPlanNativePath( 7553 Loop *L, PredicatedScalarEvolution &PSE, LoopInfo *LI, DominatorTree *DT, 7554 LoopVectorizationLegality *LVL, TargetTransformInfo *TTI, 7555 TargetLibraryInfo *TLI, DemandedBits *DB, AssumptionCache *AC, 7556 OptimizationRemarkEmitter *ORE, BlockFrequencyInfo *BFI, 7557 ProfileSummaryInfo *PSI, LoopVectorizeHints &Hints) { 7558 7559 assert(EnableVPlanNativePath && "VPlan-native path is disabled."); 7560 Function *F = L->getHeader()->getParent(); 7561 InterleavedAccessInfo IAI(PSE, L, DT, LI, LVL->getLAI()); 7562 7563 ScalarEpilogueLowering SEL = getScalarEpilogueLowering( 7564 F, L, Hints, PSI, BFI, TTI, TLI, AC, LI, PSE.getSE(), DT, *LVL); 7565 7566 LoopVectorizationCostModel CM(SEL, L, PSE, LI, LVL, *TTI, TLI, DB, AC, ORE, F, 7567 &Hints, IAI); 7568 // Use the planner for outer loop vectorization. 7569 // TODO: CM is not used at this point inside the planner. Turn CM into an 7570 // optional argument if we don't need it in the future. 7571 LoopVectorizationPlanner LVP(L, LI, TLI, TTI, LVL, CM, IAI); 7572 7573 // Get user vectorization factor. 7574 const unsigned UserVF = Hints.getWidth(); 7575 7576 // Plan how to best vectorize, return the best VF and its cost. 7577 const VectorizationFactor VF = LVP.planInVPlanNativePath(UserVF); 7578 7579 // If we are stress testing VPlan builds, do not attempt to generate vector 7580 // code. Masked vector code generation support will follow soon. 7581 // Also, do not attempt to vectorize if no vector code will be produced. 7582 if (VPlanBuildStressTest || EnableVPlanPredication || 7583 VectorizationFactor::Disabled() == VF) 7584 return false; 7585 7586 LVP.setBestPlan(VF.Width, 1); 7587 7588 InnerLoopVectorizer LB(L, PSE, LI, DT, TLI, TTI, AC, ORE, VF.Width, 1, LVL, 7589 &CM); 7590 LLVM_DEBUG(dbgs() << "Vectorizing outer loop in \"" 7591 << L->getHeader()->getParent()->getName() << "\"\n"); 7592 LVP.executePlan(LB, DT); 7593 7594 // Mark the loop as already vectorized to avoid vectorizing again. 7595 Hints.setAlreadyVectorized(); 7596 7597 LLVM_DEBUG(verifyFunction(*L->getHeader()->getParent())); 7598 return true; 7599 } 7600 7601 bool LoopVectorizePass::processLoop(Loop *L) { 7602 assert((EnableVPlanNativePath || L->empty()) && 7603 "VPlan-native path is not enabled. Only process inner loops."); 7604 7605 #ifndef NDEBUG 7606 const std::string DebugLocStr = getDebugLocString(L); 7607 #endif /* NDEBUG */ 7608 7609 LLVM_DEBUG(dbgs() << "\nLV: Checking a loop in \"" 7610 << L->getHeader()->getParent()->getName() << "\" from " 7611 << DebugLocStr << "\n"); 7612 7613 LoopVectorizeHints Hints(L, InterleaveOnlyWhenForced, *ORE); 7614 7615 LLVM_DEBUG( 7616 dbgs() << "LV: Loop hints:" 7617 << " force=" 7618 << (Hints.getForce() == LoopVectorizeHints::FK_Disabled 7619 ? "disabled" 7620 : (Hints.getForce() == LoopVectorizeHints::FK_Enabled 7621 ? "enabled" 7622 : "?")) 7623 << " width=" << Hints.getWidth() 7624 << " unroll=" << Hints.getInterleave() << "\n"); 7625 7626 // Function containing loop 7627 Function *F = L->getHeader()->getParent(); 7628 7629 // Looking at the diagnostic output is the only way to determine if a loop 7630 // was vectorized (other than looking at the IR or machine code), so it 7631 // is important to generate an optimization remark for each loop. Most of 7632 // these messages are generated as OptimizationRemarkAnalysis. Remarks 7633 // generated as OptimizationRemark and OptimizationRemarkMissed are 7634 // less verbose reporting vectorized loops and unvectorized loops that may 7635 // benefit from vectorization, respectively. 7636 7637 if (!Hints.allowVectorization(F, L, VectorizeOnlyWhenForced)) { 7638 LLVM_DEBUG(dbgs() << "LV: Loop hints prevent vectorization.\n"); 7639 return false; 7640 } 7641 7642 PredicatedScalarEvolution PSE(*SE, *L); 7643 7644 // Check if it is legal to vectorize the loop. 7645 LoopVectorizationRequirements Requirements(*ORE); 7646 LoopVectorizationLegality LVL(L, PSE, DT, TTI, TLI, AA, F, GetLAA, LI, ORE, 7647 &Requirements, &Hints, DB, AC); 7648 if (!LVL.canVectorize(EnableVPlanNativePath)) { 7649 LLVM_DEBUG(dbgs() << "LV: Not vectorizing: Cannot prove legality.\n"); 7650 Hints.emitRemarkWithHints(); 7651 return false; 7652 } 7653 7654 // Check the function attributes and profiles to find out if this function 7655 // should be optimized for size. 7656 ScalarEpilogueLowering SEL = getScalarEpilogueLowering( 7657 F, L, Hints, PSI, BFI, TTI, TLI, AC, LI, PSE.getSE(), DT, LVL); 7658 7659 // Entrance to the VPlan-native vectorization path. Outer loops are processed 7660 // here. They may require CFG and instruction level transformations before 7661 // even evaluating whether vectorization is profitable. Since we cannot modify 7662 // the incoming IR, we need to build VPlan upfront in the vectorization 7663 // pipeline. 7664 if (!L->empty()) 7665 return processLoopInVPlanNativePath(L, PSE, LI, DT, &LVL, TTI, TLI, DB, AC, 7666 ORE, BFI, PSI, Hints); 7667 7668 assert(L->empty() && "Inner loop expected."); 7669 7670 // Check the loop for a trip count threshold: vectorize loops with a tiny trip 7671 // count by optimizing for size, to minimize overheads. 7672 auto ExpectedTC = getSmallBestKnownTC(*SE, L); 7673 if (ExpectedTC && *ExpectedTC < TinyTripCountVectorThreshold) { 7674 LLVM_DEBUG(dbgs() << "LV: Found a loop with a very small trip count. " 7675 << "This loop is worth vectorizing only if no scalar " 7676 << "iteration overheads are incurred."); 7677 if (Hints.getForce() == LoopVectorizeHints::FK_Enabled) 7678 LLVM_DEBUG(dbgs() << " But vectorizing was explicitly forced.\n"); 7679 else { 7680 LLVM_DEBUG(dbgs() << "\n"); 7681 SEL = CM_ScalarEpilogueNotAllowedLowTripLoop; 7682 } 7683 } 7684 7685 // Check the function attributes to see if implicit floats are allowed. 7686 // FIXME: This check doesn't seem possibly correct -- what if the loop is 7687 // an integer loop and the vector instructions selected are purely integer 7688 // vector instructions? 7689 if (F->hasFnAttribute(Attribute::NoImplicitFloat)) { 7690 reportVectorizationFailure( 7691 "Can't vectorize when the NoImplicitFloat attribute is used", 7692 "loop not vectorized due to NoImplicitFloat attribute", 7693 "NoImplicitFloat", ORE, L); 7694 Hints.emitRemarkWithHints(); 7695 return false; 7696 } 7697 7698 // Check if the target supports potentially unsafe FP vectorization. 7699 // FIXME: Add a check for the type of safety issue (denormal, signaling) 7700 // for the target we're vectorizing for, to make sure none of the 7701 // additional fp-math flags can help. 7702 if (Hints.isPotentiallyUnsafe() && 7703 TTI->isFPVectorizationPotentiallyUnsafe()) { 7704 reportVectorizationFailure( 7705 "Potentially unsafe FP op prevents vectorization", 7706 "loop not vectorized due to unsafe FP support.", 7707 "UnsafeFP", ORE, L); 7708 Hints.emitRemarkWithHints(); 7709 return false; 7710 } 7711 7712 bool UseInterleaved = TTI->enableInterleavedAccessVectorization(); 7713 InterleavedAccessInfo IAI(PSE, L, DT, LI, LVL.getLAI()); 7714 7715 // If an override option has been passed in for interleaved accesses, use it. 7716 if (EnableInterleavedMemAccesses.getNumOccurrences() > 0) 7717 UseInterleaved = EnableInterleavedMemAccesses; 7718 7719 // Analyze interleaved memory accesses. 7720 if (UseInterleaved) { 7721 IAI.analyzeInterleaving(useMaskedInterleavedAccesses(*TTI)); 7722 } 7723 7724 // Use the cost model. 7725 LoopVectorizationCostModel CM(SEL, L, PSE, LI, &LVL, *TTI, TLI, DB, AC, ORE, 7726 F, &Hints, IAI); 7727 CM.collectValuesToIgnore(); 7728 7729 // Use the planner for vectorization. 7730 LoopVectorizationPlanner LVP(L, LI, TLI, TTI, &LVL, CM, IAI); 7731 7732 // Get user vectorization factor. 7733 unsigned UserVF = Hints.getWidth(); 7734 7735 // Plan how to best vectorize, return the best VF and its cost. 7736 Optional<VectorizationFactor> MaybeVF = LVP.plan(UserVF); 7737 7738 VectorizationFactor VF = VectorizationFactor::Disabled(); 7739 unsigned IC = 1; 7740 unsigned UserIC = Hints.getInterleave(); 7741 7742 if (MaybeVF) { 7743 VF = *MaybeVF; 7744 // Select the interleave count. 7745 IC = CM.selectInterleaveCount(VF.Width, VF.Cost); 7746 } 7747 7748 // Identify the diagnostic messages that should be produced. 7749 std::pair<StringRef, std::string> VecDiagMsg, IntDiagMsg; 7750 bool VectorizeLoop = true, InterleaveLoop = true; 7751 if (Requirements.doesNotMeet(F, L, Hints)) { 7752 LLVM_DEBUG(dbgs() << "LV: Not vectorizing: loop did not meet vectorization " 7753 "requirements.\n"); 7754 Hints.emitRemarkWithHints(); 7755 return false; 7756 } 7757 7758 if (VF.Width == 1) { 7759 LLVM_DEBUG(dbgs() << "LV: Vectorization is possible but not beneficial.\n"); 7760 VecDiagMsg = std::make_pair( 7761 "VectorizationNotBeneficial", 7762 "the cost-model indicates that vectorization is not beneficial"); 7763 VectorizeLoop = false; 7764 } 7765 7766 if (!MaybeVF && UserIC > 1) { 7767 // Tell the user interleaving was avoided up-front, despite being explicitly 7768 // requested. 7769 LLVM_DEBUG(dbgs() << "LV: Ignoring UserIC, because vectorization and " 7770 "interleaving should be avoided up front\n"); 7771 IntDiagMsg = std::make_pair( 7772 "InterleavingAvoided", 7773 "Ignoring UserIC, because interleaving was avoided up front"); 7774 InterleaveLoop = false; 7775 } else if (IC == 1 && UserIC <= 1) { 7776 // Tell the user interleaving is not beneficial. 7777 LLVM_DEBUG(dbgs() << "LV: Interleaving is not beneficial.\n"); 7778 IntDiagMsg = std::make_pair( 7779 "InterleavingNotBeneficial", 7780 "the cost-model indicates that interleaving is not beneficial"); 7781 InterleaveLoop = false; 7782 if (UserIC == 1) { 7783 IntDiagMsg.first = "InterleavingNotBeneficialAndDisabled"; 7784 IntDiagMsg.second += 7785 " and is explicitly disabled or interleave count is set to 1"; 7786 } 7787 } else if (IC > 1 && UserIC == 1) { 7788 // Tell the user interleaving is beneficial, but it explicitly disabled. 7789 LLVM_DEBUG( 7790 dbgs() << "LV: Interleaving is beneficial but is explicitly disabled."); 7791 IntDiagMsg = std::make_pair( 7792 "InterleavingBeneficialButDisabled", 7793 "the cost-model indicates that interleaving is beneficial " 7794 "but is explicitly disabled or interleave count is set to 1"); 7795 InterleaveLoop = false; 7796 } 7797 7798 // Override IC if user provided an interleave count. 7799 IC = UserIC > 0 ? UserIC : IC; 7800 7801 // Emit diagnostic messages, if any. 7802 const char *VAPassName = Hints.vectorizeAnalysisPassName(); 7803 if (!VectorizeLoop && !InterleaveLoop) { 7804 // Do not vectorize or interleaving the loop. 7805 ORE->emit([&]() { 7806 return OptimizationRemarkMissed(VAPassName, VecDiagMsg.first, 7807 L->getStartLoc(), L->getHeader()) 7808 << VecDiagMsg.second; 7809 }); 7810 ORE->emit([&]() { 7811 return OptimizationRemarkMissed(LV_NAME, IntDiagMsg.first, 7812 L->getStartLoc(), L->getHeader()) 7813 << IntDiagMsg.second; 7814 }); 7815 return false; 7816 } else if (!VectorizeLoop && InterleaveLoop) { 7817 LLVM_DEBUG(dbgs() << "LV: Interleave Count is " << IC << '\n'); 7818 ORE->emit([&]() { 7819 return OptimizationRemarkAnalysis(VAPassName, VecDiagMsg.first, 7820 L->getStartLoc(), L->getHeader()) 7821 << VecDiagMsg.second; 7822 }); 7823 } else if (VectorizeLoop && !InterleaveLoop) { 7824 LLVM_DEBUG(dbgs() << "LV: Found a vectorizable loop (" << VF.Width 7825 << ") in " << DebugLocStr << '\n'); 7826 ORE->emit([&]() { 7827 return OptimizationRemarkAnalysis(LV_NAME, IntDiagMsg.first, 7828 L->getStartLoc(), L->getHeader()) 7829 << IntDiagMsg.second; 7830 }); 7831 } else if (VectorizeLoop && InterleaveLoop) { 7832 LLVM_DEBUG(dbgs() << "LV: Found a vectorizable loop (" << VF.Width 7833 << ") in " << DebugLocStr << '\n'); 7834 LLVM_DEBUG(dbgs() << "LV: Interleave Count is " << IC << '\n'); 7835 } 7836 7837 LVP.setBestPlan(VF.Width, IC); 7838 7839 using namespace ore; 7840 bool DisableRuntimeUnroll = false; 7841 MDNode *OrigLoopID = L->getLoopID(); 7842 7843 if (!VectorizeLoop) { 7844 assert(IC > 1 && "interleave count should not be 1 or 0"); 7845 // If we decided that it is not legal to vectorize the loop, then 7846 // interleave it. 7847 InnerLoopUnroller Unroller(L, PSE, LI, DT, TLI, TTI, AC, ORE, IC, &LVL, 7848 &CM); 7849 LVP.executePlan(Unroller, DT); 7850 7851 ORE->emit([&]() { 7852 return OptimizationRemark(LV_NAME, "Interleaved", L->getStartLoc(), 7853 L->getHeader()) 7854 << "interleaved loop (interleaved count: " 7855 << NV("InterleaveCount", IC) << ")"; 7856 }); 7857 } else { 7858 // If we decided that it is *legal* to vectorize the loop, then do it. 7859 InnerLoopVectorizer LB(L, PSE, LI, DT, TLI, TTI, AC, ORE, VF.Width, IC, 7860 &LVL, &CM); 7861 LVP.executePlan(LB, DT); 7862 ++LoopsVectorized; 7863 7864 // Add metadata to disable runtime unrolling a scalar loop when there are 7865 // no runtime checks about strides and memory. A scalar loop that is 7866 // rarely used is not worth unrolling. 7867 if (!LB.areSafetyChecksAdded()) 7868 DisableRuntimeUnroll = true; 7869 7870 // Report the vectorization decision. 7871 ORE->emit([&]() { 7872 return OptimizationRemark(LV_NAME, "Vectorized", L->getStartLoc(), 7873 L->getHeader()) 7874 << "vectorized loop (vectorization width: " 7875 << NV("VectorizationFactor", VF.Width) 7876 << ", interleaved count: " << NV("InterleaveCount", IC) << ")"; 7877 }); 7878 } 7879 7880 Optional<MDNode *> RemainderLoopID = 7881 makeFollowupLoopID(OrigLoopID, {LLVMLoopVectorizeFollowupAll, 7882 LLVMLoopVectorizeFollowupEpilogue}); 7883 if (RemainderLoopID.hasValue()) { 7884 L->setLoopID(RemainderLoopID.getValue()); 7885 } else { 7886 if (DisableRuntimeUnroll) 7887 AddRuntimeUnrollDisableMetaData(L); 7888 7889 // Mark the loop as already vectorized to avoid vectorizing again. 7890 Hints.setAlreadyVectorized(); 7891 } 7892 7893 LLVM_DEBUG(verifyFunction(*L->getHeader()->getParent())); 7894 return true; 7895 } 7896 7897 bool LoopVectorizePass::runImpl( 7898 Function &F, ScalarEvolution &SE_, LoopInfo &LI_, TargetTransformInfo &TTI_, 7899 DominatorTree &DT_, BlockFrequencyInfo &BFI_, TargetLibraryInfo *TLI_, 7900 DemandedBits &DB_, AliasAnalysis &AA_, AssumptionCache &AC_, 7901 std::function<const LoopAccessInfo &(Loop &)> &GetLAA_, 7902 OptimizationRemarkEmitter &ORE_, ProfileSummaryInfo *PSI_) { 7903 SE = &SE_; 7904 LI = &LI_; 7905 TTI = &TTI_; 7906 DT = &DT_; 7907 BFI = &BFI_; 7908 TLI = TLI_; 7909 AA = &AA_; 7910 AC = &AC_; 7911 GetLAA = &GetLAA_; 7912 DB = &DB_; 7913 ORE = &ORE_; 7914 PSI = PSI_; 7915 7916 // Don't attempt if 7917 // 1. the target claims to have no vector registers, and 7918 // 2. interleaving won't help ILP. 7919 // 7920 // The second condition is necessary because, even if the target has no 7921 // vector registers, loop vectorization may still enable scalar 7922 // interleaving. 7923 if (!TTI->getNumberOfRegisters(TTI->getRegisterClassForType(true)) && 7924 TTI->getMaxInterleaveFactor(1) < 2) 7925 return false; 7926 7927 bool Changed = false; 7928 7929 // The vectorizer requires loops to be in simplified form. 7930 // Since simplification may add new inner loops, it has to run before the 7931 // legality and profitability checks. This means running the loop vectorizer 7932 // will simplify all loops, regardless of whether anything end up being 7933 // vectorized. 7934 for (auto &L : *LI) 7935 Changed |= 7936 simplifyLoop(L, DT, LI, SE, AC, nullptr, false /* PreserveLCSSA */); 7937 7938 // Build up a worklist of inner-loops to vectorize. This is necessary as 7939 // the act of vectorizing or partially unrolling a loop creates new loops 7940 // and can invalidate iterators across the loops. 7941 SmallVector<Loop *, 8> Worklist; 7942 7943 for (Loop *L : *LI) 7944 collectSupportedLoops(*L, LI, ORE, Worklist); 7945 7946 LoopsAnalyzed += Worklist.size(); 7947 7948 // Now walk the identified inner loops. 7949 while (!Worklist.empty()) { 7950 Loop *L = Worklist.pop_back_val(); 7951 7952 // For the inner loops we actually process, form LCSSA to simplify the 7953 // transform. 7954 Changed |= formLCSSARecursively(*L, *DT, LI, SE); 7955 7956 Changed |= processLoop(L); 7957 } 7958 7959 // Process each loop nest in the function. 7960 return Changed; 7961 } 7962 7963 PreservedAnalyses LoopVectorizePass::run(Function &F, 7964 FunctionAnalysisManager &AM) { 7965 auto &SE = AM.getResult<ScalarEvolutionAnalysis>(F); 7966 auto &LI = AM.getResult<LoopAnalysis>(F); 7967 auto &TTI = AM.getResult<TargetIRAnalysis>(F); 7968 auto &DT = AM.getResult<DominatorTreeAnalysis>(F); 7969 auto &BFI = AM.getResult<BlockFrequencyAnalysis>(F); 7970 auto &TLI = AM.getResult<TargetLibraryAnalysis>(F); 7971 auto &AA = AM.getResult<AAManager>(F); 7972 auto &AC = AM.getResult<AssumptionAnalysis>(F); 7973 auto &DB = AM.getResult<DemandedBitsAnalysis>(F); 7974 auto &ORE = AM.getResult<OptimizationRemarkEmitterAnalysis>(F); 7975 MemorySSA *MSSA = EnableMSSALoopDependency 7976 ? &AM.getResult<MemorySSAAnalysis>(F).getMSSA() 7977 : nullptr; 7978 7979 auto &LAM = AM.getResult<LoopAnalysisManagerFunctionProxy>(F).getManager(); 7980 std::function<const LoopAccessInfo &(Loop &)> GetLAA = 7981 [&](Loop &L) -> const LoopAccessInfo & { 7982 LoopStandardAnalysisResults AR = {AA, AC, DT, LI, SE, TLI, TTI, MSSA}; 7983 return LAM.getResult<LoopAccessAnalysis>(L, AR); 7984 }; 7985 const ModuleAnalysisManager &MAM = 7986 AM.getResult<ModuleAnalysisManagerFunctionProxy>(F).getManager(); 7987 ProfileSummaryInfo *PSI = 7988 MAM.getCachedResult<ProfileSummaryAnalysis>(*F.getParent()); 7989 bool Changed = 7990 runImpl(F, SE, LI, TTI, DT, BFI, &TLI, DB, AA, AC, GetLAA, ORE, PSI); 7991 if (!Changed) 7992 return PreservedAnalyses::all(); 7993 PreservedAnalyses PA; 7994 7995 // We currently do not preserve loopinfo/dominator analyses with outer loop 7996 // vectorization. Until this is addressed, mark these analyses as preserved 7997 // only for non-VPlan-native path. 7998 // TODO: Preserve Loop and Dominator analyses for VPlan-native path. 7999 if (!EnableVPlanNativePath) { 8000 PA.preserve<LoopAnalysis>(); 8001 PA.preserve<DominatorTreeAnalysis>(); 8002 } 8003 PA.preserve<BasicAA>(); 8004 PA.preserve<GlobalsAA>(); 8005 return PA; 8006 } 8007