1 //===- LoopVectorize.cpp - A Loop Vectorizer ------------------------------===// 2 // 3 // Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. 4 // See https://llvm.org/LICENSE.txt for license information. 5 // SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception 6 // 7 //===----------------------------------------------------------------------===// 8 // 9 // This is the LLVM loop vectorizer. This pass modifies 'vectorizable' loops 10 // and generates target-independent LLVM-IR. 11 // The vectorizer uses the TargetTransformInfo analysis to estimate the costs 12 // of instructions in order to estimate the profitability of vectorization. 13 // 14 // The loop vectorizer combines consecutive loop iterations into a single 15 // 'wide' iteration. After this transformation the index is incremented 16 // by the SIMD vector width, and not by one. 17 // 18 // This pass has three parts: 19 // 1. The main loop pass that drives the different parts. 20 // 2. LoopVectorizationLegality - A unit that checks for the legality 21 // of the vectorization. 22 // 3. InnerLoopVectorizer - A unit that performs the actual 23 // widening of instructions. 24 // 4. LoopVectorizationCostModel - A unit that checks for the profitability 25 // of vectorization. It decides on the optimal vector width, which 26 // can be one, if vectorization is not profitable. 27 // 28 // There is a development effort going on to migrate loop vectorizer to the 29 // VPlan infrastructure and to introduce outer loop vectorization support (see 30 // docs/Proposal/VectorizationPlan.rst and 31 // http://lists.llvm.org/pipermail/llvm-dev/2017-December/119523.html). For this 32 // purpose, we temporarily introduced the VPlan-native vectorization path: an 33 // alternative vectorization path that is natively implemented on top of the 34 // VPlan infrastructure. See EnableVPlanNativePath for enabling. 35 // 36 //===----------------------------------------------------------------------===// 37 // 38 // The reduction-variable vectorization is based on the paper: 39 // D. Nuzman and R. Henderson. Multi-platform Auto-vectorization. 40 // 41 // Variable uniformity checks are inspired by: 42 // Karrenberg, R. and Hack, S. Whole Function Vectorization. 43 // 44 // The interleaved access vectorization is based on the paper: 45 // Dorit Nuzman, Ira Rosen and Ayal Zaks. Auto-Vectorization of Interleaved 46 // Data for SIMD 47 // 48 // Other ideas/concepts are from: 49 // A. Zaks and D. Nuzman. Autovectorization in GCC-two years later. 50 // 51 // S. Maleki, Y. Gao, M. Garzaran, T. Wong and D. Padua. An Evaluation of 52 // Vectorizing Compilers. 53 // 54 //===----------------------------------------------------------------------===// 55 56 #include "llvm/Transforms/Vectorize/LoopVectorize.h" 57 #include "LoopVectorizationPlanner.h" 58 #include "VPRecipeBuilder.h" 59 #include "VPlan.h" 60 #include "VPlanHCFGBuilder.h" 61 #include "VPlanPredicator.h" 62 #include "VPlanTransforms.h" 63 #include "llvm/ADT/APInt.h" 64 #include "llvm/ADT/ArrayRef.h" 65 #include "llvm/ADT/DenseMap.h" 66 #include "llvm/ADT/DenseMapInfo.h" 67 #include "llvm/ADT/Hashing.h" 68 #include "llvm/ADT/MapVector.h" 69 #include "llvm/ADT/None.h" 70 #include "llvm/ADT/Optional.h" 71 #include "llvm/ADT/STLExtras.h" 72 #include "llvm/ADT/SetVector.h" 73 #include "llvm/ADT/SmallPtrSet.h" 74 #include "llvm/ADT/SmallVector.h" 75 #include "llvm/ADT/Statistic.h" 76 #include "llvm/ADT/StringRef.h" 77 #include "llvm/ADT/Twine.h" 78 #include "llvm/ADT/iterator_range.h" 79 #include "llvm/Analysis/AssumptionCache.h" 80 #include "llvm/Analysis/BasicAliasAnalysis.h" 81 #include "llvm/Analysis/BlockFrequencyInfo.h" 82 #include "llvm/Analysis/CFG.h" 83 #include "llvm/Analysis/CodeMetrics.h" 84 #include "llvm/Analysis/DemandedBits.h" 85 #include "llvm/Analysis/GlobalsModRef.h" 86 #include "llvm/Analysis/LoopAccessAnalysis.h" 87 #include "llvm/Analysis/LoopAnalysisManager.h" 88 #include "llvm/Analysis/LoopInfo.h" 89 #include "llvm/Analysis/LoopIterator.h" 90 #include "llvm/Analysis/MemorySSA.h" 91 #include "llvm/Analysis/OptimizationRemarkEmitter.h" 92 #include "llvm/Analysis/ProfileSummaryInfo.h" 93 #include "llvm/Analysis/ScalarEvolution.h" 94 #include "llvm/Analysis/ScalarEvolutionExpander.h" 95 #include "llvm/Analysis/ScalarEvolutionExpressions.h" 96 #include "llvm/Analysis/TargetLibraryInfo.h" 97 #include "llvm/Analysis/TargetTransformInfo.h" 98 #include "llvm/Analysis/VectorUtils.h" 99 #include "llvm/IR/Attributes.h" 100 #include "llvm/IR/BasicBlock.h" 101 #include "llvm/IR/CFG.h" 102 #include "llvm/IR/Constant.h" 103 #include "llvm/IR/Constants.h" 104 #include "llvm/IR/DataLayout.h" 105 #include "llvm/IR/DebugInfoMetadata.h" 106 #include "llvm/IR/DebugLoc.h" 107 #include "llvm/IR/DerivedTypes.h" 108 #include "llvm/IR/DiagnosticInfo.h" 109 #include "llvm/IR/Dominators.h" 110 #include "llvm/IR/Function.h" 111 #include "llvm/IR/IRBuilder.h" 112 #include "llvm/IR/InstrTypes.h" 113 #include "llvm/IR/Instruction.h" 114 #include "llvm/IR/Instructions.h" 115 #include "llvm/IR/IntrinsicInst.h" 116 #include "llvm/IR/Intrinsics.h" 117 #include "llvm/IR/LLVMContext.h" 118 #include "llvm/IR/Metadata.h" 119 #include "llvm/IR/Module.h" 120 #include "llvm/IR/Operator.h" 121 #include "llvm/IR/Type.h" 122 #include "llvm/IR/Use.h" 123 #include "llvm/IR/User.h" 124 #include "llvm/IR/Value.h" 125 #include "llvm/IR/ValueHandle.h" 126 #include "llvm/IR/Verifier.h" 127 #include "llvm/InitializePasses.h" 128 #include "llvm/Pass.h" 129 #include "llvm/Support/Casting.h" 130 #include "llvm/Support/CommandLine.h" 131 #include "llvm/Support/Compiler.h" 132 #include "llvm/Support/Debug.h" 133 #include "llvm/Support/ErrorHandling.h" 134 #include "llvm/Support/MathExtras.h" 135 #include "llvm/Support/raw_ostream.h" 136 #include "llvm/Transforms/Utils/BasicBlockUtils.h" 137 #include "llvm/Transforms/Utils/InjectTLIMappings.h" 138 #include "llvm/Transforms/Utils/LoopSimplify.h" 139 #include "llvm/Transforms/Utils/LoopUtils.h" 140 #include "llvm/Transforms/Utils/LoopVersioning.h" 141 #include "llvm/Transforms/Utils/SizeOpts.h" 142 #include "llvm/Transforms/Vectorize/LoopVectorizationLegality.h" 143 #include <algorithm> 144 #include <cassert> 145 #include <cstdint> 146 #include <cstdlib> 147 #include <functional> 148 #include <iterator> 149 #include <limits> 150 #include <memory> 151 #include <string> 152 #include <tuple> 153 #include <utility> 154 155 using namespace llvm; 156 157 #define LV_NAME "loop-vectorize" 158 #define DEBUG_TYPE LV_NAME 159 160 /// @{ 161 /// Metadata attribute names 162 static const char *const LLVMLoopVectorizeFollowupAll = 163 "llvm.loop.vectorize.followup_all"; 164 static const char *const LLVMLoopVectorizeFollowupVectorized = 165 "llvm.loop.vectorize.followup_vectorized"; 166 static const char *const LLVMLoopVectorizeFollowupEpilogue = 167 "llvm.loop.vectorize.followup_epilogue"; 168 /// @} 169 170 STATISTIC(LoopsVectorized, "Number of loops vectorized"); 171 STATISTIC(LoopsAnalyzed, "Number of loops analyzed for vectorization"); 172 173 /// Loops with a known constant trip count below this number are vectorized only 174 /// if no scalar iteration overheads are incurred. 175 static cl::opt<unsigned> TinyTripCountVectorThreshold( 176 "vectorizer-min-trip-count", cl::init(16), cl::Hidden, 177 cl::desc("Loops with a constant trip count that is smaller than this " 178 "value are vectorized only if no scalar iteration overheads " 179 "are incurred.")); 180 181 // Indicates that an epilogue is undesired, predication is preferred. 182 // This means that the vectorizer will try to fold the loop-tail (epilogue) 183 // into the loop and predicate the loop body accordingly. 184 static cl::opt<bool> PreferPredicateOverEpilog( 185 "prefer-predicate-over-epilog", cl::init(false), cl::Hidden, 186 cl::desc("Indicate that an epilogue is undesired, predication should be " 187 "used instead.")); 188 189 static cl::opt<bool> MaximizeBandwidth( 190 "vectorizer-maximize-bandwidth", cl::init(false), cl::Hidden, 191 cl::desc("Maximize bandwidth when selecting vectorization factor which " 192 "will be determined by the smallest type in loop.")); 193 194 static cl::opt<bool> EnableInterleavedMemAccesses( 195 "enable-interleaved-mem-accesses", cl::init(false), cl::Hidden, 196 cl::desc("Enable vectorization on interleaved memory accesses in a loop")); 197 198 /// An interleave-group may need masking if it resides in a block that needs 199 /// predication, or in order to mask away gaps. 200 static cl::opt<bool> EnableMaskedInterleavedMemAccesses( 201 "enable-masked-interleaved-mem-accesses", cl::init(false), cl::Hidden, 202 cl::desc("Enable vectorization on masked interleaved memory accesses in a loop")); 203 204 static cl::opt<unsigned> TinyTripCountInterleaveThreshold( 205 "tiny-trip-count-interleave-threshold", cl::init(128), cl::Hidden, 206 cl::desc("We don't interleave loops with a estimated constant trip count " 207 "below this number")); 208 209 static cl::opt<unsigned> ForceTargetNumScalarRegs( 210 "force-target-num-scalar-regs", cl::init(0), cl::Hidden, 211 cl::desc("A flag that overrides the target's number of scalar registers.")); 212 213 static cl::opt<unsigned> ForceTargetNumVectorRegs( 214 "force-target-num-vector-regs", cl::init(0), cl::Hidden, 215 cl::desc("A flag that overrides the target's number of vector registers.")); 216 217 static cl::opt<unsigned> ForceTargetMaxScalarInterleaveFactor( 218 "force-target-max-scalar-interleave", cl::init(0), cl::Hidden, 219 cl::desc("A flag that overrides the target's max interleave factor for " 220 "scalar loops.")); 221 222 static cl::opt<unsigned> ForceTargetMaxVectorInterleaveFactor( 223 "force-target-max-vector-interleave", cl::init(0), cl::Hidden, 224 cl::desc("A flag that overrides the target's max interleave factor for " 225 "vectorized loops.")); 226 227 static cl::opt<unsigned> ForceTargetInstructionCost( 228 "force-target-instruction-cost", cl::init(0), cl::Hidden, 229 cl::desc("A flag that overrides the target's expected cost for " 230 "an instruction to a single constant value. Mostly " 231 "useful for getting consistent testing.")); 232 233 static cl::opt<unsigned> SmallLoopCost( 234 "small-loop-cost", cl::init(20), cl::Hidden, 235 cl::desc( 236 "The cost of a loop that is considered 'small' by the interleaver.")); 237 238 static cl::opt<bool> LoopVectorizeWithBlockFrequency( 239 "loop-vectorize-with-block-frequency", cl::init(true), cl::Hidden, 240 cl::desc("Enable the use of the block frequency analysis to access PGO " 241 "heuristics minimizing code growth in cold regions and being more " 242 "aggressive in hot regions.")); 243 244 // Runtime interleave loops for load/store throughput. 245 static cl::opt<bool> EnableLoadStoreRuntimeInterleave( 246 "enable-loadstore-runtime-interleave", cl::init(true), cl::Hidden, 247 cl::desc( 248 "Enable runtime interleaving until load/store ports are saturated")); 249 250 /// The number of stores in a loop that are allowed to need predication. 251 static cl::opt<unsigned> NumberOfStoresToPredicate( 252 "vectorize-num-stores-pred", cl::init(1), cl::Hidden, 253 cl::desc("Max number of stores to be predicated behind an if.")); 254 255 static cl::opt<bool> EnableIndVarRegisterHeur( 256 "enable-ind-var-reg-heur", cl::init(true), cl::Hidden, 257 cl::desc("Count the induction variable only once when interleaving")); 258 259 static cl::opt<bool> EnableCondStoresVectorization( 260 "enable-cond-stores-vec", cl::init(true), cl::Hidden, 261 cl::desc("Enable if predication of stores during vectorization.")); 262 263 static cl::opt<unsigned> MaxNestedScalarReductionIC( 264 "max-nested-scalar-reduction-interleave", cl::init(2), cl::Hidden, 265 cl::desc("The maximum interleave count to use when interleaving a scalar " 266 "reduction in a nested loop.")); 267 268 cl::opt<bool> EnableVPlanNativePath( 269 "enable-vplan-native-path", cl::init(false), cl::Hidden, 270 cl::desc("Enable VPlan-native vectorization path with " 271 "support for outer loop vectorization.")); 272 273 // FIXME: Remove this switch once we have divergence analysis. Currently we 274 // assume divergent non-backedge branches when this switch is true. 275 cl::opt<bool> EnableVPlanPredication( 276 "enable-vplan-predication", cl::init(false), cl::Hidden, 277 cl::desc("Enable VPlan-native vectorization path predicator with " 278 "support for outer loop vectorization.")); 279 280 // This flag enables the stress testing of the VPlan H-CFG construction in the 281 // VPlan-native vectorization path. It must be used in conjuction with 282 // -enable-vplan-native-path. -vplan-verify-hcfg can also be used to enable the 283 // verification of the H-CFGs built. 284 static cl::opt<bool> VPlanBuildStressTest( 285 "vplan-build-stress-test", cl::init(false), cl::Hidden, 286 cl::desc( 287 "Build VPlan for every supported loop nest in the function and bail " 288 "out right after the build (stress test the VPlan H-CFG construction " 289 "in the VPlan-native vectorization path).")); 290 291 cl::opt<bool> llvm::EnableLoopInterleaving( 292 "interleave-loops", cl::init(true), cl::Hidden, 293 cl::desc("Enable loop interleaving in Loop vectorization passes")); 294 cl::opt<bool> llvm::EnableLoopVectorization( 295 "vectorize-loops", cl::init(true), cl::Hidden, 296 cl::desc("Run the Loop vectorization passes")); 297 298 /// A helper function that returns the type of loaded or stored value. 299 static Type *getMemInstValueType(Value *I) { 300 assert((isa<LoadInst>(I) || isa<StoreInst>(I)) && 301 "Expected Load or Store instruction"); 302 if (auto *LI = dyn_cast<LoadInst>(I)) 303 return LI->getType(); 304 return cast<StoreInst>(I)->getValueOperand()->getType(); 305 } 306 307 /// A helper function that returns true if the given type is irregular. The 308 /// type is irregular if its allocated size doesn't equal the store size of an 309 /// element of the corresponding vector type at the given vectorization factor. 310 static bool hasIrregularType(Type *Ty, const DataLayout &DL, unsigned VF) { 311 // Determine if an array of VF elements of type Ty is "bitcast compatible" 312 // with a <VF x Ty> vector. 313 if (VF > 1) { 314 auto *VectorTy = VectorType::get(Ty, VF); 315 return VF * DL.getTypeAllocSize(Ty) != DL.getTypeStoreSize(VectorTy); 316 } 317 318 // If the vectorization factor is one, we just check if an array of type Ty 319 // requires padding between elements. 320 return DL.getTypeAllocSizeInBits(Ty) != DL.getTypeSizeInBits(Ty); 321 } 322 323 /// A helper function that returns the reciprocal of the block probability of 324 /// predicated blocks. If we return X, we are assuming the predicated block 325 /// will execute once for every X iterations of the loop header. 326 /// 327 /// TODO: We should use actual block probability here, if available. Currently, 328 /// we always assume predicated blocks have a 50% chance of executing. 329 static unsigned getReciprocalPredBlockProb() { return 2; } 330 331 /// A helper function that adds a 'fast' flag to floating-point operations. 332 static Value *addFastMathFlag(Value *V) { 333 if (isa<FPMathOperator>(V)) 334 cast<Instruction>(V)->setFastMathFlags(FastMathFlags::getFast()); 335 return V; 336 } 337 338 static Value *addFastMathFlag(Value *V, FastMathFlags FMF) { 339 if (isa<FPMathOperator>(V)) 340 cast<Instruction>(V)->setFastMathFlags(FMF); 341 return V; 342 } 343 344 /// A helper function that returns an integer or floating-point constant with 345 /// value C. 346 static Constant *getSignedIntOrFpConstant(Type *Ty, int64_t C) { 347 return Ty->isIntegerTy() ? ConstantInt::getSigned(Ty, C) 348 : ConstantFP::get(Ty, C); 349 } 350 351 /// Returns "best known" trip count for the specified loop \p L as defined by 352 /// the following procedure: 353 /// 1) Returns exact trip count if it is known. 354 /// 2) Returns expected trip count according to profile data if any. 355 /// 3) Returns upper bound estimate if it is known. 356 /// 4) Returns None if all of the above failed. 357 static Optional<unsigned> getSmallBestKnownTC(ScalarEvolution &SE, Loop *L) { 358 // Check if exact trip count is known. 359 if (unsigned ExpectedTC = SE.getSmallConstantTripCount(L)) 360 return ExpectedTC; 361 362 // Check if there is an expected trip count available from profile data. 363 if (LoopVectorizeWithBlockFrequency) 364 if (auto EstimatedTC = getLoopEstimatedTripCount(L)) 365 return EstimatedTC; 366 367 // Check if upper bound estimate is known. 368 if (unsigned ExpectedTC = SE.getSmallConstantMaxTripCount(L)) 369 return ExpectedTC; 370 371 return None; 372 } 373 374 namespace llvm { 375 376 /// InnerLoopVectorizer vectorizes loops which contain only one basic 377 /// block to a specified vectorization factor (VF). 378 /// This class performs the widening of scalars into vectors, or multiple 379 /// scalars. This class also implements the following features: 380 /// * It inserts an epilogue loop for handling loops that don't have iteration 381 /// counts that are known to be a multiple of the vectorization factor. 382 /// * It handles the code generation for reduction variables. 383 /// * Scalarization (implementation using scalars) of un-vectorizable 384 /// instructions. 385 /// InnerLoopVectorizer does not perform any vectorization-legality 386 /// checks, and relies on the caller to check for the different legality 387 /// aspects. The InnerLoopVectorizer relies on the 388 /// LoopVectorizationLegality class to provide information about the induction 389 /// and reduction variables that were found to a given vectorization factor. 390 class InnerLoopVectorizer { 391 public: 392 InnerLoopVectorizer(Loop *OrigLoop, PredicatedScalarEvolution &PSE, 393 LoopInfo *LI, DominatorTree *DT, 394 const TargetLibraryInfo *TLI, 395 const TargetTransformInfo *TTI, AssumptionCache *AC, 396 OptimizationRemarkEmitter *ORE, unsigned VecWidth, 397 unsigned UnrollFactor, LoopVectorizationLegality *LVL, 398 LoopVectorizationCostModel *CM) 399 : OrigLoop(OrigLoop), PSE(PSE), LI(LI), DT(DT), TLI(TLI), TTI(TTI), 400 AC(AC), ORE(ORE), VF(VecWidth), UF(UnrollFactor), 401 Builder(PSE.getSE()->getContext()), 402 VectorLoopValueMap(UnrollFactor, VecWidth), Legal(LVL), Cost(CM) {} 403 virtual ~InnerLoopVectorizer() = default; 404 405 /// Create a new empty loop. Unlink the old loop and connect the new one. 406 /// Return the pre-header block of the new loop. 407 BasicBlock *createVectorizedLoopSkeleton(); 408 409 /// Widen a single instruction within the innermost loop. 410 void widenInstruction(Instruction &I); 411 412 /// Widen a single call instruction within the innermost loop. 413 void widenCallInstruction(CallInst &I, VPUser &ArgOperands, 414 VPTransformState &State); 415 416 /// Widen a single select instruction within the innermost loop. 417 void widenSelectInstruction(SelectInst &I, bool InvariantCond); 418 419 /// Fix the vectorized code, taking care of header phi's, live-outs, and more. 420 void fixVectorizedLoop(); 421 422 // Return true if any runtime check is added. 423 bool areSafetyChecksAdded() { return AddedSafetyChecks; } 424 425 /// A type for vectorized values in the new loop. Each value from the 426 /// original loop, when vectorized, is represented by UF vector values in the 427 /// new unrolled loop, where UF is the unroll factor. 428 using VectorParts = SmallVector<Value *, 2>; 429 430 /// Vectorize a single GetElementPtrInst based on information gathered and 431 /// decisions taken during planning. 432 void widenGEP(GetElementPtrInst *GEP, unsigned UF, unsigned VF, 433 bool IsPtrLoopInvariant, SmallBitVector &IsIndexLoopInvariant); 434 435 /// Vectorize a single PHINode in a block. This method handles the induction 436 /// variable canonicalization. It supports both VF = 1 for unrolled loops and 437 /// arbitrary length vectors. 438 void widenPHIInstruction(Instruction *PN, unsigned UF, unsigned VF); 439 440 /// A helper function to scalarize a single Instruction in the innermost loop. 441 /// Generates a sequence of scalar instances for each lane between \p MinLane 442 /// and \p MaxLane, times each part between \p MinPart and \p MaxPart, 443 /// inclusive.. 444 void scalarizeInstruction(Instruction *Instr, const VPIteration &Instance, 445 bool IfPredicateInstr); 446 447 /// Widen an integer or floating-point induction variable \p IV. If \p Trunc 448 /// is provided, the integer induction variable will first be truncated to 449 /// the corresponding type. 450 void widenIntOrFpInduction(PHINode *IV, TruncInst *Trunc = nullptr); 451 452 /// getOrCreateVectorValue and getOrCreateScalarValue coordinate to generate a 453 /// vector or scalar value on-demand if one is not yet available. When 454 /// vectorizing a loop, we visit the definition of an instruction before its 455 /// uses. When visiting the definition, we either vectorize or scalarize the 456 /// instruction, creating an entry for it in the corresponding map. (In some 457 /// cases, such as induction variables, we will create both vector and scalar 458 /// entries.) Then, as we encounter uses of the definition, we derive values 459 /// for each scalar or vector use unless such a value is already available. 460 /// For example, if we scalarize a definition and one of its uses is vector, 461 /// we build the required vector on-demand with an insertelement sequence 462 /// when visiting the use. Otherwise, if the use is scalar, we can use the 463 /// existing scalar definition. 464 /// 465 /// Return a value in the new loop corresponding to \p V from the original 466 /// loop at unroll index \p Part. If the value has already been vectorized, 467 /// the corresponding vector entry in VectorLoopValueMap is returned. If, 468 /// however, the value has a scalar entry in VectorLoopValueMap, we construct 469 /// a new vector value on-demand by inserting the scalar values into a vector 470 /// with an insertelement sequence. If the value has been neither vectorized 471 /// nor scalarized, it must be loop invariant, so we simply broadcast the 472 /// value into a vector. 473 Value *getOrCreateVectorValue(Value *V, unsigned Part); 474 475 /// Return a value in the new loop corresponding to \p V from the original 476 /// loop at unroll and vector indices \p Instance. If the value has been 477 /// vectorized but not scalarized, the necessary extractelement instruction 478 /// will be generated. 479 Value *getOrCreateScalarValue(Value *V, const VPIteration &Instance); 480 481 /// Construct the vector value of a scalarized value \p V one lane at a time. 482 void packScalarIntoVectorValue(Value *V, const VPIteration &Instance); 483 484 /// Try to vectorize interleaved access group \p Group with the base address 485 /// given in \p Addr, optionally masking the vector operations if \p 486 /// BlockInMask is non-null. Use \p State to translate given VPValues to IR 487 /// values in the vectorized loop. 488 void vectorizeInterleaveGroup(const InterleaveGroup<Instruction> *Group, 489 VPTransformState &State, VPValue *Addr, 490 VPValue *BlockInMask = nullptr); 491 492 /// Vectorize Load and Store instructions with the base address given in \p 493 /// Addr, optionally masking the vector operations if \p BlockInMask is 494 /// non-null. Use \p State to translate given VPValues to IR values in the 495 /// vectorized loop. 496 void vectorizeMemoryInstruction(Instruction *Instr, VPTransformState &State, 497 VPValue *Addr, VPValue *StoredValue, 498 VPValue *BlockInMask); 499 500 /// Set the debug location in the builder using the debug location in 501 /// the instruction. 502 void setDebugLocFromInst(IRBuilder<> &B, const Value *Ptr); 503 504 /// Fix the non-induction PHIs in the OrigPHIsToFix vector. 505 void fixNonInductionPHIs(void); 506 507 protected: 508 friend class LoopVectorizationPlanner; 509 510 /// A small list of PHINodes. 511 using PhiVector = SmallVector<PHINode *, 4>; 512 513 /// A type for scalarized values in the new loop. Each value from the 514 /// original loop, when scalarized, is represented by UF x VF scalar values 515 /// in the new unrolled loop, where UF is the unroll factor and VF is the 516 /// vectorization factor. 517 using ScalarParts = SmallVector<SmallVector<Value *, 4>, 2>; 518 519 /// Set up the values of the IVs correctly when exiting the vector loop. 520 void fixupIVUsers(PHINode *OrigPhi, const InductionDescriptor &II, 521 Value *CountRoundDown, Value *EndValue, 522 BasicBlock *MiddleBlock); 523 524 /// Create a new induction variable inside L. 525 PHINode *createInductionVariable(Loop *L, Value *Start, Value *End, 526 Value *Step, Instruction *DL); 527 528 /// Handle all cross-iteration phis in the header. 529 void fixCrossIterationPHIs(); 530 531 /// Fix a first-order recurrence. This is the second phase of vectorizing 532 /// this phi node. 533 void fixFirstOrderRecurrence(PHINode *Phi); 534 535 /// Fix a reduction cross-iteration phi. This is the second phase of 536 /// vectorizing this phi node. 537 void fixReduction(PHINode *Phi); 538 539 /// Clear NSW/NUW flags from reduction instructions if necessary. 540 void clearReductionWrapFlags(RecurrenceDescriptor &RdxDesc); 541 542 /// The Loop exit block may have single value PHI nodes with some 543 /// incoming value. While vectorizing we only handled real values 544 /// that were defined inside the loop and we should have one value for 545 /// each predecessor of its parent basic block. See PR14725. 546 void fixLCSSAPHIs(); 547 548 /// Iteratively sink the scalarized operands of a predicated instruction into 549 /// the block that was created for it. 550 void sinkScalarOperands(Instruction *PredInst); 551 552 /// Shrinks vector element sizes to the smallest bitwidth they can be legally 553 /// represented as. 554 void truncateToMinimalBitwidths(); 555 556 /// Create a broadcast instruction. This method generates a broadcast 557 /// instruction (shuffle) for loop invariant values and for the induction 558 /// value. If this is the induction variable then we extend it to N, N+1, ... 559 /// this is needed because each iteration in the loop corresponds to a SIMD 560 /// element. 561 virtual Value *getBroadcastInstrs(Value *V); 562 563 /// This function adds (StartIdx, StartIdx + Step, StartIdx + 2*Step, ...) 564 /// to each vector element of Val. The sequence starts at StartIndex. 565 /// \p Opcode is relevant for FP induction variable. 566 virtual Value *getStepVector(Value *Val, int StartIdx, Value *Step, 567 Instruction::BinaryOps Opcode = 568 Instruction::BinaryOpsEnd); 569 570 /// Compute scalar induction steps. \p ScalarIV is the scalar induction 571 /// variable on which to base the steps, \p Step is the size of the step, and 572 /// \p EntryVal is the value from the original loop that maps to the steps. 573 /// Note that \p EntryVal doesn't have to be an induction variable - it 574 /// can also be a truncate instruction. 575 void buildScalarSteps(Value *ScalarIV, Value *Step, Instruction *EntryVal, 576 const InductionDescriptor &ID); 577 578 /// Create a vector induction phi node based on an existing scalar one. \p 579 /// EntryVal is the value from the original loop that maps to the vector phi 580 /// node, and \p Step is the loop-invariant step. If \p EntryVal is a 581 /// truncate instruction, instead of widening the original IV, we widen a 582 /// version of the IV truncated to \p EntryVal's type. 583 void createVectorIntOrFpInductionPHI(const InductionDescriptor &II, 584 Value *Step, Instruction *EntryVal); 585 586 /// Returns true if an instruction \p I should be scalarized instead of 587 /// vectorized for the chosen vectorization factor. 588 bool shouldScalarizeInstruction(Instruction *I) const; 589 590 /// Returns true if we should generate a scalar version of \p IV. 591 bool needsScalarInduction(Instruction *IV) const; 592 593 /// If there is a cast involved in the induction variable \p ID, which should 594 /// be ignored in the vectorized loop body, this function records the 595 /// VectorLoopValue of the respective Phi also as the VectorLoopValue of the 596 /// cast. We had already proved that the casted Phi is equal to the uncasted 597 /// Phi in the vectorized loop (under a runtime guard), and therefore 598 /// there is no need to vectorize the cast - the same value can be used in the 599 /// vector loop for both the Phi and the cast. 600 /// If \p VectorLoopValue is a scalarized value, \p Lane is also specified, 601 /// Otherwise, \p VectorLoopValue is a widened/vectorized value. 602 /// 603 /// \p EntryVal is the value from the original loop that maps to the vector 604 /// phi node and is used to distinguish what is the IV currently being 605 /// processed - original one (if \p EntryVal is a phi corresponding to the 606 /// original IV) or the "newly-created" one based on the proof mentioned above 607 /// (see also buildScalarSteps() and createVectorIntOrFPInductionPHI()). In the 608 /// latter case \p EntryVal is a TruncInst and we must not record anything for 609 /// that IV, but it's error-prone to expect callers of this routine to care 610 /// about that, hence this explicit parameter. 611 void recordVectorLoopValueForInductionCast(const InductionDescriptor &ID, 612 const Instruction *EntryVal, 613 Value *VectorLoopValue, 614 unsigned Part, 615 unsigned Lane = UINT_MAX); 616 617 /// Generate a shuffle sequence that will reverse the vector Vec. 618 virtual Value *reverseVector(Value *Vec); 619 620 /// Returns (and creates if needed) the original loop trip count. 621 Value *getOrCreateTripCount(Loop *NewLoop); 622 623 /// Returns (and creates if needed) the trip count of the widened loop. 624 Value *getOrCreateVectorTripCount(Loop *NewLoop); 625 626 /// Returns a bitcasted value to the requested vector type. 627 /// Also handles bitcasts of vector<float> <-> vector<pointer> types. 628 Value *createBitOrPointerCast(Value *V, VectorType *DstVTy, 629 const DataLayout &DL); 630 631 /// Emit a bypass check to see if the vector trip count is zero, including if 632 /// it overflows. 633 void emitMinimumIterationCountCheck(Loop *L, BasicBlock *Bypass); 634 635 /// Emit a bypass check to see if all of the SCEV assumptions we've 636 /// had to make are correct. 637 void emitSCEVChecks(Loop *L, BasicBlock *Bypass); 638 639 /// Emit bypass checks to check any memory assumptions we may have made. 640 void emitMemRuntimeChecks(Loop *L, BasicBlock *Bypass); 641 642 /// Compute the transformed value of Index at offset StartValue using step 643 /// StepValue. 644 /// For integer induction, returns StartValue + Index * StepValue. 645 /// For pointer induction, returns StartValue[Index * StepValue]. 646 /// FIXME: The newly created binary instructions should contain nsw/nuw 647 /// flags, which can be found from the original scalar operations. 648 Value *emitTransformedIndex(IRBuilder<> &B, Value *Index, ScalarEvolution *SE, 649 const DataLayout &DL, 650 const InductionDescriptor &ID) const; 651 652 /// Add additional metadata to \p To that was not present on \p Orig. 653 /// 654 /// Currently this is used to add the noalias annotations based on the 655 /// inserted memchecks. Use this for instructions that are *cloned* into the 656 /// vector loop. 657 void addNewMetadata(Instruction *To, const Instruction *Orig); 658 659 /// Add metadata from one instruction to another. 660 /// 661 /// This includes both the original MDs from \p From and additional ones (\see 662 /// addNewMetadata). Use this for *newly created* instructions in the vector 663 /// loop. 664 void addMetadata(Instruction *To, Instruction *From); 665 666 /// Similar to the previous function but it adds the metadata to a 667 /// vector of instructions. 668 void addMetadata(ArrayRef<Value *> To, Instruction *From); 669 670 /// The original loop. 671 Loop *OrigLoop; 672 673 /// A wrapper around ScalarEvolution used to add runtime SCEV checks. Applies 674 /// dynamic knowledge to simplify SCEV expressions and converts them to a 675 /// more usable form. 676 PredicatedScalarEvolution &PSE; 677 678 /// Loop Info. 679 LoopInfo *LI; 680 681 /// Dominator Tree. 682 DominatorTree *DT; 683 684 /// Alias Analysis. 685 AliasAnalysis *AA; 686 687 /// Target Library Info. 688 const TargetLibraryInfo *TLI; 689 690 /// Target Transform Info. 691 const TargetTransformInfo *TTI; 692 693 /// Assumption Cache. 694 AssumptionCache *AC; 695 696 /// Interface to emit optimization remarks. 697 OptimizationRemarkEmitter *ORE; 698 699 /// LoopVersioning. It's only set up (non-null) if memchecks were 700 /// used. 701 /// 702 /// This is currently only used to add no-alias metadata based on the 703 /// memchecks. The actually versioning is performed manually. 704 std::unique_ptr<LoopVersioning> LVer; 705 706 /// The vectorization SIMD factor to use. Each vector will have this many 707 /// vector elements. 708 unsigned VF; 709 710 /// The vectorization unroll factor to use. Each scalar is vectorized to this 711 /// many different vector instructions. 712 unsigned UF; 713 714 /// The builder that we use 715 IRBuilder<> Builder; 716 717 // --- Vectorization state --- 718 719 /// The vector-loop preheader. 720 BasicBlock *LoopVectorPreHeader; 721 722 /// The scalar-loop preheader. 723 BasicBlock *LoopScalarPreHeader; 724 725 /// Middle Block between the vector and the scalar. 726 BasicBlock *LoopMiddleBlock; 727 728 /// The ExitBlock of the scalar loop. 729 BasicBlock *LoopExitBlock; 730 731 /// The vector loop body. 732 BasicBlock *LoopVectorBody; 733 734 /// The scalar loop body. 735 BasicBlock *LoopScalarBody; 736 737 /// A list of all bypass blocks. The first block is the entry of the loop. 738 SmallVector<BasicBlock *, 4> LoopBypassBlocks; 739 740 /// The new Induction variable which was added to the new block. 741 PHINode *Induction = nullptr; 742 743 /// The induction variable of the old basic block. 744 PHINode *OldInduction = nullptr; 745 746 /// Maps values from the original loop to their corresponding values in the 747 /// vectorized loop. A key value can map to either vector values, scalar 748 /// values or both kinds of values, depending on whether the key was 749 /// vectorized and scalarized. 750 VectorizerValueMap VectorLoopValueMap; 751 752 /// Store instructions that were predicated. 753 SmallVector<Instruction *, 4> PredicatedInstructions; 754 755 /// Trip count of the original loop. 756 Value *TripCount = nullptr; 757 758 /// Trip count of the widened loop (TripCount - TripCount % (VF*UF)) 759 Value *VectorTripCount = nullptr; 760 761 /// The legality analysis. 762 LoopVectorizationLegality *Legal; 763 764 /// The profitablity analysis. 765 LoopVectorizationCostModel *Cost; 766 767 // Record whether runtime checks are added. 768 bool AddedSafetyChecks = false; 769 770 // Holds the end values for each induction variable. We save the end values 771 // so we can later fix-up the external users of the induction variables. 772 DenseMap<PHINode *, Value *> IVEndValues; 773 774 // Vector of original scalar PHIs whose corresponding widened PHIs need to be 775 // fixed up at the end of vector code generation. 776 SmallVector<PHINode *, 8> OrigPHIsToFix; 777 }; 778 779 class InnerLoopUnroller : public InnerLoopVectorizer { 780 public: 781 InnerLoopUnroller(Loop *OrigLoop, PredicatedScalarEvolution &PSE, 782 LoopInfo *LI, DominatorTree *DT, 783 const TargetLibraryInfo *TLI, 784 const TargetTransformInfo *TTI, AssumptionCache *AC, 785 OptimizationRemarkEmitter *ORE, unsigned UnrollFactor, 786 LoopVectorizationLegality *LVL, 787 LoopVectorizationCostModel *CM) 788 : InnerLoopVectorizer(OrigLoop, PSE, LI, DT, TLI, TTI, AC, ORE, 1, 789 UnrollFactor, LVL, CM) {} 790 791 private: 792 Value *getBroadcastInstrs(Value *V) override; 793 Value *getStepVector(Value *Val, int StartIdx, Value *Step, 794 Instruction::BinaryOps Opcode = 795 Instruction::BinaryOpsEnd) override; 796 Value *reverseVector(Value *Vec) override; 797 }; 798 799 } // end namespace llvm 800 801 /// Look for a meaningful debug location on the instruction or it's 802 /// operands. 803 static Instruction *getDebugLocFromInstOrOperands(Instruction *I) { 804 if (!I) 805 return I; 806 807 DebugLoc Empty; 808 if (I->getDebugLoc() != Empty) 809 return I; 810 811 for (User::op_iterator OI = I->op_begin(), OE = I->op_end(); OI != OE; ++OI) { 812 if (Instruction *OpInst = dyn_cast<Instruction>(*OI)) 813 if (OpInst->getDebugLoc() != Empty) 814 return OpInst; 815 } 816 817 return I; 818 } 819 820 void InnerLoopVectorizer::setDebugLocFromInst(IRBuilder<> &B, const Value *Ptr) { 821 if (const Instruction *Inst = dyn_cast_or_null<Instruction>(Ptr)) { 822 const DILocation *DIL = Inst->getDebugLoc(); 823 if (DIL && Inst->getFunction()->isDebugInfoForProfiling() && 824 !isa<DbgInfoIntrinsic>(Inst)) { 825 auto NewDIL = DIL->cloneByMultiplyingDuplicationFactor(UF * VF); 826 if (NewDIL) 827 B.SetCurrentDebugLocation(NewDIL.getValue()); 828 else 829 LLVM_DEBUG(dbgs() 830 << "Failed to create new discriminator: " 831 << DIL->getFilename() << " Line: " << DIL->getLine()); 832 } 833 else 834 B.SetCurrentDebugLocation(DIL); 835 } else 836 B.SetCurrentDebugLocation(DebugLoc()); 837 } 838 839 /// Write a record \p DebugMsg about vectorization failure to the debug 840 /// output stream. If \p I is passed, it is an instruction that prevents 841 /// vectorization. 842 #ifndef NDEBUG 843 static void debugVectorizationFailure(const StringRef DebugMsg, 844 Instruction *I) { 845 dbgs() << "LV: Not vectorizing: " << DebugMsg; 846 if (I != nullptr) 847 dbgs() << " " << *I; 848 else 849 dbgs() << '.'; 850 dbgs() << '\n'; 851 } 852 #endif 853 854 /// Create an analysis remark that explains why vectorization failed 855 /// 856 /// \p PassName is the name of the pass (e.g. can be AlwaysPrint). \p 857 /// RemarkName is the identifier for the remark. If \p I is passed it is an 858 /// instruction that prevents vectorization. Otherwise \p TheLoop is used for 859 /// the location of the remark. \return the remark object that can be 860 /// streamed to. 861 static OptimizationRemarkAnalysis createLVAnalysis(const char *PassName, 862 StringRef RemarkName, Loop *TheLoop, Instruction *I) { 863 Value *CodeRegion = TheLoop->getHeader(); 864 DebugLoc DL = TheLoop->getStartLoc(); 865 866 if (I) { 867 CodeRegion = I->getParent(); 868 // If there is no debug location attached to the instruction, revert back to 869 // using the loop's. 870 if (I->getDebugLoc()) 871 DL = I->getDebugLoc(); 872 } 873 874 OptimizationRemarkAnalysis R(PassName, RemarkName, DL, CodeRegion); 875 R << "loop not vectorized: "; 876 return R; 877 } 878 879 namespace llvm { 880 881 void reportVectorizationFailure(const StringRef DebugMsg, 882 const StringRef OREMsg, const StringRef ORETag, 883 OptimizationRemarkEmitter *ORE, Loop *TheLoop, Instruction *I) { 884 LLVM_DEBUG(debugVectorizationFailure(DebugMsg, I)); 885 LoopVectorizeHints Hints(TheLoop, true /* doesn't matter */, *ORE); 886 ORE->emit(createLVAnalysis(Hints.vectorizeAnalysisPassName(), 887 ORETag, TheLoop, I) << OREMsg); 888 } 889 890 } // end namespace llvm 891 892 #ifndef NDEBUG 893 /// \return string containing a file name and a line # for the given loop. 894 static std::string getDebugLocString(const Loop *L) { 895 std::string Result; 896 if (L) { 897 raw_string_ostream OS(Result); 898 if (const DebugLoc LoopDbgLoc = L->getStartLoc()) 899 LoopDbgLoc.print(OS); 900 else 901 // Just print the module name. 902 OS << L->getHeader()->getParent()->getParent()->getModuleIdentifier(); 903 OS.flush(); 904 } 905 return Result; 906 } 907 #endif 908 909 void InnerLoopVectorizer::addNewMetadata(Instruction *To, 910 const Instruction *Orig) { 911 // If the loop was versioned with memchecks, add the corresponding no-alias 912 // metadata. 913 if (LVer && (isa<LoadInst>(Orig) || isa<StoreInst>(Orig))) 914 LVer->annotateInstWithNoAlias(To, Orig); 915 } 916 917 void InnerLoopVectorizer::addMetadata(Instruction *To, 918 Instruction *From) { 919 propagateMetadata(To, From); 920 addNewMetadata(To, From); 921 } 922 923 void InnerLoopVectorizer::addMetadata(ArrayRef<Value *> To, 924 Instruction *From) { 925 for (Value *V : To) { 926 if (Instruction *I = dyn_cast<Instruction>(V)) 927 addMetadata(I, From); 928 } 929 } 930 931 namespace llvm { 932 933 // Loop vectorization cost-model hints how the scalar epilogue loop should be 934 // lowered. 935 enum ScalarEpilogueLowering { 936 937 // The default: allowing scalar epilogues. 938 CM_ScalarEpilogueAllowed, 939 940 // Vectorization with OptForSize: don't allow epilogues. 941 CM_ScalarEpilogueNotAllowedOptSize, 942 943 // A special case of vectorisation with OptForSize: loops with a very small 944 // trip count are considered for vectorization under OptForSize, thereby 945 // making sure the cost of their loop body is dominant, free of runtime 946 // guards and scalar iteration overheads. 947 CM_ScalarEpilogueNotAllowedLowTripLoop, 948 949 // Loop hint predicate indicating an epilogue is undesired. 950 CM_ScalarEpilogueNotNeededUsePredicate 951 }; 952 953 /// LoopVectorizationCostModel - estimates the expected speedups due to 954 /// vectorization. 955 /// In many cases vectorization is not profitable. This can happen because of 956 /// a number of reasons. In this class we mainly attempt to predict the 957 /// expected speedup/slowdowns due to the supported instruction set. We use the 958 /// TargetTransformInfo to query the different backends for the cost of 959 /// different operations. 960 class LoopVectorizationCostModel { 961 public: 962 LoopVectorizationCostModel(ScalarEpilogueLowering SEL, Loop *L, 963 PredicatedScalarEvolution &PSE, LoopInfo *LI, 964 LoopVectorizationLegality *Legal, 965 const TargetTransformInfo &TTI, 966 const TargetLibraryInfo *TLI, DemandedBits *DB, 967 AssumptionCache *AC, 968 OptimizationRemarkEmitter *ORE, const Function *F, 969 const LoopVectorizeHints *Hints, 970 InterleavedAccessInfo &IAI) 971 : ScalarEpilogueStatus(SEL), TheLoop(L), PSE(PSE), LI(LI), Legal(Legal), 972 TTI(TTI), TLI(TLI), DB(DB), AC(AC), ORE(ORE), TheFunction(F), 973 Hints(Hints), InterleaveInfo(IAI) {} 974 975 /// \return An upper bound for the vectorization factor, or None if 976 /// vectorization and interleaving should be avoided up front. 977 Optional<unsigned> computeMaxVF(); 978 979 /// \return True if runtime checks are required for vectorization, and false 980 /// otherwise. 981 bool runtimeChecksRequired(); 982 983 /// \return The most profitable vectorization factor and the cost of that VF. 984 /// This method checks every power of two up to MaxVF. If UserVF is not ZERO 985 /// then this vectorization factor will be selected if vectorization is 986 /// possible. 987 VectorizationFactor selectVectorizationFactor(unsigned MaxVF); 988 989 /// Setup cost-based decisions for user vectorization factor. 990 void selectUserVectorizationFactor(unsigned UserVF) { 991 collectUniformsAndScalars(UserVF); 992 collectInstsToScalarize(UserVF); 993 } 994 995 /// \return The size (in bits) of the smallest and widest types in the code 996 /// that needs to be vectorized. We ignore values that remain scalar such as 997 /// 64 bit loop indices. 998 std::pair<unsigned, unsigned> getSmallestAndWidestTypes(); 999 1000 /// \return The desired interleave count. 1001 /// If interleave count has been specified by metadata it will be returned. 1002 /// Otherwise, the interleave count is computed and returned. VF and LoopCost 1003 /// are the selected vectorization factor and the cost of the selected VF. 1004 unsigned selectInterleaveCount(unsigned VF, unsigned LoopCost); 1005 1006 /// Memory access instruction may be vectorized in more than one way. 1007 /// Form of instruction after vectorization depends on cost. 1008 /// This function takes cost-based decisions for Load/Store instructions 1009 /// and collects them in a map. This decisions map is used for building 1010 /// the lists of loop-uniform and loop-scalar instructions. 1011 /// The calculated cost is saved with widening decision in order to 1012 /// avoid redundant calculations. 1013 void setCostBasedWideningDecision(unsigned VF); 1014 1015 /// A struct that represents some properties of the register usage 1016 /// of a loop. 1017 struct RegisterUsage { 1018 /// Holds the number of loop invariant values that are used in the loop. 1019 /// The key is ClassID of target-provided register class. 1020 SmallMapVector<unsigned, unsigned, 4> LoopInvariantRegs; 1021 /// Holds the maximum number of concurrent live intervals in the loop. 1022 /// The key is ClassID of target-provided register class. 1023 SmallMapVector<unsigned, unsigned, 4> MaxLocalUsers; 1024 }; 1025 1026 /// \return Returns information about the register usages of the loop for the 1027 /// given vectorization factors. 1028 SmallVector<RegisterUsage, 8> calculateRegisterUsage(ArrayRef<unsigned> VFs); 1029 1030 /// Collect values we want to ignore in the cost model. 1031 void collectValuesToIgnore(); 1032 1033 /// \returns The smallest bitwidth each instruction can be represented with. 1034 /// The vector equivalents of these instructions should be truncated to this 1035 /// type. 1036 const MapVector<Instruction *, uint64_t> &getMinimalBitwidths() const { 1037 return MinBWs; 1038 } 1039 1040 /// \returns True if it is more profitable to scalarize instruction \p I for 1041 /// vectorization factor \p VF. 1042 bool isProfitableToScalarize(Instruction *I, unsigned VF) const { 1043 assert(VF > 1 && "Profitable to scalarize relevant only for VF > 1."); 1044 1045 // Cost model is not run in the VPlan-native path - return conservative 1046 // result until this changes. 1047 if (EnableVPlanNativePath) 1048 return false; 1049 1050 auto Scalars = InstsToScalarize.find(VF); 1051 assert(Scalars != InstsToScalarize.end() && 1052 "VF not yet analyzed for scalarization profitability"); 1053 return Scalars->second.find(I) != Scalars->second.end(); 1054 } 1055 1056 /// Returns true if \p I is known to be uniform after vectorization. 1057 bool isUniformAfterVectorization(Instruction *I, unsigned VF) const { 1058 if (VF == 1) 1059 return true; 1060 1061 // Cost model is not run in the VPlan-native path - return conservative 1062 // result until this changes. 1063 if (EnableVPlanNativePath) 1064 return false; 1065 1066 auto UniformsPerVF = Uniforms.find(VF); 1067 assert(UniformsPerVF != Uniforms.end() && 1068 "VF not yet analyzed for uniformity"); 1069 return UniformsPerVF->second.find(I) != UniformsPerVF->second.end(); 1070 } 1071 1072 /// Returns true if \p I is known to be scalar after vectorization. 1073 bool isScalarAfterVectorization(Instruction *I, unsigned VF) const { 1074 if (VF == 1) 1075 return true; 1076 1077 // Cost model is not run in the VPlan-native path - return conservative 1078 // result until this changes. 1079 if (EnableVPlanNativePath) 1080 return false; 1081 1082 auto ScalarsPerVF = Scalars.find(VF); 1083 assert(ScalarsPerVF != Scalars.end() && 1084 "Scalar values are not calculated for VF"); 1085 return ScalarsPerVF->second.find(I) != ScalarsPerVF->second.end(); 1086 } 1087 1088 /// \returns True if instruction \p I can be truncated to a smaller bitwidth 1089 /// for vectorization factor \p VF. 1090 bool canTruncateToMinimalBitwidth(Instruction *I, unsigned VF) const { 1091 return VF > 1 && MinBWs.find(I) != MinBWs.end() && 1092 !isProfitableToScalarize(I, VF) && 1093 !isScalarAfterVectorization(I, VF); 1094 } 1095 1096 /// Decision that was taken during cost calculation for memory instruction. 1097 enum InstWidening { 1098 CM_Unknown, 1099 CM_Widen, // For consecutive accesses with stride +1. 1100 CM_Widen_Reverse, // For consecutive accesses with stride -1. 1101 CM_Interleave, 1102 CM_GatherScatter, 1103 CM_Scalarize 1104 }; 1105 1106 /// Save vectorization decision \p W and \p Cost taken by the cost model for 1107 /// instruction \p I and vector width \p VF. 1108 void setWideningDecision(Instruction *I, unsigned VF, InstWidening W, 1109 unsigned Cost) { 1110 assert(VF >= 2 && "Expected VF >=2"); 1111 WideningDecisions[std::make_pair(I, VF)] = std::make_pair(W, Cost); 1112 } 1113 1114 /// Save vectorization decision \p W and \p Cost taken by the cost model for 1115 /// interleaving group \p Grp and vector width \p VF. 1116 void setWideningDecision(const InterleaveGroup<Instruction> *Grp, unsigned VF, 1117 InstWidening W, unsigned Cost) { 1118 assert(VF >= 2 && "Expected VF >=2"); 1119 /// Broadcast this decicion to all instructions inside the group. 1120 /// But the cost will be assigned to one instruction only. 1121 for (unsigned i = 0; i < Grp->getFactor(); ++i) { 1122 if (auto *I = Grp->getMember(i)) { 1123 if (Grp->getInsertPos() == I) 1124 WideningDecisions[std::make_pair(I, VF)] = std::make_pair(W, Cost); 1125 else 1126 WideningDecisions[std::make_pair(I, VF)] = std::make_pair(W, 0); 1127 } 1128 } 1129 } 1130 1131 /// Return the cost model decision for the given instruction \p I and vector 1132 /// width \p VF. Return CM_Unknown if this instruction did not pass 1133 /// through the cost modeling. 1134 InstWidening getWideningDecision(Instruction *I, unsigned VF) { 1135 assert(VF >= 2 && "Expected VF >=2"); 1136 1137 // Cost model is not run in the VPlan-native path - return conservative 1138 // result until this changes. 1139 if (EnableVPlanNativePath) 1140 return CM_GatherScatter; 1141 1142 std::pair<Instruction *, unsigned> InstOnVF = std::make_pair(I, VF); 1143 auto Itr = WideningDecisions.find(InstOnVF); 1144 if (Itr == WideningDecisions.end()) 1145 return CM_Unknown; 1146 return Itr->second.first; 1147 } 1148 1149 /// Return the vectorization cost for the given instruction \p I and vector 1150 /// width \p VF. 1151 unsigned getWideningCost(Instruction *I, unsigned VF) { 1152 assert(VF >= 2 && "Expected VF >=2"); 1153 std::pair<Instruction *, unsigned> InstOnVF = std::make_pair(I, VF); 1154 assert(WideningDecisions.find(InstOnVF) != WideningDecisions.end() && 1155 "The cost is not calculated"); 1156 return WideningDecisions[InstOnVF].second; 1157 } 1158 1159 /// Return True if instruction \p I is an optimizable truncate whose operand 1160 /// is an induction variable. Such a truncate will be removed by adding a new 1161 /// induction variable with the destination type. 1162 bool isOptimizableIVTruncate(Instruction *I, unsigned VF) { 1163 // If the instruction is not a truncate, return false. 1164 auto *Trunc = dyn_cast<TruncInst>(I); 1165 if (!Trunc) 1166 return false; 1167 1168 // Get the source and destination types of the truncate. 1169 Type *SrcTy = ToVectorTy(cast<CastInst>(I)->getSrcTy(), VF); 1170 Type *DestTy = ToVectorTy(cast<CastInst>(I)->getDestTy(), VF); 1171 1172 // If the truncate is free for the given types, return false. Replacing a 1173 // free truncate with an induction variable would add an induction variable 1174 // update instruction to each iteration of the loop. We exclude from this 1175 // check the primary induction variable since it will need an update 1176 // instruction regardless. 1177 Value *Op = Trunc->getOperand(0); 1178 if (Op != Legal->getPrimaryInduction() && TTI.isTruncateFree(SrcTy, DestTy)) 1179 return false; 1180 1181 // If the truncated value is not an induction variable, return false. 1182 return Legal->isInductionPhi(Op); 1183 } 1184 1185 /// Collects the instructions to scalarize for each predicated instruction in 1186 /// the loop. 1187 void collectInstsToScalarize(unsigned VF); 1188 1189 /// Collect Uniform and Scalar values for the given \p VF. 1190 /// The sets depend on CM decision for Load/Store instructions 1191 /// that may be vectorized as interleave, gather-scatter or scalarized. 1192 void collectUniformsAndScalars(unsigned VF) { 1193 // Do the analysis once. 1194 if (VF == 1 || Uniforms.find(VF) != Uniforms.end()) 1195 return; 1196 setCostBasedWideningDecision(VF); 1197 collectLoopUniforms(VF); 1198 collectLoopScalars(VF); 1199 } 1200 1201 /// Returns true if the target machine supports masked store operation 1202 /// for the given \p DataType and kind of access to \p Ptr. 1203 bool isLegalMaskedStore(Type *DataType, Value *Ptr, MaybeAlign Alignment) { 1204 return Legal->isConsecutivePtr(Ptr) && 1205 TTI.isLegalMaskedStore(DataType, Alignment); 1206 } 1207 1208 /// Returns true if the target machine supports masked load operation 1209 /// for the given \p DataType and kind of access to \p Ptr. 1210 bool isLegalMaskedLoad(Type *DataType, Value *Ptr, MaybeAlign Alignment) { 1211 return Legal->isConsecutivePtr(Ptr) && 1212 TTI.isLegalMaskedLoad(DataType, Alignment); 1213 } 1214 1215 /// Returns true if the target machine supports masked scatter operation 1216 /// for the given \p DataType. 1217 bool isLegalMaskedScatter(Type *DataType, MaybeAlign Alignment) { 1218 return TTI.isLegalMaskedScatter(DataType, Alignment); 1219 } 1220 1221 /// Returns true if the target machine supports masked gather operation 1222 /// for the given \p DataType. 1223 bool isLegalMaskedGather(Type *DataType, MaybeAlign Alignment) { 1224 return TTI.isLegalMaskedGather(DataType, Alignment); 1225 } 1226 1227 /// Returns true if the target machine can represent \p V as a masked gather 1228 /// or scatter operation. 1229 bool isLegalGatherOrScatter(Value *V) { 1230 bool LI = isa<LoadInst>(V); 1231 bool SI = isa<StoreInst>(V); 1232 if (!LI && !SI) 1233 return false; 1234 auto *Ty = getMemInstValueType(V); 1235 MaybeAlign Align = getLoadStoreAlignment(V); 1236 return (LI && isLegalMaskedGather(Ty, Align)) || 1237 (SI && isLegalMaskedScatter(Ty, Align)); 1238 } 1239 1240 /// Returns true if \p I is an instruction that will be scalarized with 1241 /// predication. Such instructions include conditional stores and 1242 /// instructions that may divide by zero. 1243 /// If a non-zero VF has been calculated, we check if I will be scalarized 1244 /// predication for that VF. 1245 bool isScalarWithPredication(Instruction *I, unsigned VF = 1); 1246 1247 // Returns true if \p I is an instruction that will be predicated either 1248 // through scalar predication or masked load/store or masked gather/scatter. 1249 // Superset of instructions that return true for isScalarWithPredication. 1250 bool isPredicatedInst(Instruction *I) { 1251 if (!blockNeedsPredication(I->getParent())) 1252 return false; 1253 // Loads and stores that need some form of masked operation are predicated 1254 // instructions. 1255 if (isa<LoadInst>(I) || isa<StoreInst>(I)) 1256 return Legal->isMaskRequired(I); 1257 return isScalarWithPredication(I); 1258 } 1259 1260 /// Returns true if \p I is a memory instruction with consecutive memory 1261 /// access that can be widened. 1262 bool memoryInstructionCanBeWidened(Instruction *I, unsigned VF = 1); 1263 1264 /// Returns true if \p I is a memory instruction in an interleaved-group 1265 /// of memory accesses that can be vectorized with wide vector loads/stores 1266 /// and shuffles. 1267 bool interleavedAccessCanBeWidened(Instruction *I, unsigned VF = 1); 1268 1269 /// Check if \p Instr belongs to any interleaved access group. 1270 bool isAccessInterleaved(Instruction *Instr) { 1271 return InterleaveInfo.isInterleaved(Instr); 1272 } 1273 1274 /// Get the interleaved access group that \p Instr belongs to. 1275 const InterleaveGroup<Instruction> * 1276 getInterleavedAccessGroup(Instruction *Instr) { 1277 return InterleaveInfo.getInterleaveGroup(Instr); 1278 } 1279 1280 /// Returns true if an interleaved group requires a scalar iteration 1281 /// to handle accesses with gaps, and there is nothing preventing us from 1282 /// creating a scalar epilogue. 1283 bool requiresScalarEpilogue() const { 1284 return isScalarEpilogueAllowed() && InterleaveInfo.requiresScalarEpilogue(); 1285 } 1286 1287 /// Returns true if a scalar epilogue is not allowed due to optsize or a 1288 /// loop hint annotation. 1289 bool isScalarEpilogueAllowed() const { 1290 return ScalarEpilogueStatus == CM_ScalarEpilogueAllowed; 1291 } 1292 1293 /// Returns true if all loop blocks should be masked to fold tail loop. 1294 bool foldTailByMasking() const { return FoldTailByMasking; } 1295 1296 bool blockNeedsPredication(BasicBlock *BB) { 1297 return foldTailByMasking() || Legal->blockNeedsPredication(BB); 1298 } 1299 1300 /// Estimate cost of an intrinsic call instruction CI if it were vectorized 1301 /// with factor VF. Return the cost of the instruction, including 1302 /// scalarization overhead if it's needed. 1303 unsigned getVectorIntrinsicCost(CallInst *CI, unsigned VF); 1304 1305 /// Estimate cost of a call instruction CI if it were vectorized with factor 1306 /// VF. Return the cost of the instruction, including scalarization overhead 1307 /// if it's needed. The flag NeedToScalarize shows if the call needs to be 1308 /// scalarized - 1309 /// i.e. either vector version isn't available, or is too expensive. 1310 unsigned getVectorCallCost(CallInst *CI, unsigned VF, bool &NeedToScalarize); 1311 1312 private: 1313 unsigned NumPredStores = 0; 1314 1315 /// \return An upper bound for the vectorization factor, larger than zero. 1316 /// One is returned if vectorization should best be avoided due to cost. 1317 unsigned computeFeasibleMaxVF(unsigned ConstTripCount); 1318 1319 /// The vectorization cost is a combination of the cost itself and a boolean 1320 /// indicating whether any of the contributing operations will actually 1321 /// operate on 1322 /// vector values after type legalization in the backend. If this latter value 1323 /// is 1324 /// false, then all operations will be scalarized (i.e. no vectorization has 1325 /// actually taken place). 1326 using VectorizationCostTy = std::pair<unsigned, bool>; 1327 1328 /// Returns the expected execution cost. The unit of the cost does 1329 /// not matter because we use the 'cost' units to compare different 1330 /// vector widths. The cost that is returned is *not* normalized by 1331 /// the factor width. 1332 VectorizationCostTy expectedCost(unsigned VF); 1333 1334 /// Returns the execution time cost of an instruction for a given vector 1335 /// width. Vector width of one means scalar. 1336 VectorizationCostTy getInstructionCost(Instruction *I, unsigned VF); 1337 1338 /// The cost-computation logic from getInstructionCost which provides 1339 /// the vector type as an output parameter. 1340 unsigned getInstructionCost(Instruction *I, unsigned VF, Type *&VectorTy); 1341 1342 /// Calculate vectorization cost of memory instruction \p I. 1343 unsigned getMemoryInstructionCost(Instruction *I, unsigned VF); 1344 1345 /// The cost computation for scalarized memory instruction. 1346 unsigned getMemInstScalarizationCost(Instruction *I, unsigned VF); 1347 1348 /// The cost computation for interleaving group of memory instructions. 1349 unsigned getInterleaveGroupCost(Instruction *I, unsigned VF); 1350 1351 /// The cost computation for Gather/Scatter instruction. 1352 unsigned getGatherScatterCost(Instruction *I, unsigned VF); 1353 1354 /// The cost computation for widening instruction \p I with consecutive 1355 /// memory access. 1356 unsigned getConsecutiveMemOpCost(Instruction *I, unsigned VF); 1357 1358 /// The cost calculation for Load/Store instruction \p I with uniform pointer - 1359 /// Load: scalar load + broadcast. 1360 /// Store: scalar store + (loop invariant value stored? 0 : extract of last 1361 /// element) 1362 unsigned getUniformMemOpCost(Instruction *I, unsigned VF); 1363 1364 /// Estimate the overhead of scalarizing an instruction. This is a 1365 /// convenience wrapper for the type-based getScalarizationOverhead API. 1366 unsigned getScalarizationOverhead(Instruction *I, unsigned VF); 1367 1368 /// Returns whether the instruction is a load or store and will be a emitted 1369 /// as a vector operation. 1370 bool isConsecutiveLoadOrStore(Instruction *I); 1371 1372 /// Returns true if an artificially high cost for emulated masked memrefs 1373 /// should be used. 1374 bool useEmulatedMaskMemRefHack(Instruction *I); 1375 1376 /// Map of scalar integer values to the smallest bitwidth they can be legally 1377 /// represented as. The vector equivalents of these values should be truncated 1378 /// to this type. 1379 MapVector<Instruction *, uint64_t> MinBWs; 1380 1381 /// A type representing the costs for instructions if they were to be 1382 /// scalarized rather than vectorized. The entries are Instruction-Cost 1383 /// pairs. 1384 using ScalarCostsTy = DenseMap<Instruction *, unsigned>; 1385 1386 /// A set containing all BasicBlocks that are known to present after 1387 /// vectorization as a predicated block. 1388 SmallPtrSet<BasicBlock *, 4> PredicatedBBsAfterVectorization; 1389 1390 /// Records whether it is allowed to have the original scalar loop execute at 1391 /// least once. This may be needed as a fallback loop in case runtime 1392 /// aliasing/dependence checks fail, or to handle the tail/remainder 1393 /// iterations when the trip count is unknown or doesn't divide by the VF, 1394 /// or as a peel-loop to handle gaps in interleave-groups. 1395 /// Under optsize and when the trip count is very small we don't allow any 1396 /// iterations to execute in the scalar loop. 1397 ScalarEpilogueLowering ScalarEpilogueStatus = CM_ScalarEpilogueAllowed; 1398 1399 /// All blocks of loop are to be masked to fold tail of scalar iterations. 1400 bool FoldTailByMasking = false; 1401 1402 /// A map holding scalar costs for different vectorization factors. The 1403 /// presence of a cost for an instruction in the mapping indicates that the 1404 /// instruction will be scalarized when vectorizing with the associated 1405 /// vectorization factor. The entries are VF-ScalarCostTy pairs. 1406 DenseMap<unsigned, ScalarCostsTy> InstsToScalarize; 1407 1408 /// Holds the instructions known to be uniform after vectorization. 1409 /// The data is collected per VF. 1410 DenseMap<unsigned, SmallPtrSet<Instruction *, 4>> Uniforms; 1411 1412 /// Holds the instructions known to be scalar after vectorization. 1413 /// The data is collected per VF. 1414 DenseMap<unsigned, SmallPtrSet<Instruction *, 4>> Scalars; 1415 1416 /// Holds the instructions (address computations) that are forced to be 1417 /// scalarized. 1418 DenseMap<unsigned, SmallPtrSet<Instruction *, 4>> ForcedScalars; 1419 1420 /// Returns the expected difference in cost from scalarizing the expression 1421 /// feeding a predicated instruction \p PredInst. The instructions to 1422 /// scalarize and their scalar costs are collected in \p ScalarCosts. A 1423 /// non-negative return value implies the expression will be scalarized. 1424 /// Currently, only single-use chains are considered for scalarization. 1425 int computePredInstDiscount(Instruction *PredInst, ScalarCostsTy &ScalarCosts, 1426 unsigned VF); 1427 1428 /// Collect the instructions that are uniform after vectorization. An 1429 /// instruction is uniform if we represent it with a single scalar value in 1430 /// the vectorized loop corresponding to each vector iteration. Examples of 1431 /// uniform instructions include pointer operands of consecutive or 1432 /// interleaved memory accesses. Note that although uniformity implies an 1433 /// instruction will be scalar, the reverse is not true. In general, a 1434 /// scalarized instruction will be represented by VF scalar values in the 1435 /// vectorized loop, each corresponding to an iteration of the original 1436 /// scalar loop. 1437 void collectLoopUniforms(unsigned VF); 1438 1439 /// Collect the instructions that are scalar after vectorization. An 1440 /// instruction is scalar if it is known to be uniform or will be scalarized 1441 /// during vectorization. Non-uniform scalarized instructions will be 1442 /// represented by VF values in the vectorized loop, each corresponding to an 1443 /// iteration of the original scalar loop. 1444 void collectLoopScalars(unsigned VF); 1445 1446 /// Keeps cost model vectorization decision and cost for instructions. 1447 /// Right now it is used for memory instructions only. 1448 using DecisionList = DenseMap<std::pair<Instruction *, unsigned>, 1449 std::pair<InstWidening, unsigned>>; 1450 1451 DecisionList WideningDecisions; 1452 1453 /// Returns true if \p V is expected to be vectorized and it needs to be 1454 /// extracted. 1455 bool needsExtract(Value *V, unsigned VF) const { 1456 Instruction *I = dyn_cast<Instruction>(V); 1457 if (VF == 1 || !I || !TheLoop->contains(I) || TheLoop->isLoopInvariant(I)) 1458 return false; 1459 1460 // Assume we can vectorize V (and hence we need extraction) if the 1461 // scalars are not computed yet. This can happen, because it is called 1462 // via getScalarizationOverhead from setCostBasedWideningDecision, before 1463 // the scalars are collected. That should be a safe assumption in most 1464 // cases, because we check if the operands have vectorizable types 1465 // beforehand in LoopVectorizationLegality. 1466 return Scalars.find(VF) == Scalars.end() || 1467 !isScalarAfterVectorization(I, VF); 1468 }; 1469 1470 /// Returns a range containing only operands needing to be extracted. 1471 SmallVector<Value *, 4> filterExtractingOperands(Instruction::op_range Ops, 1472 unsigned VF) { 1473 return SmallVector<Value *, 4>(make_filter_range( 1474 Ops, [this, VF](Value *V) { return this->needsExtract(V, VF); })); 1475 } 1476 1477 public: 1478 /// The loop that we evaluate. 1479 Loop *TheLoop; 1480 1481 /// Predicated scalar evolution analysis. 1482 PredicatedScalarEvolution &PSE; 1483 1484 /// Loop Info analysis. 1485 LoopInfo *LI; 1486 1487 /// Vectorization legality. 1488 LoopVectorizationLegality *Legal; 1489 1490 /// Vector target information. 1491 const TargetTransformInfo &TTI; 1492 1493 /// Target Library Info. 1494 const TargetLibraryInfo *TLI; 1495 1496 /// Demanded bits analysis. 1497 DemandedBits *DB; 1498 1499 /// Assumption cache. 1500 AssumptionCache *AC; 1501 1502 /// Interface to emit optimization remarks. 1503 OptimizationRemarkEmitter *ORE; 1504 1505 const Function *TheFunction; 1506 1507 /// Loop Vectorize Hint. 1508 const LoopVectorizeHints *Hints; 1509 1510 /// The interleave access information contains groups of interleaved accesses 1511 /// with the same stride and close to each other. 1512 InterleavedAccessInfo &InterleaveInfo; 1513 1514 /// Values to ignore in the cost model. 1515 SmallPtrSet<const Value *, 16> ValuesToIgnore; 1516 1517 /// Values to ignore in the cost model when VF > 1. 1518 SmallPtrSet<const Value *, 16> VecValuesToIgnore; 1519 }; 1520 1521 } // end namespace llvm 1522 1523 // Return true if \p OuterLp is an outer loop annotated with hints for explicit 1524 // vectorization. The loop needs to be annotated with #pragma omp simd 1525 // simdlen(#) or #pragma clang vectorize(enable) vectorize_width(#). If the 1526 // vector length information is not provided, vectorization is not considered 1527 // explicit. Interleave hints are not allowed either. These limitations will be 1528 // relaxed in the future. 1529 // Please, note that we are currently forced to abuse the pragma 'clang 1530 // vectorize' semantics. This pragma provides *auto-vectorization hints* 1531 // (i.e., LV must check that vectorization is legal) whereas pragma 'omp simd' 1532 // provides *explicit vectorization hints* (LV can bypass legal checks and 1533 // assume that vectorization is legal). However, both hints are implemented 1534 // using the same metadata (llvm.loop.vectorize, processed by 1535 // LoopVectorizeHints). This will be fixed in the future when the native IR 1536 // representation for pragma 'omp simd' is introduced. 1537 static bool isExplicitVecOuterLoop(Loop *OuterLp, 1538 OptimizationRemarkEmitter *ORE) { 1539 assert(!OuterLp->empty() && "This is not an outer loop"); 1540 LoopVectorizeHints Hints(OuterLp, true /*DisableInterleaving*/, *ORE); 1541 1542 // Only outer loops with an explicit vectorization hint are supported. 1543 // Unannotated outer loops are ignored. 1544 if (Hints.getForce() == LoopVectorizeHints::FK_Undefined) 1545 return false; 1546 1547 Function *Fn = OuterLp->getHeader()->getParent(); 1548 if (!Hints.allowVectorization(Fn, OuterLp, 1549 true /*VectorizeOnlyWhenForced*/)) { 1550 LLVM_DEBUG(dbgs() << "LV: Loop hints prevent outer loop vectorization.\n"); 1551 return false; 1552 } 1553 1554 if (Hints.getInterleave() > 1) { 1555 // TODO: Interleave support is future work. 1556 LLVM_DEBUG(dbgs() << "LV: Not vectorizing: Interleave is not supported for " 1557 "outer loops.\n"); 1558 Hints.emitRemarkWithHints(); 1559 return false; 1560 } 1561 1562 return true; 1563 } 1564 1565 static void collectSupportedLoops(Loop &L, LoopInfo *LI, 1566 OptimizationRemarkEmitter *ORE, 1567 SmallVectorImpl<Loop *> &V) { 1568 // Collect inner loops and outer loops without irreducible control flow. For 1569 // now, only collect outer loops that have explicit vectorization hints. If we 1570 // are stress testing the VPlan H-CFG construction, we collect the outermost 1571 // loop of every loop nest. 1572 if (L.empty() || VPlanBuildStressTest || 1573 (EnableVPlanNativePath && isExplicitVecOuterLoop(&L, ORE))) { 1574 LoopBlocksRPO RPOT(&L); 1575 RPOT.perform(LI); 1576 if (!containsIrreducibleCFG<const BasicBlock *>(RPOT, *LI)) { 1577 V.push_back(&L); 1578 // TODO: Collect inner loops inside marked outer loops in case 1579 // vectorization fails for the outer loop. Do not invoke 1580 // 'containsIrreducibleCFG' again for inner loops when the outer loop is 1581 // already known to be reducible. We can use an inherited attribute for 1582 // that. 1583 return; 1584 } 1585 } 1586 for (Loop *InnerL : L) 1587 collectSupportedLoops(*InnerL, LI, ORE, V); 1588 } 1589 1590 namespace { 1591 1592 /// The LoopVectorize Pass. 1593 struct LoopVectorize : public FunctionPass { 1594 /// Pass identification, replacement for typeid 1595 static char ID; 1596 1597 LoopVectorizePass Impl; 1598 1599 explicit LoopVectorize(bool InterleaveOnlyWhenForced = false, 1600 bool VectorizeOnlyWhenForced = false) 1601 : FunctionPass(ID), 1602 Impl({InterleaveOnlyWhenForced, VectorizeOnlyWhenForced}) { 1603 initializeLoopVectorizePass(*PassRegistry::getPassRegistry()); 1604 } 1605 1606 bool runOnFunction(Function &F) override { 1607 if (skipFunction(F)) 1608 return false; 1609 1610 auto *SE = &getAnalysis<ScalarEvolutionWrapperPass>().getSE(); 1611 auto *LI = &getAnalysis<LoopInfoWrapperPass>().getLoopInfo(); 1612 auto *TTI = &getAnalysis<TargetTransformInfoWrapperPass>().getTTI(F); 1613 auto *DT = &getAnalysis<DominatorTreeWrapperPass>().getDomTree(); 1614 auto *BFI = &getAnalysis<BlockFrequencyInfoWrapperPass>().getBFI(); 1615 auto *TLIP = getAnalysisIfAvailable<TargetLibraryInfoWrapperPass>(); 1616 auto *TLI = TLIP ? &TLIP->getTLI(F) : nullptr; 1617 auto *AA = &getAnalysis<AAResultsWrapperPass>().getAAResults(); 1618 auto *AC = &getAnalysis<AssumptionCacheTracker>().getAssumptionCache(F); 1619 auto *LAA = &getAnalysis<LoopAccessLegacyAnalysis>(); 1620 auto *DB = &getAnalysis<DemandedBitsWrapperPass>().getDemandedBits(); 1621 auto *ORE = &getAnalysis<OptimizationRemarkEmitterWrapperPass>().getORE(); 1622 auto *PSI = &getAnalysis<ProfileSummaryInfoWrapperPass>().getPSI(); 1623 1624 std::function<const LoopAccessInfo &(Loop &)> GetLAA = 1625 [&](Loop &L) -> const LoopAccessInfo & { return LAA->getInfo(&L); }; 1626 1627 return Impl.runImpl(F, *SE, *LI, *TTI, *DT, *BFI, TLI, *DB, *AA, *AC, 1628 GetLAA, *ORE, PSI); 1629 } 1630 1631 void getAnalysisUsage(AnalysisUsage &AU) const override { 1632 AU.addRequired<AssumptionCacheTracker>(); 1633 AU.addRequired<BlockFrequencyInfoWrapperPass>(); 1634 AU.addRequired<DominatorTreeWrapperPass>(); 1635 AU.addRequired<LoopInfoWrapperPass>(); 1636 AU.addRequired<ScalarEvolutionWrapperPass>(); 1637 AU.addRequired<TargetTransformInfoWrapperPass>(); 1638 AU.addRequired<AAResultsWrapperPass>(); 1639 AU.addRequired<LoopAccessLegacyAnalysis>(); 1640 AU.addRequired<DemandedBitsWrapperPass>(); 1641 AU.addRequired<OptimizationRemarkEmitterWrapperPass>(); 1642 AU.addRequired<InjectTLIMappingsLegacy>(); 1643 1644 // We currently do not preserve loopinfo/dominator analyses with outer loop 1645 // vectorization. Until this is addressed, mark these analyses as preserved 1646 // only for non-VPlan-native path. 1647 // TODO: Preserve Loop and Dominator analyses for VPlan-native path. 1648 if (!EnableVPlanNativePath) { 1649 AU.addPreserved<LoopInfoWrapperPass>(); 1650 AU.addPreserved<DominatorTreeWrapperPass>(); 1651 } 1652 1653 AU.addPreserved<BasicAAWrapperPass>(); 1654 AU.addPreserved<GlobalsAAWrapperPass>(); 1655 AU.addRequired<ProfileSummaryInfoWrapperPass>(); 1656 } 1657 }; 1658 1659 } // end anonymous namespace 1660 1661 //===----------------------------------------------------------------------===// 1662 // Implementation of LoopVectorizationLegality, InnerLoopVectorizer and 1663 // LoopVectorizationCostModel and LoopVectorizationPlanner. 1664 //===----------------------------------------------------------------------===// 1665 1666 Value *InnerLoopVectorizer::getBroadcastInstrs(Value *V) { 1667 // We need to place the broadcast of invariant variables outside the loop, 1668 // but only if it's proven safe to do so. Else, broadcast will be inside 1669 // vector loop body. 1670 Instruction *Instr = dyn_cast<Instruction>(V); 1671 bool SafeToHoist = OrigLoop->isLoopInvariant(V) && 1672 (!Instr || 1673 DT->dominates(Instr->getParent(), LoopVectorPreHeader)); 1674 // Place the code for broadcasting invariant variables in the new preheader. 1675 IRBuilder<>::InsertPointGuard Guard(Builder); 1676 if (SafeToHoist) 1677 Builder.SetInsertPoint(LoopVectorPreHeader->getTerminator()); 1678 1679 // Broadcast the scalar into all locations in the vector. 1680 Value *Shuf = Builder.CreateVectorSplat(VF, V, "broadcast"); 1681 1682 return Shuf; 1683 } 1684 1685 void InnerLoopVectorizer::createVectorIntOrFpInductionPHI( 1686 const InductionDescriptor &II, Value *Step, Instruction *EntryVal) { 1687 assert((isa<PHINode>(EntryVal) || isa<TruncInst>(EntryVal)) && 1688 "Expected either an induction phi-node or a truncate of it!"); 1689 Value *Start = II.getStartValue(); 1690 1691 // Construct the initial value of the vector IV in the vector loop preheader 1692 auto CurrIP = Builder.saveIP(); 1693 Builder.SetInsertPoint(LoopVectorPreHeader->getTerminator()); 1694 if (isa<TruncInst>(EntryVal)) { 1695 assert(Start->getType()->isIntegerTy() && 1696 "Truncation requires an integer type"); 1697 auto *TruncType = cast<IntegerType>(EntryVal->getType()); 1698 Step = Builder.CreateTrunc(Step, TruncType); 1699 Start = Builder.CreateCast(Instruction::Trunc, Start, TruncType); 1700 } 1701 Value *SplatStart = Builder.CreateVectorSplat(VF, Start); 1702 Value *SteppedStart = 1703 getStepVector(SplatStart, 0, Step, II.getInductionOpcode()); 1704 1705 // We create vector phi nodes for both integer and floating-point induction 1706 // variables. Here, we determine the kind of arithmetic we will perform. 1707 Instruction::BinaryOps AddOp; 1708 Instruction::BinaryOps MulOp; 1709 if (Step->getType()->isIntegerTy()) { 1710 AddOp = Instruction::Add; 1711 MulOp = Instruction::Mul; 1712 } else { 1713 AddOp = II.getInductionOpcode(); 1714 MulOp = Instruction::FMul; 1715 } 1716 1717 // Multiply the vectorization factor by the step using integer or 1718 // floating-point arithmetic as appropriate. 1719 Value *ConstVF = getSignedIntOrFpConstant(Step->getType(), VF); 1720 Value *Mul = addFastMathFlag(Builder.CreateBinOp(MulOp, Step, ConstVF)); 1721 1722 // Create a vector splat to use in the induction update. 1723 // 1724 // FIXME: If the step is non-constant, we create the vector splat with 1725 // IRBuilder. IRBuilder can constant-fold the multiply, but it doesn't 1726 // handle a constant vector splat. 1727 Value *SplatVF = 1728 isa<Constant>(Mul) 1729 ? ConstantVector::getSplat({VF, false}, cast<Constant>(Mul)) 1730 : Builder.CreateVectorSplat(VF, Mul); 1731 Builder.restoreIP(CurrIP); 1732 1733 // We may need to add the step a number of times, depending on the unroll 1734 // factor. The last of those goes into the PHI. 1735 PHINode *VecInd = PHINode::Create(SteppedStart->getType(), 2, "vec.ind", 1736 &*LoopVectorBody->getFirstInsertionPt()); 1737 VecInd->setDebugLoc(EntryVal->getDebugLoc()); 1738 Instruction *LastInduction = VecInd; 1739 for (unsigned Part = 0; Part < UF; ++Part) { 1740 VectorLoopValueMap.setVectorValue(EntryVal, Part, LastInduction); 1741 1742 if (isa<TruncInst>(EntryVal)) 1743 addMetadata(LastInduction, EntryVal); 1744 recordVectorLoopValueForInductionCast(II, EntryVal, LastInduction, Part); 1745 1746 LastInduction = cast<Instruction>(addFastMathFlag( 1747 Builder.CreateBinOp(AddOp, LastInduction, SplatVF, "step.add"))); 1748 LastInduction->setDebugLoc(EntryVal->getDebugLoc()); 1749 } 1750 1751 // Move the last step to the end of the latch block. This ensures consistent 1752 // placement of all induction updates. 1753 auto *LoopVectorLatch = LI->getLoopFor(LoopVectorBody)->getLoopLatch(); 1754 auto *Br = cast<BranchInst>(LoopVectorLatch->getTerminator()); 1755 auto *ICmp = cast<Instruction>(Br->getCondition()); 1756 LastInduction->moveBefore(ICmp); 1757 LastInduction->setName("vec.ind.next"); 1758 1759 VecInd->addIncoming(SteppedStart, LoopVectorPreHeader); 1760 VecInd->addIncoming(LastInduction, LoopVectorLatch); 1761 } 1762 1763 bool InnerLoopVectorizer::shouldScalarizeInstruction(Instruction *I) const { 1764 return Cost->isScalarAfterVectorization(I, VF) || 1765 Cost->isProfitableToScalarize(I, VF); 1766 } 1767 1768 bool InnerLoopVectorizer::needsScalarInduction(Instruction *IV) const { 1769 if (shouldScalarizeInstruction(IV)) 1770 return true; 1771 auto isScalarInst = [&](User *U) -> bool { 1772 auto *I = cast<Instruction>(U); 1773 return (OrigLoop->contains(I) && shouldScalarizeInstruction(I)); 1774 }; 1775 return llvm::any_of(IV->users(), isScalarInst); 1776 } 1777 1778 void InnerLoopVectorizer::recordVectorLoopValueForInductionCast( 1779 const InductionDescriptor &ID, const Instruction *EntryVal, 1780 Value *VectorLoopVal, unsigned Part, unsigned Lane) { 1781 assert((isa<PHINode>(EntryVal) || isa<TruncInst>(EntryVal)) && 1782 "Expected either an induction phi-node or a truncate of it!"); 1783 1784 // This induction variable is not the phi from the original loop but the 1785 // newly-created IV based on the proof that casted Phi is equal to the 1786 // uncasted Phi in the vectorized loop (under a runtime guard possibly). It 1787 // re-uses the same InductionDescriptor that original IV uses but we don't 1788 // have to do any recording in this case - that is done when original IV is 1789 // processed. 1790 if (isa<TruncInst>(EntryVal)) 1791 return; 1792 1793 const SmallVectorImpl<Instruction *> &Casts = ID.getCastInsts(); 1794 if (Casts.empty()) 1795 return; 1796 // Only the first Cast instruction in the Casts vector is of interest. 1797 // The rest of the Casts (if exist) have no uses outside the 1798 // induction update chain itself. 1799 Instruction *CastInst = *Casts.begin(); 1800 if (Lane < UINT_MAX) 1801 VectorLoopValueMap.setScalarValue(CastInst, {Part, Lane}, VectorLoopVal); 1802 else 1803 VectorLoopValueMap.setVectorValue(CastInst, Part, VectorLoopVal); 1804 } 1805 1806 void InnerLoopVectorizer::widenIntOrFpInduction(PHINode *IV, TruncInst *Trunc) { 1807 assert((IV->getType()->isIntegerTy() || IV != OldInduction) && 1808 "Primary induction variable must have an integer type"); 1809 1810 auto II = Legal->getInductionVars().find(IV); 1811 assert(II != Legal->getInductionVars().end() && "IV is not an induction"); 1812 1813 auto ID = II->second; 1814 assert(IV->getType() == ID.getStartValue()->getType() && "Types must match"); 1815 1816 // The value from the original loop to which we are mapping the new induction 1817 // variable. 1818 Instruction *EntryVal = Trunc ? cast<Instruction>(Trunc) : IV; 1819 1820 auto &DL = OrigLoop->getHeader()->getModule()->getDataLayout(); 1821 1822 // Generate code for the induction step. Note that induction steps are 1823 // required to be loop-invariant 1824 auto CreateStepValue = [&](const SCEV *Step) -> Value * { 1825 assert(PSE.getSE()->isLoopInvariant(Step, OrigLoop) && 1826 "Induction step should be loop invariant"); 1827 if (PSE.getSE()->isSCEVable(IV->getType())) { 1828 SCEVExpander Exp(*PSE.getSE(), DL, "induction"); 1829 return Exp.expandCodeFor(Step, Step->getType(), 1830 LoopVectorPreHeader->getTerminator()); 1831 } 1832 return cast<SCEVUnknown>(Step)->getValue(); 1833 }; 1834 1835 // The scalar value to broadcast. This is derived from the canonical 1836 // induction variable. If a truncation type is given, truncate the canonical 1837 // induction variable and step. Otherwise, derive these values from the 1838 // induction descriptor. 1839 auto CreateScalarIV = [&](Value *&Step) -> Value * { 1840 Value *ScalarIV = Induction; 1841 if (IV != OldInduction) { 1842 ScalarIV = IV->getType()->isIntegerTy() 1843 ? Builder.CreateSExtOrTrunc(Induction, IV->getType()) 1844 : Builder.CreateCast(Instruction::SIToFP, Induction, 1845 IV->getType()); 1846 ScalarIV = emitTransformedIndex(Builder, ScalarIV, PSE.getSE(), DL, ID); 1847 ScalarIV->setName("offset.idx"); 1848 } 1849 if (Trunc) { 1850 auto *TruncType = cast<IntegerType>(Trunc->getType()); 1851 assert(Step->getType()->isIntegerTy() && 1852 "Truncation requires an integer step"); 1853 ScalarIV = Builder.CreateTrunc(ScalarIV, TruncType); 1854 Step = Builder.CreateTrunc(Step, TruncType); 1855 } 1856 return ScalarIV; 1857 }; 1858 1859 // Create the vector values from the scalar IV, in the absence of creating a 1860 // vector IV. 1861 auto CreateSplatIV = [&](Value *ScalarIV, Value *Step) { 1862 Value *Broadcasted = getBroadcastInstrs(ScalarIV); 1863 for (unsigned Part = 0; Part < UF; ++Part) { 1864 Value *EntryPart = 1865 getStepVector(Broadcasted, VF * Part, Step, ID.getInductionOpcode()); 1866 VectorLoopValueMap.setVectorValue(EntryVal, Part, EntryPart); 1867 if (Trunc) 1868 addMetadata(EntryPart, Trunc); 1869 recordVectorLoopValueForInductionCast(ID, EntryVal, EntryPart, Part); 1870 } 1871 }; 1872 1873 // Now do the actual transformations, and start with creating the step value. 1874 Value *Step = CreateStepValue(ID.getStep()); 1875 if (VF <= 1) { 1876 Value *ScalarIV = CreateScalarIV(Step); 1877 CreateSplatIV(ScalarIV, Step); 1878 return; 1879 } 1880 1881 // Determine if we want a scalar version of the induction variable. This is 1882 // true if the induction variable itself is not widened, or if it has at 1883 // least one user in the loop that is not widened. 1884 auto NeedsScalarIV = needsScalarInduction(EntryVal); 1885 if (!NeedsScalarIV) { 1886 createVectorIntOrFpInductionPHI(ID, Step, EntryVal); 1887 return; 1888 } 1889 1890 // Try to create a new independent vector induction variable. If we can't 1891 // create the phi node, we will splat the scalar induction variable in each 1892 // loop iteration. 1893 if (!shouldScalarizeInstruction(EntryVal)) { 1894 createVectorIntOrFpInductionPHI(ID, Step, EntryVal); 1895 Value *ScalarIV = CreateScalarIV(Step); 1896 // Create scalar steps that can be used by instructions we will later 1897 // scalarize. Note that the addition of the scalar steps will not increase 1898 // the number of instructions in the loop in the common case prior to 1899 // InstCombine. We will be trading one vector extract for each scalar step. 1900 buildScalarSteps(ScalarIV, Step, EntryVal, ID); 1901 return; 1902 } 1903 1904 // If we haven't yet vectorized the induction variable, splat the scalar 1905 // induction variable, and build the necessary step vectors. 1906 // TODO: Don't do it unless the vectorized IV is really required. 1907 Value *ScalarIV = CreateScalarIV(Step); 1908 CreateSplatIV(ScalarIV, Step); 1909 buildScalarSteps(ScalarIV, Step, EntryVal, ID); 1910 } 1911 1912 Value *InnerLoopVectorizer::getStepVector(Value *Val, int StartIdx, Value *Step, 1913 Instruction::BinaryOps BinOp) { 1914 // Create and check the types. 1915 auto *ValVTy = cast<VectorType>(Val->getType()); 1916 int VLen = ValVTy->getNumElements(); 1917 1918 Type *STy = Val->getType()->getScalarType(); 1919 assert((STy->isIntegerTy() || STy->isFloatingPointTy()) && 1920 "Induction Step must be an integer or FP"); 1921 assert(Step->getType() == STy && "Step has wrong type"); 1922 1923 SmallVector<Constant *, 8> Indices; 1924 1925 if (STy->isIntegerTy()) { 1926 // Create a vector of consecutive numbers from zero to VF. 1927 for (int i = 0; i < VLen; ++i) 1928 Indices.push_back(ConstantInt::get(STy, StartIdx + i)); 1929 1930 // Add the consecutive indices to the vector value. 1931 Constant *Cv = ConstantVector::get(Indices); 1932 assert(Cv->getType() == Val->getType() && "Invalid consecutive vec"); 1933 Step = Builder.CreateVectorSplat(VLen, Step); 1934 assert(Step->getType() == Val->getType() && "Invalid step vec"); 1935 // FIXME: The newly created binary instructions should contain nsw/nuw flags, 1936 // which can be found from the original scalar operations. 1937 Step = Builder.CreateMul(Cv, Step); 1938 return Builder.CreateAdd(Val, Step, "induction"); 1939 } 1940 1941 // Floating point induction. 1942 assert((BinOp == Instruction::FAdd || BinOp == Instruction::FSub) && 1943 "Binary Opcode should be specified for FP induction"); 1944 // Create a vector of consecutive numbers from zero to VF. 1945 for (int i = 0; i < VLen; ++i) 1946 Indices.push_back(ConstantFP::get(STy, (double)(StartIdx + i))); 1947 1948 // Add the consecutive indices to the vector value. 1949 Constant *Cv = ConstantVector::get(Indices); 1950 1951 Step = Builder.CreateVectorSplat(VLen, Step); 1952 1953 // Floating point operations had to be 'fast' to enable the induction. 1954 FastMathFlags Flags; 1955 Flags.setFast(); 1956 1957 Value *MulOp = Builder.CreateFMul(Cv, Step); 1958 if (isa<Instruction>(MulOp)) 1959 // Have to check, MulOp may be a constant 1960 cast<Instruction>(MulOp)->setFastMathFlags(Flags); 1961 1962 Value *BOp = Builder.CreateBinOp(BinOp, Val, MulOp, "induction"); 1963 if (isa<Instruction>(BOp)) 1964 cast<Instruction>(BOp)->setFastMathFlags(Flags); 1965 return BOp; 1966 } 1967 1968 void InnerLoopVectorizer::buildScalarSteps(Value *ScalarIV, Value *Step, 1969 Instruction *EntryVal, 1970 const InductionDescriptor &ID) { 1971 // We shouldn't have to build scalar steps if we aren't vectorizing. 1972 assert(VF > 1 && "VF should be greater than one"); 1973 1974 // Get the value type and ensure it and the step have the same integer type. 1975 Type *ScalarIVTy = ScalarIV->getType()->getScalarType(); 1976 assert(ScalarIVTy == Step->getType() && 1977 "Val and Step should have the same type"); 1978 1979 // We build scalar steps for both integer and floating-point induction 1980 // variables. Here, we determine the kind of arithmetic we will perform. 1981 Instruction::BinaryOps AddOp; 1982 Instruction::BinaryOps MulOp; 1983 if (ScalarIVTy->isIntegerTy()) { 1984 AddOp = Instruction::Add; 1985 MulOp = Instruction::Mul; 1986 } else { 1987 AddOp = ID.getInductionOpcode(); 1988 MulOp = Instruction::FMul; 1989 } 1990 1991 // Determine the number of scalars we need to generate for each unroll 1992 // iteration. If EntryVal is uniform, we only need to generate the first 1993 // lane. Otherwise, we generate all VF values. 1994 unsigned Lanes = 1995 Cost->isUniformAfterVectorization(cast<Instruction>(EntryVal), VF) ? 1 1996 : VF; 1997 // Compute the scalar steps and save the results in VectorLoopValueMap. 1998 for (unsigned Part = 0; Part < UF; ++Part) { 1999 for (unsigned Lane = 0; Lane < Lanes; ++Lane) { 2000 auto *StartIdx = getSignedIntOrFpConstant(ScalarIVTy, VF * Part + Lane); 2001 auto *Mul = addFastMathFlag(Builder.CreateBinOp(MulOp, StartIdx, Step)); 2002 auto *Add = addFastMathFlag(Builder.CreateBinOp(AddOp, ScalarIV, Mul)); 2003 VectorLoopValueMap.setScalarValue(EntryVal, {Part, Lane}, Add); 2004 recordVectorLoopValueForInductionCast(ID, EntryVal, Add, Part, Lane); 2005 } 2006 } 2007 } 2008 2009 Value *InnerLoopVectorizer::getOrCreateVectorValue(Value *V, unsigned Part) { 2010 assert(V != Induction && "The new induction variable should not be used."); 2011 assert(!V->getType()->isVectorTy() && "Can't widen a vector"); 2012 assert(!V->getType()->isVoidTy() && "Type does not produce a value"); 2013 2014 // If we have a stride that is replaced by one, do it here. Defer this for 2015 // the VPlan-native path until we start running Legal checks in that path. 2016 if (!EnableVPlanNativePath && Legal->hasStride(V)) 2017 V = ConstantInt::get(V->getType(), 1); 2018 2019 // If we have a vector mapped to this value, return it. 2020 if (VectorLoopValueMap.hasVectorValue(V, Part)) 2021 return VectorLoopValueMap.getVectorValue(V, Part); 2022 2023 // If the value has not been vectorized, check if it has been scalarized 2024 // instead. If it has been scalarized, and we actually need the value in 2025 // vector form, we will construct the vector values on demand. 2026 if (VectorLoopValueMap.hasAnyScalarValue(V)) { 2027 Value *ScalarValue = VectorLoopValueMap.getScalarValue(V, {Part, 0}); 2028 2029 // If we've scalarized a value, that value should be an instruction. 2030 auto *I = cast<Instruction>(V); 2031 2032 // If we aren't vectorizing, we can just copy the scalar map values over to 2033 // the vector map. 2034 if (VF == 1) { 2035 VectorLoopValueMap.setVectorValue(V, Part, ScalarValue); 2036 return ScalarValue; 2037 } 2038 2039 // Get the last scalar instruction we generated for V and Part. If the value 2040 // is known to be uniform after vectorization, this corresponds to lane zero 2041 // of the Part unroll iteration. Otherwise, the last instruction is the one 2042 // we created for the last vector lane of the Part unroll iteration. 2043 unsigned LastLane = Cost->isUniformAfterVectorization(I, VF) ? 0 : VF - 1; 2044 auto *LastInst = cast<Instruction>( 2045 VectorLoopValueMap.getScalarValue(V, {Part, LastLane})); 2046 2047 // Set the insert point after the last scalarized instruction. This ensures 2048 // the insertelement sequence will directly follow the scalar definitions. 2049 auto OldIP = Builder.saveIP(); 2050 auto NewIP = std::next(BasicBlock::iterator(LastInst)); 2051 Builder.SetInsertPoint(&*NewIP); 2052 2053 // However, if we are vectorizing, we need to construct the vector values. 2054 // If the value is known to be uniform after vectorization, we can just 2055 // broadcast the scalar value corresponding to lane zero for each unroll 2056 // iteration. Otherwise, we construct the vector values using insertelement 2057 // instructions. Since the resulting vectors are stored in 2058 // VectorLoopValueMap, we will only generate the insertelements once. 2059 Value *VectorValue = nullptr; 2060 if (Cost->isUniformAfterVectorization(I, VF)) { 2061 VectorValue = getBroadcastInstrs(ScalarValue); 2062 VectorLoopValueMap.setVectorValue(V, Part, VectorValue); 2063 } else { 2064 // Initialize packing with insertelements to start from undef. 2065 Value *Undef = UndefValue::get(VectorType::get(V->getType(), VF)); 2066 VectorLoopValueMap.setVectorValue(V, Part, Undef); 2067 for (unsigned Lane = 0; Lane < VF; ++Lane) 2068 packScalarIntoVectorValue(V, {Part, Lane}); 2069 VectorValue = VectorLoopValueMap.getVectorValue(V, Part); 2070 } 2071 Builder.restoreIP(OldIP); 2072 return VectorValue; 2073 } 2074 2075 // If this scalar is unknown, assume that it is a constant or that it is 2076 // loop invariant. Broadcast V and save the value for future uses. 2077 Value *B = getBroadcastInstrs(V); 2078 VectorLoopValueMap.setVectorValue(V, Part, B); 2079 return B; 2080 } 2081 2082 Value * 2083 InnerLoopVectorizer::getOrCreateScalarValue(Value *V, 2084 const VPIteration &Instance) { 2085 // If the value is not an instruction contained in the loop, it should 2086 // already be scalar. 2087 if (OrigLoop->isLoopInvariant(V)) 2088 return V; 2089 2090 assert(Instance.Lane > 0 2091 ? !Cost->isUniformAfterVectorization(cast<Instruction>(V), VF) 2092 : true && "Uniform values only have lane zero"); 2093 2094 // If the value from the original loop has not been vectorized, it is 2095 // represented by UF x VF scalar values in the new loop. Return the requested 2096 // scalar value. 2097 if (VectorLoopValueMap.hasScalarValue(V, Instance)) 2098 return VectorLoopValueMap.getScalarValue(V, Instance); 2099 2100 // If the value has not been scalarized, get its entry in VectorLoopValueMap 2101 // for the given unroll part. If this entry is not a vector type (i.e., the 2102 // vectorization factor is one), there is no need to generate an 2103 // extractelement instruction. 2104 auto *U = getOrCreateVectorValue(V, Instance.Part); 2105 if (!U->getType()->isVectorTy()) { 2106 assert(VF == 1 && "Value not scalarized has non-vector type"); 2107 return U; 2108 } 2109 2110 // Otherwise, the value from the original loop has been vectorized and is 2111 // represented by UF vector values. Extract and return the requested scalar 2112 // value from the appropriate vector lane. 2113 return Builder.CreateExtractElement(U, Builder.getInt32(Instance.Lane)); 2114 } 2115 2116 void InnerLoopVectorizer::packScalarIntoVectorValue( 2117 Value *V, const VPIteration &Instance) { 2118 assert(V != Induction && "The new induction variable should not be used."); 2119 assert(!V->getType()->isVectorTy() && "Can't pack a vector"); 2120 assert(!V->getType()->isVoidTy() && "Type does not produce a value"); 2121 2122 Value *ScalarInst = VectorLoopValueMap.getScalarValue(V, Instance); 2123 Value *VectorValue = VectorLoopValueMap.getVectorValue(V, Instance.Part); 2124 VectorValue = Builder.CreateInsertElement(VectorValue, ScalarInst, 2125 Builder.getInt32(Instance.Lane)); 2126 VectorLoopValueMap.resetVectorValue(V, Instance.Part, VectorValue); 2127 } 2128 2129 Value *InnerLoopVectorizer::reverseVector(Value *Vec) { 2130 assert(Vec->getType()->isVectorTy() && "Invalid type"); 2131 SmallVector<int, 8> ShuffleMask; 2132 for (unsigned i = 0; i < VF; ++i) 2133 ShuffleMask.push_back(VF - i - 1); 2134 2135 return Builder.CreateShuffleVector(Vec, UndefValue::get(Vec->getType()), 2136 ShuffleMask, "reverse"); 2137 } 2138 2139 // Return whether we allow using masked interleave-groups (for dealing with 2140 // strided loads/stores that reside in predicated blocks, or for dealing 2141 // with gaps). 2142 static bool useMaskedInterleavedAccesses(const TargetTransformInfo &TTI) { 2143 // If an override option has been passed in for interleaved accesses, use it. 2144 if (EnableMaskedInterleavedMemAccesses.getNumOccurrences() > 0) 2145 return EnableMaskedInterleavedMemAccesses; 2146 2147 return TTI.enableMaskedInterleavedAccessVectorization(); 2148 } 2149 2150 // Try to vectorize the interleave group that \p Instr belongs to. 2151 // 2152 // E.g. Translate following interleaved load group (factor = 3): 2153 // for (i = 0; i < N; i+=3) { 2154 // R = Pic[i]; // Member of index 0 2155 // G = Pic[i+1]; // Member of index 1 2156 // B = Pic[i+2]; // Member of index 2 2157 // ... // do something to R, G, B 2158 // } 2159 // To: 2160 // %wide.vec = load <12 x i32> ; Read 4 tuples of R,G,B 2161 // %R.vec = shuffle %wide.vec, undef, <0, 3, 6, 9> ; R elements 2162 // %G.vec = shuffle %wide.vec, undef, <1, 4, 7, 10> ; G elements 2163 // %B.vec = shuffle %wide.vec, undef, <2, 5, 8, 11> ; B elements 2164 // 2165 // Or translate following interleaved store group (factor = 3): 2166 // for (i = 0; i < N; i+=3) { 2167 // ... do something to R, G, B 2168 // Pic[i] = R; // Member of index 0 2169 // Pic[i+1] = G; // Member of index 1 2170 // Pic[i+2] = B; // Member of index 2 2171 // } 2172 // To: 2173 // %R_G.vec = shuffle %R.vec, %G.vec, <0, 1, 2, ..., 7> 2174 // %B_U.vec = shuffle %B.vec, undef, <0, 1, 2, 3, u, u, u, u> 2175 // %interleaved.vec = shuffle %R_G.vec, %B_U.vec, 2176 // <0, 4, 8, 1, 5, 9, 2, 6, 10, 3, 7, 11> ; Interleave R,G,B elements 2177 // store <12 x i32> %interleaved.vec ; Write 4 tuples of R,G,B 2178 void InnerLoopVectorizer::vectorizeInterleaveGroup( 2179 const InterleaveGroup<Instruction> *Group, VPTransformState &State, 2180 VPValue *Addr, VPValue *BlockInMask) { 2181 Instruction *Instr = Group->getInsertPos(); 2182 const DataLayout &DL = Instr->getModule()->getDataLayout(); 2183 2184 // Prepare for the vector type of the interleaved load/store. 2185 Type *ScalarTy = getMemInstValueType(Instr); 2186 unsigned InterleaveFactor = Group->getFactor(); 2187 Type *VecTy = VectorType::get(ScalarTy, InterleaveFactor * VF); 2188 2189 // Prepare for the new pointers. 2190 SmallVector<Value *, 2> AddrParts; 2191 unsigned Index = Group->getIndex(Instr); 2192 2193 // TODO: extend the masked interleaved-group support to reversed access. 2194 assert((!BlockInMask || !Group->isReverse()) && 2195 "Reversed masked interleave-group not supported."); 2196 2197 // If the group is reverse, adjust the index to refer to the last vector lane 2198 // instead of the first. We adjust the index from the first vector lane, 2199 // rather than directly getting the pointer for lane VF - 1, because the 2200 // pointer operand of the interleaved access is supposed to be uniform. For 2201 // uniform instructions, we're only required to generate a value for the 2202 // first vector lane in each unroll iteration. 2203 if (Group->isReverse()) 2204 Index += (VF - 1) * Group->getFactor(); 2205 2206 for (unsigned Part = 0; Part < UF; Part++) { 2207 Value *AddrPart = State.get(Addr, {Part, 0}); 2208 setDebugLocFromInst(Builder, AddrPart); 2209 2210 // Notice current instruction could be any index. Need to adjust the address 2211 // to the member of index 0. 2212 // 2213 // E.g. a = A[i+1]; // Member of index 1 (Current instruction) 2214 // b = A[i]; // Member of index 0 2215 // Current pointer is pointed to A[i+1], adjust it to A[i]. 2216 // 2217 // E.g. A[i+1] = a; // Member of index 1 2218 // A[i] = b; // Member of index 0 2219 // A[i+2] = c; // Member of index 2 (Current instruction) 2220 // Current pointer is pointed to A[i+2], adjust it to A[i]. 2221 2222 bool InBounds = false; 2223 if (auto *gep = dyn_cast<GetElementPtrInst>(AddrPart->stripPointerCasts())) 2224 InBounds = gep->isInBounds(); 2225 AddrPart = Builder.CreateGEP(ScalarTy, AddrPart, Builder.getInt32(-Index)); 2226 cast<GetElementPtrInst>(AddrPart)->setIsInBounds(InBounds); 2227 2228 // Cast to the vector pointer type. 2229 unsigned AddressSpace = AddrPart->getType()->getPointerAddressSpace(); 2230 Type *PtrTy = VecTy->getPointerTo(AddressSpace); 2231 AddrParts.push_back(Builder.CreateBitCast(AddrPart, PtrTy)); 2232 } 2233 2234 setDebugLocFromInst(Builder, Instr); 2235 Value *UndefVec = UndefValue::get(VecTy); 2236 2237 Value *MaskForGaps = nullptr; 2238 if (Group->requiresScalarEpilogue() && !Cost->isScalarEpilogueAllowed()) { 2239 MaskForGaps = createBitMaskForGaps(Builder, VF, *Group); 2240 assert(MaskForGaps && "Mask for Gaps is required but it is null"); 2241 } 2242 2243 // Vectorize the interleaved load group. 2244 if (isa<LoadInst>(Instr)) { 2245 // For each unroll part, create a wide load for the group. 2246 SmallVector<Value *, 2> NewLoads; 2247 for (unsigned Part = 0; Part < UF; Part++) { 2248 Instruction *NewLoad; 2249 if (BlockInMask || MaskForGaps) { 2250 assert(useMaskedInterleavedAccesses(*TTI) && 2251 "masked interleaved groups are not allowed."); 2252 Value *GroupMask = MaskForGaps; 2253 if (BlockInMask) { 2254 Value *BlockInMaskPart = State.get(BlockInMask, Part); 2255 auto *Undefs = UndefValue::get(BlockInMaskPart->getType()); 2256 auto *RepMask = createReplicatedMask(Builder, InterleaveFactor, VF); 2257 Value *ShuffledMask = Builder.CreateShuffleVector( 2258 BlockInMaskPart, Undefs, RepMask, "interleaved.mask"); 2259 GroupMask = MaskForGaps 2260 ? Builder.CreateBinOp(Instruction::And, ShuffledMask, 2261 MaskForGaps) 2262 : ShuffledMask; 2263 } 2264 NewLoad = 2265 Builder.CreateMaskedLoad(AddrParts[Part], Group->getAlign(), 2266 GroupMask, UndefVec, "wide.masked.vec"); 2267 } 2268 else 2269 NewLoad = Builder.CreateAlignedLoad(VecTy, AddrParts[Part], 2270 Group->getAlign(), "wide.vec"); 2271 Group->addMetadata(NewLoad); 2272 NewLoads.push_back(NewLoad); 2273 } 2274 2275 // For each member in the group, shuffle out the appropriate data from the 2276 // wide loads. 2277 for (unsigned I = 0; I < InterleaveFactor; ++I) { 2278 Instruction *Member = Group->getMember(I); 2279 2280 // Skip the gaps in the group. 2281 if (!Member) 2282 continue; 2283 2284 Constant *StrideMask = createStrideMask(Builder, I, InterleaveFactor, VF); 2285 for (unsigned Part = 0; Part < UF; Part++) { 2286 Value *StridedVec = Builder.CreateShuffleVector( 2287 NewLoads[Part], UndefVec, StrideMask, "strided.vec"); 2288 2289 // If this member has different type, cast the result type. 2290 if (Member->getType() != ScalarTy) { 2291 VectorType *OtherVTy = VectorType::get(Member->getType(), VF); 2292 StridedVec = createBitOrPointerCast(StridedVec, OtherVTy, DL); 2293 } 2294 2295 if (Group->isReverse()) 2296 StridedVec = reverseVector(StridedVec); 2297 2298 VectorLoopValueMap.setVectorValue(Member, Part, StridedVec); 2299 } 2300 } 2301 return; 2302 } 2303 2304 // The sub vector type for current instruction. 2305 VectorType *SubVT = VectorType::get(ScalarTy, VF); 2306 2307 // Vectorize the interleaved store group. 2308 for (unsigned Part = 0; Part < UF; Part++) { 2309 // Collect the stored vector from each member. 2310 SmallVector<Value *, 4> StoredVecs; 2311 for (unsigned i = 0; i < InterleaveFactor; i++) { 2312 // Interleaved store group doesn't allow a gap, so each index has a member 2313 Instruction *Member = Group->getMember(i); 2314 assert(Member && "Fail to get a member from an interleaved store group"); 2315 2316 Value *StoredVec = getOrCreateVectorValue( 2317 cast<StoreInst>(Member)->getValueOperand(), Part); 2318 if (Group->isReverse()) 2319 StoredVec = reverseVector(StoredVec); 2320 2321 // If this member has different type, cast it to a unified type. 2322 2323 if (StoredVec->getType() != SubVT) 2324 StoredVec = createBitOrPointerCast(StoredVec, SubVT, DL); 2325 2326 StoredVecs.push_back(StoredVec); 2327 } 2328 2329 // Concatenate all vectors into a wide vector. 2330 Value *WideVec = concatenateVectors(Builder, StoredVecs); 2331 2332 // Interleave the elements in the wide vector. 2333 Constant *IMask = createInterleaveMask(Builder, VF, InterleaveFactor); 2334 Value *IVec = Builder.CreateShuffleVector(WideVec, UndefVec, IMask, 2335 "interleaved.vec"); 2336 2337 Instruction *NewStoreInstr; 2338 if (BlockInMask) { 2339 Value *BlockInMaskPart = State.get(BlockInMask, Part); 2340 auto *Undefs = UndefValue::get(BlockInMaskPart->getType()); 2341 auto *RepMask = createReplicatedMask(Builder, InterleaveFactor, VF); 2342 Value *ShuffledMask = Builder.CreateShuffleVector( 2343 BlockInMaskPart, Undefs, RepMask, "interleaved.mask"); 2344 NewStoreInstr = Builder.CreateMaskedStore( 2345 IVec, AddrParts[Part], Group->getAlign(), ShuffledMask); 2346 } 2347 else 2348 NewStoreInstr = 2349 Builder.CreateAlignedStore(IVec, AddrParts[Part], Group->getAlign()); 2350 2351 Group->addMetadata(NewStoreInstr); 2352 } 2353 } 2354 2355 void InnerLoopVectorizer::vectorizeMemoryInstruction(Instruction *Instr, 2356 VPTransformState &State, 2357 VPValue *Addr, 2358 VPValue *StoredValue, 2359 VPValue *BlockInMask) { 2360 // Attempt to issue a wide load. 2361 LoadInst *LI = dyn_cast<LoadInst>(Instr); 2362 StoreInst *SI = dyn_cast<StoreInst>(Instr); 2363 2364 assert((LI || SI) && "Invalid Load/Store instruction"); 2365 assert((!SI || StoredValue) && "No stored value provided for widened store"); 2366 assert((!LI || !StoredValue) && "Stored value provided for widened load"); 2367 2368 LoopVectorizationCostModel::InstWidening Decision = 2369 Cost->getWideningDecision(Instr, VF); 2370 assert((Decision == LoopVectorizationCostModel::CM_Widen || 2371 Decision == LoopVectorizationCostModel::CM_Widen_Reverse || 2372 Decision == LoopVectorizationCostModel::CM_GatherScatter) && 2373 "CM decision is not to widen the memory instruction"); 2374 2375 Type *ScalarDataTy = getMemInstValueType(Instr); 2376 Type *DataTy = VectorType::get(ScalarDataTy, VF); 2377 // An alignment of 0 means target abi alignment. We need to use the scalar's 2378 // target abi alignment in such a case. 2379 const DataLayout &DL = Instr->getModule()->getDataLayout(); 2380 const Align Alignment = 2381 DL.getValueOrABITypeAlignment(getLoadStoreAlignment(Instr), ScalarDataTy); 2382 2383 // Determine if the pointer operand of the access is either consecutive or 2384 // reverse consecutive. 2385 bool Reverse = (Decision == LoopVectorizationCostModel::CM_Widen_Reverse); 2386 bool ConsecutiveStride = 2387 Reverse || (Decision == LoopVectorizationCostModel::CM_Widen); 2388 bool CreateGatherScatter = 2389 (Decision == LoopVectorizationCostModel::CM_GatherScatter); 2390 2391 // Either Ptr feeds a vector load/store, or a vector GEP should feed a vector 2392 // gather/scatter. Otherwise Decision should have been to Scalarize. 2393 assert((ConsecutiveStride || CreateGatherScatter) && 2394 "The instruction should be scalarized"); 2395 (void)ConsecutiveStride; 2396 2397 VectorParts BlockInMaskParts(UF); 2398 bool isMaskRequired = BlockInMask; 2399 if (isMaskRequired) 2400 for (unsigned Part = 0; Part < UF; ++Part) 2401 BlockInMaskParts[Part] = State.get(BlockInMask, Part); 2402 2403 const auto CreateVecPtr = [&](unsigned Part, Value *Ptr) -> Value * { 2404 // Calculate the pointer for the specific unroll-part. 2405 GetElementPtrInst *PartPtr = nullptr; 2406 2407 bool InBounds = false; 2408 if (auto *gep = dyn_cast<GetElementPtrInst>(Ptr->stripPointerCasts())) 2409 InBounds = gep->isInBounds(); 2410 2411 if (Reverse) { 2412 // If the address is consecutive but reversed, then the 2413 // wide store needs to start at the last vector element. 2414 PartPtr = cast<GetElementPtrInst>( 2415 Builder.CreateGEP(ScalarDataTy, Ptr, Builder.getInt32(-Part * VF))); 2416 PartPtr->setIsInBounds(InBounds); 2417 PartPtr = cast<GetElementPtrInst>( 2418 Builder.CreateGEP(ScalarDataTy, PartPtr, Builder.getInt32(1 - VF))); 2419 PartPtr->setIsInBounds(InBounds); 2420 if (isMaskRequired) // Reverse of a null all-one mask is a null mask. 2421 BlockInMaskParts[Part] = reverseVector(BlockInMaskParts[Part]); 2422 } else { 2423 PartPtr = cast<GetElementPtrInst>( 2424 Builder.CreateGEP(ScalarDataTy, Ptr, Builder.getInt32(Part * VF))); 2425 PartPtr->setIsInBounds(InBounds); 2426 } 2427 2428 unsigned AddressSpace = Ptr->getType()->getPointerAddressSpace(); 2429 return Builder.CreateBitCast(PartPtr, DataTy->getPointerTo(AddressSpace)); 2430 }; 2431 2432 // Handle Stores: 2433 if (SI) { 2434 setDebugLocFromInst(Builder, SI); 2435 2436 for (unsigned Part = 0; Part < UF; ++Part) { 2437 Instruction *NewSI = nullptr; 2438 Value *StoredVal = State.get(StoredValue, Part); 2439 if (CreateGatherScatter) { 2440 Value *MaskPart = isMaskRequired ? BlockInMaskParts[Part] : nullptr; 2441 Value *VectorGep = State.get(Addr, Part); 2442 NewSI = Builder.CreateMaskedScatter(StoredVal, VectorGep, Alignment, 2443 MaskPart); 2444 } else { 2445 if (Reverse) { 2446 // If we store to reverse consecutive memory locations, then we need 2447 // to reverse the order of elements in the stored value. 2448 StoredVal = reverseVector(StoredVal); 2449 // We don't want to update the value in the map as it might be used in 2450 // another expression. So don't call resetVectorValue(StoredVal). 2451 } 2452 auto *VecPtr = CreateVecPtr(Part, State.get(Addr, {0, 0})); 2453 if (isMaskRequired) 2454 NewSI = Builder.CreateMaskedStore(StoredVal, VecPtr, Alignment, 2455 BlockInMaskParts[Part]); 2456 else 2457 NewSI = Builder.CreateAlignedStore(StoredVal, VecPtr, Alignment); 2458 } 2459 addMetadata(NewSI, SI); 2460 } 2461 return; 2462 } 2463 2464 // Handle loads. 2465 assert(LI && "Must have a load instruction"); 2466 setDebugLocFromInst(Builder, LI); 2467 for (unsigned Part = 0; Part < UF; ++Part) { 2468 Value *NewLI; 2469 if (CreateGatherScatter) { 2470 Value *MaskPart = isMaskRequired ? BlockInMaskParts[Part] : nullptr; 2471 Value *VectorGep = State.get(Addr, Part); 2472 NewLI = Builder.CreateMaskedGather(VectorGep, Alignment, MaskPart, 2473 nullptr, "wide.masked.gather"); 2474 addMetadata(NewLI, LI); 2475 } else { 2476 auto *VecPtr = CreateVecPtr(Part, State.get(Addr, {0, 0})); 2477 if (isMaskRequired) 2478 NewLI = Builder.CreateMaskedLoad( 2479 VecPtr, Alignment, BlockInMaskParts[Part], UndefValue::get(DataTy), 2480 "wide.masked.load"); 2481 else 2482 NewLI = 2483 Builder.CreateAlignedLoad(DataTy, VecPtr, Alignment, "wide.load"); 2484 2485 // Add metadata to the load, but setVectorValue to the reverse shuffle. 2486 addMetadata(NewLI, LI); 2487 if (Reverse) 2488 NewLI = reverseVector(NewLI); 2489 } 2490 VectorLoopValueMap.setVectorValue(Instr, Part, NewLI); 2491 } 2492 } 2493 2494 void InnerLoopVectorizer::scalarizeInstruction(Instruction *Instr, 2495 const VPIteration &Instance, 2496 bool IfPredicateInstr) { 2497 assert(!Instr->getType()->isAggregateType() && "Can't handle vectors"); 2498 2499 setDebugLocFromInst(Builder, Instr); 2500 2501 // Does this instruction return a value ? 2502 bool IsVoidRetTy = Instr->getType()->isVoidTy(); 2503 2504 Instruction *Cloned = Instr->clone(); 2505 if (!IsVoidRetTy) 2506 Cloned->setName(Instr->getName() + ".cloned"); 2507 2508 // Replace the operands of the cloned instructions with their scalar 2509 // equivalents in the new loop. 2510 for (unsigned op = 0, e = Instr->getNumOperands(); op != e; ++op) { 2511 auto *NewOp = getOrCreateScalarValue(Instr->getOperand(op), Instance); 2512 Cloned->setOperand(op, NewOp); 2513 } 2514 addNewMetadata(Cloned, Instr); 2515 2516 // Place the cloned scalar in the new loop. 2517 Builder.Insert(Cloned); 2518 2519 // Add the cloned scalar to the scalar map entry. 2520 VectorLoopValueMap.setScalarValue(Instr, Instance, Cloned); 2521 2522 // If we just cloned a new assumption, add it the assumption cache. 2523 if (auto *II = dyn_cast<IntrinsicInst>(Cloned)) 2524 if (II->getIntrinsicID() == Intrinsic::assume) 2525 AC->registerAssumption(II); 2526 2527 // End if-block. 2528 if (IfPredicateInstr) 2529 PredicatedInstructions.push_back(Cloned); 2530 } 2531 2532 PHINode *InnerLoopVectorizer::createInductionVariable(Loop *L, Value *Start, 2533 Value *End, Value *Step, 2534 Instruction *DL) { 2535 BasicBlock *Header = L->getHeader(); 2536 BasicBlock *Latch = L->getLoopLatch(); 2537 // As we're just creating this loop, it's possible no latch exists 2538 // yet. If so, use the header as this will be a single block loop. 2539 if (!Latch) 2540 Latch = Header; 2541 2542 IRBuilder<> Builder(&*Header->getFirstInsertionPt()); 2543 Instruction *OldInst = getDebugLocFromInstOrOperands(OldInduction); 2544 setDebugLocFromInst(Builder, OldInst); 2545 auto *Induction = Builder.CreatePHI(Start->getType(), 2, "index"); 2546 2547 Builder.SetInsertPoint(Latch->getTerminator()); 2548 setDebugLocFromInst(Builder, OldInst); 2549 2550 // Create i+1 and fill the PHINode. 2551 Value *Next = Builder.CreateAdd(Induction, Step, "index.next"); 2552 Induction->addIncoming(Start, L->getLoopPreheader()); 2553 Induction->addIncoming(Next, Latch); 2554 // Create the compare. 2555 Value *ICmp = Builder.CreateICmpEQ(Next, End); 2556 Builder.CreateCondBr(ICmp, L->getExitBlock(), Header); 2557 2558 // Now we have two terminators. Remove the old one from the block. 2559 Latch->getTerminator()->eraseFromParent(); 2560 2561 return Induction; 2562 } 2563 2564 Value *InnerLoopVectorizer::getOrCreateTripCount(Loop *L) { 2565 if (TripCount) 2566 return TripCount; 2567 2568 assert(L && "Create Trip Count for null loop."); 2569 IRBuilder<> Builder(L->getLoopPreheader()->getTerminator()); 2570 // Find the loop boundaries. 2571 ScalarEvolution *SE = PSE.getSE(); 2572 const SCEV *BackedgeTakenCount = PSE.getBackedgeTakenCount(); 2573 assert(BackedgeTakenCount != SE->getCouldNotCompute() && 2574 "Invalid loop count"); 2575 2576 Type *IdxTy = Legal->getWidestInductionType(); 2577 assert(IdxTy && "No type for induction"); 2578 2579 // The exit count might have the type of i64 while the phi is i32. This can 2580 // happen if we have an induction variable that is sign extended before the 2581 // compare. The only way that we get a backedge taken count is that the 2582 // induction variable was signed and as such will not overflow. In such a case 2583 // truncation is legal. 2584 if (SE->getTypeSizeInBits(BackedgeTakenCount->getType()) > 2585 IdxTy->getPrimitiveSizeInBits()) 2586 BackedgeTakenCount = SE->getTruncateOrNoop(BackedgeTakenCount, IdxTy); 2587 BackedgeTakenCount = SE->getNoopOrZeroExtend(BackedgeTakenCount, IdxTy); 2588 2589 // Get the total trip count from the count by adding 1. 2590 const SCEV *ExitCount = SE->getAddExpr( 2591 BackedgeTakenCount, SE->getOne(BackedgeTakenCount->getType())); 2592 2593 const DataLayout &DL = L->getHeader()->getModule()->getDataLayout(); 2594 2595 // Expand the trip count and place the new instructions in the preheader. 2596 // Notice that the pre-header does not change, only the loop body. 2597 SCEVExpander Exp(*SE, DL, "induction"); 2598 2599 // Count holds the overall loop count (N). 2600 TripCount = Exp.expandCodeFor(ExitCount, ExitCount->getType(), 2601 L->getLoopPreheader()->getTerminator()); 2602 2603 if (TripCount->getType()->isPointerTy()) 2604 TripCount = 2605 CastInst::CreatePointerCast(TripCount, IdxTy, "exitcount.ptrcnt.to.int", 2606 L->getLoopPreheader()->getTerminator()); 2607 2608 return TripCount; 2609 } 2610 2611 Value *InnerLoopVectorizer::getOrCreateVectorTripCount(Loop *L) { 2612 if (VectorTripCount) 2613 return VectorTripCount; 2614 2615 Value *TC = getOrCreateTripCount(L); 2616 IRBuilder<> Builder(L->getLoopPreheader()->getTerminator()); 2617 2618 Type *Ty = TC->getType(); 2619 Constant *Step = ConstantInt::get(Ty, VF * UF); 2620 2621 // If the tail is to be folded by masking, round the number of iterations N 2622 // up to a multiple of Step instead of rounding down. This is done by first 2623 // adding Step-1 and then rounding down. Note that it's ok if this addition 2624 // overflows: the vector induction variable will eventually wrap to zero given 2625 // that it starts at zero and its Step is a power of two; the loop will then 2626 // exit, with the last early-exit vector comparison also producing all-true. 2627 if (Cost->foldTailByMasking()) { 2628 assert(isPowerOf2_32(VF * UF) && 2629 "VF*UF must be a power of 2 when folding tail by masking"); 2630 TC = Builder.CreateAdd(TC, ConstantInt::get(Ty, VF * UF - 1), "n.rnd.up"); 2631 } 2632 2633 // Now we need to generate the expression for the part of the loop that the 2634 // vectorized body will execute. This is equal to N - (N % Step) if scalar 2635 // iterations are not required for correctness, or N - Step, otherwise. Step 2636 // is equal to the vectorization factor (number of SIMD elements) times the 2637 // unroll factor (number of SIMD instructions). 2638 Value *R = Builder.CreateURem(TC, Step, "n.mod.vf"); 2639 2640 // If there is a non-reversed interleaved group that may speculatively access 2641 // memory out-of-bounds, we need to ensure that there will be at least one 2642 // iteration of the scalar epilogue loop. Thus, if the step evenly divides 2643 // the trip count, we set the remainder to be equal to the step. If the step 2644 // does not evenly divide the trip count, no adjustment is necessary since 2645 // there will already be scalar iterations. Note that the minimum iterations 2646 // check ensures that N >= Step. 2647 if (VF > 1 && Cost->requiresScalarEpilogue()) { 2648 auto *IsZero = Builder.CreateICmpEQ(R, ConstantInt::get(R->getType(), 0)); 2649 R = Builder.CreateSelect(IsZero, Step, R); 2650 } 2651 2652 VectorTripCount = Builder.CreateSub(TC, R, "n.vec"); 2653 2654 return VectorTripCount; 2655 } 2656 2657 Value *InnerLoopVectorizer::createBitOrPointerCast(Value *V, VectorType *DstVTy, 2658 const DataLayout &DL) { 2659 // Verify that V is a vector type with same number of elements as DstVTy. 2660 unsigned VF = DstVTy->getNumElements(); 2661 VectorType *SrcVecTy = cast<VectorType>(V->getType()); 2662 assert((VF == SrcVecTy->getNumElements()) && "Vector dimensions do not match"); 2663 Type *SrcElemTy = SrcVecTy->getElementType(); 2664 Type *DstElemTy = DstVTy->getElementType(); 2665 assert((DL.getTypeSizeInBits(SrcElemTy) == DL.getTypeSizeInBits(DstElemTy)) && 2666 "Vector elements must have same size"); 2667 2668 // Do a direct cast if element types are castable. 2669 if (CastInst::isBitOrNoopPointerCastable(SrcElemTy, DstElemTy, DL)) { 2670 return Builder.CreateBitOrPointerCast(V, DstVTy); 2671 } 2672 // V cannot be directly casted to desired vector type. 2673 // May happen when V is a floating point vector but DstVTy is a vector of 2674 // pointers or vice-versa. Handle this using a two-step bitcast using an 2675 // intermediate Integer type for the bitcast i.e. Ptr <-> Int <-> Float. 2676 assert((DstElemTy->isPointerTy() != SrcElemTy->isPointerTy()) && 2677 "Only one type should be a pointer type"); 2678 assert((DstElemTy->isFloatingPointTy() != SrcElemTy->isFloatingPointTy()) && 2679 "Only one type should be a floating point type"); 2680 Type *IntTy = 2681 IntegerType::getIntNTy(V->getContext(), DL.getTypeSizeInBits(SrcElemTy)); 2682 VectorType *VecIntTy = VectorType::get(IntTy, VF); 2683 Value *CastVal = Builder.CreateBitOrPointerCast(V, VecIntTy); 2684 return Builder.CreateBitOrPointerCast(CastVal, DstVTy); 2685 } 2686 2687 void InnerLoopVectorizer::emitMinimumIterationCountCheck(Loop *L, 2688 BasicBlock *Bypass) { 2689 Value *Count = getOrCreateTripCount(L); 2690 // Reuse existing vector loop preheader for TC checks. 2691 // Note that new preheader block is generated for vector loop. 2692 BasicBlock *const TCCheckBlock = LoopVectorPreHeader; 2693 IRBuilder<> Builder(TCCheckBlock->getTerminator()); 2694 2695 // Generate code to check if the loop's trip count is less than VF * UF, or 2696 // equal to it in case a scalar epilogue is required; this implies that the 2697 // vector trip count is zero. This check also covers the case where adding one 2698 // to the backedge-taken count overflowed leading to an incorrect trip count 2699 // of zero. In this case we will also jump to the scalar loop. 2700 auto P = Cost->requiresScalarEpilogue() ? ICmpInst::ICMP_ULE 2701 : ICmpInst::ICMP_ULT; 2702 2703 // If tail is to be folded, vector loop takes care of all iterations. 2704 Value *CheckMinIters = Builder.getFalse(); 2705 if (!Cost->foldTailByMasking()) 2706 CheckMinIters = Builder.CreateICmp( 2707 P, Count, ConstantInt::get(Count->getType(), VF * UF), 2708 "min.iters.check"); 2709 2710 // Create new preheader for vector loop. 2711 LoopVectorPreHeader = 2712 SplitBlock(TCCheckBlock, TCCheckBlock->getTerminator(), DT, LI, nullptr, 2713 "vector.ph"); 2714 2715 assert(DT->properlyDominates(DT->getNode(TCCheckBlock), 2716 DT->getNode(Bypass)->getIDom()) && 2717 "TC check is expected to dominate Bypass"); 2718 2719 // Update dominator for Bypass & LoopExit. 2720 DT->changeImmediateDominator(Bypass, TCCheckBlock); 2721 DT->changeImmediateDominator(LoopExitBlock, TCCheckBlock); 2722 2723 ReplaceInstWithInst( 2724 TCCheckBlock->getTerminator(), 2725 BranchInst::Create(Bypass, LoopVectorPreHeader, CheckMinIters)); 2726 LoopBypassBlocks.push_back(TCCheckBlock); 2727 } 2728 2729 void InnerLoopVectorizer::emitSCEVChecks(Loop *L, BasicBlock *Bypass) { 2730 // Reuse existing vector loop preheader for SCEV checks. 2731 // Note that new preheader block is generated for vector loop. 2732 BasicBlock *const SCEVCheckBlock = LoopVectorPreHeader; 2733 2734 // Generate the code to check that the SCEV assumptions that we made. 2735 // We want the new basic block to start at the first instruction in a 2736 // sequence of instructions that form a check. 2737 SCEVExpander Exp(*PSE.getSE(), Bypass->getModule()->getDataLayout(), 2738 "scev.check"); 2739 Value *SCEVCheck = Exp.expandCodeForPredicate( 2740 &PSE.getUnionPredicate(), SCEVCheckBlock->getTerminator()); 2741 2742 if (auto *C = dyn_cast<ConstantInt>(SCEVCheck)) 2743 if (C->isZero()) 2744 return; 2745 2746 assert(!SCEVCheckBlock->getParent()->hasOptSize() && 2747 "Cannot SCEV check stride or overflow when optimizing for size"); 2748 2749 SCEVCheckBlock->setName("vector.scevcheck"); 2750 // Create new preheader for vector loop. 2751 LoopVectorPreHeader = 2752 SplitBlock(SCEVCheckBlock, SCEVCheckBlock->getTerminator(), DT, LI, 2753 nullptr, "vector.ph"); 2754 2755 // Update dominator only if this is first RT check. 2756 if (LoopBypassBlocks.empty()) { 2757 DT->changeImmediateDominator(Bypass, SCEVCheckBlock); 2758 DT->changeImmediateDominator(LoopExitBlock, SCEVCheckBlock); 2759 } 2760 2761 ReplaceInstWithInst( 2762 SCEVCheckBlock->getTerminator(), 2763 BranchInst::Create(Bypass, LoopVectorPreHeader, SCEVCheck)); 2764 LoopBypassBlocks.push_back(SCEVCheckBlock); 2765 AddedSafetyChecks = true; 2766 } 2767 2768 void InnerLoopVectorizer::emitMemRuntimeChecks(Loop *L, BasicBlock *Bypass) { 2769 // VPlan-native path does not do any analysis for runtime checks currently. 2770 if (EnableVPlanNativePath) 2771 return; 2772 2773 // Reuse existing vector loop preheader for runtime memory checks. 2774 // Note that new preheader block is generated for vector loop. 2775 BasicBlock *const MemCheckBlock = L->getLoopPreheader(); 2776 2777 // Generate the code that checks in runtime if arrays overlap. We put the 2778 // checks into a separate block to make the more common case of few elements 2779 // faster. 2780 Instruction *FirstCheckInst; 2781 Instruction *MemRuntimeCheck; 2782 std::tie(FirstCheckInst, MemRuntimeCheck) = 2783 Legal->getLAI()->addRuntimeChecks(MemCheckBlock->getTerminator()); 2784 if (!MemRuntimeCheck) 2785 return; 2786 2787 if (MemCheckBlock->getParent()->hasOptSize()) { 2788 assert(Cost->Hints->getForce() == LoopVectorizeHints::FK_Enabled && 2789 "Cannot emit memory checks when optimizing for size, unless forced " 2790 "to vectorize."); 2791 ORE->emit([&]() { 2792 return OptimizationRemarkAnalysis(DEBUG_TYPE, "VectorizationCodeSize", 2793 L->getStartLoc(), L->getHeader()) 2794 << "Code-size may be reduced by not forcing " 2795 "vectorization, or by source-code modifications " 2796 "eliminating the need for runtime checks " 2797 "(e.g., adding 'restrict')."; 2798 }); 2799 } 2800 2801 MemCheckBlock->setName("vector.memcheck"); 2802 // Create new preheader for vector loop. 2803 LoopVectorPreHeader = 2804 SplitBlock(MemCheckBlock, MemCheckBlock->getTerminator(), DT, LI, nullptr, 2805 "vector.ph"); 2806 2807 // Update dominator only if this is first RT check. 2808 if (LoopBypassBlocks.empty()) { 2809 DT->changeImmediateDominator(Bypass, MemCheckBlock); 2810 DT->changeImmediateDominator(LoopExitBlock, MemCheckBlock); 2811 } 2812 2813 ReplaceInstWithInst( 2814 MemCheckBlock->getTerminator(), 2815 BranchInst::Create(Bypass, LoopVectorPreHeader, MemRuntimeCheck)); 2816 LoopBypassBlocks.push_back(MemCheckBlock); 2817 AddedSafetyChecks = true; 2818 2819 // We currently don't use LoopVersioning for the actual loop cloning but we 2820 // still use it to add the noalias metadata. 2821 LVer = std::make_unique<LoopVersioning>(*Legal->getLAI(), OrigLoop, LI, DT, 2822 PSE.getSE()); 2823 LVer->prepareNoAliasMetadata(); 2824 } 2825 2826 Value *InnerLoopVectorizer::emitTransformedIndex( 2827 IRBuilder<> &B, Value *Index, ScalarEvolution *SE, const DataLayout &DL, 2828 const InductionDescriptor &ID) const { 2829 2830 SCEVExpander Exp(*SE, DL, "induction"); 2831 auto Step = ID.getStep(); 2832 auto StartValue = ID.getStartValue(); 2833 assert(Index->getType() == Step->getType() && 2834 "Index type does not match StepValue type"); 2835 2836 // Note: the IR at this point is broken. We cannot use SE to create any new 2837 // SCEV and then expand it, hoping that SCEV's simplification will give us 2838 // a more optimal code. Unfortunately, attempt of doing so on invalid IR may 2839 // lead to various SCEV crashes. So all we can do is to use builder and rely 2840 // on InstCombine for future simplifications. Here we handle some trivial 2841 // cases only. 2842 auto CreateAdd = [&B](Value *X, Value *Y) { 2843 assert(X->getType() == Y->getType() && "Types don't match!"); 2844 if (auto *CX = dyn_cast<ConstantInt>(X)) 2845 if (CX->isZero()) 2846 return Y; 2847 if (auto *CY = dyn_cast<ConstantInt>(Y)) 2848 if (CY->isZero()) 2849 return X; 2850 return B.CreateAdd(X, Y); 2851 }; 2852 2853 auto CreateMul = [&B](Value *X, Value *Y) { 2854 assert(X->getType() == Y->getType() && "Types don't match!"); 2855 if (auto *CX = dyn_cast<ConstantInt>(X)) 2856 if (CX->isOne()) 2857 return Y; 2858 if (auto *CY = dyn_cast<ConstantInt>(Y)) 2859 if (CY->isOne()) 2860 return X; 2861 return B.CreateMul(X, Y); 2862 }; 2863 2864 switch (ID.getKind()) { 2865 case InductionDescriptor::IK_IntInduction: { 2866 assert(Index->getType() == StartValue->getType() && 2867 "Index type does not match StartValue type"); 2868 if (ID.getConstIntStepValue() && ID.getConstIntStepValue()->isMinusOne()) 2869 return B.CreateSub(StartValue, Index); 2870 auto *Offset = CreateMul( 2871 Index, Exp.expandCodeFor(Step, Index->getType(), &*B.GetInsertPoint())); 2872 return CreateAdd(StartValue, Offset); 2873 } 2874 case InductionDescriptor::IK_PtrInduction: { 2875 assert(isa<SCEVConstant>(Step) && 2876 "Expected constant step for pointer induction"); 2877 return B.CreateGEP( 2878 StartValue->getType()->getPointerElementType(), StartValue, 2879 CreateMul(Index, Exp.expandCodeFor(Step, Index->getType(), 2880 &*B.GetInsertPoint()))); 2881 } 2882 case InductionDescriptor::IK_FpInduction: { 2883 assert(Step->getType()->isFloatingPointTy() && "Expected FP Step value"); 2884 auto InductionBinOp = ID.getInductionBinOp(); 2885 assert(InductionBinOp && 2886 (InductionBinOp->getOpcode() == Instruction::FAdd || 2887 InductionBinOp->getOpcode() == Instruction::FSub) && 2888 "Original bin op should be defined for FP induction"); 2889 2890 Value *StepValue = cast<SCEVUnknown>(Step)->getValue(); 2891 2892 // Floating point operations had to be 'fast' to enable the induction. 2893 FastMathFlags Flags; 2894 Flags.setFast(); 2895 2896 Value *MulExp = B.CreateFMul(StepValue, Index); 2897 if (isa<Instruction>(MulExp)) 2898 // We have to check, the MulExp may be a constant. 2899 cast<Instruction>(MulExp)->setFastMathFlags(Flags); 2900 2901 Value *BOp = B.CreateBinOp(InductionBinOp->getOpcode(), StartValue, MulExp, 2902 "induction"); 2903 if (isa<Instruction>(BOp)) 2904 cast<Instruction>(BOp)->setFastMathFlags(Flags); 2905 2906 return BOp; 2907 } 2908 case InductionDescriptor::IK_NoInduction: 2909 return nullptr; 2910 } 2911 llvm_unreachable("invalid enum"); 2912 } 2913 2914 BasicBlock *InnerLoopVectorizer::createVectorizedLoopSkeleton() { 2915 /* 2916 In this function we generate a new loop. The new loop will contain 2917 the vectorized instructions while the old loop will continue to run the 2918 scalar remainder. 2919 2920 [ ] <-- loop iteration number check. 2921 / | 2922 / v 2923 | [ ] <-- vector loop bypass (may consist of multiple blocks). 2924 | / | 2925 | / v 2926 || [ ] <-- vector pre header. 2927 |/ | 2928 | v 2929 | [ ] \ 2930 | [ ]_| <-- vector loop. 2931 | | 2932 | v 2933 | -[ ] <--- middle-block. 2934 | / | 2935 | / v 2936 -|- >[ ] <--- new preheader. 2937 | | 2938 | v 2939 | [ ] \ 2940 | [ ]_| <-- old scalar loop to handle remainder. 2941 \ | 2942 \ v 2943 >[ ] <-- exit block. 2944 ... 2945 */ 2946 2947 MDNode *OrigLoopID = OrigLoop->getLoopID(); 2948 2949 // Some loops have a single integer induction variable, while other loops 2950 // don't. One example is c++ iterators that often have multiple pointer 2951 // induction variables. In the code below we also support a case where we 2952 // don't have a single induction variable. 2953 // 2954 // We try to obtain an induction variable from the original loop as hard 2955 // as possible. However if we don't find one that: 2956 // - is an integer 2957 // - counts from zero, stepping by one 2958 // - is the size of the widest induction variable type 2959 // then we create a new one. 2960 OldInduction = Legal->getPrimaryInduction(); 2961 Type *IdxTy = Legal->getWidestInductionType(); 2962 2963 // Split the single block loop into the two loop structure described above. 2964 LoopScalarBody = OrigLoop->getHeader(); 2965 LoopVectorPreHeader = OrigLoop->getLoopPreheader(); 2966 LoopExitBlock = OrigLoop->getExitBlock(); 2967 assert(LoopExitBlock && "Must have an exit block"); 2968 assert(LoopVectorPreHeader && "Invalid loop structure"); 2969 2970 LoopMiddleBlock = 2971 SplitBlock(LoopVectorPreHeader, LoopVectorPreHeader->getTerminator(), DT, 2972 LI, nullptr, "middle.block"); 2973 LoopScalarPreHeader = 2974 SplitBlock(LoopMiddleBlock, LoopMiddleBlock->getTerminator(), DT, LI, 2975 nullptr, "scalar.ph"); 2976 // We intentionally don't let SplitBlock to update LoopInfo since 2977 // LoopVectorBody should belong to another loop than LoopVectorPreHeader. 2978 // LoopVectorBody is explicitly added to the correct place few lines later. 2979 LoopVectorBody = 2980 SplitBlock(LoopVectorPreHeader, LoopVectorPreHeader->getTerminator(), DT, 2981 nullptr, nullptr, "vector.body"); 2982 2983 // Update dominator for loop exit. 2984 DT->changeImmediateDominator(LoopExitBlock, LoopMiddleBlock); 2985 2986 // Create and register the new vector loop. 2987 Loop *Lp = LI->AllocateLoop(); 2988 Loop *ParentLoop = OrigLoop->getParentLoop(); 2989 2990 // Insert the new loop into the loop nest and register the new basic blocks 2991 // before calling any utilities such as SCEV that require valid LoopInfo. 2992 if (ParentLoop) { 2993 ParentLoop->addChildLoop(Lp); 2994 } else { 2995 LI->addTopLevelLoop(Lp); 2996 } 2997 Lp->addBasicBlockToLoop(LoopVectorBody, *LI); 2998 2999 // Find the loop boundaries. 3000 Value *Count = getOrCreateTripCount(Lp); 3001 3002 Value *StartIdx = ConstantInt::get(IdxTy, 0); 3003 3004 // Now, compare the new count to zero. If it is zero skip the vector loop and 3005 // jump to the scalar loop. This check also covers the case where the 3006 // backedge-taken count is uint##_max: adding one to it will overflow leading 3007 // to an incorrect trip count of zero. In this (rare) case we will also jump 3008 // to the scalar loop. 3009 emitMinimumIterationCountCheck(Lp, LoopScalarPreHeader); 3010 3011 // Generate the code to check any assumptions that we've made for SCEV 3012 // expressions. 3013 emitSCEVChecks(Lp, LoopScalarPreHeader); 3014 3015 // Generate the code that checks in runtime if arrays overlap. We put the 3016 // checks into a separate block to make the more common case of few elements 3017 // faster. 3018 emitMemRuntimeChecks(Lp, LoopScalarPreHeader); 3019 3020 // Generate the induction variable. 3021 // The loop step is equal to the vectorization factor (num of SIMD elements) 3022 // times the unroll factor (num of SIMD instructions). 3023 Value *CountRoundDown = getOrCreateVectorTripCount(Lp); 3024 Constant *Step = ConstantInt::get(IdxTy, VF * UF); 3025 Induction = 3026 createInductionVariable(Lp, StartIdx, CountRoundDown, Step, 3027 getDebugLocFromInstOrOperands(OldInduction)); 3028 3029 // We are going to resume the execution of the scalar loop. 3030 // Go over all of the induction variables that we found and fix the 3031 // PHIs that are left in the scalar version of the loop. 3032 // The starting values of PHI nodes depend on the counter of the last 3033 // iteration in the vectorized loop. 3034 // If we come from a bypass edge then we need to start from the original 3035 // start value. 3036 3037 // This variable saves the new starting index for the scalar loop. It is used 3038 // to test if there are any tail iterations left once the vector loop has 3039 // completed. 3040 for (auto &InductionEntry : Legal->getInductionVars()) { 3041 PHINode *OrigPhi = InductionEntry.first; 3042 InductionDescriptor II = InductionEntry.second; 3043 3044 // Create phi nodes to merge from the backedge-taken check block. 3045 PHINode *BCResumeVal = 3046 PHINode::Create(OrigPhi->getType(), 3, "bc.resume.val", 3047 LoopScalarPreHeader->getTerminator()); 3048 // Copy original phi DL over to the new one. 3049 BCResumeVal->setDebugLoc(OrigPhi->getDebugLoc()); 3050 Value *&EndValue = IVEndValues[OrigPhi]; 3051 if (OrigPhi == OldInduction) { 3052 // We know what the end value is. 3053 EndValue = CountRoundDown; 3054 } else { 3055 IRBuilder<> B(Lp->getLoopPreheader()->getTerminator()); 3056 Type *StepType = II.getStep()->getType(); 3057 Instruction::CastOps CastOp = 3058 CastInst::getCastOpcode(CountRoundDown, true, StepType, true); 3059 Value *CRD = B.CreateCast(CastOp, CountRoundDown, StepType, "cast.crd"); 3060 const DataLayout &DL = LoopScalarBody->getModule()->getDataLayout(); 3061 EndValue = emitTransformedIndex(B, CRD, PSE.getSE(), DL, II); 3062 EndValue->setName("ind.end"); 3063 } 3064 3065 // The new PHI merges the original incoming value, in case of a bypass, 3066 // or the value at the end of the vectorized loop. 3067 BCResumeVal->addIncoming(EndValue, LoopMiddleBlock); 3068 3069 // Fix the scalar body counter (PHI node). 3070 // The old induction's phi node in the scalar body needs the truncated 3071 // value. 3072 for (BasicBlock *BB : LoopBypassBlocks) 3073 BCResumeVal->addIncoming(II.getStartValue(), BB); 3074 OrigPhi->setIncomingValueForBlock(LoopScalarPreHeader, BCResumeVal); 3075 } 3076 3077 // We need the OrigLoop (scalar loop part) latch terminator to help 3078 // produce correct debug info for the middle block BB instructions. 3079 // The legality check stage guarantees that the loop will have a single 3080 // latch. 3081 assert(isa<BranchInst>(OrigLoop->getLoopLatch()->getTerminator()) && 3082 "Scalar loop latch terminator isn't a branch"); 3083 BranchInst *ScalarLatchBr = 3084 cast<BranchInst>(OrigLoop->getLoopLatch()->getTerminator()); 3085 3086 // Add a check in the middle block to see if we have completed 3087 // all of the iterations in the first vector loop. 3088 // If (N - N%VF) == N, then we *don't* need to run the remainder. 3089 // If tail is to be folded, we know we don't need to run the remainder. 3090 Value *CmpN = Builder.getTrue(); 3091 if (!Cost->foldTailByMasking()) { 3092 CmpN = CmpInst::Create(Instruction::ICmp, CmpInst::ICMP_EQ, Count, 3093 CountRoundDown, "cmp.n", 3094 LoopMiddleBlock->getTerminator()); 3095 3096 // Here we use the same DebugLoc as the scalar loop latch branch instead 3097 // of the corresponding compare because they may have ended up with 3098 // different line numbers and we want to avoid awkward line stepping while 3099 // debugging. Eg. if the compare has got a line number inside the loop. 3100 cast<Instruction>(CmpN)->setDebugLoc(ScalarLatchBr->getDebugLoc()); 3101 } 3102 3103 BranchInst *BrInst = 3104 BranchInst::Create(LoopExitBlock, LoopScalarPreHeader, CmpN); 3105 BrInst->setDebugLoc(ScalarLatchBr->getDebugLoc()); 3106 ReplaceInstWithInst(LoopMiddleBlock->getTerminator(), BrInst); 3107 3108 // Get ready to start creating new instructions into the vectorized body. 3109 assert(LoopVectorPreHeader == Lp->getLoopPreheader() && 3110 "Inconsistent vector loop preheader"); 3111 Builder.SetInsertPoint(&*LoopVectorBody->getFirstInsertionPt()); 3112 3113 Optional<MDNode *> VectorizedLoopID = 3114 makeFollowupLoopID(OrigLoopID, {LLVMLoopVectorizeFollowupAll, 3115 LLVMLoopVectorizeFollowupVectorized}); 3116 if (VectorizedLoopID.hasValue()) { 3117 Lp->setLoopID(VectorizedLoopID.getValue()); 3118 3119 // Do not setAlreadyVectorized if loop attributes have been defined 3120 // explicitly. 3121 return LoopVectorPreHeader; 3122 } 3123 3124 // Keep all loop hints from the original loop on the vector loop (we'll 3125 // replace the vectorizer-specific hints below). 3126 if (MDNode *LID = OrigLoop->getLoopID()) 3127 Lp->setLoopID(LID); 3128 3129 LoopVectorizeHints Hints(Lp, true, *ORE); 3130 Hints.setAlreadyVectorized(); 3131 3132 #ifdef EXPENSIVE_CHECKS 3133 assert(DT->verify(DominatorTree::VerificationLevel::Fast)); 3134 LI->verify(*DT); 3135 #endif 3136 3137 return LoopVectorPreHeader; 3138 } 3139 3140 // Fix up external users of the induction variable. At this point, we are 3141 // in LCSSA form, with all external PHIs that use the IV having one input value, 3142 // coming from the remainder loop. We need those PHIs to also have a correct 3143 // value for the IV when arriving directly from the middle block. 3144 void InnerLoopVectorizer::fixupIVUsers(PHINode *OrigPhi, 3145 const InductionDescriptor &II, 3146 Value *CountRoundDown, Value *EndValue, 3147 BasicBlock *MiddleBlock) { 3148 // There are two kinds of external IV usages - those that use the value 3149 // computed in the last iteration (the PHI) and those that use the penultimate 3150 // value (the value that feeds into the phi from the loop latch). 3151 // We allow both, but they, obviously, have different values. 3152 3153 assert(OrigLoop->getExitBlock() && "Expected a single exit block"); 3154 3155 DenseMap<Value *, Value *> MissingVals; 3156 3157 // An external user of the last iteration's value should see the value that 3158 // the remainder loop uses to initialize its own IV. 3159 Value *PostInc = OrigPhi->getIncomingValueForBlock(OrigLoop->getLoopLatch()); 3160 for (User *U : PostInc->users()) { 3161 Instruction *UI = cast<Instruction>(U); 3162 if (!OrigLoop->contains(UI)) { 3163 assert(isa<PHINode>(UI) && "Expected LCSSA form"); 3164 MissingVals[UI] = EndValue; 3165 } 3166 } 3167 3168 // An external user of the penultimate value need to see EndValue - Step. 3169 // The simplest way to get this is to recompute it from the constituent SCEVs, 3170 // that is Start + (Step * (CRD - 1)). 3171 for (User *U : OrigPhi->users()) { 3172 auto *UI = cast<Instruction>(U); 3173 if (!OrigLoop->contains(UI)) { 3174 const DataLayout &DL = 3175 OrigLoop->getHeader()->getModule()->getDataLayout(); 3176 assert(isa<PHINode>(UI) && "Expected LCSSA form"); 3177 3178 IRBuilder<> B(MiddleBlock->getTerminator()); 3179 Value *CountMinusOne = B.CreateSub( 3180 CountRoundDown, ConstantInt::get(CountRoundDown->getType(), 1)); 3181 Value *CMO = 3182 !II.getStep()->getType()->isIntegerTy() 3183 ? B.CreateCast(Instruction::SIToFP, CountMinusOne, 3184 II.getStep()->getType()) 3185 : B.CreateSExtOrTrunc(CountMinusOne, II.getStep()->getType()); 3186 CMO->setName("cast.cmo"); 3187 Value *Escape = emitTransformedIndex(B, CMO, PSE.getSE(), DL, II); 3188 Escape->setName("ind.escape"); 3189 MissingVals[UI] = Escape; 3190 } 3191 } 3192 3193 for (auto &I : MissingVals) { 3194 PHINode *PHI = cast<PHINode>(I.first); 3195 // One corner case we have to handle is two IVs "chasing" each-other, 3196 // that is %IV2 = phi [...], [ %IV1, %latch ] 3197 // In this case, if IV1 has an external use, we need to avoid adding both 3198 // "last value of IV1" and "penultimate value of IV2". So, verify that we 3199 // don't already have an incoming value for the middle block. 3200 if (PHI->getBasicBlockIndex(MiddleBlock) == -1) 3201 PHI->addIncoming(I.second, MiddleBlock); 3202 } 3203 } 3204 3205 namespace { 3206 3207 struct CSEDenseMapInfo { 3208 static bool canHandle(const Instruction *I) { 3209 return isa<InsertElementInst>(I) || isa<ExtractElementInst>(I) || 3210 isa<ShuffleVectorInst>(I) || isa<GetElementPtrInst>(I); 3211 } 3212 3213 static inline Instruction *getEmptyKey() { 3214 return DenseMapInfo<Instruction *>::getEmptyKey(); 3215 } 3216 3217 static inline Instruction *getTombstoneKey() { 3218 return DenseMapInfo<Instruction *>::getTombstoneKey(); 3219 } 3220 3221 static unsigned getHashValue(const Instruction *I) { 3222 assert(canHandle(I) && "Unknown instruction!"); 3223 return hash_combine(I->getOpcode(), hash_combine_range(I->value_op_begin(), 3224 I->value_op_end())); 3225 } 3226 3227 static bool isEqual(const Instruction *LHS, const Instruction *RHS) { 3228 if (LHS == getEmptyKey() || RHS == getEmptyKey() || 3229 LHS == getTombstoneKey() || RHS == getTombstoneKey()) 3230 return LHS == RHS; 3231 return LHS->isIdenticalTo(RHS); 3232 } 3233 }; 3234 3235 } // end anonymous namespace 3236 3237 ///Perform cse of induction variable instructions. 3238 static void cse(BasicBlock *BB) { 3239 // Perform simple cse. 3240 SmallDenseMap<Instruction *, Instruction *, 4, CSEDenseMapInfo> CSEMap; 3241 for (BasicBlock::iterator I = BB->begin(), E = BB->end(); I != E;) { 3242 Instruction *In = &*I++; 3243 3244 if (!CSEDenseMapInfo::canHandle(In)) 3245 continue; 3246 3247 // Check if we can replace this instruction with any of the 3248 // visited instructions. 3249 if (Instruction *V = CSEMap.lookup(In)) { 3250 In->replaceAllUsesWith(V); 3251 In->eraseFromParent(); 3252 continue; 3253 } 3254 3255 CSEMap[In] = In; 3256 } 3257 } 3258 3259 unsigned LoopVectorizationCostModel::getVectorCallCost(CallInst *CI, 3260 unsigned VF, 3261 bool &NeedToScalarize) { 3262 Function *F = CI->getCalledFunction(); 3263 Type *ScalarRetTy = CI->getType(); 3264 SmallVector<Type *, 4> Tys, ScalarTys; 3265 for (auto &ArgOp : CI->arg_operands()) 3266 ScalarTys.push_back(ArgOp->getType()); 3267 3268 // Estimate cost of scalarized vector call. The source operands are assumed 3269 // to be vectors, so we need to extract individual elements from there, 3270 // execute VF scalar calls, and then gather the result into the vector return 3271 // value. 3272 unsigned ScalarCallCost = TTI.getCallInstrCost(F, ScalarRetTy, ScalarTys); 3273 if (VF == 1) 3274 return ScalarCallCost; 3275 3276 // Compute corresponding vector type for return value and arguments. 3277 Type *RetTy = ToVectorTy(ScalarRetTy, VF); 3278 for (Type *ScalarTy : ScalarTys) 3279 Tys.push_back(ToVectorTy(ScalarTy, VF)); 3280 3281 // Compute costs of unpacking argument values for the scalar calls and 3282 // packing the return values to a vector. 3283 unsigned ScalarizationCost = getScalarizationOverhead(CI, VF); 3284 3285 unsigned Cost = ScalarCallCost * VF + ScalarizationCost; 3286 3287 // If we can't emit a vector call for this function, then the currently found 3288 // cost is the cost we need to return. 3289 NeedToScalarize = true; 3290 VFShape Shape = VFShape::get(*CI, {VF, false}, false /*HasGlobalPred*/); 3291 Function *VecFunc = VFDatabase(*CI).getVectorizedFunction(Shape); 3292 3293 if (!TLI || CI->isNoBuiltin() || !VecFunc) 3294 return Cost; 3295 3296 // If the corresponding vector cost is cheaper, return its cost. 3297 unsigned VectorCallCost = TTI.getCallInstrCost(nullptr, RetTy, Tys); 3298 if (VectorCallCost < Cost) { 3299 NeedToScalarize = false; 3300 return VectorCallCost; 3301 } 3302 return Cost; 3303 } 3304 3305 unsigned LoopVectorizationCostModel::getVectorIntrinsicCost(CallInst *CI, 3306 unsigned VF) { 3307 Intrinsic::ID ID = getVectorIntrinsicIDForCall(CI, TLI); 3308 assert(ID && "Expected intrinsic call!"); 3309 3310 FastMathFlags FMF; 3311 if (auto *FPMO = dyn_cast<FPMathOperator>(CI)) 3312 FMF = FPMO->getFastMathFlags(); 3313 3314 SmallVector<Value *, 4> Operands(CI->arg_operands()); 3315 return TTI.getIntrinsicInstrCost(ID, CI->getType(), Operands, FMF, VF, CI); 3316 } 3317 3318 static Type *smallestIntegerVectorType(Type *T1, Type *T2) { 3319 auto *I1 = cast<IntegerType>(cast<VectorType>(T1)->getElementType()); 3320 auto *I2 = cast<IntegerType>(cast<VectorType>(T2)->getElementType()); 3321 return I1->getBitWidth() < I2->getBitWidth() ? T1 : T2; 3322 } 3323 3324 static Type *largestIntegerVectorType(Type *T1, Type *T2) { 3325 auto *I1 = cast<IntegerType>(cast<VectorType>(T1)->getElementType()); 3326 auto *I2 = cast<IntegerType>(cast<VectorType>(T2)->getElementType()); 3327 return I1->getBitWidth() > I2->getBitWidth() ? T1 : T2; 3328 } 3329 3330 void InnerLoopVectorizer::truncateToMinimalBitwidths() { 3331 // For every instruction `I` in MinBWs, truncate the operands, create a 3332 // truncated version of `I` and reextend its result. InstCombine runs 3333 // later and will remove any ext/trunc pairs. 3334 SmallPtrSet<Value *, 4> Erased; 3335 for (const auto &KV : Cost->getMinimalBitwidths()) { 3336 // If the value wasn't vectorized, we must maintain the original scalar 3337 // type. The absence of the value from VectorLoopValueMap indicates that it 3338 // wasn't vectorized. 3339 if (!VectorLoopValueMap.hasAnyVectorValue(KV.first)) 3340 continue; 3341 for (unsigned Part = 0; Part < UF; ++Part) { 3342 Value *I = getOrCreateVectorValue(KV.first, Part); 3343 if (Erased.find(I) != Erased.end() || I->use_empty() || 3344 !isa<Instruction>(I)) 3345 continue; 3346 Type *OriginalTy = I->getType(); 3347 Type *ScalarTruncatedTy = 3348 IntegerType::get(OriginalTy->getContext(), KV.second); 3349 Type *TruncatedTy = VectorType::get( 3350 ScalarTruncatedTy, cast<VectorType>(OriginalTy)->getNumElements()); 3351 if (TruncatedTy == OriginalTy) 3352 continue; 3353 3354 IRBuilder<> B(cast<Instruction>(I)); 3355 auto ShrinkOperand = [&](Value *V) -> Value * { 3356 if (auto *ZI = dyn_cast<ZExtInst>(V)) 3357 if (ZI->getSrcTy() == TruncatedTy) 3358 return ZI->getOperand(0); 3359 return B.CreateZExtOrTrunc(V, TruncatedTy); 3360 }; 3361 3362 // The actual instruction modification depends on the instruction type, 3363 // unfortunately. 3364 Value *NewI = nullptr; 3365 if (auto *BO = dyn_cast<BinaryOperator>(I)) { 3366 NewI = B.CreateBinOp(BO->getOpcode(), ShrinkOperand(BO->getOperand(0)), 3367 ShrinkOperand(BO->getOperand(1))); 3368 3369 // Any wrapping introduced by shrinking this operation shouldn't be 3370 // considered undefined behavior. So, we can't unconditionally copy 3371 // arithmetic wrapping flags to NewI. 3372 cast<BinaryOperator>(NewI)->copyIRFlags(I, /*IncludeWrapFlags=*/false); 3373 } else if (auto *CI = dyn_cast<ICmpInst>(I)) { 3374 NewI = 3375 B.CreateICmp(CI->getPredicate(), ShrinkOperand(CI->getOperand(0)), 3376 ShrinkOperand(CI->getOperand(1))); 3377 } else if (auto *SI = dyn_cast<SelectInst>(I)) { 3378 NewI = B.CreateSelect(SI->getCondition(), 3379 ShrinkOperand(SI->getTrueValue()), 3380 ShrinkOperand(SI->getFalseValue())); 3381 } else if (auto *CI = dyn_cast<CastInst>(I)) { 3382 switch (CI->getOpcode()) { 3383 default: 3384 llvm_unreachable("Unhandled cast!"); 3385 case Instruction::Trunc: 3386 NewI = ShrinkOperand(CI->getOperand(0)); 3387 break; 3388 case Instruction::SExt: 3389 NewI = B.CreateSExtOrTrunc( 3390 CI->getOperand(0), 3391 smallestIntegerVectorType(OriginalTy, TruncatedTy)); 3392 break; 3393 case Instruction::ZExt: 3394 NewI = B.CreateZExtOrTrunc( 3395 CI->getOperand(0), 3396 smallestIntegerVectorType(OriginalTy, TruncatedTy)); 3397 break; 3398 } 3399 } else if (auto *SI = dyn_cast<ShuffleVectorInst>(I)) { 3400 auto Elements0 = 3401 cast<VectorType>(SI->getOperand(0)->getType())->getNumElements(); 3402 auto *O0 = B.CreateZExtOrTrunc( 3403 SI->getOperand(0), VectorType::get(ScalarTruncatedTy, Elements0)); 3404 auto Elements1 = 3405 cast<VectorType>(SI->getOperand(1)->getType())->getNumElements(); 3406 auto *O1 = B.CreateZExtOrTrunc( 3407 SI->getOperand(1), VectorType::get(ScalarTruncatedTy, Elements1)); 3408 3409 NewI = B.CreateShuffleVector(O0, O1, SI->getShuffleMask()); 3410 } else if (isa<LoadInst>(I) || isa<PHINode>(I)) { 3411 // Don't do anything with the operands, just extend the result. 3412 continue; 3413 } else if (auto *IE = dyn_cast<InsertElementInst>(I)) { 3414 auto Elements = 3415 cast<VectorType>(IE->getOperand(0)->getType())->getNumElements(); 3416 auto *O0 = B.CreateZExtOrTrunc( 3417 IE->getOperand(0), VectorType::get(ScalarTruncatedTy, Elements)); 3418 auto *O1 = B.CreateZExtOrTrunc(IE->getOperand(1), ScalarTruncatedTy); 3419 NewI = B.CreateInsertElement(O0, O1, IE->getOperand(2)); 3420 } else if (auto *EE = dyn_cast<ExtractElementInst>(I)) { 3421 auto Elements = 3422 cast<VectorType>(EE->getOperand(0)->getType())->getNumElements(); 3423 auto *O0 = B.CreateZExtOrTrunc( 3424 EE->getOperand(0), VectorType::get(ScalarTruncatedTy, Elements)); 3425 NewI = B.CreateExtractElement(O0, EE->getOperand(2)); 3426 } else { 3427 // If we don't know what to do, be conservative and don't do anything. 3428 continue; 3429 } 3430 3431 // Lastly, extend the result. 3432 NewI->takeName(cast<Instruction>(I)); 3433 Value *Res = B.CreateZExtOrTrunc(NewI, OriginalTy); 3434 I->replaceAllUsesWith(Res); 3435 cast<Instruction>(I)->eraseFromParent(); 3436 Erased.insert(I); 3437 VectorLoopValueMap.resetVectorValue(KV.first, Part, Res); 3438 } 3439 } 3440 3441 // We'll have created a bunch of ZExts that are now parentless. Clean up. 3442 for (const auto &KV : Cost->getMinimalBitwidths()) { 3443 // If the value wasn't vectorized, we must maintain the original scalar 3444 // type. The absence of the value from VectorLoopValueMap indicates that it 3445 // wasn't vectorized. 3446 if (!VectorLoopValueMap.hasAnyVectorValue(KV.first)) 3447 continue; 3448 for (unsigned Part = 0; Part < UF; ++Part) { 3449 Value *I = getOrCreateVectorValue(KV.first, Part); 3450 ZExtInst *Inst = dyn_cast<ZExtInst>(I); 3451 if (Inst && Inst->use_empty()) { 3452 Value *NewI = Inst->getOperand(0); 3453 Inst->eraseFromParent(); 3454 VectorLoopValueMap.resetVectorValue(KV.first, Part, NewI); 3455 } 3456 } 3457 } 3458 } 3459 3460 void InnerLoopVectorizer::fixVectorizedLoop() { 3461 // Insert truncates and extends for any truncated instructions as hints to 3462 // InstCombine. 3463 if (VF > 1) 3464 truncateToMinimalBitwidths(); 3465 3466 // Fix widened non-induction PHIs by setting up the PHI operands. 3467 if (OrigPHIsToFix.size()) { 3468 assert(EnableVPlanNativePath && 3469 "Unexpected non-induction PHIs for fixup in non VPlan-native path"); 3470 fixNonInductionPHIs(); 3471 } 3472 3473 // At this point every instruction in the original loop is widened to a 3474 // vector form. Now we need to fix the recurrences in the loop. These PHI 3475 // nodes are currently empty because we did not want to introduce cycles. 3476 // This is the second stage of vectorizing recurrences. 3477 fixCrossIterationPHIs(); 3478 3479 // Forget the original basic block. 3480 PSE.getSE()->forgetLoop(OrigLoop); 3481 3482 // Fix-up external users of the induction variables. 3483 for (auto &Entry : Legal->getInductionVars()) 3484 fixupIVUsers(Entry.first, Entry.second, 3485 getOrCreateVectorTripCount(LI->getLoopFor(LoopVectorBody)), 3486 IVEndValues[Entry.first], LoopMiddleBlock); 3487 3488 fixLCSSAPHIs(); 3489 for (Instruction *PI : PredicatedInstructions) 3490 sinkScalarOperands(&*PI); 3491 3492 // Remove redundant induction instructions. 3493 cse(LoopVectorBody); 3494 3495 // Set/update profile weights for the vector and remainder loops as original 3496 // loop iterations are now distributed among them. Note that original loop 3497 // represented by LoopScalarBody becomes remainder loop after vectorization. 3498 // 3499 // For cases like foldTailByMasking() and requiresScalarEpiloque() we may 3500 // end up getting slightly roughened result but that should be OK since 3501 // profile is not inherently precise anyway. Note also possible bypass of 3502 // vector code caused by legality checks is ignored, assigning all the weight 3503 // to the vector loop, optimistically. 3504 setProfileInfoAfterUnrolling(LI->getLoopFor(LoopScalarBody), 3505 LI->getLoopFor(LoopVectorBody), 3506 LI->getLoopFor(LoopScalarBody), VF * UF); 3507 } 3508 3509 void InnerLoopVectorizer::fixCrossIterationPHIs() { 3510 // In order to support recurrences we need to be able to vectorize Phi nodes. 3511 // Phi nodes have cycles, so we need to vectorize them in two stages. This is 3512 // stage #2: We now need to fix the recurrences by adding incoming edges to 3513 // the currently empty PHI nodes. At this point every instruction in the 3514 // original loop is widened to a vector form so we can use them to construct 3515 // the incoming edges. 3516 for (PHINode &Phi : OrigLoop->getHeader()->phis()) { 3517 // Handle first-order recurrences and reductions that need to be fixed. 3518 if (Legal->isFirstOrderRecurrence(&Phi)) 3519 fixFirstOrderRecurrence(&Phi); 3520 else if (Legal->isReductionVariable(&Phi)) 3521 fixReduction(&Phi); 3522 } 3523 } 3524 3525 void InnerLoopVectorizer::fixFirstOrderRecurrence(PHINode *Phi) { 3526 // This is the second phase of vectorizing first-order recurrences. An 3527 // overview of the transformation is described below. Suppose we have the 3528 // following loop. 3529 // 3530 // for (int i = 0; i < n; ++i) 3531 // b[i] = a[i] - a[i - 1]; 3532 // 3533 // There is a first-order recurrence on "a". For this loop, the shorthand 3534 // scalar IR looks like: 3535 // 3536 // scalar.ph: 3537 // s_init = a[-1] 3538 // br scalar.body 3539 // 3540 // scalar.body: 3541 // i = phi [0, scalar.ph], [i+1, scalar.body] 3542 // s1 = phi [s_init, scalar.ph], [s2, scalar.body] 3543 // s2 = a[i] 3544 // b[i] = s2 - s1 3545 // br cond, scalar.body, ... 3546 // 3547 // In this example, s1 is a recurrence because it's value depends on the 3548 // previous iteration. In the first phase of vectorization, we created a 3549 // temporary value for s1. We now complete the vectorization and produce the 3550 // shorthand vector IR shown below (for VF = 4, UF = 1). 3551 // 3552 // vector.ph: 3553 // v_init = vector(..., ..., ..., a[-1]) 3554 // br vector.body 3555 // 3556 // vector.body 3557 // i = phi [0, vector.ph], [i+4, vector.body] 3558 // v1 = phi [v_init, vector.ph], [v2, vector.body] 3559 // v2 = a[i, i+1, i+2, i+3]; 3560 // v3 = vector(v1(3), v2(0, 1, 2)) 3561 // b[i, i+1, i+2, i+3] = v2 - v3 3562 // br cond, vector.body, middle.block 3563 // 3564 // middle.block: 3565 // x = v2(3) 3566 // br scalar.ph 3567 // 3568 // scalar.ph: 3569 // s_init = phi [x, middle.block], [a[-1], otherwise] 3570 // br scalar.body 3571 // 3572 // After execution completes the vector loop, we extract the next value of 3573 // the recurrence (x) to use as the initial value in the scalar loop. 3574 3575 // Get the original loop preheader and single loop latch. 3576 auto *Preheader = OrigLoop->getLoopPreheader(); 3577 auto *Latch = OrigLoop->getLoopLatch(); 3578 3579 // Get the initial and previous values of the scalar recurrence. 3580 auto *ScalarInit = Phi->getIncomingValueForBlock(Preheader); 3581 auto *Previous = Phi->getIncomingValueForBlock(Latch); 3582 3583 // Create a vector from the initial value. 3584 auto *VectorInit = ScalarInit; 3585 if (VF > 1) { 3586 Builder.SetInsertPoint(LoopVectorPreHeader->getTerminator()); 3587 VectorInit = Builder.CreateInsertElement( 3588 UndefValue::get(VectorType::get(VectorInit->getType(), VF)), VectorInit, 3589 Builder.getInt32(VF - 1), "vector.recur.init"); 3590 } 3591 3592 // We constructed a temporary phi node in the first phase of vectorization. 3593 // This phi node will eventually be deleted. 3594 Builder.SetInsertPoint( 3595 cast<Instruction>(VectorLoopValueMap.getVectorValue(Phi, 0))); 3596 3597 // Create a phi node for the new recurrence. The current value will either be 3598 // the initial value inserted into a vector or loop-varying vector value. 3599 auto *VecPhi = Builder.CreatePHI(VectorInit->getType(), 2, "vector.recur"); 3600 VecPhi->addIncoming(VectorInit, LoopVectorPreHeader); 3601 3602 // Get the vectorized previous value of the last part UF - 1. It appears last 3603 // among all unrolled iterations, due to the order of their construction. 3604 Value *PreviousLastPart = getOrCreateVectorValue(Previous, UF - 1); 3605 3606 // Find and set the insertion point after the previous value if it is an 3607 // instruction. 3608 BasicBlock::iterator InsertPt; 3609 // Note that the previous value may have been constant-folded so it is not 3610 // guaranteed to be an instruction in the vector loop. 3611 // FIXME: Loop invariant values do not form recurrences. We should deal with 3612 // them earlier. 3613 if (LI->getLoopFor(LoopVectorBody)->isLoopInvariant(PreviousLastPart)) 3614 InsertPt = LoopVectorBody->getFirstInsertionPt(); 3615 else { 3616 Instruction *PreviousInst = cast<Instruction>(PreviousLastPart); 3617 if (isa<PHINode>(PreviousLastPart)) 3618 // If the previous value is a phi node, we should insert after all the phi 3619 // nodes in the block containing the PHI to avoid breaking basic block 3620 // verification. Note that the basic block may be different to 3621 // LoopVectorBody, in case we predicate the loop. 3622 InsertPt = PreviousInst->getParent()->getFirstInsertionPt(); 3623 else 3624 InsertPt = ++PreviousInst->getIterator(); 3625 } 3626 Builder.SetInsertPoint(&*InsertPt); 3627 3628 // We will construct a vector for the recurrence by combining the values for 3629 // the current and previous iterations. This is the required shuffle mask. 3630 SmallVector<int, 8> ShuffleMask(VF); 3631 ShuffleMask[0] = VF - 1; 3632 for (unsigned I = 1; I < VF; ++I) 3633 ShuffleMask[I] = I + VF - 1; 3634 3635 // The vector from which to take the initial value for the current iteration 3636 // (actual or unrolled). Initially, this is the vector phi node. 3637 Value *Incoming = VecPhi; 3638 3639 // Shuffle the current and previous vector and update the vector parts. 3640 for (unsigned Part = 0; Part < UF; ++Part) { 3641 Value *PreviousPart = getOrCreateVectorValue(Previous, Part); 3642 Value *PhiPart = VectorLoopValueMap.getVectorValue(Phi, Part); 3643 auto *Shuffle = VF > 1 ? Builder.CreateShuffleVector(Incoming, PreviousPart, 3644 ShuffleMask) 3645 : Incoming; 3646 PhiPart->replaceAllUsesWith(Shuffle); 3647 cast<Instruction>(PhiPart)->eraseFromParent(); 3648 VectorLoopValueMap.resetVectorValue(Phi, Part, Shuffle); 3649 Incoming = PreviousPart; 3650 } 3651 3652 // Fix the latch value of the new recurrence in the vector loop. 3653 VecPhi->addIncoming(Incoming, LI->getLoopFor(LoopVectorBody)->getLoopLatch()); 3654 3655 // Extract the last vector element in the middle block. This will be the 3656 // initial value for the recurrence when jumping to the scalar loop. 3657 auto *ExtractForScalar = Incoming; 3658 if (VF > 1) { 3659 Builder.SetInsertPoint(LoopMiddleBlock->getTerminator()); 3660 ExtractForScalar = Builder.CreateExtractElement( 3661 ExtractForScalar, Builder.getInt32(VF - 1), "vector.recur.extract"); 3662 } 3663 // Extract the second last element in the middle block if the 3664 // Phi is used outside the loop. We need to extract the phi itself 3665 // and not the last element (the phi update in the current iteration). This 3666 // will be the value when jumping to the exit block from the LoopMiddleBlock, 3667 // when the scalar loop is not run at all. 3668 Value *ExtractForPhiUsedOutsideLoop = nullptr; 3669 if (VF > 1) 3670 ExtractForPhiUsedOutsideLoop = Builder.CreateExtractElement( 3671 Incoming, Builder.getInt32(VF - 2), "vector.recur.extract.for.phi"); 3672 // When loop is unrolled without vectorizing, initialize 3673 // ExtractForPhiUsedOutsideLoop with the value just prior to unrolled value of 3674 // `Incoming`. This is analogous to the vectorized case above: extracting the 3675 // second last element when VF > 1. 3676 else if (UF > 1) 3677 ExtractForPhiUsedOutsideLoop = getOrCreateVectorValue(Previous, UF - 2); 3678 3679 // Fix the initial value of the original recurrence in the scalar loop. 3680 Builder.SetInsertPoint(&*LoopScalarPreHeader->begin()); 3681 auto *Start = Builder.CreatePHI(Phi->getType(), 2, "scalar.recur.init"); 3682 for (auto *BB : predecessors(LoopScalarPreHeader)) { 3683 auto *Incoming = BB == LoopMiddleBlock ? ExtractForScalar : ScalarInit; 3684 Start->addIncoming(Incoming, BB); 3685 } 3686 3687 Phi->setIncomingValueForBlock(LoopScalarPreHeader, Start); 3688 Phi->setName("scalar.recur"); 3689 3690 // Finally, fix users of the recurrence outside the loop. The users will need 3691 // either the last value of the scalar recurrence or the last value of the 3692 // vector recurrence we extracted in the middle block. Since the loop is in 3693 // LCSSA form, we just need to find all the phi nodes for the original scalar 3694 // recurrence in the exit block, and then add an edge for the middle block. 3695 for (PHINode &LCSSAPhi : LoopExitBlock->phis()) { 3696 if (LCSSAPhi.getIncomingValue(0) == Phi) { 3697 LCSSAPhi.addIncoming(ExtractForPhiUsedOutsideLoop, LoopMiddleBlock); 3698 } 3699 } 3700 } 3701 3702 void InnerLoopVectorizer::fixReduction(PHINode *Phi) { 3703 Constant *Zero = Builder.getInt32(0); 3704 3705 // Get it's reduction variable descriptor. 3706 assert(Legal->isReductionVariable(Phi) && 3707 "Unable to find the reduction variable"); 3708 RecurrenceDescriptor RdxDesc = Legal->getReductionVars()[Phi]; 3709 3710 RecurrenceDescriptor::RecurrenceKind RK = RdxDesc.getRecurrenceKind(); 3711 TrackingVH<Value> ReductionStartValue = RdxDesc.getRecurrenceStartValue(); 3712 Instruction *LoopExitInst = RdxDesc.getLoopExitInstr(); 3713 RecurrenceDescriptor::MinMaxRecurrenceKind MinMaxKind = 3714 RdxDesc.getMinMaxRecurrenceKind(); 3715 setDebugLocFromInst(Builder, ReductionStartValue); 3716 3717 // We need to generate a reduction vector from the incoming scalar. 3718 // To do so, we need to generate the 'identity' vector and override 3719 // one of the elements with the incoming scalar reduction. We need 3720 // to do it in the vector-loop preheader. 3721 Builder.SetInsertPoint(LoopVectorPreHeader->getTerminator()); 3722 3723 // This is the vector-clone of the value that leaves the loop. 3724 Type *VecTy = getOrCreateVectorValue(LoopExitInst, 0)->getType(); 3725 3726 // Find the reduction identity variable. Zero for addition, or, xor, 3727 // one for multiplication, -1 for And. 3728 Value *Identity; 3729 Value *VectorStart; 3730 if (RK == RecurrenceDescriptor::RK_IntegerMinMax || 3731 RK == RecurrenceDescriptor::RK_FloatMinMax) { 3732 // MinMax reduction have the start value as their identify. 3733 if (VF == 1) { 3734 VectorStart = Identity = ReductionStartValue; 3735 } else { 3736 VectorStart = Identity = 3737 Builder.CreateVectorSplat(VF, ReductionStartValue, "minmax.ident"); 3738 } 3739 } else { 3740 // Handle other reduction kinds: 3741 Constant *Iden = RecurrenceDescriptor::getRecurrenceIdentity( 3742 RK, VecTy->getScalarType()); 3743 if (VF == 1) { 3744 Identity = Iden; 3745 // This vector is the Identity vector where the first element is the 3746 // incoming scalar reduction. 3747 VectorStart = ReductionStartValue; 3748 } else { 3749 Identity = ConstantVector::getSplat({VF, false}, Iden); 3750 3751 // This vector is the Identity vector where the first element is the 3752 // incoming scalar reduction. 3753 VectorStart = 3754 Builder.CreateInsertElement(Identity, ReductionStartValue, Zero); 3755 } 3756 } 3757 3758 // Wrap flags are in general invalid after vectorization, clear them. 3759 clearReductionWrapFlags(RdxDesc); 3760 3761 // Fix the vector-loop phi. 3762 3763 // Reductions do not have to start at zero. They can start with 3764 // any loop invariant values. 3765 BasicBlock *Latch = OrigLoop->getLoopLatch(); 3766 Value *LoopVal = Phi->getIncomingValueForBlock(Latch); 3767 3768 for (unsigned Part = 0; Part < UF; ++Part) { 3769 Value *VecRdxPhi = getOrCreateVectorValue(Phi, Part); 3770 Value *Val = getOrCreateVectorValue(LoopVal, Part); 3771 // Make sure to add the reduction start value only to the 3772 // first unroll part. 3773 Value *StartVal = (Part == 0) ? VectorStart : Identity; 3774 cast<PHINode>(VecRdxPhi)->addIncoming(StartVal, LoopVectorPreHeader); 3775 cast<PHINode>(VecRdxPhi) 3776 ->addIncoming(Val, LI->getLoopFor(LoopVectorBody)->getLoopLatch()); 3777 } 3778 3779 // Before each round, move the insertion point right between 3780 // the PHIs and the values we are going to write. 3781 // This allows us to write both PHINodes and the extractelement 3782 // instructions. 3783 Builder.SetInsertPoint(&*LoopMiddleBlock->getFirstInsertionPt()); 3784 3785 setDebugLocFromInst(Builder, LoopExitInst); 3786 3787 // If tail is folded by masking, the vector value to leave the loop should be 3788 // a Select choosing between the vectorized LoopExitInst and vectorized Phi, 3789 // instead of the former. 3790 if (Cost->foldTailByMasking()) { 3791 for (unsigned Part = 0; Part < UF; ++Part) { 3792 Value *VecLoopExitInst = 3793 VectorLoopValueMap.getVectorValue(LoopExitInst, Part); 3794 Value *Sel = nullptr; 3795 for (User *U : VecLoopExitInst->users()) { 3796 if (isa<SelectInst>(U)) { 3797 assert(!Sel && "Reduction exit feeding two selects"); 3798 Sel = U; 3799 } else 3800 assert(isa<PHINode>(U) && "Reduction exit must feed Phi's or select"); 3801 } 3802 assert(Sel && "Reduction exit feeds no select"); 3803 VectorLoopValueMap.resetVectorValue(LoopExitInst, Part, Sel); 3804 } 3805 } 3806 3807 // If the vector reduction can be performed in a smaller type, we truncate 3808 // then extend the loop exit value to enable InstCombine to evaluate the 3809 // entire expression in the smaller type. 3810 if (VF > 1 && Phi->getType() != RdxDesc.getRecurrenceType()) { 3811 Type *RdxVecTy = VectorType::get(RdxDesc.getRecurrenceType(), VF); 3812 Builder.SetInsertPoint( 3813 LI->getLoopFor(LoopVectorBody)->getLoopLatch()->getTerminator()); 3814 VectorParts RdxParts(UF); 3815 for (unsigned Part = 0; Part < UF; ++Part) { 3816 RdxParts[Part] = VectorLoopValueMap.getVectorValue(LoopExitInst, Part); 3817 Value *Trunc = Builder.CreateTrunc(RdxParts[Part], RdxVecTy); 3818 Value *Extnd = RdxDesc.isSigned() ? Builder.CreateSExt(Trunc, VecTy) 3819 : Builder.CreateZExt(Trunc, VecTy); 3820 for (Value::user_iterator UI = RdxParts[Part]->user_begin(); 3821 UI != RdxParts[Part]->user_end();) 3822 if (*UI != Trunc) { 3823 (*UI++)->replaceUsesOfWith(RdxParts[Part], Extnd); 3824 RdxParts[Part] = Extnd; 3825 } else { 3826 ++UI; 3827 } 3828 } 3829 Builder.SetInsertPoint(&*LoopMiddleBlock->getFirstInsertionPt()); 3830 for (unsigned Part = 0; Part < UF; ++Part) { 3831 RdxParts[Part] = Builder.CreateTrunc(RdxParts[Part], RdxVecTy); 3832 VectorLoopValueMap.resetVectorValue(LoopExitInst, Part, RdxParts[Part]); 3833 } 3834 } 3835 3836 // Reduce all of the unrolled parts into a single vector. 3837 Value *ReducedPartRdx = VectorLoopValueMap.getVectorValue(LoopExitInst, 0); 3838 unsigned Op = RecurrenceDescriptor::getRecurrenceBinOp(RK); 3839 3840 // The middle block terminator has already been assigned a DebugLoc here (the 3841 // OrigLoop's single latch terminator). We want the whole middle block to 3842 // appear to execute on this line because: (a) it is all compiler generated, 3843 // (b) these instructions are always executed after evaluating the latch 3844 // conditional branch, and (c) other passes may add new predecessors which 3845 // terminate on this line. This is the easiest way to ensure we don't 3846 // accidentally cause an extra step back into the loop while debugging. 3847 setDebugLocFromInst(Builder, LoopMiddleBlock->getTerminator()); 3848 for (unsigned Part = 1; Part < UF; ++Part) { 3849 Value *RdxPart = VectorLoopValueMap.getVectorValue(LoopExitInst, Part); 3850 if (Op != Instruction::ICmp && Op != Instruction::FCmp) 3851 // Floating point operations had to be 'fast' to enable the reduction. 3852 ReducedPartRdx = addFastMathFlag( 3853 Builder.CreateBinOp((Instruction::BinaryOps)Op, RdxPart, 3854 ReducedPartRdx, "bin.rdx"), 3855 RdxDesc.getFastMathFlags()); 3856 else 3857 ReducedPartRdx = createMinMaxOp(Builder, MinMaxKind, ReducedPartRdx, 3858 RdxPart); 3859 } 3860 3861 if (VF > 1) { 3862 bool NoNaN = Legal->hasFunNoNaNAttr(); 3863 ReducedPartRdx = 3864 createTargetReduction(Builder, TTI, RdxDesc, ReducedPartRdx, NoNaN); 3865 // If the reduction can be performed in a smaller type, we need to extend 3866 // the reduction to the wider type before we branch to the original loop. 3867 if (Phi->getType() != RdxDesc.getRecurrenceType()) 3868 ReducedPartRdx = 3869 RdxDesc.isSigned() 3870 ? Builder.CreateSExt(ReducedPartRdx, Phi->getType()) 3871 : Builder.CreateZExt(ReducedPartRdx, Phi->getType()); 3872 } 3873 3874 // Create a phi node that merges control-flow from the backedge-taken check 3875 // block and the middle block. 3876 PHINode *BCBlockPhi = PHINode::Create(Phi->getType(), 2, "bc.merge.rdx", 3877 LoopScalarPreHeader->getTerminator()); 3878 for (unsigned I = 0, E = LoopBypassBlocks.size(); I != E; ++I) 3879 BCBlockPhi->addIncoming(ReductionStartValue, LoopBypassBlocks[I]); 3880 BCBlockPhi->addIncoming(ReducedPartRdx, LoopMiddleBlock); 3881 3882 // Now, we need to fix the users of the reduction variable 3883 // inside and outside of the scalar remainder loop. 3884 // We know that the loop is in LCSSA form. We need to update the 3885 // PHI nodes in the exit blocks. 3886 for (PHINode &LCSSAPhi : LoopExitBlock->phis()) { 3887 // All PHINodes need to have a single entry edge, or two if 3888 // we already fixed them. 3889 assert(LCSSAPhi.getNumIncomingValues() < 3 && "Invalid LCSSA PHI"); 3890 3891 // We found a reduction value exit-PHI. Update it with the 3892 // incoming bypass edge. 3893 if (LCSSAPhi.getIncomingValue(0) == LoopExitInst) 3894 LCSSAPhi.addIncoming(ReducedPartRdx, LoopMiddleBlock); 3895 } // end of the LCSSA phi scan. 3896 3897 // Fix the scalar loop reduction variable with the incoming reduction sum 3898 // from the vector body and from the backedge value. 3899 int IncomingEdgeBlockIdx = 3900 Phi->getBasicBlockIndex(OrigLoop->getLoopLatch()); 3901 assert(IncomingEdgeBlockIdx >= 0 && "Invalid block index"); 3902 // Pick the other block. 3903 int SelfEdgeBlockIdx = (IncomingEdgeBlockIdx ? 0 : 1); 3904 Phi->setIncomingValue(SelfEdgeBlockIdx, BCBlockPhi); 3905 Phi->setIncomingValue(IncomingEdgeBlockIdx, LoopExitInst); 3906 } 3907 3908 void InnerLoopVectorizer::clearReductionWrapFlags( 3909 RecurrenceDescriptor &RdxDesc) { 3910 RecurrenceDescriptor::RecurrenceKind RK = RdxDesc.getRecurrenceKind(); 3911 if (RK != RecurrenceDescriptor::RK_IntegerAdd && 3912 RK != RecurrenceDescriptor::RK_IntegerMult) 3913 return; 3914 3915 Instruction *LoopExitInstr = RdxDesc.getLoopExitInstr(); 3916 assert(LoopExitInstr && "null loop exit instruction"); 3917 SmallVector<Instruction *, 8> Worklist; 3918 SmallPtrSet<Instruction *, 8> Visited; 3919 Worklist.push_back(LoopExitInstr); 3920 Visited.insert(LoopExitInstr); 3921 3922 while (!Worklist.empty()) { 3923 Instruction *Cur = Worklist.pop_back_val(); 3924 if (isa<OverflowingBinaryOperator>(Cur)) 3925 for (unsigned Part = 0; Part < UF; ++Part) { 3926 Value *V = getOrCreateVectorValue(Cur, Part); 3927 cast<Instruction>(V)->dropPoisonGeneratingFlags(); 3928 } 3929 3930 for (User *U : Cur->users()) { 3931 Instruction *UI = cast<Instruction>(U); 3932 if ((Cur != LoopExitInstr || OrigLoop->contains(UI->getParent())) && 3933 Visited.insert(UI).second) 3934 Worklist.push_back(UI); 3935 } 3936 } 3937 } 3938 3939 void InnerLoopVectorizer::fixLCSSAPHIs() { 3940 for (PHINode &LCSSAPhi : LoopExitBlock->phis()) { 3941 if (LCSSAPhi.getNumIncomingValues() == 1) { 3942 auto *IncomingValue = LCSSAPhi.getIncomingValue(0); 3943 // Non-instruction incoming values will have only one value. 3944 unsigned LastLane = 0; 3945 if (isa<Instruction>(IncomingValue)) 3946 LastLane = Cost->isUniformAfterVectorization( 3947 cast<Instruction>(IncomingValue), VF) 3948 ? 0 3949 : VF - 1; 3950 // Can be a loop invariant incoming value or the last scalar value to be 3951 // extracted from the vectorized loop. 3952 Builder.SetInsertPoint(LoopMiddleBlock->getTerminator()); 3953 Value *lastIncomingValue = 3954 getOrCreateScalarValue(IncomingValue, { UF - 1, LastLane }); 3955 LCSSAPhi.addIncoming(lastIncomingValue, LoopMiddleBlock); 3956 } 3957 } 3958 } 3959 3960 void InnerLoopVectorizer::sinkScalarOperands(Instruction *PredInst) { 3961 // The basic block and loop containing the predicated instruction. 3962 auto *PredBB = PredInst->getParent(); 3963 auto *VectorLoop = LI->getLoopFor(PredBB); 3964 3965 // Initialize a worklist with the operands of the predicated instruction. 3966 SetVector<Value *> Worklist(PredInst->op_begin(), PredInst->op_end()); 3967 3968 // Holds instructions that we need to analyze again. An instruction may be 3969 // reanalyzed if we don't yet know if we can sink it or not. 3970 SmallVector<Instruction *, 8> InstsToReanalyze; 3971 3972 // Returns true if a given use occurs in the predicated block. Phi nodes use 3973 // their operands in their corresponding predecessor blocks. 3974 auto isBlockOfUsePredicated = [&](Use &U) -> bool { 3975 auto *I = cast<Instruction>(U.getUser()); 3976 BasicBlock *BB = I->getParent(); 3977 if (auto *Phi = dyn_cast<PHINode>(I)) 3978 BB = Phi->getIncomingBlock( 3979 PHINode::getIncomingValueNumForOperand(U.getOperandNo())); 3980 return BB == PredBB; 3981 }; 3982 3983 // Iteratively sink the scalarized operands of the predicated instruction 3984 // into the block we created for it. When an instruction is sunk, it's 3985 // operands are then added to the worklist. The algorithm ends after one pass 3986 // through the worklist doesn't sink a single instruction. 3987 bool Changed; 3988 do { 3989 // Add the instructions that need to be reanalyzed to the worklist, and 3990 // reset the changed indicator. 3991 Worklist.insert(InstsToReanalyze.begin(), InstsToReanalyze.end()); 3992 InstsToReanalyze.clear(); 3993 Changed = false; 3994 3995 while (!Worklist.empty()) { 3996 auto *I = dyn_cast<Instruction>(Worklist.pop_back_val()); 3997 3998 // We can't sink an instruction if it is a phi node, is already in the 3999 // predicated block, is not in the loop, or may have side effects. 4000 if (!I || isa<PHINode>(I) || I->getParent() == PredBB || 4001 !VectorLoop->contains(I) || I->mayHaveSideEffects()) 4002 continue; 4003 4004 // It's legal to sink the instruction if all its uses occur in the 4005 // predicated block. Otherwise, there's nothing to do yet, and we may 4006 // need to reanalyze the instruction. 4007 if (!llvm::all_of(I->uses(), isBlockOfUsePredicated)) { 4008 InstsToReanalyze.push_back(I); 4009 continue; 4010 } 4011 4012 // Move the instruction to the beginning of the predicated block, and add 4013 // it's operands to the worklist. 4014 I->moveBefore(&*PredBB->getFirstInsertionPt()); 4015 Worklist.insert(I->op_begin(), I->op_end()); 4016 4017 // The sinking may have enabled other instructions to be sunk, so we will 4018 // need to iterate. 4019 Changed = true; 4020 } 4021 } while (Changed); 4022 } 4023 4024 void InnerLoopVectorizer::fixNonInductionPHIs() { 4025 for (PHINode *OrigPhi : OrigPHIsToFix) { 4026 PHINode *NewPhi = 4027 cast<PHINode>(VectorLoopValueMap.getVectorValue(OrigPhi, 0)); 4028 unsigned NumIncomingValues = OrigPhi->getNumIncomingValues(); 4029 4030 SmallVector<BasicBlock *, 2> ScalarBBPredecessors( 4031 predecessors(OrigPhi->getParent())); 4032 SmallVector<BasicBlock *, 2> VectorBBPredecessors( 4033 predecessors(NewPhi->getParent())); 4034 assert(ScalarBBPredecessors.size() == VectorBBPredecessors.size() && 4035 "Scalar and Vector BB should have the same number of predecessors"); 4036 4037 // The insertion point in Builder may be invalidated by the time we get 4038 // here. Force the Builder insertion point to something valid so that we do 4039 // not run into issues during insertion point restore in 4040 // getOrCreateVectorValue calls below. 4041 Builder.SetInsertPoint(NewPhi); 4042 4043 // The predecessor order is preserved and we can rely on mapping between 4044 // scalar and vector block predecessors. 4045 for (unsigned i = 0; i < NumIncomingValues; ++i) { 4046 BasicBlock *NewPredBB = VectorBBPredecessors[i]; 4047 4048 // When looking up the new scalar/vector values to fix up, use incoming 4049 // values from original phi. 4050 Value *ScIncV = 4051 OrigPhi->getIncomingValueForBlock(ScalarBBPredecessors[i]); 4052 4053 // Scalar incoming value may need a broadcast 4054 Value *NewIncV = getOrCreateVectorValue(ScIncV, 0); 4055 NewPhi->addIncoming(NewIncV, NewPredBB); 4056 } 4057 } 4058 } 4059 4060 void InnerLoopVectorizer::widenGEP(GetElementPtrInst *GEP, unsigned UF, 4061 unsigned VF, bool IsPtrLoopInvariant, 4062 SmallBitVector &IsIndexLoopInvariant) { 4063 // Construct a vector GEP by widening the operands of the scalar GEP as 4064 // necessary. We mark the vector GEP 'inbounds' if appropriate. A GEP 4065 // results in a vector of pointers when at least one operand of the GEP 4066 // is vector-typed. Thus, to keep the representation compact, we only use 4067 // vector-typed operands for loop-varying values. 4068 4069 if (VF > 1 && IsPtrLoopInvariant && IsIndexLoopInvariant.all()) { 4070 // If we are vectorizing, but the GEP has only loop-invariant operands, 4071 // the GEP we build (by only using vector-typed operands for 4072 // loop-varying values) would be a scalar pointer. Thus, to ensure we 4073 // produce a vector of pointers, we need to either arbitrarily pick an 4074 // operand to broadcast, or broadcast a clone of the original GEP. 4075 // Here, we broadcast a clone of the original. 4076 // 4077 // TODO: If at some point we decide to scalarize instructions having 4078 // loop-invariant operands, this special case will no longer be 4079 // required. We would add the scalarization decision to 4080 // collectLoopScalars() and teach getVectorValue() to broadcast 4081 // the lane-zero scalar value. 4082 auto *Clone = Builder.Insert(GEP->clone()); 4083 for (unsigned Part = 0; Part < UF; ++Part) { 4084 Value *EntryPart = Builder.CreateVectorSplat(VF, Clone); 4085 VectorLoopValueMap.setVectorValue(GEP, Part, EntryPart); 4086 addMetadata(EntryPart, GEP); 4087 } 4088 } else { 4089 // If the GEP has at least one loop-varying operand, we are sure to 4090 // produce a vector of pointers. But if we are only unrolling, we want 4091 // to produce a scalar GEP for each unroll part. Thus, the GEP we 4092 // produce with the code below will be scalar (if VF == 1) or vector 4093 // (otherwise). Note that for the unroll-only case, we still maintain 4094 // values in the vector mapping with initVector, as we do for other 4095 // instructions. 4096 for (unsigned Part = 0; Part < UF; ++Part) { 4097 // The pointer operand of the new GEP. If it's loop-invariant, we 4098 // won't broadcast it. 4099 auto *Ptr = IsPtrLoopInvariant 4100 ? GEP->getPointerOperand() 4101 : getOrCreateVectorValue(GEP->getPointerOperand(), Part); 4102 4103 // Collect all the indices for the new GEP. If any index is 4104 // loop-invariant, we won't broadcast it. 4105 SmallVector<Value *, 4> Indices; 4106 for (auto Index : enumerate(GEP->indices())) { 4107 Value *User = Index.value().get(); 4108 if (IsIndexLoopInvariant[Index.index()]) 4109 Indices.push_back(User); 4110 else 4111 Indices.push_back(getOrCreateVectorValue(User, Part)); 4112 } 4113 4114 // Create the new GEP. Note that this GEP may be a scalar if VF == 1, 4115 // but it should be a vector, otherwise. 4116 auto *NewGEP = 4117 GEP->isInBounds() 4118 ? Builder.CreateInBoundsGEP(GEP->getSourceElementType(), Ptr, 4119 Indices) 4120 : Builder.CreateGEP(GEP->getSourceElementType(), Ptr, Indices); 4121 assert((VF == 1 || NewGEP->getType()->isVectorTy()) && 4122 "NewGEP is not a pointer vector"); 4123 VectorLoopValueMap.setVectorValue(GEP, Part, NewGEP); 4124 addMetadata(NewGEP, GEP); 4125 } 4126 } 4127 } 4128 4129 void InnerLoopVectorizer::widenPHIInstruction(Instruction *PN, unsigned UF, 4130 unsigned VF) { 4131 PHINode *P = cast<PHINode>(PN); 4132 if (EnableVPlanNativePath) { 4133 // Currently we enter here in the VPlan-native path for non-induction 4134 // PHIs where all control flow is uniform. We simply widen these PHIs. 4135 // Create a vector phi with no operands - the vector phi operands will be 4136 // set at the end of vector code generation. 4137 Type *VecTy = 4138 (VF == 1) ? PN->getType() : VectorType::get(PN->getType(), VF); 4139 Value *VecPhi = Builder.CreatePHI(VecTy, PN->getNumOperands(), "vec.phi"); 4140 VectorLoopValueMap.setVectorValue(P, 0, VecPhi); 4141 OrigPHIsToFix.push_back(P); 4142 4143 return; 4144 } 4145 4146 assert(PN->getParent() == OrigLoop->getHeader() && 4147 "Non-header phis should have been handled elsewhere"); 4148 4149 // In order to support recurrences we need to be able to vectorize Phi nodes. 4150 // Phi nodes have cycles, so we need to vectorize them in two stages. This is 4151 // stage #1: We create a new vector PHI node with no incoming edges. We'll use 4152 // this value when we vectorize all of the instructions that use the PHI. 4153 if (Legal->isReductionVariable(P) || Legal->isFirstOrderRecurrence(P)) { 4154 for (unsigned Part = 0; Part < UF; ++Part) { 4155 // This is phase one of vectorizing PHIs. 4156 Type *VecTy = 4157 (VF == 1) ? PN->getType() : VectorType::get(PN->getType(), VF); 4158 Value *EntryPart = PHINode::Create( 4159 VecTy, 2, "vec.phi", &*LoopVectorBody->getFirstInsertionPt()); 4160 VectorLoopValueMap.setVectorValue(P, Part, EntryPart); 4161 } 4162 return; 4163 } 4164 4165 setDebugLocFromInst(Builder, P); 4166 4167 // This PHINode must be an induction variable. 4168 // Make sure that we know about it. 4169 assert(Legal->getInductionVars().count(P) && "Not an induction variable"); 4170 4171 InductionDescriptor II = Legal->getInductionVars().lookup(P); 4172 const DataLayout &DL = OrigLoop->getHeader()->getModule()->getDataLayout(); 4173 4174 // FIXME: The newly created binary instructions should contain nsw/nuw flags, 4175 // which can be found from the original scalar operations. 4176 switch (II.getKind()) { 4177 case InductionDescriptor::IK_NoInduction: 4178 llvm_unreachable("Unknown induction"); 4179 case InductionDescriptor::IK_IntInduction: 4180 case InductionDescriptor::IK_FpInduction: 4181 llvm_unreachable("Integer/fp induction is handled elsewhere."); 4182 case InductionDescriptor::IK_PtrInduction: { 4183 // Handle the pointer induction variable case. 4184 assert(P->getType()->isPointerTy() && "Unexpected type."); 4185 // This is the normalized GEP that starts counting at zero. 4186 Value *PtrInd = Induction; 4187 PtrInd = Builder.CreateSExtOrTrunc(PtrInd, II.getStep()->getType()); 4188 // Determine the number of scalars we need to generate for each unroll 4189 // iteration. If the instruction is uniform, we only need to generate the 4190 // first lane. Otherwise, we generate all VF values. 4191 unsigned Lanes = Cost->isUniformAfterVectorization(P, VF) ? 1 : VF; 4192 // These are the scalar results. Notice that we don't generate vector GEPs 4193 // because scalar GEPs result in better code. 4194 for (unsigned Part = 0; Part < UF; ++Part) { 4195 for (unsigned Lane = 0; Lane < Lanes; ++Lane) { 4196 Constant *Idx = ConstantInt::get(PtrInd->getType(), Lane + Part * VF); 4197 Value *GlobalIdx = Builder.CreateAdd(PtrInd, Idx); 4198 Value *SclrGep = 4199 emitTransformedIndex(Builder, GlobalIdx, PSE.getSE(), DL, II); 4200 SclrGep->setName("next.gep"); 4201 VectorLoopValueMap.setScalarValue(P, {Part, Lane}, SclrGep); 4202 } 4203 } 4204 return; 4205 } 4206 } 4207 } 4208 4209 /// A helper function for checking whether an integer division-related 4210 /// instruction may divide by zero (in which case it must be predicated if 4211 /// executed conditionally in the scalar code). 4212 /// TODO: It may be worthwhile to generalize and check isKnownNonZero(). 4213 /// Non-zero divisors that are non compile-time constants will not be 4214 /// converted into multiplication, so we will still end up scalarizing 4215 /// the division, but can do so w/o predication. 4216 static bool mayDivideByZero(Instruction &I) { 4217 assert((I.getOpcode() == Instruction::UDiv || 4218 I.getOpcode() == Instruction::SDiv || 4219 I.getOpcode() == Instruction::URem || 4220 I.getOpcode() == Instruction::SRem) && 4221 "Unexpected instruction"); 4222 Value *Divisor = I.getOperand(1); 4223 auto *CInt = dyn_cast<ConstantInt>(Divisor); 4224 return !CInt || CInt->isZero(); 4225 } 4226 4227 void InnerLoopVectorizer::widenInstruction(Instruction &I) { 4228 switch (I.getOpcode()) { 4229 case Instruction::Call: 4230 case Instruction::Br: 4231 case Instruction::PHI: 4232 case Instruction::GetElementPtr: 4233 case Instruction::Select: 4234 llvm_unreachable("This instruction is handled by a different recipe."); 4235 case Instruction::UDiv: 4236 case Instruction::SDiv: 4237 case Instruction::SRem: 4238 case Instruction::URem: 4239 case Instruction::Add: 4240 case Instruction::FAdd: 4241 case Instruction::Sub: 4242 case Instruction::FSub: 4243 case Instruction::FNeg: 4244 case Instruction::Mul: 4245 case Instruction::FMul: 4246 case Instruction::FDiv: 4247 case Instruction::FRem: 4248 case Instruction::Shl: 4249 case Instruction::LShr: 4250 case Instruction::AShr: 4251 case Instruction::And: 4252 case Instruction::Or: 4253 case Instruction::Xor: { 4254 // Just widen unops and binops. 4255 setDebugLocFromInst(Builder, &I); 4256 4257 for (unsigned Part = 0; Part < UF; ++Part) { 4258 SmallVector<Value *, 2> Ops; 4259 for (Value *Op : I.operands()) 4260 Ops.push_back(getOrCreateVectorValue(Op, Part)); 4261 4262 Value *V = Builder.CreateNAryOp(I.getOpcode(), Ops); 4263 4264 if (auto *VecOp = dyn_cast<Instruction>(V)) 4265 VecOp->copyIRFlags(&I); 4266 4267 // Use this vector value for all users of the original instruction. 4268 VectorLoopValueMap.setVectorValue(&I, Part, V); 4269 addMetadata(V, &I); 4270 } 4271 4272 break; 4273 } 4274 case Instruction::ICmp: 4275 case Instruction::FCmp: { 4276 // Widen compares. Generate vector compares. 4277 bool FCmp = (I.getOpcode() == Instruction::FCmp); 4278 auto *Cmp = cast<CmpInst>(&I); 4279 setDebugLocFromInst(Builder, Cmp); 4280 for (unsigned Part = 0; Part < UF; ++Part) { 4281 Value *A = getOrCreateVectorValue(Cmp->getOperand(0), Part); 4282 Value *B = getOrCreateVectorValue(Cmp->getOperand(1), Part); 4283 Value *C = nullptr; 4284 if (FCmp) { 4285 // Propagate fast math flags. 4286 IRBuilder<>::FastMathFlagGuard FMFG(Builder); 4287 Builder.setFastMathFlags(Cmp->getFastMathFlags()); 4288 C = Builder.CreateFCmp(Cmp->getPredicate(), A, B); 4289 } else { 4290 C = Builder.CreateICmp(Cmp->getPredicate(), A, B); 4291 } 4292 VectorLoopValueMap.setVectorValue(&I, Part, C); 4293 addMetadata(C, &I); 4294 } 4295 4296 break; 4297 } 4298 4299 case Instruction::ZExt: 4300 case Instruction::SExt: 4301 case Instruction::FPToUI: 4302 case Instruction::FPToSI: 4303 case Instruction::FPExt: 4304 case Instruction::PtrToInt: 4305 case Instruction::IntToPtr: 4306 case Instruction::SIToFP: 4307 case Instruction::UIToFP: 4308 case Instruction::Trunc: 4309 case Instruction::FPTrunc: 4310 case Instruction::BitCast: { 4311 auto *CI = cast<CastInst>(&I); 4312 setDebugLocFromInst(Builder, CI); 4313 4314 /// Vectorize casts. 4315 Type *DestTy = 4316 (VF == 1) ? CI->getType() : VectorType::get(CI->getType(), VF); 4317 4318 for (unsigned Part = 0; Part < UF; ++Part) { 4319 Value *A = getOrCreateVectorValue(CI->getOperand(0), Part); 4320 Value *Cast = Builder.CreateCast(CI->getOpcode(), A, DestTy); 4321 VectorLoopValueMap.setVectorValue(&I, Part, Cast); 4322 addMetadata(Cast, &I); 4323 } 4324 break; 4325 } 4326 default: 4327 // This instruction is not vectorized by simple widening. 4328 LLVM_DEBUG(dbgs() << "LV: Found an unhandled instruction: " << I); 4329 llvm_unreachable("Unhandled instruction!"); 4330 } // end of switch. 4331 } 4332 4333 void InnerLoopVectorizer::widenCallInstruction(CallInst &I, VPUser &ArgOperands, 4334 VPTransformState &State) { 4335 assert(!isa<DbgInfoIntrinsic>(I) && 4336 "DbgInfoIntrinsic should have been dropped during VPlan construction"); 4337 setDebugLocFromInst(Builder, &I); 4338 4339 Module *M = I.getParent()->getParent()->getParent(); 4340 auto *CI = cast<CallInst>(&I); 4341 4342 SmallVector<Type *, 4> Tys; 4343 for (Value *ArgOperand : CI->arg_operands()) 4344 Tys.push_back(ToVectorTy(ArgOperand->getType(), VF)); 4345 4346 Intrinsic::ID ID = getVectorIntrinsicIDForCall(CI, TLI); 4347 4348 // The flag shows whether we use Intrinsic or a usual Call for vectorized 4349 // version of the instruction. 4350 // Is it beneficial to perform intrinsic call compared to lib call? 4351 bool NeedToScalarize = false; 4352 unsigned CallCost = Cost->getVectorCallCost(CI, VF, NeedToScalarize); 4353 bool UseVectorIntrinsic = 4354 ID && Cost->getVectorIntrinsicCost(CI, VF) <= CallCost; 4355 assert((UseVectorIntrinsic || !NeedToScalarize) && 4356 "Instruction should be scalarized elsewhere."); 4357 4358 for (unsigned Part = 0; Part < UF; ++Part) { 4359 SmallVector<Value *, 4> Args; 4360 for (auto &I : enumerate(ArgOperands.operands())) { 4361 // Some intrinsics have a scalar argument - don't replace it with a 4362 // vector. 4363 Value *Arg; 4364 if (!UseVectorIntrinsic || !hasVectorInstrinsicScalarOpd(ID, I.index())) 4365 Arg = State.get(I.value(), Part); 4366 else 4367 Arg = State.get(I.value(), {0, 0}); 4368 Args.push_back(Arg); 4369 } 4370 4371 Function *VectorF; 4372 if (UseVectorIntrinsic) { 4373 // Use vector version of the intrinsic. 4374 Type *TysForDecl[] = {CI->getType()}; 4375 if (VF > 1) 4376 TysForDecl[0] = VectorType::get(CI->getType()->getScalarType(), VF); 4377 VectorF = Intrinsic::getDeclaration(M, ID, TysForDecl); 4378 } else { 4379 // Use vector version of the function call. 4380 const VFShape Shape = 4381 VFShape::get(*CI, {VF, false} /*EC*/, false /*HasGlobalPred*/); 4382 #ifndef NDEBUG 4383 const SmallVector<VFInfo, 8> Infos = VFDatabase::getMappings(*CI); 4384 assert(std::find_if(Infos.begin(), Infos.end(), 4385 [&Shape](const VFInfo &Info) { 4386 return Info.Shape == Shape; 4387 }) != Infos.end() && 4388 "Vector function shape is missing from the database."); 4389 #endif 4390 VectorF = VFDatabase(*CI).getVectorizedFunction(Shape); 4391 } 4392 assert(VectorF && "Can't create vector function."); 4393 4394 SmallVector<OperandBundleDef, 1> OpBundles; 4395 CI->getOperandBundlesAsDefs(OpBundles); 4396 CallInst *V = Builder.CreateCall(VectorF, Args, OpBundles); 4397 4398 if (isa<FPMathOperator>(V)) 4399 V->copyFastMathFlags(CI); 4400 4401 VectorLoopValueMap.setVectorValue(&I, Part, V); 4402 addMetadata(V, &I); 4403 } 4404 } 4405 4406 void InnerLoopVectorizer::widenSelectInstruction(SelectInst &I, 4407 bool InvariantCond) { 4408 setDebugLocFromInst(Builder, &I); 4409 4410 // The condition can be loop invariant but still defined inside the 4411 // loop. This means that we can't just use the original 'cond' value. 4412 // We have to take the 'vectorized' value and pick the first lane. 4413 // Instcombine will make this a no-op. 4414 4415 auto *ScalarCond = getOrCreateScalarValue(I.getOperand(0), {0, 0}); 4416 4417 for (unsigned Part = 0; Part < UF; ++Part) { 4418 Value *Cond = getOrCreateVectorValue(I.getOperand(0), Part); 4419 Value *Op0 = getOrCreateVectorValue(I.getOperand(1), Part); 4420 Value *Op1 = getOrCreateVectorValue(I.getOperand(2), Part); 4421 Value *Sel = 4422 Builder.CreateSelect(InvariantCond ? ScalarCond : Cond, Op0, Op1); 4423 VectorLoopValueMap.setVectorValue(&I, Part, Sel); 4424 addMetadata(Sel, &I); 4425 } 4426 } 4427 4428 void LoopVectorizationCostModel::collectLoopScalars(unsigned VF) { 4429 // We should not collect Scalars more than once per VF. Right now, this 4430 // function is called from collectUniformsAndScalars(), which already does 4431 // this check. Collecting Scalars for VF=1 does not make any sense. 4432 assert(VF >= 2 && Scalars.find(VF) == Scalars.end() && 4433 "This function should not be visited twice for the same VF"); 4434 4435 SmallSetVector<Instruction *, 8> Worklist; 4436 4437 // These sets are used to seed the analysis with pointers used by memory 4438 // accesses that will remain scalar. 4439 SmallSetVector<Instruction *, 8> ScalarPtrs; 4440 SmallPtrSet<Instruction *, 8> PossibleNonScalarPtrs; 4441 4442 // A helper that returns true if the use of Ptr by MemAccess will be scalar. 4443 // The pointer operands of loads and stores will be scalar as long as the 4444 // memory access is not a gather or scatter operation. The value operand of a 4445 // store will remain scalar if the store is scalarized. 4446 auto isScalarUse = [&](Instruction *MemAccess, Value *Ptr) { 4447 InstWidening WideningDecision = getWideningDecision(MemAccess, VF); 4448 assert(WideningDecision != CM_Unknown && 4449 "Widening decision should be ready at this moment"); 4450 if (auto *Store = dyn_cast<StoreInst>(MemAccess)) 4451 if (Ptr == Store->getValueOperand()) 4452 return WideningDecision == CM_Scalarize; 4453 assert(Ptr == getLoadStorePointerOperand(MemAccess) && 4454 "Ptr is neither a value or pointer operand"); 4455 return WideningDecision != CM_GatherScatter; 4456 }; 4457 4458 // A helper that returns true if the given value is a bitcast or 4459 // getelementptr instruction contained in the loop. 4460 auto isLoopVaryingBitCastOrGEP = [&](Value *V) { 4461 return ((isa<BitCastInst>(V) && V->getType()->isPointerTy()) || 4462 isa<GetElementPtrInst>(V)) && 4463 !TheLoop->isLoopInvariant(V); 4464 }; 4465 4466 // A helper that evaluates a memory access's use of a pointer. If the use 4467 // will be a scalar use, and the pointer is only used by memory accesses, we 4468 // place the pointer in ScalarPtrs. Otherwise, the pointer is placed in 4469 // PossibleNonScalarPtrs. 4470 auto evaluatePtrUse = [&](Instruction *MemAccess, Value *Ptr) { 4471 // We only care about bitcast and getelementptr instructions contained in 4472 // the loop. 4473 if (!isLoopVaryingBitCastOrGEP(Ptr)) 4474 return; 4475 4476 // If the pointer has already been identified as scalar (e.g., if it was 4477 // also identified as uniform), there's nothing to do. 4478 auto *I = cast<Instruction>(Ptr); 4479 if (Worklist.count(I)) 4480 return; 4481 4482 // If the use of the pointer will be a scalar use, and all users of the 4483 // pointer are memory accesses, place the pointer in ScalarPtrs. Otherwise, 4484 // place the pointer in PossibleNonScalarPtrs. 4485 if (isScalarUse(MemAccess, Ptr) && llvm::all_of(I->users(), [&](User *U) { 4486 return isa<LoadInst>(U) || isa<StoreInst>(U); 4487 })) 4488 ScalarPtrs.insert(I); 4489 else 4490 PossibleNonScalarPtrs.insert(I); 4491 }; 4492 4493 // We seed the scalars analysis with three classes of instructions: (1) 4494 // instructions marked uniform-after-vectorization, (2) bitcast and 4495 // getelementptr instructions used by memory accesses requiring a scalar use, 4496 // and (3) pointer induction variables and their update instructions (we 4497 // currently only scalarize these). 4498 // 4499 // (1) Add to the worklist all instructions that have been identified as 4500 // uniform-after-vectorization. 4501 Worklist.insert(Uniforms[VF].begin(), Uniforms[VF].end()); 4502 4503 // (2) Add to the worklist all bitcast and getelementptr instructions used by 4504 // memory accesses requiring a scalar use. The pointer operands of loads and 4505 // stores will be scalar as long as the memory accesses is not a gather or 4506 // scatter operation. The value operand of a store will remain scalar if the 4507 // store is scalarized. 4508 for (auto *BB : TheLoop->blocks()) 4509 for (auto &I : *BB) { 4510 if (auto *Load = dyn_cast<LoadInst>(&I)) { 4511 evaluatePtrUse(Load, Load->getPointerOperand()); 4512 } else if (auto *Store = dyn_cast<StoreInst>(&I)) { 4513 evaluatePtrUse(Store, Store->getPointerOperand()); 4514 evaluatePtrUse(Store, Store->getValueOperand()); 4515 } 4516 } 4517 for (auto *I : ScalarPtrs) 4518 if (PossibleNonScalarPtrs.find(I) == PossibleNonScalarPtrs.end()) { 4519 LLVM_DEBUG(dbgs() << "LV: Found scalar instruction: " << *I << "\n"); 4520 Worklist.insert(I); 4521 } 4522 4523 // (3) Add to the worklist all pointer induction variables and their update 4524 // instructions. 4525 // 4526 // TODO: Once we are able to vectorize pointer induction variables we should 4527 // no longer insert them into the worklist here. 4528 auto *Latch = TheLoop->getLoopLatch(); 4529 for (auto &Induction : Legal->getInductionVars()) { 4530 auto *Ind = Induction.first; 4531 auto *IndUpdate = cast<Instruction>(Ind->getIncomingValueForBlock(Latch)); 4532 if (Induction.second.getKind() != InductionDescriptor::IK_PtrInduction) 4533 continue; 4534 Worklist.insert(Ind); 4535 Worklist.insert(IndUpdate); 4536 LLVM_DEBUG(dbgs() << "LV: Found scalar instruction: " << *Ind << "\n"); 4537 LLVM_DEBUG(dbgs() << "LV: Found scalar instruction: " << *IndUpdate 4538 << "\n"); 4539 } 4540 4541 // Insert the forced scalars. 4542 // FIXME: Currently widenPHIInstruction() often creates a dead vector 4543 // induction variable when the PHI user is scalarized. 4544 auto ForcedScalar = ForcedScalars.find(VF); 4545 if (ForcedScalar != ForcedScalars.end()) 4546 for (auto *I : ForcedScalar->second) 4547 Worklist.insert(I); 4548 4549 // Expand the worklist by looking through any bitcasts and getelementptr 4550 // instructions we've already identified as scalar. This is similar to the 4551 // expansion step in collectLoopUniforms(); however, here we're only 4552 // expanding to include additional bitcasts and getelementptr instructions. 4553 unsigned Idx = 0; 4554 while (Idx != Worklist.size()) { 4555 Instruction *Dst = Worklist[Idx++]; 4556 if (!isLoopVaryingBitCastOrGEP(Dst->getOperand(0))) 4557 continue; 4558 auto *Src = cast<Instruction>(Dst->getOperand(0)); 4559 if (llvm::all_of(Src->users(), [&](User *U) -> bool { 4560 auto *J = cast<Instruction>(U); 4561 return !TheLoop->contains(J) || Worklist.count(J) || 4562 ((isa<LoadInst>(J) || isa<StoreInst>(J)) && 4563 isScalarUse(J, Src)); 4564 })) { 4565 Worklist.insert(Src); 4566 LLVM_DEBUG(dbgs() << "LV: Found scalar instruction: " << *Src << "\n"); 4567 } 4568 } 4569 4570 // An induction variable will remain scalar if all users of the induction 4571 // variable and induction variable update remain scalar. 4572 for (auto &Induction : Legal->getInductionVars()) { 4573 auto *Ind = Induction.first; 4574 auto *IndUpdate = cast<Instruction>(Ind->getIncomingValueForBlock(Latch)); 4575 4576 // We already considered pointer induction variables, so there's no reason 4577 // to look at their users again. 4578 // 4579 // TODO: Once we are able to vectorize pointer induction variables we 4580 // should no longer skip over them here. 4581 if (Induction.second.getKind() == InductionDescriptor::IK_PtrInduction) 4582 continue; 4583 4584 // Determine if all users of the induction variable are scalar after 4585 // vectorization. 4586 auto ScalarInd = llvm::all_of(Ind->users(), [&](User *U) -> bool { 4587 auto *I = cast<Instruction>(U); 4588 return I == IndUpdate || !TheLoop->contains(I) || Worklist.count(I); 4589 }); 4590 if (!ScalarInd) 4591 continue; 4592 4593 // Determine if all users of the induction variable update instruction are 4594 // scalar after vectorization. 4595 auto ScalarIndUpdate = 4596 llvm::all_of(IndUpdate->users(), [&](User *U) -> bool { 4597 auto *I = cast<Instruction>(U); 4598 return I == Ind || !TheLoop->contains(I) || Worklist.count(I); 4599 }); 4600 if (!ScalarIndUpdate) 4601 continue; 4602 4603 // The induction variable and its update instruction will remain scalar. 4604 Worklist.insert(Ind); 4605 Worklist.insert(IndUpdate); 4606 LLVM_DEBUG(dbgs() << "LV: Found scalar instruction: " << *Ind << "\n"); 4607 LLVM_DEBUG(dbgs() << "LV: Found scalar instruction: " << *IndUpdate 4608 << "\n"); 4609 } 4610 4611 Scalars[VF].insert(Worklist.begin(), Worklist.end()); 4612 } 4613 4614 bool LoopVectorizationCostModel::isScalarWithPredication(Instruction *I, unsigned VF) { 4615 if (!blockNeedsPredication(I->getParent())) 4616 return false; 4617 switch(I->getOpcode()) { 4618 default: 4619 break; 4620 case Instruction::Load: 4621 case Instruction::Store: { 4622 if (!Legal->isMaskRequired(I)) 4623 return false; 4624 auto *Ptr = getLoadStorePointerOperand(I); 4625 auto *Ty = getMemInstValueType(I); 4626 // We have already decided how to vectorize this instruction, get that 4627 // result. 4628 if (VF > 1) { 4629 InstWidening WideningDecision = getWideningDecision(I, VF); 4630 assert(WideningDecision != CM_Unknown && 4631 "Widening decision should be ready at this moment"); 4632 return WideningDecision == CM_Scalarize; 4633 } 4634 const MaybeAlign Alignment = getLoadStoreAlignment(I); 4635 return isa<LoadInst>(I) ? !(isLegalMaskedLoad(Ty, Ptr, Alignment) || 4636 isLegalMaskedGather(Ty, Alignment)) 4637 : !(isLegalMaskedStore(Ty, Ptr, Alignment) || 4638 isLegalMaskedScatter(Ty, Alignment)); 4639 } 4640 case Instruction::UDiv: 4641 case Instruction::SDiv: 4642 case Instruction::SRem: 4643 case Instruction::URem: 4644 return mayDivideByZero(*I); 4645 } 4646 return false; 4647 } 4648 4649 bool LoopVectorizationCostModel::interleavedAccessCanBeWidened(Instruction *I, 4650 unsigned VF) { 4651 assert(isAccessInterleaved(I) && "Expecting interleaved access."); 4652 assert(getWideningDecision(I, VF) == CM_Unknown && 4653 "Decision should not be set yet."); 4654 auto *Group = getInterleavedAccessGroup(I); 4655 assert(Group && "Must have a group."); 4656 4657 // If the instruction's allocated size doesn't equal it's type size, it 4658 // requires padding and will be scalarized. 4659 auto &DL = I->getModule()->getDataLayout(); 4660 auto *ScalarTy = getMemInstValueType(I); 4661 if (hasIrregularType(ScalarTy, DL, VF)) 4662 return false; 4663 4664 // Check if masking is required. 4665 // A Group may need masking for one of two reasons: it resides in a block that 4666 // needs predication, or it was decided to use masking to deal with gaps. 4667 bool PredicatedAccessRequiresMasking = 4668 Legal->blockNeedsPredication(I->getParent()) && Legal->isMaskRequired(I); 4669 bool AccessWithGapsRequiresMasking = 4670 Group->requiresScalarEpilogue() && !isScalarEpilogueAllowed(); 4671 if (!PredicatedAccessRequiresMasking && !AccessWithGapsRequiresMasking) 4672 return true; 4673 4674 // If masked interleaving is required, we expect that the user/target had 4675 // enabled it, because otherwise it either wouldn't have been created or 4676 // it should have been invalidated by the CostModel. 4677 assert(useMaskedInterleavedAccesses(TTI) && 4678 "Masked interleave-groups for predicated accesses are not enabled."); 4679 4680 auto *Ty = getMemInstValueType(I); 4681 const MaybeAlign Alignment = getLoadStoreAlignment(I); 4682 return isa<LoadInst>(I) ? TTI.isLegalMaskedLoad(Ty, Alignment) 4683 : TTI.isLegalMaskedStore(Ty, Alignment); 4684 } 4685 4686 bool LoopVectorizationCostModel::memoryInstructionCanBeWidened(Instruction *I, 4687 unsigned VF) { 4688 // Get and ensure we have a valid memory instruction. 4689 LoadInst *LI = dyn_cast<LoadInst>(I); 4690 StoreInst *SI = dyn_cast<StoreInst>(I); 4691 assert((LI || SI) && "Invalid memory instruction"); 4692 4693 auto *Ptr = getLoadStorePointerOperand(I); 4694 4695 // In order to be widened, the pointer should be consecutive, first of all. 4696 if (!Legal->isConsecutivePtr(Ptr)) 4697 return false; 4698 4699 // If the instruction is a store located in a predicated block, it will be 4700 // scalarized. 4701 if (isScalarWithPredication(I)) 4702 return false; 4703 4704 // If the instruction's allocated size doesn't equal it's type size, it 4705 // requires padding and will be scalarized. 4706 auto &DL = I->getModule()->getDataLayout(); 4707 auto *ScalarTy = LI ? LI->getType() : SI->getValueOperand()->getType(); 4708 if (hasIrregularType(ScalarTy, DL, VF)) 4709 return false; 4710 4711 return true; 4712 } 4713 4714 void LoopVectorizationCostModel::collectLoopUniforms(unsigned VF) { 4715 // We should not collect Uniforms more than once per VF. Right now, 4716 // this function is called from collectUniformsAndScalars(), which 4717 // already does this check. Collecting Uniforms for VF=1 does not make any 4718 // sense. 4719 4720 assert(VF >= 2 && Uniforms.find(VF) == Uniforms.end() && 4721 "This function should not be visited twice for the same VF"); 4722 4723 // Visit the list of Uniforms. If we'll not find any uniform value, we'll 4724 // not analyze again. Uniforms.count(VF) will return 1. 4725 Uniforms[VF].clear(); 4726 4727 // We now know that the loop is vectorizable! 4728 // Collect instructions inside the loop that will remain uniform after 4729 // vectorization. 4730 4731 // Global values, params and instructions outside of current loop are out of 4732 // scope. 4733 auto isOutOfScope = [&](Value *V) -> bool { 4734 Instruction *I = dyn_cast<Instruction>(V); 4735 return (!I || !TheLoop->contains(I)); 4736 }; 4737 4738 SetVector<Instruction *> Worklist; 4739 BasicBlock *Latch = TheLoop->getLoopLatch(); 4740 4741 // Instructions that are scalar with predication must not be considered 4742 // uniform after vectorization, because that would create an erroneous 4743 // replicating region where only a single instance out of VF should be formed. 4744 // TODO: optimize such seldom cases if found important, see PR40816. 4745 auto addToWorklistIfAllowed = [&](Instruction *I) -> void { 4746 if (isScalarWithPredication(I, VF)) { 4747 LLVM_DEBUG(dbgs() << "LV: Found not uniform being ScalarWithPredication: " 4748 << *I << "\n"); 4749 return; 4750 } 4751 LLVM_DEBUG(dbgs() << "LV: Found uniform instruction: " << *I << "\n"); 4752 Worklist.insert(I); 4753 }; 4754 4755 // Start with the conditional branch. If the branch condition is an 4756 // instruction contained in the loop that is only used by the branch, it is 4757 // uniform. 4758 auto *Cmp = dyn_cast<Instruction>(Latch->getTerminator()->getOperand(0)); 4759 if (Cmp && TheLoop->contains(Cmp) && Cmp->hasOneUse()) 4760 addToWorklistIfAllowed(Cmp); 4761 4762 // Holds consecutive and consecutive-like pointers. Consecutive-like pointers 4763 // are pointers that are treated like consecutive pointers during 4764 // vectorization. The pointer operands of interleaved accesses are an 4765 // example. 4766 SmallSetVector<Instruction *, 8> ConsecutiveLikePtrs; 4767 4768 // Holds pointer operands of instructions that are possibly non-uniform. 4769 SmallPtrSet<Instruction *, 8> PossibleNonUniformPtrs; 4770 4771 auto isUniformDecision = [&](Instruction *I, unsigned VF) { 4772 InstWidening WideningDecision = getWideningDecision(I, VF); 4773 assert(WideningDecision != CM_Unknown && 4774 "Widening decision should be ready at this moment"); 4775 4776 return (WideningDecision == CM_Widen || 4777 WideningDecision == CM_Widen_Reverse || 4778 WideningDecision == CM_Interleave); 4779 }; 4780 // Iterate over the instructions in the loop, and collect all 4781 // consecutive-like pointer operands in ConsecutiveLikePtrs. If it's possible 4782 // that a consecutive-like pointer operand will be scalarized, we collect it 4783 // in PossibleNonUniformPtrs instead. We use two sets here because a single 4784 // getelementptr instruction can be used by both vectorized and scalarized 4785 // memory instructions. For example, if a loop loads and stores from the same 4786 // location, but the store is conditional, the store will be scalarized, and 4787 // the getelementptr won't remain uniform. 4788 for (auto *BB : TheLoop->blocks()) 4789 for (auto &I : *BB) { 4790 // If there's no pointer operand, there's nothing to do. 4791 auto *Ptr = dyn_cast_or_null<Instruction>(getLoadStorePointerOperand(&I)); 4792 if (!Ptr) 4793 continue; 4794 4795 // True if all users of Ptr are memory accesses that have Ptr as their 4796 // pointer operand. 4797 auto UsersAreMemAccesses = 4798 llvm::all_of(Ptr->users(), [&](User *U) -> bool { 4799 return getLoadStorePointerOperand(U) == Ptr; 4800 }); 4801 4802 // Ensure the memory instruction will not be scalarized or used by 4803 // gather/scatter, making its pointer operand non-uniform. If the pointer 4804 // operand is used by any instruction other than a memory access, we 4805 // conservatively assume the pointer operand may be non-uniform. 4806 if (!UsersAreMemAccesses || !isUniformDecision(&I, VF)) 4807 PossibleNonUniformPtrs.insert(Ptr); 4808 4809 // If the memory instruction will be vectorized and its pointer operand 4810 // is consecutive-like, or interleaving - the pointer operand should 4811 // remain uniform. 4812 else 4813 ConsecutiveLikePtrs.insert(Ptr); 4814 } 4815 4816 // Add to the Worklist all consecutive and consecutive-like pointers that 4817 // aren't also identified as possibly non-uniform. 4818 for (auto *V : ConsecutiveLikePtrs) 4819 if (PossibleNonUniformPtrs.find(V) == PossibleNonUniformPtrs.end()) 4820 addToWorklistIfAllowed(V); 4821 4822 // Expand Worklist in topological order: whenever a new instruction 4823 // is added , its users should be already inside Worklist. It ensures 4824 // a uniform instruction will only be used by uniform instructions. 4825 unsigned idx = 0; 4826 while (idx != Worklist.size()) { 4827 Instruction *I = Worklist[idx++]; 4828 4829 for (auto OV : I->operand_values()) { 4830 // isOutOfScope operands cannot be uniform instructions. 4831 if (isOutOfScope(OV)) 4832 continue; 4833 // First order recurrence Phi's should typically be considered 4834 // non-uniform. 4835 auto *OP = dyn_cast<PHINode>(OV); 4836 if (OP && Legal->isFirstOrderRecurrence(OP)) 4837 continue; 4838 // If all the users of the operand are uniform, then add the 4839 // operand into the uniform worklist. 4840 auto *OI = cast<Instruction>(OV); 4841 if (llvm::all_of(OI->users(), [&](User *U) -> bool { 4842 auto *J = cast<Instruction>(U); 4843 return Worklist.count(J) || 4844 (OI == getLoadStorePointerOperand(J) && 4845 isUniformDecision(J, VF)); 4846 })) 4847 addToWorklistIfAllowed(OI); 4848 } 4849 } 4850 4851 // Returns true if Ptr is the pointer operand of a memory access instruction 4852 // I, and I is known to not require scalarization. 4853 auto isVectorizedMemAccessUse = [&](Instruction *I, Value *Ptr) -> bool { 4854 return getLoadStorePointerOperand(I) == Ptr && isUniformDecision(I, VF); 4855 }; 4856 4857 // For an instruction to be added into Worklist above, all its users inside 4858 // the loop should also be in Worklist. However, this condition cannot be 4859 // true for phi nodes that form a cyclic dependence. We must process phi 4860 // nodes separately. An induction variable will remain uniform if all users 4861 // of the induction variable and induction variable update remain uniform. 4862 // The code below handles both pointer and non-pointer induction variables. 4863 for (auto &Induction : Legal->getInductionVars()) { 4864 auto *Ind = Induction.first; 4865 auto *IndUpdate = cast<Instruction>(Ind->getIncomingValueForBlock(Latch)); 4866 4867 // Determine if all users of the induction variable are uniform after 4868 // vectorization. 4869 auto UniformInd = llvm::all_of(Ind->users(), [&](User *U) -> bool { 4870 auto *I = cast<Instruction>(U); 4871 return I == IndUpdate || !TheLoop->contains(I) || Worklist.count(I) || 4872 isVectorizedMemAccessUse(I, Ind); 4873 }); 4874 if (!UniformInd) 4875 continue; 4876 4877 // Determine if all users of the induction variable update instruction are 4878 // uniform after vectorization. 4879 auto UniformIndUpdate = 4880 llvm::all_of(IndUpdate->users(), [&](User *U) -> bool { 4881 auto *I = cast<Instruction>(U); 4882 return I == Ind || !TheLoop->contains(I) || Worklist.count(I) || 4883 isVectorizedMemAccessUse(I, IndUpdate); 4884 }); 4885 if (!UniformIndUpdate) 4886 continue; 4887 4888 // The induction variable and its update instruction will remain uniform. 4889 addToWorklistIfAllowed(Ind); 4890 addToWorklistIfAllowed(IndUpdate); 4891 } 4892 4893 Uniforms[VF].insert(Worklist.begin(), Worklist.end()); 4894 } 4895 4896 bool LoopVectorizationCostModel::runtimeChecksRequired() { 4897 LLVM_DEBUG(dbgs() << "LV: Performing code size checks.\n"); 4898 4899 if (Legal->getRuntimePointerChecking()->Need) { 4900 reportVectorizationFailure("Runtime ptr check is required with -Os/-Oz", 4901 "runtime pointer checks needed. Enable vectorization of this " 4902 "loop with '#pragma clang loop vectorize(enable)' when " 4903 "compiling with -Os/-Oz", 4904 "CantVersionLoopWithOptForSize", ORE, TheLoop); 4905 return true; 4906 } 4907 4908 if (!PSE.getUnionPredicate().getPredicates().empty()) { 4909 reportVectorizationFailure("Runtime SCEV check is required with -Os/-Oz", 4910 "runtime SCEV checks needed. Enable vectorization of this " 4911 "loop with '#pragma clang loop vectorize(enable)' when " 4912 "compiling with -Os/-Oz", 4913 "CantVersionLoopWithOptForSize", ORE, TheLoop); 4914 return true; 4915 } 4916 4917 // FIXME: Avoid specializing for stride==1 instead of bailing out. 4918 if (!Legal->getLAI()->getSymbolicStrides().empty()) { 4919 reportVectorizationFailure("Runtime stride check is required with -Os/-Oz", 4920 "runtime stride == 1 checks needed. Enable vectorization of " 4921 "this loop with '#pragma clang loop vectorize(enable)' when " 4922 "compiling with -Os/-Oz", 4923 "CantVersionLoopWithOptForSize", ORE, TheLoop); 4924 return true; 4925 } 4926 4927 return false; 4928 } 4929 4930 Optional<unsigned> LoopVectorizationCostModel::computeMaxVF() { 4931 if (Legal->getRuntimePointerChecking()->Need && TTI.hasBranchDivergence()) { 4932 // TODO: It may by useful to do since it's still likely to be dynamically 4933 // uniform if the target can skip. 4934 reportVectorizationFailure( 4935 "Not inserting runtime ptr check for divergent target", 4936 "runtime pointer checks needed. Not enabled for divergent target", 4937 "CantVersionLoopWithDivergentTarget", ORE, TheLoop); 4938 return None; 4939 } 4940 4941 unsigned TC = PSE.getSE()->getSmallConstantTripCount(TheLoop); 4942 LLVM_DEBUG(dbgs() << "LV: Found trip count: " << TC << '\n'); 4943 if (TC == 1) { 4944 reportVectorizationFailure("Single iteration (non) loop", 4945 "loop trip count is one, irrelevant for vectorization", 4946 "SingleIterationLoop", ORE, TheLoop); 4947 return None; 4948 } 4949 4950 switch (ScalarEpilogueStatus) { 4951 case CM_ScalarEpilogueAllowed: 4952 return computeFeasibleMaxVF(TC); 4953 case CM_ScalarEpilogueNotNeededUsePredicate: 4954 LLVM_DEBUG( 4955 dbgs() << "LV: vector predicate hint/switch found.\n" 4956 << "LV: Not allowing scalar epilogue, creating predicated " 4957 << "vector loop.\n"); 4958 break; 4959 case CM_ScalarEpilogueNotAllowedLowTripLoop: 4960 // fallthrough as a special case of OptForSize 4961 case CM_ScalarEpilogueNotAllowedOptSize: 4962 if (ScalarEpilogueStatus == CM_ScalarEpilogueNotAllowedOptSize) 4963 LLVM_DEBUG( 4964 dbgs() << "LV: Not allowing scalar epilogue due to -Os/-Oz.\n"); 4965 else 4966 LLVM_DEBUG(dbgs() << "LV: Not allowing scalar epilogue due to low trip " 4967 << "count.\n"); 4968 4969 // Bail if runtime checks are required, which are not good when optimising 4970 // for size. 4971 if (runtimeChecksRequired()) 4972 return None; 4973 break; 4974 } 4975 4976 // Now try the tail folding 4977 4978 // Invalidate interleave groups that require an epilogue if we can't mask 4979 // the interleave-group. 4980 if (!useMaskedInterleavedAccesses(TTI)) 4981 InterleaveInfo.invalidateGroupsRequiringScalarEpilogue(); 4982 4983 unsigned MaxVF = computeFeasibleMaxVF(TC); 4984 if (TC > 0 && TC % MaxVF == 0) { 4985 // Accept MaxVF if we do not have a tail. 4986 LLVM_DEBUG(dbgs() << "LV: No tail will remain for any chosen VF.\n"); 4987 return MaxVF; 4988 } 4989 4990 // If we don't know the precise trip count, or if the trip count that we 4991 // found modulo the vectorization factor is not zero, try to fold the tail 4992 // by masking. 4993 // FIXME: look for a smaller MaxVF that does divide TC rather than masking. 4994 if (Legal->prepareToFoldTailByMasking()) { 4995 FoldTailByMasking = true; 4996 return MaxVF; 4997 } 4998 4999 if (TC == 0) { 5000 reportVectorizationFailure( 5001 "Unable to calculate the loop count due to complex control flow", 5002 "unable to calculate the loop count due to complex control flow", 5003 "UnknownLoopCountComplexCFG", ORE, TheLoop); 5004 return None; 5005 } 5006 5007 reportVectorizationFailure( 5008 "Cannot optimize for size and vectorize at the same time.", 5009 "cannot optimize for size and vectorize at the same time. " 5010 "Enable vectorization of this loop with '#pragma clang loop " 5011 "vectorize(enable)' when compiling with -Os/-Oz", 5012 "NoTailLoopWithOptForSize", ORE, TheLoop); 5013 return None; 5014 } 5015 5016 unsigned 5017 LoopVectorizationCostModel::computeFeasibleMaxVF(unsigned ConstTripCount) { 5018 MinBWs = computeMinimumValueSizes(TheLoop->getBlocks(), *DB, &TTI); 5019 unsigned SmallestType, WidestType; 5020 std::tie(SmallestType, WidestType) = getSmallestAndWidestTypes(); 5021 unsigned WidestRegister = TTI.getRegisterBitWidth(true); 5022 5023 // Get the maximum safe dependence distance in bits computed by LAA. 5024 // It is computed by MaxVF * sizeOf(type) * 8, where type is taken from 5025 // the memory accesses that is most restrictive (involved in the smallest 5026 // dependence distance). 5027 unsigned MaxSafeRegisterWidth = Legal->getMaxSafeRegisterWidth(); 5028 5029 WidestRegister = std::min(WidestRegister, MaxSafeRegisterWidth); 5030 5031 unsigned MaxVectorSize = WidestRegister / WidestType; 5032 5033 LLVM_DEBUG(dbgs() << "LV: The Smallest and Widest types: " << SmallestType 5034 << " / " << WidestType << " bits.\n"); 5035 LLVM_DEBUG(dbgs() << "LV: The Widest register safe to use is: " 5036 << WidestRegister << " bits.\n"); 5037 5038 assert(MaxVectorSize <= 256 && "Did not expect to pack so many elements" 5039 " into one vector!"); 5040 if (MaxVectorSize == 0) { 5041 LLVM_DEBUG(dbgs() << "LV: The target has no vector registers.\n"); 5042 MaxVectorSize = 1; 5043 return MaxVectorSize; 5044 } else if (ConstTripCount && ConstTripCount < MaxVectorSize && 5045 isPowerOf2_32(ConstTripCount)) { 5046 // We need to clamp the VF to be the ConstTripCount. There is no point in 5047 // choosing a higher viable VF as done in the loop below. 5048 LLVM_DEBUG(dbgs() << "LV: Clamping the MaxVF to the constant trip count: " 5049 << ConstTripCount << "\n"); 5050 MaxVectorSize = ConstTripCount; 5051 return MaxVectorSize; 5052 } 5053 5054 unsigned MaxVF = MaxVectorSize; 5055 if (TTI.shouldMaximizeVectorBandwidth(!isScalarEpilogueAllowed()) || 5056 (MaximizeBandwidth && isScalarEpilogueAllowed())) { 5057 // Collect all viable vectorization factors larger than the default MaxVF 5058 // (i.e. MaxVectorSize). 5059 SmallVector<unsigned, 8> VFs; 5060 unsigned NewMaxVectorSize = WidestRegister / SmallestType; 5061 for (unsigned VS = MaxVectorSize * 2; VS <= NewMaxVectorSize; VS *= 2) 5062 VFs.push_back(VS); 5063 5064 // For each VF calculate its register usage. 5065 auto RUs = calculateRegisterUsage(VFs); 5066 5067 // Select the largest VF which doesn't require more registers than existing 5068 // ones. 5069 for (int i = RUs.size() - 1; i >= 0; --i) { 5070 bool Selected = true; 5071 for (auto& pair : RUs[i].MaxLocalUsers) { 5072 unsigned TargetNumRegisters = TTI.getNumberOfRegisters(pair.first); 5073 if (pair.second > TargetNumRegisters) 5074 Selected = false; 5075 } 5076 if (Selected) { 5077 MaxVF = VFs[i]; 5078 break; 5079 } 5080 } 5081 if (unsigned MinVF = TTI.getMinimumVF(SmallestType)) { 5082 if (MaxVF < MinVF) { 5083 LLVM_DEBUG(dbgs() << "LV: Overriding calculated MaxVF(" << MaxVF 5084 << ") with target's minimum: " << MinVF << '\n'); 5085 MaxVF = MinVF; 5086 } 5087 } 5088 } 5089 return MaxVF; 5090 } 5091 5092 VectorizationFactor 5093 LoopVectorizationCostModel::selectVectorizationFactor(unsigned MaxVF) { 5094 float Cost = expectedCost(1).first; 5095 const float ScalarCost = Cost; 5096 unsigned Width = 1; 5097 LLVM_DEBUG(dbgs() << "LV: Scalar loop costs: " << (int)ScalarCost << ".\n"); 5098 5099 bool ForceVectorization = Hints->getForce() == LoopVectorizeHints::FK_Enabled; 5100 if (ForceVectorization && MaxVF > 1) { 5101 // Ignore scalar width, because the user explicitly wants vectorization. 5102 // Initialize cost to max so that VF = 2 is, at least, chosen during cost 5103 // evaluation. 5104 Cost = std::numeric_limits<float>::max(); 5105 } 5106 5107 for (unsigned i = 2; i <= MaxVF; i *= 2) { 5108 // Notice that the vector loop needs to be executed less times, so 5109 // we need to divide the cost of the vector loops by the width of 5110 // the vector elements. 5111 VectorizationCostTy C = expectedCost(i); 5112 float VectorCost = C.first / (float)i; 5113 LLVM_DEBUG(dbgs() << "LV: Vector loop of width " << i 5114 << " costs: " << (int)VectorCost << ".\n"); 5115 if (!C.second && !ForceVectorization) { 5116 LLVM_DEBUG( 5117 dbgs() << "LV: Not considering vector loop of width " << i 5118 << " because it will not generate any vector instructions.\n"); 5119 continue; 5120 } 5121 if (VectorCost < Cost) { 5122 Cost = VectorCost; 5123 Width = i; 5124 } 5125 } 5126 5127 if (!EnableCondStoresVectorization && NumPredStores) { 5128 reportVectorizationFailure("There are conditional stores.", 5129 "store that is conditionally executed prevents vectorization", 5130 "ConditionalStore", ORE, TheLoop); 5131 Width = 1; 5132 Cost = ScalarCost; 5133 } 5134 5135 LLVM_DEBUG(if (ForceVectorization && Width > 1 && Cost >= ScalarCost) dbgs() 5136 << "LV: Vectorization seems to be not beneficial, " 5137 << "but was forced by a user.\n"); 5138 LLVM_DEBUG(dbgs() << "LV: Selecting VF: " << Width << ".\n"); 5139 VectorizationFactor Factor = {Width, (unsigned)(Width * Cost)}; 5140 return Factor; 5141 } 5142 5143 std::pair<unsigned, unsigned> 5144 LoopVectorizationCostModel::getSmallestAndWidestTypes() { 5145 unsigned MinWidth = -1U; 5146 unsigned MaxWidth = 8; 5147 const DataLayout &DL = TheFunction->getParent()->getDataLayout(); 5148 5149 // For each block. 5150 for (BasicBlock *BB : TheLoop->blocks()) { 5151 // For each instruction in the loop. 5152 for (Instruction &I : BB->instructionsWithoutDebug()) { 5153 Type *T = I.getType(); 5154 5155 // Skip ignored values. 5156 if (ValuesToIgnore.find(&I) != ValuesToIgnore.end()) 5157 continue; 5158 5159 // Only examine Loads, Stores and PHINodes. 5160 if (!isa<LoadInst>(I) && !isa<StoreInst>(I) && !isa<PHINode>(I)) 5161 continue; 5162 5163 // Examine PHI nodes that are reduction variables. Update the type to 5164 // account for the recurrence type. 5165 if (auto *PN = dyn_cast<PHINode>(&I)) { 5166 if (!Legal->isReductionVariable(PN)) 5167 continue; 5168 RecurrenceDescriptor RdxDesc = Legal->getReductionVars()[PN]; 5169 T = RdxDesc.getRecurrenceType(); 5170 } 5171 5172 // Examine the stored values. 5173 if (auto *ST = dyn_cast<StoreInst>(&I)) 5174 T = ST->getValueOperand()->getType(); 5175 5176 // Ignore loaded pointer types and stored pointer types that are not 5177 // vectorizable. 5178 // 5179 // FIXME: The check here attempts to predict whether a load or store will 5180 // be vectorized. We only know this for certain after a VF has 5181 // been selected. Here, we assume that if an access can be 5182 // vectorized, it will be. We should also look at extending this 5183 // optimization to non-pointer types. 5184 // 5185 if (T->isPointerTy() && !isConsecutiveLoadOrStore(&I) && 5186 !isAccessInterleaved(&I) && !isLegalGatherOrScatter(&I)) 5187 continue; 5188 5189 MinWidth = std::min(MinWidth, 5190 (unsigned)DL.getTypeSizeInBits(T->getScalarType())); 5191 MaxWidth = std::max(MaxWidth, 5192 (unsigned)DL.getTypeSizeInBits(T->getScalarType())); 5193 } 5194 } 5195 5196 return {MinWidth, MaxWidth}; 5197 } 5198 5199 unsigned LoopVectorizationCostModel::selectInterleaveCount(unsigned VF, 5200 unsigned LoopCost) { 5201 // -- The interleave heuristics -- 5202 // We interleave the loop in order to expose ILP and reduce the loop overhead. 5203 // There are many micro-architectural considerations that we can't predict 5204 // at this level. For example, frontend pressure (on decode or fetch) due to 5205 // code size, or the number and capabilities of the execution ports. 5206 // 5207 // We use the following heuristics to select the interleave count: 5208 // 1. If the code has reductions, then we interleave to break the cross 5209 // iteration dependency. 5210 // 2. If the loop is really small, then we interleave to reduce the loop 5211 // overhead. 5212 // 3. We don't interleave if we think that we will spill registers to memory 5213 // due to the increased register pressure. 5214 5215 if (!isScalarEpilogueAllowed()) 5216 return 1; 5217 5218 // We used the distance for the interleave count. 5219 if (Legal->getMaxSafeDepDistBytes() != -1U) 5220 return 1; 5221 5222 // Do not interleave loops with a relatively small known or estimated trip 5223 // count. 5224 auto BestKnownTC = getSmallBestKnownTC(*PSE.getSE(), TheLoop); 5225 if (BestKnownTC && *BestKnownTC < TinyTripCountInterleaveThreshold) 5226 return 1; 5227 5228 RegisterUsage R = calculateRegisterUsage({VF})[0]; 5229 // We divide by these constants so assume that we have at least one 5230 // instruction that uses at least one register. 5231 for (auto& pair : R.MaxLocalUsers) { 5232 pair.second = std::max(pair.second, 1U); 5233 } 5234 5235 // We calculate the interleave count using the following formula. 5236 // Subtract the number of loop invariants from the number of available 5237 // registers. These registers are used by all of the interleaved instances. 5238 // Next, divide the remaining registers by the number of registers that is 5239 // required by the loop, in order to estimate how many parallel instances 5240 // fit without causing spills. All of this is rounded down if necessary to be 5241 // a power of two. We want power of two interleave count to simplify any 5242 // addressing operations or alignment considerations. 5243 // We also want power of two interleave counts to ensure that the induction 5244 // variable of the vector loop wraps to zero, when tail is folded by masking; 5245 // this currently happens when OptForSize, in which case IC is set to 1 above. 5246 unsigned IC = UINT_MAX; 5247 5248 for (auto& pair : R.MaxLocalUsers) { 5249 unsigned TargetNumRegisters = TTI.getNumberOfRegisters(pair.first); 5250 LLVM_DEBUG(dbgs() << "LV: The target has " << TargetNumRegisters 5251 << " registers of " 5252 << TTI.getRegisterClassName(pair.first) << " register class\n"); 5253 if (VF == 1) { 5254 if (ForceTargetNumScalarRegs.getNumOccurrences() > 0) 5255 TargetNumRegisters = ForceTargetNumScalarRegs; 5256 } else { 5257 if (ForceTargetNumVectorRegs.getNumOccurrences() > 0) 5258 TargetNumRegisters = ForceTargetNumVectorRegs; 5259 } 5260 unsigned MaxLocalUsers = pair.second; 5261 unsigned LoopInvariantRegs = 0; 5262 if (R.LoopInvariantRegs.find(pair.first) != R.LoopInvariantRegs.end()) 5263 LoopInvariantRegs = R.LoopInvariantRegs[pair.first]; 5264 5265 unsigned TmpIC = PowerOf2Floor((TargetNumRegisters - LoopInvariantRegs) / MaxLocalUsers); 5266 // Don't count the induction variable as interleaved. 5267 if (EnableIndVarRegisterHeur) { 5268 TmpIC = 5269 PowerOf2Floor((TargetNumRegisters - LoopInvariantRegs - 1) / 5270 std::max(1U, (MaxLocalUsers - 1))); 5271 } 5272 5273 IC = std::min(IC, TmpIC); 5274 } 5275 5276 // Clamp the interleave ranges to reasonable counts. 5277 unsigned MaxInterleaveCount = TTI.getMaxInterleaveFactor(VF); 5278 5279 // Check if the user has overridden the max. 5280 if (VF == 1) { 5281 if (ForceTargetMaxScalarInterleaveFactor.getNumOccurrences() > 0) 5282 MaxInterleaveCount = ForceTargetMaxScalarInterleaveFactor; 5283 } else { 5284 if (ForceTargetMaxVectorInterleaveFactor.getNumOccurrences() > 0) 5285 MaxInterleaveCount = ForceTargetMaxVectorInterleaveFactor; 5286 } 5287 5288 // If trip count is known or estimated compile time constant, limit the 5289 // interleave count to be less than the trip count divided by VF. 5290 if (BestKnownTC) { 5291 MaxInterleaveCount = std::min(*BestKnownTC / VF, MaxInterleaveCount); 5292 } 5293 5294 // If we did not calculate the cost for VF (because the user selected the VF) 5295 // then we calculate the cost of VF here. 5296 if (LoopCost == 0) 5297 LoopCost = expectedCost(VF).first; 5298 5299 assert(LoopCost && "Non-zero loop cost expected"); 5300 5301 // Clamp the calculated IC to be between the 1 and the max interleave count 5302 // that the target and trip count allows. 5303 if (IC > MaxInterleaveCount) 5304 IC = MaxInterleaveCount; 5305 else if (IC < 1) 5306 IC = 1; 5307 5308 // Interleave if we vectorized this loop and there is a reduction that could 5309 // benefit from interleaving. 5310 if (VF > 1 && !Legal->getReductionVars().empty()) { 5311 LLVM_DEBUG(dbgs() << "LV: Interleaving because of reductions.\n"); 5312 return IC; 5313 } 5314 5315 // Note that if we've already vectorized the loop we will have done the 5316 // runtime check and so interleaving won't require further checks. 5317 bool InterleavingRequiresRuntimePointerCheck = 5318 (VF == 1 && Legal->getRuntimePointerChecking()->Need); 5319 5320 // We want to interleave small loops in order to reduce the loop overhead and 5321 // potentially expose ILP opportunities. 5322 LLVM_DEBUG(dbgs() << "LV: Loop cost is " << LoopCost << '\n'); 5323 if (!InterleavingRequiresRuntimePointerCheck && LoopCost < SmallLoopCost) { 5324 // We assume that the cost overhead is 1 and we use the cost model 5325 // to estimate the cost of the loop and interleave until the cost of the 5326 // loop overhead is about 5% of the cost of the loop. 5327 unsigned SmallIC = 5328 std::min(IC, (unsigned)PowerOf2Floor(SmallLoopCost / LoopCost)); 5329 5330 // Interleave until store/load ports (estimated by max interleave count) are 5331 // saturated. 5332 unsigned NumStores = Legal->getNumStores(); 5333 unsigned NumLoads = Legal->getNumLoads(); 5334 unsigned StoresIC = IC / (NumStores ? NumStores : 1); 5335 unsigned LoadsIC = IC / (NumLoads ? NumLoads : 1); 5336 5337 // If we have a scalar reduction (vector reductions are already dealt with 5338 // by this point), we can increase the critical path length if the loop 5339 // we're interleaving is inside another loop. Limit, by default to 2, so the 5340 // critical path only gets increased by one reduction operation. 5341 if (!Legal->getReductionVars().empty() && TheLoop->getLoopDepth() > 1) { 5342 unsigned F = static_cast<unsigned>(MaxNestedScalarReductionIC); 5343 SmallIC = std::min(SmallIC, F); 5344 StoresIC = std::min(StoresIC, F); 5345 LoadsIC = std::min(LoadsIC, F); 5346 } 5347 5348 if (EnableLoadStoreRuntimeInterleave && 5349 std::max(StoresIC, LoadsIC) > SmallIC) { 5350 LLVM_DEBUG( 5351 dbgs() << "LV: Interleaving to saturate store or load ports.\n"); 5352 return std::max(StoresIC, LoadsIC); 5353 } 5354 5355 LLVM_DEBUG(dbgs() << "LV: Interleaving to reduce branch cost.\n"); 5356 return SmallIC; 5357 } 5358 5359 // Interleave if this is a large loop (small loops are already dealt with by 5360 // this point) that could benefit from interleaving. 5361 bool HasReductions = !Legal->getReductionVars().empty(); 5362 if (TTI.enableAggressiveInterleaving(HasReductions)) { 5363 LLVM_DEBUG(dbgs() << "LV: Interleaving to expose ILP.\n"); 5364 return IC; 5365 } 5366 5367 LLVM_DEBUG(dbgs() << "LV: Not Interleaving.\n"); 5368 return 1; 5369 } 5370 5371 SmallVector<LoopVectorizationCostModel::RegisterUsage, 8> 5372 LoopVectorizationCostModel::calculateRegisterUsage(ArrayRef<unsigned> VFs) { 5373 // This function calculates the register usage by measuring the highest number 5374 // of values that are alive at a single location. Obviously, this is a very 5375 // rough estimation. We scan the loop in a topological order in order and 5376 // assign a number to each instruction. We use RPO to ensure that defs are 5377 // met before their users. We assume that each instruction that has in-loop 5378 // users starts an interval. We record every time that an in-loop value is 5379 // used, so we have a list of the first and last occurrences of each 5380 // instruction. Next, we transpose this data structure into a multi map that 5381 // holds the list of intervals that *end* at a specific location. This multi 5382 // map allows us to perform a linear search. We scan the instructions linearly 5383 // and record each time that a new interval starts, by placing it in a set. 5384 // If we find this value in the multi-map then we remove it from the set. 5385 // The max register usage is the maximum size of the set. 5386 // We also search for instructions that are defined outside the loop, but are 5387 // used inside the loop. We need this number separately from the max-interval 5388 // usage number because when we unroll, loop-invariant values do not take 5389 // more register. 5390 LoopBlocksDFS DFS(TheLoop); 5391 DFS.perform(LI); 5392 5393 RegisterUsage RU; 5394 5395 // Each 'key' in the map opens a new interval. The values 5396 // of the map are the index of the 'last seen' usage of the 5397 // instruction that is the key. 5398 using IntervalMap = DenseMap<Instruction *, unsigned>; 5399 5400 // Maps instruction to its index. 5401 SmallVector<Instruction *, 64> IdxToInstr; 5402 // Marks the end of each interval. 5403 IntervalMap EndPoint; 5404 // Saves the list of instruction indices that are used in the loop. 5405 SmallPtrSet<Instruction *, 8> Ends; 5406 // Saves the list of values that are used in the loop but are 5407 // defined outside the loop, such as arguments and constants. 5408 SmallPtrSet<Value *, 8> LoopInvariants; 5409 5410 for (BasicBlock *BB : make_range(DFS.beginRPO(), DFS.endRPO())) { 5411 for (Instruction &I : BB->instructionsWithoutDebug()) { 5412 IdxToInstr.push_back(&I); 5413 5414 // Save the end location of each USE. 5415 for (Value *U : I.operands()) { 5416 auto *Instr = dyn_cast<Instruction>(U); 5417 5418 // Ignore non-instruction values such as arguments, constants, etc. 5419 if (!Instr) 5420 continue; 5421 5422 // If this instruction is outside the loop then record it and continue. 5423 if (!TheLoop->contains(Instr)) { 5424 LoopInvariants.insert(Instr); 5425 continue; 5426 } 5427 5428 // Overwrite previous end points. 5429 EndPoint[Instr] = IdxToInstr.size(); 5430 Ends.insert(Instr); 5431 } 5432 } 5433 } 5434 5435 // Saves the list of intervals that end with the index in 'key'. 5436 using InstrList = SmallVector<Instruction *, 2>; 5437 DenseMap<unsigned, InstrList> TransposeEnds; 5438 5439 // Transpose the EndPoints to a list of values that end at each index. 5440 for (auto &Interval : EndPoint) 5441 TransposeEnds[Interval.second].push_back(Interval.first); 5442 5443 SmallPtrSet<Instruction *, 8> OpenIntervals; 5444 5445 // Get the size of the widest register. 5446 unsigned MaxSafeDepDist = -1U; 5447 if (Legal->getMaxSafeDepDistBytes() != -1U) 5448 MaxSafeDepDist = Legal->getMaxSafeDepDistBytes() * 8; 5449 unsigned WidestRegister = 5450 std::min(TTI.getRegisterBitWidth(true), MaxSafeDepDist); 5451 const DataLayout &DL = TheFunction->getParent()->getDataLayout(); 5452 5453 SmallVector<RegisterUsage, 8> RUs(VFs.size()); 5454 SmallVector<SmallMapVector<unsigned, unsigned, 4>, 8> MaxUsages(VFs.size()); 5455 5456 LLVM_DEBUG(dbgs() << "LV(REG): Calculating max register usage:\n"); 5457 5458 // A lambda that gets the register usage for the given type and VF. 5459 auto GetRegUsage = [&DL, WidestRegister](Type *Ty, unsigned VF) { 5460 if (Ty->isTokenTy()) 5461 return 0U; 5462 unsigned TypeSize = DL.getTypeSizeInBits(Ty->getScalarType()); 5463 return std::max<unsigned>(1, VF * TypeSize / WidestRegister); 5464 }; 5465 5466 for (unsigned int i = 0, s = IdxToInstr.size(); i < s; ++i) { 5467 Instruction *I = IdxToInstr[i]; 5468 5469 // Remove all of the instructions that end at this location. 5470 InstrList &List = TransposeEnds[i]; 5471 for (Instruction *ToRemove : List) 5472 OpenIntervals.erase(ToRemove); 5473 5474 // Ignore instructions that are never used within the loop. 5475 if (Ends.find(I) == Ends.end()) 5476 continue; 5477 5478 // Skip ignored values. 5479 if (ValuesToIgnore.find(I) != ValuesToIgnore.end()) 5480 continue; 5481 5482 // For each VF find the maximum usage of registers. 5483 for (unsigned j = 0, e = VFs.size(); j < e; ++j) { 5484 // Count the number of live intervals. 5485 SmallMapVector<unsigned, unsigned, 4> RegUsage; 5486 5487 if (VFs[j] == 1) { 5488 for (auto Inst : OpenIntervals) { 5489 unsigned ClassID = TTI.getRegisterClassForType(false, Inst->getType()); 5490 if (RegUsage.find(ClassID) == RegUsage.end()) 5491 RegUsage[ClassID] = 1; 5492 else 5493 RegUsage[ClassID] += 1; 5494 } 5495 } else { 5496 collectUniformsAndScalars(VFs[j]); 5497 for (auto Inst : OpenIntervals) { 5498 // Skip ignored values for VF > 1. 5499 if (VecValuesToIgnore.find(Inst) != VecValuesToIgnore.end()) 5500 continue; 5501 if (isScalarAfterVectorization(Inst, VFs[j])) { 5502 unsigned ClassID = TTI.getRegisterClassForType(false, Inst->getType()); 5503 if (RegUsage.find(ClassID) == RegUsage.end()) 5504 RegUsage[ClassID] = 1; 5505 else 5506 RegUsage[ClassID] += 1; 5507 } else { 5508 unsigned ClassID = TTI.getRegisterClassForType(true, Inst->getType()); 5509 if (RegUsage.find(ClassID) == RegUsage.end()) 5510 RegUsage[ClassID] = GetRegUsage(Inst->getType(), VFs[j]); 5511 else 5512 RegUsage[ClassID] += GetRegUsage(Inst->getType(), VFs[j]); 5513 } 5514 } 5515 } 5516 5517 for (auto& pair : RegUsage) { 5518 if (MaxUsages[j].find(pair.first) != MaxUsages[j].end()) 5519 MaxUsages[j][pair.first] = std::max(MaxUsages[j][pair.first], pair.second); 5520 else 5521 MaxUsages[j][pair.first] = pair.second; 5522 } 5523 } 5524 5525 LLVM_DEBUG(dbgs() << "LV(REG): At #" << i << " Interval # " 5526 << OpenIntervals.size() << '\n'); 5527 5528 // Add the current instruction to the list of open intervals. 5529 OpenIntervals.insert(I); 5530 } 5531 5532 for (unsigned i = 0, e = VFs.size(); i < e; ++i) { 5533 SmallMapVector<unsigned, unsigned, 4> Invariant; 5534 5535 for (auto Inst : LoopInvariants) { 5536 unsigned Usage = VFs[i] == 1 ? 1 : GetRegUsage(Inst->getType(), VFs[i]); 5537 unsigned ClassID = TTI.getRegisterClassForType(VFs[i] > 1, Inst->getType()); 5538 if (Invariant.find(ClassID) == Invariant.end()) 5539 Invariant[ClassID] = Usage; 5540 else 5541 Invariant[ClassID] += Usage; 5542 } 5543 5544 LLVM_DEBUG({ 5545 dbgs() << "LV(REG): VF = " << VFs[i] << '\n'; 5546 dbgs() << "LV(REG): Found max usage: " << MaxUsages[i].size() 5547 << " item\n"; 5548 for (const auto &pair : MaxUsages[i]) { 5549 dbgs() << "LV(REG): RegisterClass: " 5550 << TTI.getRegisterClassName(pair.first) << ", " << pair.second 5551 << " registers\n"; 5552 } 5553 dbgs() << "LV(REG): Found invariant usage: " << Invariant.size() 5554 << " item\n"; 5555 for (const auto &pair : Invariant) { 5556 dbgs() << "LV(REG): RegisterClass: " 5557 << TTI.getRegisterClassName(pair.first) << ", " << pair.second 5558 << " registers\n"; 5559 } 5560 }); 5561 5562 RU.LoopInvariantRegs = Invariant; 5563 RU.MaxLocalUsers = MaxUsages[i]; 5564 RUs[i] = RU; 5565 } 5566 5567 return RUs; 5568 } 5569 5570 bool LoopVectorizationCostModel::useEmulatedMaskMemRefHack(Instruction *I){ 5571 // TODO: Cost model for emulated masked load/store is completely 5572 // broken. This hack guides the cost model to use an artificially 5573 // high enough value to practically disable vectorization with such 5574 // operations, except where previously deployed legality hack allowed 5575 // using very low cost values. This is to avoid regressions coming simply 5576 // from moving "masked load/store" check from legality to cost model. 5577 // Masked Load/Gather emulation was previously never allowed. 5578 // Limited number of Masked Store/Scatter emulation was allowed. 5579 assert(isPredicatedInst(I) && "Expecting a scalar emulated instruction"); 5580 return isa<LoadInst>(I) || 5581 (isa<StoreInst>(I) && 5582 NumPredStores > NumberOfStoresToPredicate); 5583 } 5584 5585 void LoopVectorizationCostModel::collectInstsToScalarize(unsigned VF) { 5586 // If we aren't vectorizing the loop, or if we've already collected the 5587 // instructions to scalarize, there's nothing to do. Collection may already 5588 // have occurred if we have a user-selected VF and are now computing the 5589 // expected cost for interleaving. 5590 if (VF < 2 || InstsToScalarize.find(VF) != InstsToScalarize.end()) 5591 return; 5592 5593 // Initialize a mapping for VF in InstsToScalalarize. If we find that it's 5594 // not profitable to scalarize any instructions, the presence of VF in the 5595 // map will indicate that we've analyzed it already. 5596 ScalarCostsTy &ScalarCostsVF = InstsToScalarize[VF]; 5597 5598 // Find all the instructions that are scalar with predication in the loop and 5599 // determine if it would be better to not if-convert the blocks they are in. 5600 // If so, we also record the instructions to scalarize. 5601 for (BasicBlock *BB : TheLoop->blocks()) { 5602 if (!blockNeedsPredication(BB)) 5603 continue; 5604 for (Instruction &I : *BB) 5605 if (isScalarWithPredication(&I)) { 5606 ScalarCostsTy ScalarCosts; 5607 // Do not apply discount logic if hacked cost is needed 5608 // for emulated masked memrefs. 5609 if (!useEmulatedMaskMemRefHack(&I) && 5610 computePredInstDiscount(&I, ScalarCosts, VF) >= 0) 5611 ScalarCostsVF.insert(ScalarCosts.begin(), ScalarCosts.end()); 5612 // Remember that BB will remain after vectorization. 5613 PredicatedBBsAfterVectorization.insert(BB); 5614 } 5615 } 5616 } 5617 5618 int LoopVectorizationCostModel::computePredInstDiscount( 5619 Instruction *PredInst, DenseMap<Instruction *, unsigned> &ScalarCosts, 5620 unsigned VF) { 5621 assert(!isUniformAfterVectorization(PredInst, VF) && 5622 "Instruction marked uniform-after-vectorization will be predicated"); 5623 5624 // Initialize the discount to zero, meaning that the scalar version and the 5625 // vector version cost the same. 5626 int Discount = 0; 5627 5628 // Holds instructions to analyze. The instructions we visit are mapped in 5629 // ScalarCosts. Those instructions are the ones that would be scalarized if 5630 // we find that the scalar version costs less. 5631 SmallVector<Instruction *, 8> Worklist; 5632 5633 // Returns true if the given instruction can be scalarized. 5634 auto canBeScalarized = [&](Instruction *I) -> bool { 5635 // We only attempt to scalarize instructions forming a single-use chain 5636 // from the original predicated block that would otherwise be vectorized. 5637 // Although not strictly necessary, we give up on instructions we know will 5638 // already be scalar to avoid traversing chains that are unlikely to be 5639 // beneficial. 5640 if (!I->hasOneUse() || PredInst->getParent() != I->getParent() || 5641 isScalarAfterVectorization(I, VF)) 5642 return false; 5643 5644 // If the instruction is scalar with predication, it will be analyzed 5645 // separately. We ignore it within the context of PredInst. 5646 if (isScalarWithPredication(I)) 5647 return false; 5648 5649 // If any of the instruction's operands are uniform after vectorization, 5650 // the instruction cannot be scalarized. This prevents, for example, a 5651 // masked load from being scalarized. 5652 // 5653 // We assume we will only emit a value for lane zero of an instruction 5654 // marked uniform after vectorization, rather than VF identical values. 5655 // Thus, if we scalarize an instruction that uses a uniform, we would 5656 // create uses of values corresponding to the lanes we aren't emitting code 5657 // for. This behavior can be changed by allowing getScalarValue to clone 5658 // the lane zero values for uniforms rather than asserting. 5659 for (Use &U : I->operands()) 5660 if (auto *J = dyn_cast<Instruction>(U.get())) 5661 if (isUniformAfterVectorization(J, VF)) 5662 return false; 5663 5664 // Otherwise, we can scalarize the instruction. 5665 return true; 5666 }; 5667 5668 // Compute the expected cost discount from scalarizing the entire expression 5669 // feeding the predicated instruction. We currently only consider expressions 5670 // that are single-use instruction chains. 5671 Worklist.push_back(PredInst); 5672 while (!Worklist.empty()) { 5673 Instruction *I = Worklist.pop_back_val(); 5674 5675 // If we've already analyzed the instruction, there's nothing to do. 5676 if (ScalarCosts.find(I) != ScalarCosts.end()) 5677 continue; 5678 5679 // Compute the cost of the vector instruction. Note that this cost already 5680 // includes the scalarization overhead of the predicated instruction. 5681 unsigned VectorCost = getInstructionCost(I, VF).first; 5682 5683 // Compute the cost of the scalarized instruction. This cost is the cost of 5684 // the instruction as if it wasn't if-converted and instead remained in the 5685 // predicated block. We will scale this cost by block probability after 5686 // computing the scalarization overhead. 5687 unsigned ScalarCost = VF * getInstructionCost(I, 1).first; 5688 5689 // Compute the scalarization overhead of needed insertelement instructions 5690 // and phi nodes. 5691 if (isScalarWithPredication(I) && !I->getType()->isVoidTy()) { 5692 ScalarCost += TTI.getScalarizationOverhead(ToVectorTy(I->getType(), VF), 5693 true, false); 5694 ScalarCost += VF * TTI.getCFInstrCost(Instruction::PHI); 5695 } 5696 5697 // Compute the scalarization overhead of needed extractelement 5698 // instructions. For each of the instruction's operands, if the operand can 5699 // be scalarized, add it to the worklist; otherwise, account for the 5700 // overhead. 5701 for (Use &U : I->operands()) 5702 if (auto *J = dyn_cast<Instruction>(U.get())) { 5703 assert(VectorType::isValidElementType(J->getType()) && 5704 "Instruction has non-scalar type"); 5705 if (canBeScalarized(J)) 5706 Worklist.push_back(J); 5707 else if (needsExtract(J, VF)) 5708 ScalarCost += TTI.getScalarizationOverhead( 5709 ToVectorTy(J->getType(),VF), false, true); 5710 } 5711 5712 // Scale the total scalar cost by block probability. 5713 ScalarCost /= getReciprocalPredBlockProb(); 5714 5715 // Compute the discount. A non-negative discount means the vector version 5716 // of the instruction costs more, and scalarizing would be beneficial. 5717 Discount += VectorCost - ScalarCost; 5718 ScalarCosts[I] = ScalarCost; 5719 } 5720 5721 return Discount; 5722 } 5723 5724 LoopVectorizationCostModel::VectorizationCostTy 5725 LoopVectorizationCostModel::expectedCost(unsigned VF) { 5726 VectorizationCostTy Cost; 5727 5728 // For each block. 5729 for (BasicBlock *BB : TheLoop->blocks()) { 5730 VectorizationCostTy BlockCost; 5731 5732 // For each instruction in the old loop. 5733 for (Instruction &I : BB->instructionsWithoutDebug()) { 5734 // Skip ignored values. 5735 if (ValuesToIgnore.find(&I) != ValuesToIgnore.end() || 5736 (VF > 1 && VecValuesToIgnore.find(&I) != VecValuesToIgnore.end())) 5737 continue; 5738 5739 VectorizationCostTy C = getInstructionCost(&I, VF); 5740 5741 // Check if we should override the cost. 5742 if (ForceTargetInstructionCost.getNumOccurrences() > 0) 5743 C.first = ForceTargetInstructionCost; 5744 5745 BlockCost.first += C.first; 5746 BlockCost.second |= C.second; 5747 LLVM_DEBUG(dbgs() << "LV: Found an estimated cost of " << C.first 5748 << " for VF " << VF << " For instruction: " << I 5749 << '\n'); 5750 } 5751 5752 // If we are vectorizing a predicated block, it will have been 5753 // if-converted. This means that the block's instructions (aside from 5754 // stores and instructions that may divide by zero) will now be 5755 // unconditionally executed. For the scalar case, we may not always execute 5756 // the predicated block. Thus, scale the block's cost by the probability of 5757 // executing it. 5758 if (VF == 1 && blockNeedsPredication(BB)) 5759 BlockCost.first /= getReciprocalPredBlockProb(); 5760 5761 Cost.first += BlockCost.first; 5762 Cost.second |= BlockCost.second; 5763 } 5764 5765 return Cost; 5766 } 5767 5768 /// Gets Address Access SCEV after verifying that the access pattern 5769 /// is loop invariant except the induction variable dependence. 5770 /// 5771 /// This SCEV can be sent to the Target in order to estimate the address 5772 /// calculation cost. 5773 static const SCEV *getAddressAccessSCEV( 5774 Value *Ptr, 5775 LoopVectorizationLegality *Legal, 5776 PredicatedScalarEvolution &PSE, 5777 const Loop *TheLoop) { 5778 5779 auto *Gep = dyn_cast<GetElementPtrInst>(Ptr); 5780 if (!Gep) 5781 return nullptr; 5782 5783 // We are looking for a gep with all loop invariant indices except for one 5784 // which should be an induction variable. 5785 auto SE = PSE.getSE(); 5786 unsigned NumOperands = Gep->getNumOperands(); 5787 for (unsigned i = 1; i < NumOperands; ++i) { 5788 Value *Opd = Gep->getOperand(i); 5789 if (!SE->isLoopInvariant(SE->getSCEV(Opd), TheLoop) && 5790 !Legal->isInductionVariable(Opd)) 5791 return nullptr; 5792 } 5793 5794 // Now we know we have a GEP ptr, %inv, %ind, %inv. return the Ptr SCEV. 5795 return PSE.getSCEV(Ptr); 5796 } 5797 5798 static bool isStrideMul(Instruction *I, LoopVectorizationLegality *Legal) { 5799 return Legal->hasStride(I->getOperand(0)) || 5800 Legal->hasStride(I->getOperand(1)); 5801 } 5802 5803 unsigned LoopVectorizationCostModel::getMemInstScalarizationCost(Instruction *I, 5804 unsigned VF) { 5805 assert(VF > 1 && "Scalarization cost of instruction implies vectorization."); 5806 Type *ValTy = getMemInstValueType(I); 5807 auto SE = PSE.getSE(); 5808 5809 unsigned AS = getLoadStoreAddressSpace(I); 5810 Value *Ptr = getLoadStorePointerOperand(I); 5811 Type *PtrTy = ToVectorTy(Ptr->getType(), VF); 5812 5813 // Figure out whether the access is strided and get the stride value 5814 // if it's known in compile time 5815 const SCEV *PtrSCEV = getAddressAccessSCEV(Ptr, Legal, PSE, TheLoop); 5816 5817 // Get the cost of the scalar memory instruction and address computation. 5818 unsigned Cost = VF * TTI.getAddressComputationCost(PtrTy, SE, PtrSCEV); 5819 5820 // Don't pass *I here, since it is scalar but will actually be part of a 5821 // vectorized loop where the user of it is a vectorized instruction. 5822 const MaybeAlign Alignment = getLoadStoreAlignment(I); 5823 Cost += VF * TTI.getMemoryOpCost(I->getOpcode(), ValTy->getScalarType(), 5824 Alignment, AS); 5825 5826 // Get the overhead of the extractelement and insertelement instructions 5827 // we might create due to scalarization. 5828 Cost += getScalarizationOverhead(I, VF); 5829 5830 // If we have a predicated store, it may not be executed for each vector 5831 // lane. Scale the cost by the probability of executing the predicated 5832 // block. 5833 if (isPredicatedInst(I)) { 5834 Cost /= getReciprocalPredBlockProb(); 5835 5836 if (useEmulatedMaskMemRefHack(I)) 5837 // Artificially setting to a high enough value to practically disable 5838 // vectorization with such operations. 5839 Cost = 3000000; 5840 } 5841 5842 return Cost; 5843 } 5844 5845 unsigned LoopVectorizationCostModel::getConsecutiveMemOpCost(Instruction *I, 5846 unsigned VF) { 5847 Type *ValTy = getMemInstValueType(I); 5848 Type *VectorTy = ToVectorTy(ValTy, VF); 5849 Value *Ptr = getLoadStorePointerOperand(I); 5850 unsigned AS = getLoadStoreAddressSpace(I); 5851 int ConsecutiveStride = Legal->isConsecutivePtr(Ptr); 5852 5853 assert((ConsecutiveStride == 1 || ConsecutiveStride == -1) && 5854 "Stride should be 1 or -1 for consecutive memory access"); 5855 const MaybeAlign Alignment = getLoadStoreAlignment(I); 5856 unsigned Cost = 0; 5857 if (Legal->isMaskRequired(I)) 5858 Cost += TTI.getMaskedMemoryOpCost(I->getOpcode(), VectorTy, 5859 Alignment ? Alignment->value() : 0, AS); 5860 else 5861 Cost += TTI.getMemoryOpCost(I->getOpcode(), VectorTy, Alignment, AS, I); 5862 5863 bool Reverse = ConsecutiveStride < 0; 5864 if (Reverse) 5865 Cost += TTI.getShuffleCost(TargetTransformInfo::SK_Reverse, VectorTy, 0); 5866 return Cost; 5867 } 5868 5869 unsigned LoopVectorizationCostModel::getUniformMemOpCost(Instruction *I, 5870 unsigned VF) { 5871 Type *ValTy = getMemInstValueType(I); 5872 Type *VectorTy = ToVectorTy(ValTy, VF); 5873 const MaybeAlign Alignment = getLoadStoreAlignment(I); 5874 unsigned AS = getLoadStoreAddressSpace(I); 5875 if (isa<LoadInst>(I)) { 5876 return TTI.getAddressComputationCost(ValTy) + 5877 TTI.getMemoryOpCost(Instruction::Load, ValTy, Alignment, AS) + 5878 TTI.getShuffleCost(TargetTransformInfo::SK_Broadcast, VectorTy); 5879 } 5880 StoreInst *SI = cast<StoreInst>(I); 5881 5882 bool isLoopInvariantStoreValue = Legal->isUniform(SI->getValueOperand()); 5883 return TTI.getAddressComputationCost(ValTy) + 5884 TTI.getMemoryOpCost(Instruction::Store, ValTy, Alignment, AS) + 5885 (isLoopInvariantStoreValue 5886 ? 0 5887 : TTI.getVectorInstrCost(Instruction::ExtractElement, VectorTy, 5888 VF - 1)); 5889 } 5890 5891 unsigned LoopVectorizationCostModel::getGatherScatterCost(Instruction *I, 5892 unsigned VF) { 5893 Type *ValTy = getMemInstValueType(I); 5894 Type *VectorTy = ToVectorTy(ValTy, VF); 5895 const MaybeAlign Alignment = getLoadStoreAlignment(I); 5896 Value *Ptr = getLoadStorePointerOperand(I); 5897 5898 return TTI.getAddressComputationCost(VectorTy) + 5899 TTI.getGatherScatterOpCost(I->getOpcode(), VectorTy, Ptr, 5900 Legal->isMaskRequired(I), 5901 Alignment ? Alignment->value() : 0, I); 5902 } 5903 5904 unsigned LoopVectorizationCostModel::getInterleaveGroupCost(Instruction *I, 5905 unsigned VF) { 5906 Type *ValTy = getMemInstValueType(I); 5907 Type *VectorTy = ToVectorTy(ValTy, VF); 5908 unsigned AS = getLoadStoreAddressSpace(I); 5909 5910 auto Group = getInterleavedAccessGroup(I); 5911 assert(Group && "Fail to get an interleaved access group."); 5912 5913 unsigned InterleaveFactor = Group->getFactor(); 5914 Type *WideVecTy = VectorType::get(ValTy, VF * InterleaveFactor); 5915 5916 // Holds the indices of existing members in an interleaved load group. 5917 // An interleaved store group doesn't need this as it doesn't allow gaps. 5918 SmallVector<unsigned, 4> Indices; 5919 if (isa<LoadInst>(I)) { 5920 for (unsigned i = 0; i < InterleaveFactor; i++) 5921 if (Group->getMember(i)) 5922 Indices.push_back(i); 5923 } 5924 5925 // Calculate the cost of the whole interleaved group. 5926 bool UseMaskForGaps = 5927 Group->requiresScalarEpilogue() && !isScalarEpilogueAllowed(); 5928 unsigned Cost = TTI.getInterleavedMemoryOpCost( 5929 I->getOpcode(), WideVecTy, Group->getFactor(), Indices, 5930 Group->getAlign().value(), AS, Legal->isMaskRequired(I), UseMaskForGaps); 5931 5932 if (Group->isReverse()) { 5933 // TODO: Add support for reversed masked interleaved access. 5934 assert(!Legal->isMaskRequired(I) && 5935 "Reverse masked interleaved access not supported."); 5936 Cost += Group->getNumMembers() * 5937 TTI.getShuffleCost(TargetTransformInfo::SK_Reverse, VectorTy, 0); 5938 } 5939 return Cost; 5940 } 5941 5942 unsigned LoopVectorizationCostModel::getMemoryInstructionCost(Instruction *I, 5943 unsigned VF) { 5944 // Calculate scalar cost only. Vectorization cost should be ready at this 5945 // moment. 5946 if (VF == 1) { 5947 Type *ValTy = getMemInstValueType(I); 5948 const MaybeAlign Alignment = getLoadStoreAlignment(I); 5949 unsigned AS = getLoadStoreAddressSpace(I); 5950 5951 return TTI.getAddressComputationCost(ValTy) + 5952 TTI.getMemoryOpCost(I->getOpcode(), ValTy, Alignment, AS, I); 5953 } 5954 return getWideningCost(I, VF); 5955 } 5956 5957 LoopVectorizationCostModel::VectorizationCostTy 5958 LoopVectorizationCostModel::getInstructionCost(Instruction *I, unsigned VF) { 5959 // If we know that this instruction will remain uniform, check the cost of 5960 // the scalar version. 5961 if (isUniformAfterVectorization(I, VF)) 5962 VF = 1; 5963 5964 if (VF > 1 && isProfitableToScalarize(I, VF)) 5965 return VectorizationCostTy(InstsToScalarize[VF][I], false); 5966 5967 // Forced scalars do not have any scalarization overhead. 5968 auto ForcedScalar = ForcedScalars.find(VF); 5969 if (VF > 1 && ForcedScalar != ForcedScalars.end()) { 5970 auto InstSet = ForcedScalar->second; 5971 if (InstSet.find(I) != InstSet.end()) 5972 return VectorizationCostTy((getInstructionCost(I, 1).first * VF), false); 5973 } 5974 5975 Type *VectorTy; 5976 unsigned C = getInstructionCost(I, VF, VectorTy); 5977 5978 bool TypeNotScalarized = 5979 VF > 1 && VectorTy->isVectorTy() && TTI.getNumberOfParts(VectorTy) < VF; 5980 return VectorizationCostTy(C, TypeNotScalarized); 5981 } 5982 5983 unsigned LoopVectorizationCostModel::getScalarizationOverhead(Instruction *I, 5984 unsigned VF) { 5985 5986 if (VF == 1) 5987 return 0; 5988 5989 unsigned Cost = 0; 5990 Type *RetTy = ToVectorTy(I->getType(), VF); 5991 if (!RetTy->isVoidTy() && 5992 (!isa<LoadInst>(I) || !TTI.supportsEfficientVectorElementLoadStore())) 5993 Cost += TTI.getScalarizationOverhead(RetTy, true, false); 5994 5995 // Some targets keep addresses scalar. 5996 if (isa<LoadInst>(I) && !TTI.prefersVectorizedAddressing()) 5997 return Cost; 5998 5999 // Some targets support efficient element stores. 6000 if (isa<StoreInst>(I) && TTI.supportsEfficientVectorElementLoadStore()) 6001 return Cost; 6002 6003 // Collect operands to consider. 6004 CallInst *CI = dyn_cast<CallInst>(I); 6005 Instruction::op_range Ops = CI ? CI->arg_operands() : I->operands(); 6006 6007 // Skip operands that do not require extraction/scalarization and do not incur 6008 // any overhead. 6009 return Cost + TTI.getOperandsScalarizationOverhead( 6010 filterExtractingOperands(Ops, VF), VF); 6011 } 6012 6013 void LoopVectorizationCostModel::setCostBasedWideningDecision(unsigned VF) { 6014 if (VF == 1) 6015 return; 6016 NumPredStores = 0; 6017 for (BasicBlock *BB : TheLoop->blocks()) { 6018 // For each instruction in the old loop. 6019 for (Instruction &I : *BB) { 6020 Value *Ptr = getLoadStorePointerOperand(&I); 6021 if (!Ptr) 6022 continue; 6023 6024 // TODO: We should generate better code and update the cost model for 6025 // predicated uniform stores. Today they are treated as any other 6026 // predicated store (see added test cases in 6027 // invariant-store-vectorization.ll). 6028 if (isa<StoreInst>(&I) && isScalarWithPredication(&I)) 6029 NumPredStores++; 6030 6031 if (Legal->isUniform(Ptr) && 6032 // Conditional loads and stores should be scalarized and predicated. 6033 // isScalarWithPredication cannot be used here since masked 6034 // gather/scatters are not considered scalar with predication. 6035 !Legal->blockNeedsPredication(I.getParent())) { 6036 // TODO: Avoid replicating loads and stores instead of 6037 // relying on instcombine to remove them. 6038 // Load: Scalar load + broadcast 6039 // Store: Scalar store + isLoopInvariantStoreValue ? 0 : extract 6040 unsigned Cost = getUniformMemOpCost(&I, VF); 6041 setWideningDecision(&I, VF, CM_Scalarize, Cost); 6042 continue; 6043 } 6044 6045 // We assume that widening is the best solution when possible. 6046 if (memoryInstructionCanBeWidened(&I, VF)) { 6047 unsigned Cost = getConsecutiveMemOpCost(&I, VF); 6048 int ConsecutiveStride = 6049 Legal->isConsecutivePtr(getLoadStorePointerOperand(&I)); 6050 assert((ConsecutiveStride == 1 || ConsecutiveStride == -1) && 6051 "Expected consecutive stride."); 6052 InstWidening Decision = 6053 ConsecutiveStride == 1 ? CM_Widen : CM_Widen_Reverse; 6054 setWideningDecision(&I, VF, Decision, Cost); 6055 continue; 6056 } 6057 6058 // Choose between Interleaving, Gather/Scatter or Scalarization. 6059 unsigned InterleaveCost = std::numeric_limits<unsigned>::max(); 6060 unsigned NumAccesses = 1; 6061 if (isAccessInterleaved(&I)) { 6062 auto Group = getInterleavedAccessGroup(&I); 6063 assert(Group && "Fail to get an interleaved access group."); 6064 6065 // Make one decision for the whole group. 6066 if (getWideningDecision(&I, VF) != CM_Unknown) 6067 continue; 6068 6069 NumAccesses = Group->getNumMembers(); 6070 if (interleavedAccessCanBeWidened(&I, VF)) 6071 InterleaveCost = getInterleaveGroupCost(&I, VF); 6072 } 6073 6074 unsigned GatherScatterCost = 6075 isLegalGatherOrScatter(&I) 6076 ? getGatherScatterCost(&I, VF) * NumAccesses 6077 : std::numeric_limits<unsigned>::max(); 6078 6079 unsigned ScalarizationCost = 6080 getMemInstScalarizationCost(&I, VF) * NumAccesses; 6081 6082 // Choose better solution for the current VF, 6083 // write down this decision and use it during vectorization. 6084 unsigned Cost; 6085 InstWidening Decision; 6086 if (InterleaveCost <= GatherScatterCost && 6087 InterleaveCost < ScalarizationCost) { 6088 Decision = CM_Interleave; 6089 Cost = InterleaveCost; 6090 } else if (GatherScatterCost < ScalarizationCost) { 6091 Decision = CM_GatherScatter; 6092 Cost = GatherScatterCost; 6093 } else { 6094 Decision = CM_Scalarize; 6095 Cost = ScalarizationCost; 6096 } 6097 // If the instructions belongs to an interleave group, the whole group 6098 // receives the same decision. The whole group receives the cost, but 6099 // the cost will actually be assigned to one instruction. 6100 if (auto Group = getInterleavedAccessGroup(&I)) 6101 setWideningDecision(Group, VF, Decision, Cost); 6102 else 6103 setWideningDecision(&I, VF, Decision, Cost); 6104 } 6105 } 6106 6107 // Make sure that any load of address and any other address computation 6108 // remains scalar unless there is gather/scatter support. This avoids 6109 // inevitable extracts into address registers, and also has the benefit of 6110 // activating LSR more, since that pass can't optimize vectorized 6111 // addresses. 6112 if (TTI.prefersVectorizedAddressing()) 6113 return; 6114 6115 // Start with all scalar pointer uses. 6116 SmallPtrSet<Instruction *, 8> AddrDefs; 6117 for (BasicBlock *BB : TheLoop->blocks()) 6118 for (Instruction &I : *BB) { 6119 Instruction *PtrDef = 6120 dyn_cast_or_null<Instruction>(getLoadStorePointerOperand(&I)); 6121 if (PtrDef && TheLoop->contains(PtrDef) && 6122 getWideningDecision(&I, VF) != CM_GatherScatter) 6123 AddrDefs.insert(PtrDef); 6124 } 6125 6126 // Add all instructions used to generate the addresses. 6127 SmallVector<Instruction *, 4> Worklist; 6128 for (auto *I : AddrDefs) 6129 Worklist.push_back(I); 6130 while (!Worklist.empty()) { 6131 Instruction *I = Worklist.pop_back_val(); 6132 for (auto &Op : I->operands()) 6133 if (auto *InstOp = dyn_cast<Instruction>(Op)) 6134 if ((InstOp->getParent() == I->getParent()) && !isa<PHINode>(InstOp) && 6135 AddrDefs.insert(InstOp).second) 6136 Worklist.push_back(InstOp); 6137 } 6138 6139 for (auto *I : AddrDefs) { 6140 if (isa<LoadInst>(I)) { 6141 // Setting the desired widening decision should ideally be handled in 6142 // by cost functions, but since this involves the task of finding out 6143 // if the loaded register is involved in an address computation, it is 6144 // instead changed here when we know this is the case. 6145 InstWidening Decision = getWideningDecision(I, VF); 6146 if (Decision == CM_Widen || Decision == CM_Widen_Reverse) 6147 // Scalarize a widened load of address. 6148 setWideningDecision(I, VF, CM_Scalarize, 6149 (VF * getMemoryInstructionCost(I, 1))); 6150 else if (auto Group = getInterleavedAccessGroup(I)) { 6151 // Scalarize an interleave group of address loads. 6152 for (unsigned I = 0; I < Group->getFactor(); ++I) { 6153 if (Instruction *Member = Group->getMember(I)) 6154 setWideningDecision(Member, VF, CM_Scalarize, 6155 (VF * getMemoryInstructionCost(Member, 1))); 6156 } 6157 } 6158 } else 6159 // Make sure I gets scalarized and a cost estimate without 6160 // scalarization overhead. 6161 ForcedScalars[VF].insert(I); 6162 } 6163 } 6164 6165 unsigned LoopVectorizationCostModel::getInstructionCost(Instruction *I, 6166 unsigned VF, 6167 Type *&VectorTy) { 6168 Type *RetTy = I->getType(); 6169 if (canTruncateToMinimalBitwidth(I, VF)) 6170 RetTy = IntegerType::get(RetTy->getContext(), MinBWs[I]); 6171 VectorTy = isScalarAfterVectorization(I, VF) ? RetTy : ToVectorTy(RetTy, VF); 6172 auto SE = PSE.getSE(); 6173 6174 // TODO: We need to estimate the cost of intrinsic calls. 6175 switch (I->getOpcode()) { 6176 case Instruction::GetElementPtr: 6177 // We mark this instruction as zero-cost because the cost of GEPs in 6178 // vectorized code depends on whether the corresponding memory instruction 6179 // is scalarized or not. Therefore, we handle GEPs with the memory 6180 // instruction cost. 6181 return 0; 6182 case Instruction::Br: { 6183 // In cases of scalarized and predicated instructions, there will be VF 6184 // predicated blocks in the vectorized loop. Each branch around these 6185 // blocks requires also an extract of its vector compare i1 element. 6186 bool ScalarPredicatedBB = false; 6187 BranchInst *BI = cast<BranchInst>(I); 6188 if (VF > 1 && BI->isConditional() && 6189 (PredicatedBBsAfterVectorization.find(BI->getSuccessor(0)) != 6190 PredicatedBBsAfterVectorization.end() || 6191 PredicatedBBsAfterVectorization.find(BI->getSuccessor(1)) != 6192 PredicatedBBsAfterVectorization.end())) 6193 ScalarPredicatedBB = true; 6194 6195 if (ScalarPredicatedBB) { 6196 // Return cost for branches around scalarized and predicated blocks. 6197 Type *Vec_i1Ty = 6198 VectorType::get(IntegerType::getInt1Ty(RetTy->getContext()), VF); 6199 return (TTI.getScalarizationOverhead(Vec_i1Ty, false, true) + 6200 (TTI.getCFInstrCost(Instruction::Br) * VF)); 6201 } else if (I->getParent() == TheLoop->getLoopLatch() || VF == 1) 6202 // The back-edge branch will remain, as will all scalar branches. 6203 return TTI.getCFInstrCost(Instruction::Br); 6204 else 6205 // This branch will be eliminated by if-conversion. 6206 return 0; 6207 // Note: We currently assume zero cost for an unconditional branch inside 6208 // a predicated block since it will become a fall-through, although we 6209 // may decide in the future to call TTI for all branches. 6210 } 6211 case Instruction::PHI: { 6212 auto *Phi = cast<PHINode>(I); 6213 6214 // First-order recurrences are replaced by vector shuffles inside the loop. 6215 // NOTE: Don't use ToVectorTy as SK_ExtractSubvector expects a vector type. 6216 if (VF > 1 && Legal->isFirstOrderRecurrence(Phi)) 6217 return TTI.getShuffleCost(TargetTransformInfo::SK_ExtractSubvector, 6218 VectorTy, VF - 1, VectorType::get(RetTy, 1)); 6219 6220 // Phi nodes in non-header blocks (not inductions, reductions, etc.) are 6221 // converted into select instructions. We require N - 1 selects per phi 6222 // node, where N is the number of incoming values. 6223 if (VF > 1 && Phi->getParent() != TheLoop->getHeader()) 6224 return (Phi->getNumIncomingValues() - 1) * 6225 TTI.getCmpSelInstrCost( 6226 Instruction::Select, ToVectorTy(Phi->getType(), VF), 6227 ToVectorTy(Type::getInt1Ty(Phi->getContext()), VF)); 6228 6229 return TTI.getCFInstrCost(Instruction::PHI); 6230 } 6231 case Instruction::UDiv: 6232 case Instruction::SDiv: 6233 case Instruction::URem: 6234 case Instruction::SRem: 6235 // If we have a predicated instruction, it may not be executed for each 6236 // vector lane. Get the scalarization cost and scale this amount by the 6237 // probability of executing the predicated block. If the instruction is not 6238 // predicated, we fall through to the next case. 6239 if (VF > 1 && isScalarWithPredication(I)) { 6240 unsigned Cost = 0; 6241 6242 // These instructions have a non-void type, so account for the phi nodes 6243 // that we will create. This cost is likely to be zero. The phi node 6244 // cost, if any, should be scaled by the block probability because it 6245 // models a copy at the end of each predicated block. 6246 Cost += VF * TTI.getCFInstrCost(Instruction::PHI); 6247 6248 // The cost of the non-predicated instruction. 6249 Cost += VF * TTI.getArithmeticInstrCost(I->getOpcode(), RetTy); 6250 6251 // The cost of insertelement and extractelement instructions needed for 6252 // scalarization. 6253 Cost += getScalarizationOverhead(I, VF); 6254 6255 // Scale the cost by the probability of executing the predicated blocks. 6256 // This assumes the predicated block for each vector lane is equally 6257 // likely. 6258 return Cost / getReciprocalPredBlockProb(); 6259 } 6260 LLVM_FALLTHROUGH; 6261 case Instruction::Add: 6262 case Instruction::FAdd: 6263 case Instruction::Sub: 6264 case Instruction::FSub: 6265 case Instruction::Mul: 6266 case Instruction::FMul: 6267 case Instruction::FDiv: 6268 case Instruction::FRem: 6269 case Instruction::Shl: 6270 case Instruction::LShr: 6271 case Instruction::AShr: 6272 case Instruction::And: 6273 case Instruction::Or: 6274 case Instruction::Xor: { 6275 // Since we will replace the stride by 1 the multiplication should go away. 6276 if (I->getOpcode() == Instruction::Mul && isStrideMul(I, Legal)) 6277 return 0; 6278 // Certain instructions can be cheaper to vectorize if they have a constant 6279 // second vector operand. One example of this are shifts on x86. 6280 Value *Op2 = I->getOperand(1); 6281 TargetTransformInfo::OperandValueProperties Op2VP; 6282 TargetTransformInfo::OperandValueKind Op2VK = 6283 TTI.getOperandInfo(Op2, Op2VP); 6284 if (Op2VK == TargetTransformInfo::OK_AnyValue && Legal->isUniform(Op2)) 6285 Op2VK = TargetTransformInfo::OK_UniformValue; 6286 6287 SmallVector<const Value *, 4> Operands(I->operand_values()); 6288 unsigned N = isScalarAfterVectorization(I, VF) ? VF : 1; 6289 return N * TTI.getArithmeticInstrCost( 6290 I->getOpcode(), VectorTy, TargetTransformInfo::OK_AnyValue, 6291 Op2VK, TargetTransformInfo::OP_None, Op2VP, Operands, I); 6292 } 6293 case Instruction::FNeg: { 6294 unsigned N = isScalarAfterVectorization(I, VF) ? VF : 1; 6295 return N * TTI.getArithmeticInstrCost( 6296 I->getOpcode(), VectorTy, TargetTransformInfo::OK_AnyValue, 6297 TargetTransformInfo::OK_AnyValue, 6298 TargetTransformInfo::OP_None, TargetTransformInfo::OP_None, 6299 I->getOperand(0), I); 6300 } 6301 case Instruction::Select: { 6302 SelectInst *SI = cast<SelectInst>(I); 6303 const SCEV *CondSCEV = SE->getSCEV(SI->getCondition()); 6304 bool ScalarCond = (SE->isLoopInvariant(CondSCEV, TheLoop)); 6305 Type *CondTy = SI->getCondition()->getType(); 6306 if (!ScalarCond) 6307 CondTy = VectorType::get(CondTy, VF); 6308 6309 return TTI.getCmpSelInstrCost(I->getOpcode(), VectorTy, CondTy, I); 6310 } 6311 case Instruction::ICmp: 6312 case Instruction::FCmp: { 6313 Type *ValTy = I->getOperand(0)->getType(); 6314 Instruction *Op0AsInstruction = dyn_cast<Instruction>(I->getOperand(0)); 6315 if (canTruncateToMinimalBitwidth(Op0AsInstruction, VF)) 6316 ValTy = IntegerType::get(ValTy->getContext(), MinBWs[Op0AsInstruction]); 6317 VectorTy = ToVectorTy(ValTy, VF); 6318 return TTI.getCmpSelInstrCost(I->getOpcode(), VectorTy, nullptr, I); 6319 } 6320 case Instruction::Store: 6321 case Instruction::Load: { 6322 unsigned Width = VF; 6323 if (Width > 1) { 6324 InstWidening Decision = getWideningDecision(I, Width); 6325 assert(Decision != CM_Unknown && 6326 "CM decision should be taken at this point"); 6327 if (Decision == CM_Scalarize) 6328 Width = 1; 6329 } 6330 VectorTy = ToVectorTy(getMemInstValueType(I), Width); 6331 return getMemoryInstructionCost(I, VF); 6332 } 6333 case Instruction::ZExt: 6334 case Instruction::SExt: 6335 case Instruction::FPToUI: 6336 case Instruction::FPToSI: 6337 case Instruction::FPExt: 6338 case Instruction::PtrToInt: 6339 case Instruction::IntToPtr: 6340 case Instruction::SIToFP: 6341 case Instruction::UIToFP: 6342 case Instruction::Trunc: 6343 case Instruction::FPTrunc: 6344 case Instruction::BitCast: { 6345 // We optimize the truncation of induction variables having constant 6346 // integer steps. The cost of these truncations is the same as the scalar 6347 // operation. 6348 if (isOptimizableIVTruncate(I, VF)) { 6349 auto *Trunc = cast<TruncInst>(I); 6350 return TTI.getCastInstrCost(Instruction::Trunc, Trunc->getDestTy(), 6351 Trunc->getSrcTy(), Trunc); 6352 } 6353 6354 Type *SrcScalarTy = I->getOperand(0)->getType(); 6355 Type *SrcVecTy = 6356 VectorTy->isVectorTy() ? ToVectorTy(SrcScalarTy, VF) : SrcScalarTy; 6357 if (canTruncateToMinimalBitwidth(I, VF)) { 6358 // This cast is going to be shrunk. This may remove the cast or it might 6359 // turn it into slightly different cast. For example, if MinBW == 16, 6360 // "zext i8 %1 to i32" becomes "zext i8 %1 to i16". 6361 // 6362 // Calculate the modified src and dest types. 6363 Type *MinVecTy = VectorTy; 6364 if (I->getOpcode() == Instruction::Trunc) { 6365 SrcVecTy = smallestIntegerVectorType(SrcVecTy, MinVecTy); 6366 VectorTy = 6367 largestIntegerVectorType(ToVectorTy(I->getType(), VF), MinVecTy); 6368 } else if (I->getOpcode() == Instruction::ZExt || 6369 I->getOpcode() == Instruction::SExt) { 6370 SrcVecTy = largestIntegerVectorType(SrcVecTy, MinVecTy); 6371 VectorTy = 6372 smallestIntegerVectorType(ToVectorTy(I->getType(), VF), MinVecTy); 6373 } 6374 } 6375 6376 unsigned N = isScalarAfterVectorization(I, VF) ? VF : 1; 6377 return N * TTI.getCastInstrCost(I->getOpcode(), VectorTy, SrcVecTy, I); 6378 } 6379 case Instruction::Call: { 6380 bool NeedToScalarize; 6381 CallInst *CI = cast<CallInst>(I); 6382 unsigned CallCost = getVectorCallCost(CI, VF, NeedToScalarize); 6383 if (getVectorIntrinsicIDForCall(CI, TLI)) 6384 return std::min(CallCost, getVectorIntrinsicCost(CI, VF)); 6385 return CallCost; 6386 } 6387 default: 6388 // The cost of executing VF copies of the scalar instruction. This opcode 6389 // is unknown. Assume that it is the same as 'mul'. 6390 return VF * TTI.getArithmeticInstrCost(Instruction::Mul, VectorTy) + 6391 getScalarizationOverhead(I, VF); 6392 } // end of switch. 6393 } 6394 6395 char LoopVectorize::ID = 0; 6396 6397 static const char lv_name[] = "Loop Vectorization"; 6398 6399 INITIALIZE_PASS_BEGIN(LoopVectorize, LV_NAME, lv_name, false, false) 6400 INITIALIZE_PASS_DEPENDENCY(TargetTransformInfoWrapperPass) 6401 INITIALIZE_PASS_DEPENDENCY(BasicAAWrapperPass) 6402 INITIALIZE_PASS_DEPENDENCY(AAResultsWrapperPass) 6403 INITIALIZE_PASS_DEPENDENCY(GlobalsAAWrapperPass) 6404 INITIALIZE_PASS_DEPENDENCY(AssumptionCacheTracker) 6405 INITIALIZE_PASS_DEPENDENCY(BlockFrequencyInfoWrapperPass) 6406 INITIALIZE_PASS_DEPENDENCY(DominatorTreeWrapperPass) 6407 INITIALIZE_PASS_DEPENDENCY(ScalarEvolutionWrapperPass) 6408 INITIALIZE_PASS_DEPENDENCY(LoopInfoWrapperPass) 6409 INITIALIZE_PASS_DEPENDENCY(LoopAccessLegacyAnalysis) 6410 INITIALIZE_PASS_DEPENDENCY(DemandedBitsWrapperPass) 6411 INITIALIZE_PASS_DEPENDENCY(OptimizationRemarkEmitterWrapperPass) 6412 INITIALIZE_PASS_DEPENDENCY(ProfileSummaryInfoWrapperPass) 6413 INITIALIZE_PASS_DEPENDENCY(InjectTLIMappingsLegacy) 6414 INITIALIZE_PASS_END(LoopVectorize, LV_NAME, lv_name, false, false) 6415 6416 namespace llvm { 6417 6418 Pass *createLoopVectorizePass() { return new LoopVectorize(); } 6419 6420 Pass *createLoopVectorizePass(bool InterleaveOnlyWhenForced, 6421 bool VectorizeOnlyWhenForced) { 6422 return new LoopVectorize(InterleaveOnlyWhenForced, VectorizeOnlyWhenForced); 6423 } 6424 6425 } // end namespace llvm 6426 6427 bool LoopVectorizationCostModel::isConsecutiveLoadOrStore(Instruction *Inst) { 6428 // Check if the pointer operand of a load or store instruction is 6429 // consecutive. 6430 if (auto *Ptr = getLoadStorePointerOperand(Inst)) 6431 return Legal->isConsecutivePtr(Ptr); 6432 return false; 6433 } 6434 6435 void LoopVectorizationCostModel::collectValuesToIgnore() { 6436 // Ignore ephemeral values. 6437 CodeMetrics::collectEphemeralValues(TheLoop, AC, ValuesToIgnore); 6438 6439 // Ignore type-promoting instructions we identified during reduction 6440 // detection. 6441 for (auto &Reduction : Legal->getReductionVars()) { 6442 RecurrenceDescriptor &RedDes = Reduction.second; 6443 SmallPtrSetImpl<Instruction *> &Casts = RedDes.getCastInsts(); 6444 VecValuesToIgnore.insert(Casts.begin(), Casts.end()); 6445 } 6446 // Ignore type-casting instructions we identified during induction 6447 // detection. 6448 for (auto &Induction : Legal->getInductionVars()) { 6449 InductionDescriptor &IndDes = Induction.second; 6450 const SmallVectorImpl<Instruction *> &Casts = IndDes.getCastInsts(); 6451 VecValuesToIgnore.insert(Casts.begin(), Casts.end()); 6452 } 6453 } 6454 6455 // TODO: we could return a pair of values that specify the max VF and 6456 // min VF, to be used in `buildVPlans(MinVF, MaxVF)` instead of 6457 // `buildVPlans(VF, VF)`. We cannot do it because VPLAN at the moment 6458 // doesn't have a cost model that can choose which plan to execute if 6459 // more than one is generated. 6460 static unsigned determineVPlanVF(const unsigned WidestVectorRegBits, 6461 LoopVectorizationCostModel &CM) { 6462 unsigned WidestType; 6463 std::tie(std::ignore, WidestType) = CM.getSmallestAndWidestTypes(); 6464 return WidestVectorRegBits / WidestType; 6465 } 6466 6467 VectorizationFactor 6468 LoopVectorizationPlanner::planInVPlanNativePath(unsigned UserVF) { 6469 unsigned VF = UserVF; 6470 // Outer loop handling: They may require CFG and instruction level 6471 // transformations before even evaluating whether vectorization is profitable. 6472 // Since we cannot modify the incoming IR, we need to build VPlan upfront in 6473 // the vectorization pipeline. 6474 if (!OrigLoop->empty()) { 6475 // If the user doesn't provide a vectorization factor, determine a 6476 // reasonable one. 6477 if (!UserVF) { 6478 VF = determineVPlanVF(TTI->getRegisterBitWidth(true /* Vector*/), CM); 6479 LLVM_DEBUG(dbgs() << "LV: VPlan computed VF " << VF << ".\n"); 6480 6481 // Make sure we have a VF > 1 for stress testing. 6482 if (VPlanBuildStressTest && VF < 2) { 6483 LLVM_DEBUG(dbgs() << "LV: VPlan stress testing: " 6484 << "overriding computed VF.\n"); 6485 VF = 4; 6486 } 6487 } 6488 assert(EnableVPlanNativePath && "VPlan-native path is not enabled."); 6489 assert(isPowerOf2_32(VF) && "VF needs to be a power of two"); 6490 LLVM_DEBUG(dbgs() << "LV: Using " << (UserVF ? "user " : "") << "VF " << VF 6491 << " to build VPlans.\n"); 6492 buildVPlans(VF, VF); 6493 6494 // For VPlan build stress testing, we bail out after VPlan construction. 6495 if (VPlanBuildStressTest) 6496 return VectorizationFactor::Disabled(); 6497 6498 return {VF, 0}; 6499 } 6500 6501 LLVM_DEBUG( 6502 dbgs() << "LV: Not vectorizing. Inner loops aren't supported in the " 6503 "VPlan-native path.\n"); 6504 return VectorizationFactor::Disabled(); 6505 } 6506 6507 Optional<VectorizationFactor> LoopVectorizationPlanner::plan(unsigned UserVF) { 6508 assert(OrigLoop->empty() && "Inner loop expected."); 6509 Optional<unsigned> MaybeMaxVF = CM.computeMaxVF(); 6510 if (!MaybeMaxVF) // Cases that should not to be vectorized nor interleaved. 6511 return None; 6512 6513 // Invalidate interleave groups if all blocks of loop will be predicated. 6514 if (CM.blockNeedsPredication(OrigLoop->getHeader()) && 6515 !useMaskedInterleavedAccesses(*TTI)) { 6516 LLVM_DEBUG( 6517 dbgs() 6518 << "LV: Invalidate all interleaved groups due to fold-tail by masking " 6519 "which requires masked-interleaved support.\n"); 6520 CM.InterleaveInfo.reset(); 6521 } 6522 6523 if (UserVF) { 6524 LLVM_DEBUG(dbgs() << "LV: Using user VF " << UserVF << ".\n"); 6525 assert(isPowerOf2_32(UserVF) && "VF needs to be a power of two"); 6526 // Collect the instructions (and their associated costs) that will be more 6527 // profitable to scalarize. 6528 CM.selectUserVectorizationFactor(UserVF); 6529 buildVPlansWithVPRecipes(UserVF, UserVF); 6530 LLVM_DEBUG(printPlans(dbgs())); 6531 return {{UserVF, 0}}; 6532 } 6533 6534 unsigned MaxVF = MaybeMaxVF.getValue(); 6535 assert(MaxVF != 0 && "MaxVF is zero."); 6536 6537 for (unsigned VF = 1; VF <= MaxVF; VF *= 2) { 6538 // Collect Uniform and Scalar instructions after vectorization with VF. 6539 CM.collectUniformsAndScalars(VF); 6540 6541 // Collect the instructions (and their associated costs) that will be more 6542 // profitable to scalarize. 6543 if (VF > 1) 6544 CM.collectInstsToScalarize(VF); 6545 } 6546 6547 buildVPlansWithVPRecipes(1, MaxVF); 6548 LLVM_DEBUG(printPlans(dbgs())); 6549 if (MaxVF == 1) 6550 return VectorizationFactor::Disabled(); 6551 6552 // Select the optimal vectorization factor. 6553 return CM.selectVectorizationFactor(MaxVF); 6554 } 6555 6556 void LoopVectorizationPlanner::setBestPlan(unsigned VF, unsigned UF) { 6557 LLVM_DEBUG(dbgs() << "Setting best plan to VF=" << VF << ", UF=" << UF 6558 << '\n'); 6559 BestVF = VF; 6560 BestUF = UF; 6561 6562 erase_if(VPlans, [VF](const VPlanPtr &Plan) { 6563 return !Plan->hasVF(VF); 6564 }); 6565 assert(VPlans.size() == 1 && "Best VF has not a single VPlan."); 6566 } 6567 6568 void LoopVectorizationPlanner::executePlan(InnerLoopVectorizer &ILV, 6569 DominatorTree *DT) { 6570 // Perform the actual loop transformation. 6571 6572 // 1. Create a new empty loop. Unlink the old loop and connect the new one. 6573 VPCallbackILV CallbackILV(ILV); 6574 6575 VPTransformState State{BestVF, BestUF, LI, 6576 DT, ILV.Builder, ILV.VectorLoopValueMap, 6577 &ILV, CallbackILV}; 6578 State.CFG.PrevBB = ILV.createVectorizedLoopSkeleton(); 6579 State.TripCount = ILV.getOrCreateTripCount(nullptr); 6580 State.CanonicalIV = ILV.Induction; 6581 6582 //===------------------------------------------------===// 6583 // 6584 // Notice: any optimization or new instruction that go 6585 // into the code below should also be implemented in 6586 // the cost-model. 6587 // 6588 //===------------------------------------------------===// 6589 6590 // 2. Copy and widen instructions from the old loop into the new loop. 6591 assert(VPlans.size() == 1 && "Not a single VPlan to execute."); 6592 VPlans.front()->execute(&State); 6593 6594 // 3. Fix the vectorized code: take care of header phi's, live-outs, 6595 // predication, updating analyses. 6596 ILV.fixVectorizedLoop(); 6597 } 6598 6599 void LoopVectorizationPlanner::collectTriviallyDeadInstructions( 6600 SmallPtrSetImpl<Instruction *> &DeadInstructions) { 6601 BasicBlock *Latch = OrigLoop->getLoopLatch(); 6602 6603 // We create new control-flow for the vectorized loop, so the original 6604 // condition will be dead after vectorization if it's only used by the 6605 // branch. 6606 auto *Cmp = dyn_cast<Instruction>(Latch->getTerminator()->getOperand(0)); 6607 if (Cmp && Cmp->hasOneUse()) 6608 DeadInstructions.insert(Cmp); 6609 6610 // We create new "steps" for induction variable updates to which the original 6611 // induction variables map. An original update instruction will be dead if 6612 // all its users except the induction variable are dead. 6613 for (auto &Induction : Legal->getInductionVars()) { 6614 PHINode *Ind = Induction.first; 6615 auto *IndUpdate = cast<Instruction>(Ind->getIncomingValueForBlock(Latch)); 6616 if (llvm::all_of(IndUpdate->users(), [&](User *U) -> bool { 6617 return U == Ind || DeadInstructions.find(cast<Instruction>(U)) != 6618 DeadInstructions.end(); 6619 })) 6620 DeadInstructions.insert(IndUpdate); 6621 6622 // We record as "Dead" also the type-casting instructions we had identified 6623 // during induction analysis. We don't need any handling for them in the 6624 // vectorized loop because we have proven that, under a proper runtime 6625 // test guarding the vectorized loop, the value of the phi, and the casted 6626 // value of the phi, are the same. The last instruction in this casting chain 6627 // will get its scalar/vector/widened def from the scalar/vector/widened def 6628 // of the respective phi node. Any other casts in the induction def-use chain 6629 // have no other uses outside the phi update chain, and will be ignored. 6630 InductionDescriptor &IndDes = Induction.second; 6631 const SmallVectorImpl<Instruction *> &Casts = IndDes.getCastInsts(); 6632 DeadInstructions.insert(Casts.begin(), Casts.end()); 6633 } 6634 } 6635 6636 Value *InnerLoopUnroller::reverseVector(Value *Vec) { return Vec; } 6637 6638 Value *InnerLoopUnroller::getBroadcastInstrs(Value *V) { return V; } 6639 6640 Value *InnerLoopUnroller::getStepVector(Value *Val, int StartIdx, Value *Step, 6641 Instruction::BinaryOps BinOp) { 6642 // When unrolling and the VF is 1, we only need to add a simple scalar. 6643 Type *Ty = Val->getType(); 6644 assert(!Ty->isVectorTy() && "Val must be a scalar"); 6645 6646 if (Ty->isFloatingPointTy()) { 6647 Constant *C = ConstantFP::get(Ty, (double)StartIdx); 6648 6649 // Floating point operations had to be 'fast' to enable the unrolling. 6650 Value *MulOp = addFastMathFlag(Builder.CreateFMul(C, Step)); 6651 return addFastMathFlag(Builder.CreateBinOp(BinOp, Val, MulOp)); 6652 } 6653 Constant *C = ConstantInt::get(Ty, StartIdx); 6654 return Builder.CreateAdd(Val, Builder.CreateMul(C, Step), "induction"); 6655 } 6656 6657 static void AddRuntimeUnrollDisableMetaData(Loop *L) { 6658 SmallVector<Metadata *, 4> MDs; 6659 // Reserve first location for self reference to the LoopID metadata node. 6660 MDs.push_back(nullptr); 6661 bool IsUnrollMetadata = false; 6662 MDNode *LoopID = L->getLoopID(); 6663 if (LoopID) { 6664 // First find existing loop unrolling disable metadata. 6665 for (unsigned i = 1, ie = LoopID->getNumOperands(); i < ie; ++i) { 6666 auto *MD = dyn_cast<MDNode>(LoopID->getOperand(i)); 6667 if (MD) { 6668 const auto *S = dyn_cast<MDString>(MD->getOperand(0)); 6669 IsUnrollMetadata = 6670 S && S->getString().startswith("llvm.loop.unroll.disable"); 6671 } 6672 MDs.push_back(LoopID->getOperand(i)); 6673 } 6674 } 6675 6676 if (!IsUnrollMetadata) { 6677 // Add runtime unroll disable metadata. 6678 LLVMContext &Context = L->getHeader()->getContext(); 6679 SmallVector<Metadata *, 1> DisableOperands; 6680 DisableOperands.push_back( 6681 MDString::get(Context, "llvm.loop.unroll.runtime.disable")); 6682 MDNode *DisableNode = MDNode::get(Context, DisableOperands); 6683 MDs.push_back(DisableNode); 6684 MDNode *NewLoopID = MDNode::get(Context, MDs); 6685 // Set operand 0 to refer to the loop id itself. 6686 NewLoopID->replaceOperandWith(0, NewLoopID); 6687 L->setLoopID(NewLoopID); 6688 } 6689 } 6690 6691 bool LoopVectorizationPlanner::getDecisionAndClampRange( 6692 const std::function<bool(unsigned)> &Predicate, VFRange &Range) { 6693 assert(Range.End > Range.Start && "Trying to test an empty VF range."); 6694 bool PredicateAtRangeStart = Predicate(Range.Start); 6695 6696 for (unsigned TmpVF = Range.Start * 2; TmpVF < Range.End; TmpVF *= 2) 6697 if (Predicate(TmpVF) != PredicateAtRangeStart) { 6698 Range.End = TmpVF; 6699 break; 6700 } 6701 6702 return PredicateAtRangeStart; 6703 } 6704 6705 /// Build VPlans for the full range of feasible VF's = {\p MinVF, 2 * \p MinVF, 6706 /// 4 * \p MinVF, ..., \p MaxVF} by repeatedly building a VPlan for a sub-range 6707 /// of VF's starting at a given VF and extending it as much as possible. Each 6708 /// vectorization decision can potentially shorten this sub-range during 6709 /// buildVPlan(). 6710 void LoopVectorizationPlanner::buildVPlans(unsigned MinVF, unsigned MaxVF) { 6711 for (unsigned VF = MinVF; VF < MaxVF + 1;) { 6712 VFRange SubRange = {VF, MaxVF + 1}; 6713 VPlans.push_back(buildVPlan(SubRange)); 6714 VF = SubRange.End; 6715 } 6716 } 6717 6718 VPValue *VPRecipeBuilder::createEdgeMask(BasicBlock *Src, BasicBlock *Dst, 6719 VPlanPtr &Plan) { 6720 assert(is_contained(predecessors(Dst), Src) && "Invalid edge"); 6721 6722 // Look for cached value. 6723 std::pair<BasicBlock *, BasicBlock *> Edge(Src, Dst); 6724 EdgeMaskCacheTy::iterator ECEntryIt = EdgeMaskCache.find(Edge); 6725 if (ECEntryIt != EdgeMaskCache.end()) 6726 return ECEntryIt->second; 6727 6728 VPValue *SrcMask = createBlockInMask(Src, Plan); 6729 6730 // The terminator has to be a branch inst! 6731 BranchInst *BI = dyn_cast<BranchInst>(Src->getTerminator()); 6732 assert(BI && "Unexpected terminator found"); 6733 6734 if (!BI->isConditional() || BI->getSuccessor(0) == BI->getSuccessor(1)) 6735 return EdgeMaskCache[Edge] = SrcMask; 6736 6737 VPValue *EdgeMask = Plan->getVPValue(BI->getCondition()); 6738 assert(EdgeMask && "No Edge Mask found for condition"); 6739 6740 if (BI->getSuccessor(0) != Dst) 6741 EdgeMask = Builder.createNot(EdgeMask); 6742 6743 if (SrcMask) // Otherwise block in-mask is all-one, no need to AND. 6744 EdgeMask = Builder.createAnd(EdgeMask, SrcMask); 6745 6746 return EdgeMaskCache[Edge] = EdgeMask; 6747 } 6748 6749 VPValue *VPRecipeBuilder::createBlockInMask(BasicBlock *BB, VPlanPtr &Plan) { 6750 assert(OrigLoop->contains(BB) && "Block is not a part of a loop"); 6751 6752 // Look for cached value. 6753 BlockMaskCacheTy::iterator BCEntryIt = BlockMaskCache.find(BB); 6754 if (BCEntryIt != BlockMaskCache.end()) 6755 return BCEntryIt->second; 6756 6757 // All-one mask is modelled as no-mask following the convention for masked 6758 // load/store/gather/scatter. Initialize BlockMask to no-mask. 6759 VPValue *BlockMask = nullptr; 6760 6761 if (OrigLoop->getHeader() == BB) { 6762 if (!CM.blockNeedsPredication(BB)) 6763 return BlockMaskCache[BB] = BlockMask; // Loop incoming mask is all-one. 6764 6765 // Introduce the early-exit compare IV <= BTC to form header block mask. 6766 // This is used instead of IV < TC because TC may wrap, unlike BTC. 6767 // Start by constructing the desired canonical IV. 6768 VPValue *IV = nullptr; 6769 if (Legal->getPrimaryInduction()) 6770 IV = Plan->getVPValue(Legal->getPrimaryInduction()); 6771 else { 6772 auto IVRecipe = new VPWidenCanonicalIVRecipe(); 6773 Builder.getInsertBlock()->appendRecipe(IVRecipe); 6774 IV = IVRecipe->getVPValue(); 6775 } 6776 VPValue *BTC = Plan->getOrCreateBackedgeTakenCount(); 6777 BlockMask = Builder.createNaryOp(VPInstruction::ICmpULE, {IV, BTC}); 6778 return BlockMaskCache[BB] = BlockMask; 6779 } 6780 6781 // This is the block mask. We OR all incoming edges. 6782 for (auto *Predecessor : predecessors(BB)) { 6783 VPValue *EdgeMask = createEdgeMask(Predecessor, BB, Plan); 6784 if (!EdgeMask) // Mask of predecessor is all-one so mask of block is too. 6785 return BlockMaskCache[BB] = EdgeMask; 6786 6787 if (!BlockMask) { // BlockMask has its initialized nullptr value. 6788 BlockMask = EdgeMask; 6789 continue; 6790 } 6791 6792 BlockMask = Builder.createOr(BlockMask, EdgeMask); 6793 } 6794 6795 return BlockMaskCache[BB] = BlockMask; 6796 } 6797 6798 VPWidenMemoryInstructionRecipe * 6799 VPRecipeBuilder::tryToWidenMemory(Instruction *I, VFRange &Range, 6800 VPlanPtr &Plan) { 6801 if (!isa<LoadInst>(I) && !isa<StoreInst>(I)) 6802 return nullptr; 6803 6804 auto willWiden = [&](unsigned VF) -> bool { 6805 if (VF == 1) 6806 return false; 6807 LoopVectorizationCostModel::InstWidening Decision = 6808 CM.getWideningDecision(I, VF); 6809 assert(Decision != LoopVectorizationCostModel::CM_Unknown && 6810 "CM decision should be taken at this point."); 6811 if (Decision == LoopVectorizationCostModel::CM_Interleave) 6812 return true; 6813 if (CM.isScalarAfterVectorization(I, VF) || 6814 CM.isProfitableToScalarize(I, VF)) 6815 return false; 6816 return Decision != LoopVectorizationCostModel::CM_Scalarize; 6817 }; 6818 6819 if (!LoopVectorizationPlanner::getDecisionAndClampRange(willWiden, Range)) 6820 return nullptr; 6821 6822 VPValue *Mask = nullptr; 6823 if (Legal->isMaskRequired(I)) 6824 Mask = createBlockInMask(I->getParent(), Plan); 6825 6826 VPValue *Addr = Plan->getOrAddVPValue(getLoadStorePointerOperand(I)); 6827 if (LoadInst *Load = dyn_cast<LoadInst>(I)) 6828 return new VPWidenMemoryInstructionRecipe(*Load, Addr, Mask); 6829 6830 StoreInst *Store = cast<StoreInst>(I); 6831 VPValue *StoredValue = Plan->getOrAddVPValue(Store->getValueOperand()); 6832 return new VPWidenMemoryInstructionRecipe(*Store, Addr, StoredValue, Mask); 6833 } 6834 6835 VPWidenIntOrFpInductionRecipe * 6836 VPRecipeBuilder::tryToOptimizeInduction(Instruction *I, VFRange &Range) { 6837 if (PHINode *Phi = dyn_cast<PHINode>(I)) { 6838 // Check if this is an integer or fp induction. If so, build the recipe that 6839 // produces its scalar and vector values. 6840 InductionDescriptor II = Legal->getInductionVars().lookup(Phi); 6841 if (II.getKind() == InductionDescriptor::IK_IntInduction || 6842 II.getKind() == InductionDescriptor::IK_FpInduction) 6843 return new VPWidenIntOrFpInductionRecipe(Phi); 6844 6845 return nullptr; 6846 } 6847 6848 // Optimize the special case where the source is a constant integer 6849 // induction variable. Notice that we can only optimize the 'trunc' case 6850 // because (a) FP conversions lose precision, (b) sext/zext may wrap, and 6851 // (c) other casts depend on pointer size. 6852 6853 // Determine whether \p K is a truncation based on an induction variable that 6854 // can be optimized. 6855 auto isOptimizableIVTruncate = 6856 [&](Instruction *K) -> std::function<bool(unsigned)> { 6857 return 6858 [=](unsigned VF) -> bool { return CM.isOptimizableIVTruncate(K, VF); }; 6859 }; 6860 6861 if (isa<TruncInst>(I) && LoopVectorizationPlanner::getDecisionAndClampRange( 6862 isOptimizableIVTruncate(I), Range)) 6863 return new VPWidenIntOrFpInductionRecipe(cast<PHINode>(I->getOperand(0)), 6864 cast<TruncInst>(I)); 6865 return nullptr; 6866 } 6867 6868 VPBlendRecipe *VPRecipeBuilder::tryToBlend(Instruction *I, VPlanPtr &Plan) { 6869 PHINode *Phi = dyn_cast<PHINode>(I); 6870 if (!Phi || Phi->getParent() == OrigLoop->getHeader()) 6871 return nullptr; 6872 6873 // We know that all PHIs in non-header blocks are converted into selects, so 6874 // we don't have to worry about the insertion order and we can just use the 6875 // builder. At this point we generate the predication tree. There may be 6876 // duplications since this is a simple recursive scan, but future 6877 // optimizations will clean it up. 6878 6879 SmallVector<VPValue *, 2> Operands; 6880 unsigned NumIncoming = Phi->getNumIncomingValues(); 6881 for (unsigned In = 0; In < NumIncoming; In++) { 6882 VPValue *EdgeMask = 6883 createEdgeMask(Phi->getIncomingBlock(In), Phi->getParent(), Plan); 6884 assert((EdgeMask || NumIncoming == 1) && 6885 "Multiple predecessors with one having a full mask"); 6886 Operands.push_back(Plan->getOrAddVPValue(Phi->getIncomingValue(In))); 6887 if (EdgeMask) 6888 Operands.push_back(EdgeMask); 6889 } 6890 return new VPBlendRecipe(Phi, Operands); 6891 } 6892 6893 VPWidenCallRecipe * 6894 VPRecipeBuilder::tryToWidenCall(Instruction *I, VFRange &Range, VPlan &Plan) { 6895 6896 bool IsPredicated = LoopVectorizationPlanner::getDecisionAndClampRange( 6897 [&](unsigned VF) { return CM.isScalarWithPredication(I, VF); }, Range); 6898 6899 CallInst *CI = dyn_cast<CallInst>(I); 6900 if (IsPredicated || !CI) 6901 return nullptr; 6902 6903 Intrinsic::ID ID = getVectorIntrinsicIDForCall(CI, TLI); 6904 if (ID && (ID == Intrinsic::assume || ID == Intrinsic::lifetime_end || 6905 ID == Intrinsic::lifetime_start || ID == Intrinsic::sideeffect)) 6906 return nullptr; 6907 6908 auto willWiden = [&](unsigned VF) -> bool { 6909 Intrinsic::ID ID = getVectorIntrinsicIDForCall(CI, TLI); 6910 // The following case may be scalarized depending on the VF. 6911 // The flag shows whether we use Intrinsic or a usual Call for vectorized 6912 // version of the instruction. 6913 // Is it beneficial to perform intrinsic call compared to lib call? 6914 bool NeedToScalarize = false; 6915 unsigned CallCost = CM.getVectorCallCost(CI, VF, NeedToScalarize); 6916 bool UseVectorIntrinsic = 6917 ID && CM.getVectorIntrinsicCost(CI, VF) <= CallCost; 6918 return UseVectorIntrinsic || !NeedToScalarize; 6919 }; 6920 6921 if (!LoopVectorizationPlanner::getDecisionAndClampRange(willWiden, Range)) 6922 return nullptr; 6923 6924 // Success: widen this call. 6925 auto VPValues = map_range(CI->arg_operands(), [&Plan](Value *Op) { 6926 return Plan.getOrAddVPValue(Op); 6927 }); 6928 6929 return new VPWidenCallRecipe(*CI, VPValues); 6930 } 6931 6932 bool VPRecipeBuilder::shouldWiden(Instruction *I, VFRange &Range) const { 6933 assert(!isa<BranchInst>(I) && !isa<PHINode>(I) && !isa<LoadInst>(I) && 6934 !isa<StoreInst>(I) && "Instruction should have been handled earlier"); 6935 // Instruction should be widened, unless it is scalar after vectorization, 6936 // scalarization is profitable or it is predicated. 6937 auto WillScalarize = [this, I](unsigned VF) -> bool { 6938 return CM.isScalarAfterVectorization(I, VF) || 6939 CM.isProfitableToScalarize(I, VF) || 6940 CM.isScalarWithPredication(I, VF); 6941 }; 6942 return !LoopVectorizationPlanner::getDecisionAndClampRange(WillScalarize, 6943 Range); 6944 } 6945 6946 VPWidenSelectRecipe *VPRecipeBuilder::tryToWidenSelect(Instruction *I) { 6947 auto *SI = dyn_cast<SelectInst>(I); 6948 if (!SI) 6949 return nullptr; 6950 auto *SE = PSE.getSE(); 6951 bool InvariantCond = 6952 SE->isLoopInvariant(PSE.getSCEV(SI->getOperand(0)), OrigLoop); 6953 // Success: widen this instruction. 6954 return new VPWidenSelectRecipe(*SI, InvariantCond); 6955 } 6956 6957 VPWidenRecipe *VPRecipeBuilder::tryToWiden(Instruction *I, VPlan &Plan) { 6958 auto IsVectorizableOpcode = [](unsigned Opcode) { 6959 switch (Opcode) { 6960 case Instruction::Add: 6961 case Instruction::And: 6962 case Instruction::AShr: 6963 case Instruction::BitCast: 6964 case Instruction::FAdd: 6965 case Instruction::FCmp: 6966 case Instruction::FDiv: 6967 case Instruction::FMul: 6968 case Instruction::FNeg: 6969 case Instruction::FPExt: 6970 case Instruction::FPToSI: 6971 case Instruction::FPToUI: 6972 case Instruction::FPTrunc: 6973 case Instruction::FRem: 6974 case Instruction::FSub: 6975 case Instruction::ICmp: 6976 case Instruction::IntToPtr: 6977 case Instruction::LShr: 6978 case Instruction::Mul: 6979 case Instruction::Or: 6980 case Instruction::PtrToInt: 6981 case Instruction::SDiv: 6982 case Instruction::Select: 6983 case Instruction::SExt: 6984 case Instruction::Shl: 6985 case Instruction::SIToFP: 6986 case Instruction::SRem: 6987 case Instruction::Sub: 6988 case Instruction::Trunc: 6989 case Instruction::UDiv: 6990 case Instruction::UIToFP: 6991 case Instruction::URem: 6992 case Instruction::Xor: 6993 case Instruction::ZExt: 6994 return true; 6995 } 6996 return false; 6997 }; 6998 6999 if (!IsVectorizableOpcode(I->getOpcode())) 7000 return nullptr; 7001 7002 // Success: widen this instruction. 7003 return new VPWidenRecipe(*I); 7004 } 7005 7006 VPBasicBlock *VPRecipeBuilder::handleReplication( 7007 Instruction *I, VFRange &Range, VPBasicBlock *VPBB, 7008 DenseMap<Instruction *, VPReplicateRecipe *> &PredInst2Recipe, 7009 VPlanPtr &Plan) { 7010 bool IsUniform = LoopVectorizationPlanner::getDecisionAndClampRange( 7011 [&](unsigned VF) { return CM.isUniformAfterVectorization(I, VF); }, 7012 Range); 7013 7014 bool IsPredicated = LoopVectorizationPlanner::getDecisionAndClampRange( 7015 [&](unsigned VF) { return CM.isScalarWithPredication(I, VF); }, Range); 7016 7017 auto *Recipe = new VPReplicateRecipe(I, IsUniform, IsPredicated); 7018 setRecipe(I, Recipe); 7019 7020 // Find if I uses a predicated instruction. If so, it will use its scalar 7021 // value. Avoid hoisting the insert-element which packs the scalar value into 7022 // a vector value, as that happens iff all users use the vector value. 7023 for (auto &Op : I->operands()) 7024 if (auto *PredInst = dyn_cast<Instruction>(Op)) 7025 if (PredInst2Recipe.find(PredInst) != PredInst2Recipe.end()) 7026 PredInst2Recipe[PredInst]->setAlsoPack(false); 7027 7028 // Finalize the recipe for Instr, first if it is not predicated. 7029 if (!IsPredicated) { 7030 LLVM_DEBUG(dbgs() << "LV: Scalarizing:" << *I << "\n"); 7031 VPBB->appendRecipe(Recipe); 7032 return VPBB; 7033 } 7034 LLVM_DEBUG(dbgs() << "LV: Scalarizing and predicating:" << *I << "\n"); 7035 assert(VPBB->getSuccessors().empty() && 7036 "VPBB has successors when handling predicated replication."); 7037 // Record predicated instructions for above packing optimizations. 7038 PredInst2Recipe[I] = Recipe; 7039 VPBlockBase *Region = createReplicateRegion(I, Recipe, Plan); 7040 VPBlockUtils::insertBlockAfter(Region, VPBB); 7041 auto *RegSucc = new VPBasicBlock(); 7042 VPBlockUtils::insertBlockAfter(RegSucc, Region); 7043 return RegSucc; 7044 } 7045 7046 VPRegionBlock *VPRecipeBuilder::createReplicateRegion(Instruction *Instr, 7047 VPRecipeBase *PredRecipe, 7048 VPlanPtr &Plan) { 7049 // Instructions marked for predication are replicated and placed under an 7050 // if-then construct to prevent side-effects. 7051 7052 // Generate recipes to compute the block mask for this region. 7053 VPValue *BlockInMask = createBlockInMask(Instr->getParent(), Plan); 7054 7055 // Build the triangular if-then region. 7056 std::string RegionName = (Twine("pred.") + Instr->getOpcodeName()).str(); 7057 assert(Instr->getParent() && "Predicated instruction not in any basic block"); 7058 auto *BOMRecipe = new VPBranchOnMaskRecipe(BlockInMask); 7059 auto *Entry = new VPBasicBlock(Twine(RegionName) + ".entry", BOMRecipe); 7060 auto *PHIRecipe = 7061 Instr->getType()->isVoidTy() ? nullptr : new VPPredInstPHIRecipe(Instr); 7062 auto *Exit = new VPBasicBlock(Twine(RegionName) + ".continue", PHIRecipe); 7063 auto *Pred = new VPBasicBlock(Twine(RegionName) + ".if", PredRecipe); 7064 VPRegionBlock *Region = new VPRegionBlock(Entry, Exit, RegionName, true); 7065 7066 // Note: first set Entry as region entry and then connect successors starting 7067 // from it in order, to propagate the "parent" of each VPBasicBlock. 7068 VPBlockUtils::insertTwoBlocksAfter(Pred, Exit, BlockInMask, Entry); 7069 VPBlockUtils::connectBlocks(Pred, Exit); 7070 7071 return Region; 7072 } 7073 7074 bool VPRecipeBuilder::tryToCreateRecipe(Instruction *Instr, VFRange &Range, 7075 VPlanPtr &Plan, VPBasicBlock *VPBB) { 7076 VPRecipeBase *Recipe = nullptr; 7077 7078 // First, check for specific widening recipes that deal with calls, memory 7079 // operations, inductions and Phi nodes. 7080 if ((Recipe = tryToWidenCall(Instr, Range, *Plan)) || 7081 (Recipe = tryToWidenMemory(Instr, Range, Plan)) || 7082 (Recipe = tryToOptimizeInduction(Instr, Range)) || 7083 (Recipe = tryToBlend(Instr, Plan)) || 7084 (isa<PHINode>(Instr) && 7085 (Recipe = new VPWidenPHIRecipe(cast<PHINode>(Instr))))) { 7086 setRecipe(Instr, Recipe); 7087 VPBB->appendRecipe(Recipe); 7088 return true; 7089 } 7090 7091 // Calls and memory instructions are widened by the specialized recipes above, 7092 // or scalarized. 7093 if (isa<CallInst>(Instr) || isa<LoadInst>(Instr) || isa<StoreInst>(Instr)) 7094 return false; 7095 7096 if (!shouldWiden(Instr, Range)) 7097 return false; 7098 7099 if ((Recipe = tryToWidenSelect(Instr)) || 7100 (isa<GetElementPtrInst>(Instr) && 7101 (Recipe = 7102 new VPWidenGEPRecipe(cast<GetElementPtrInst>(Instr), OrigLoop))) || 7103 (Recipe = tryToWiden(Instr, *Plan))) { 7104 setRecipe(Instr, Recipe); 7105 VPBB->appendRecipe(Recipe); 7106 return true; 7107 } 7108 7109 return false; 7110 } 7111 7112 void LoopVectorizationPlanner::buildVPlansWithVPRecipes(unsigned MinVF, 7113 unsigned MaxVF) { 7114 assert(OrigLoop->empty() && "Inner loop expected."); 7115 7116 // Collect conditions feeding internal conditional branches; they need to be 7117 // represented in VPlan for it to model masking. 7118 SmallPtrSet<Value *, 1> NeedDef; 7119 7120 auto *Latch = OrigLoop->getLoopLatch(); 7121 for (BasicBlock *BB : OrigLoop->blocks()) { 7122 if (BB == Latch) 7123 continue; 7124 BranchInst *Branch = dyn_cast<BranchInst>(BB->getTerminator()); 7125 if (Branch && Branch->isConditional()) 7126 NeedDef.insert(Branch->getCondition()); 7127 } 7128 7129 // If the tail is to be folded by masking, the primary induction variable, if 7130 // exists needs to be represented in VPlan for it to model early-exit masking. 7131 // Also, both the Phi and the live-out instruction of each reduction are 7132 // required in order to introduce a select between them in VPlan. 7133 if (CM.foldTailByMasking()) { 7134 if (Legal->getPrimaryInduction()) 7135 NeedDef.insert(Legal->getPrimaryInduction()); 7136 for (auto &Reduction : Legal->getReductionVars()) { 7137 NeedDef.insert(Reduction.first); 7138 NeedDef.insert(Reduction.second.getLoopExitInstr()); 7139 } 7140 } 7141 7142 // Collect instructions from the original loop that will become trivially dead 7143 // in the vectorized loop. We don't need to vectorize these instructions. For 7144 // example, original induction update instructions can become dead because we 7145 // separately emit induction "steps" when generating code for the new loop. 7146 // Similarly, we create a new latch condition when setting up the structure 7147 // of the new loop, so the old one can become dead. 7148 SmallPtrSet<Instruction *, 4> DeadInstructions; 7149 collectTriviallyDeadInstructions(DeadInstructions); 7150 7151 // Add assume instructions we need to drop to DeadInstructions, to prevent 7152 // them from being added to the VPlan. 7153 // TODO: We only need to drop assumes in blocks that get flattend. If the 7154 // control flow is preserved, we should keep them. 7155 auto &ConditionalAssumes = Legal->getConditionalAssumes(); 7156 DeadInstructions.insert(ConditionalAssumes.begin(), ConditionalAssumes.end()); 7157 7158 DenseMap<Instruction *, Instruction *> &SinkAfter = Legal->getSinkAfter(); 7159 // Dead instructions do not need sinking. Remove them from SinkAfter. 7160 for (Instruction *I : DeadInstructions) 7161 SinkAfter.erase(I); 7162 7163 for (unsigned VF = MinVF; VF < MaxVF + 1;) { 7164 VFRange SubRange = {VF, MaxVF + 1}; 7165 VPlans.push_back(buildVPlanWithVPRecipes(SubRange, NeedDef, 7166 DeadInstructions, SinkAfter)); 7167 VF = SubRange.End; 7168 } 7169 } 7170 7171 VPlanPtr LoopVectorizationPlanner::buildVPlanWithVPRecipes( 7172 VFRange &Range, SmallPtrSetImpl<Value *> &NeedDef, 7173 SmallPtrSetImpl<Instruction *> &DeadInstructions, 7174 const DenseMap<Instruction *, Instruction *> &SinkAfter) { 7175 7176 // Hold a mapping from predicated instructions to their recipes, in order to 7177 // fix their AlsoPack behavior if a user is determined to replicate and use a 7178 // scalar instead of vector value. 7179 DenseMap<Instruction *, VPReplicateRecipe *> PredInst2Recipe; 7180 7181 SmallPtrSet<const InterleaveGroup<Instruction> *, 1> InterleaveGroups; 7182 7183 VPRecipeBuilder RecipeBuilder(OrigLoop, TLI, Legal, CM, PSE, Builder); 7184 7185 // --------------------------------------------------------------------------- 7186 // Pre-construction: record ingredients whose recipes we'll need to further 7187 // process after constructing the initial VPlan. 7188 // --------------------------------------------------------------------------- 7189 7190 // Mark instructions we'll need to sink later and their targets as 7191 // ingredients whose recipe we'll need to record. 7192 for (auto &Entry : SinkAfter) { 7193 RecipeBuilder.recordRecipeOf(Entry.first); 7194 RecipeBuilder.recordRecipeOf(Entry.second); 7195 } 7196 7197 // For each interleave group which is relevant for this (possibly trimmed) 7198 // Range, add it to the set of groups to be later applied to the VPlan and add 7199 // placeholders for its members' Recipes which we'll be replacing with a 7200 // single VPInterleaveRecipe. 7201 for (InterleaveGroup<Instruction> *IG : IAI.getInterleaveGroups()) { 7202 auto applyIG = [IG, this](unsigned VF) -> bool { 7203 return (VF >= 2 && // Query is illegal for VF == 1 7204 CM.getWideningDecision(IG->getInsertPos(), VF) == 7205 LoopVectorizationCostModel::CM_Interleave); 7206 }; 7207 if (!getDecisionAndClampRange(applyIG, Range)) 7208 continue; 7209 InterleaveGroups.insert(IG); 7210 for (unsigned i = 0; i < IG->getFactor(); i++) 7211 if (Instruction *Member = IG->getMember(i)) 7212 RecipeBuilder.recordRecipeOf(Member); 7213 }; 7214 7215 // --------------------------------------------------------------------------- 7216 // Build initial VPlan: Scan the body of the loop in a topological order to 7217 // visit each basic block after having visited its predecessor basic blocks. 7218 // --------------------------------------------------------------------------- 7219 7220 // Create a dummy pre-entry VPBasicBlock to start building the VPlan. 7221 auto Plan = std::make_unique<VPlan>(); 7222 VPBasicBlock *VPBB = new VPBasicBlock("Pre-Entry"); 7223 Plan->setEntry(VPBB); 7224 7225 // Represent values that will have defs inside VPlan. 7226 for (Value *V : NeedDef) 7227 Plan->addVPValue(V); 7228 7229 // Scan the body of the loop in a topological order to visit each basic block 7230 // after having visited its predecessor basic blocks. 7231 LoopBlocksDFS DFS(OrigLoop); 7232 DFS.perform(LI); 7233 7234 for (BasicBlock *BB : make_range(DFS.beginRPO(), DFS.endRPO())) { 7235 // Relevant instructions from basic block BB will be grouped into VPRecipe 7236 // ingredients and fill a new VPBasicBlock. 7237 unsigned VPBBsForBB = 0; 7238 auto *FirstVPBBForBB = new VPBasicBlock(BB->getName()); 7239 VPBlockUtils::insertBlockAfter(FirstVPBBForBB, VPBB); 7240 VPBB = FirstVPBBForBB; 7241 Builder.setInsertPoint(VPBB); 7242 7243 // Introduce each ingredient into VPlan. 7244 // TODO: Model and preserve debug instrinsics in VPlan. 7245 for (Instruction &I : BB->instructionsWithoutDebug()) { 7246 Instruction *Instr = &I; 7247 7248 // First filter out irrelevant instructions, to ensure no recipes are 7249 // built for them. 7250 if (isa<BranchInst>(Instr) || 7251 DeadInstructions.find(Instr) != DeadInstructions.end()) 7252 continue; 7253 7254 if (RecipeBuilder.tryToCreateRecipe(Instr, Range, Plan, VPBB)) 7255 continue; 7256 7257 // Otherwise, if all widening options failed, Instruction is to be 7258 // replicated. This may create a successor for VPBB. 7259 VPBasicBlock *NextVPBB = RecipeBuilder.handleReplication( 7260 Instr, Range, VPBB, PredInst2Recipe, Plan); 7261 if (NextVPBB != VPBB) { 7262 VPBB = NextVPBB; 7263 VPBB->setName(BB->hasName() ? BB->getName() + "." + Twine(VPBBsForBB++) 7264 : ""); 7265 } 7266 } 7267 } 7268 7269 // Discard empty dummy pre-entry VPBasicBlock. Note that other VPBasicBlocks 7270 // may also be empty, such as the last one VPBB, reflecting original 7271 // basic-blocks with no recipes. 7272 VPBasicBlock *PreEntry = cast<VPBasicBlock>(Plan->getEntry()); 7273 assert(PreEntry->empty() && "Expecting empty pre-entry block."); 7274 VPBlockBase *Entry = Plan->setEntry(PreEntry->getSingleSuccessor()); 7275 VPBlockUtils::disconnectBlocks(PreEntry, Entry); 7276 delete PreEntry; 7277 7278 // --------------------------------------------------------------------------- 7279 // Transform initial VPlan: Apply previously taken decisions, in order, to 7280 // bring the VPlan to its final state. 7281 // --------------------------------------------------------------------------- 7282 7283 // Apply Sink-After legal constraints. 7284 for (auto &Entry : SinkAfter) { 7285 VPRecipeBase *Sink = RecipeBuilder.getRecipe(Entry.first); 7286 VPRecipeBase *Target = RecipeBuilder.getRecipe(Entry.second); 7287 Sink->moveAfter(Target); 7288 } 7289 7290 // Interleave memory: for each Interleave Group we marked earlier as relevant 7291 // for this VPlan, replace the Recipes widening its memory instructions with a 7292 // single VPInterleaveRecipe at its insertion point. 7293 for (auto IG : InterleaveGroups) { 7294 auto *Recipe = cast<VPWidenMemoryInstructionRecipe>( 7295 RecipeBuilder.getRecipe(IG->getInsertPos())); 7296 (new VPInterleaveRecipe(IG, Recipe->getAddr(), Recipe->getMask())) 7297 ->insertBefore(Recipe); 7298 7299 for (unsigned i = 0; i < IG->getFactor(); ++i) 7300 if (Instruction *Member = IG->getMember(i)) { 7301 RecipeBuilder.getRecipe(Member)->eraseFromParent(); 7302 } 7303 } 7304 7305 // Finally, if tail is folded by masking, introduce selects between the phi 7306 // and the live-out instruction of each reduction, at the end of the latch. 7307 if (CM.foldTailByMasking()) { 7308 Builder.setInsertPoint(VPBB); 7309 auto *Cond = RecipeBuilder.createBlockInMask(OrigLoop->getHeader(), Plan); 7310 for (auto &Reduction : Legal->getReductionVars()) { 7311 VPValue *Phi = Plan->getVPValue(Reduction.first); 7312 VPValue *Red = Plan->getVPValue(Reduction.second.getLoopExitInstr()); 7313 Builder.createNaryOp(Instruction::Select, {Cond, Red, Phi}); 7314 } 7315 } 7316 7317 std::string PlanName; 7318 raw_string_ostream RSO(PlanName); 7319 unsigned VF = Range.Start; 7320 Plan->addVF(VF); 7321 RSO << "Initial VPlan for VF={" << VF; 7322 for (VF *= 2; VF < Range.End; VF *= 2) { 7323 Plan->addVF(VF); 7324 RSO << "," << VF; 7325 } 7326 RSO << "},UF>=1"; 7327 RSO.flush(); 7328 Plan->setName(PlanName); 7329 7330 return Plan; 7331 } 7332 7333 VPlanPtr LoopVectorizationPlanner::buildVPlan(VFRange &Range) { 7334 // Outer loop handling: They may require CFG and instruction level 7335 // transformations before even evaluating whether vectorization is profitable. 7336 // Since we cannot modify the incoming IR, we need to build VPlan upfront in 7337 // the vectorization pipeline. 7338 assert(!OrigLoop->empty()); 7339 assert(EnableVPlanNativePath && "VPlan-native path is not enabled."); 7340 7341 // Create new empty VPlan 7342 auto Plan = std::make_unique<VPlan>(); 7343 7344 // Build hierarchical CFG 7345 VPlanHCFGBuilder HCFGBuilder(OrigLoop, LI, *Plan); 7346 HCFGBuilder.buildHierarchicalCFG(); 7347 7348 for (unsigned VF = Range.Start; VF < Range.End; VF *= 2) 7349 Plan->addVF(VF); 7350 7351 if (EnableVPlanPredication) { 7352 VPlanPredicator VPP(*Plan); 7353 VPP.predicate(); 7354 7355 // Avoid running transformation to recipes until masked code generation in 7356 // VPlan-native path is in place. 7357 return Plan; 7358 } 7359 7360 SmallPtrSet<Instruction *, 1> DeadInstructions; 7361 VPlanTransforms::VPInstructionsToVPRecipes( 7362 OrigLoop, Plan, Legal->getInductionVars(), DeadInstructions); 7363 return Plan; 7364 } 7365 7366 Value* LoopVectorizationPlanner::VPCallbackILV:: 7367 getOrCreateVectorValues(Value *V, unsigned Part) { 7368 return ILV.getOrCreateVectorValue(V, Part); 7369 } 7370 7371 Value *LoopVectorizationPlanner::VPCallbackILV::getOrCreateScalarValue( 7372 Value *V, const VPIteration &Instance) { 7373 return ILV.getOrCreateScalarValue(V, Instance); 7374 } 7375 7376 void VPInterleaveRecipe::print(raw_ostream &O, const Twine &Indent, 7377 VPSlotTracker &SlotTracker) const { 7378 O << " +\n" 7379 << Indent << "\"INTERLEAVE-GROUP with factor " << IG->getFactor() << " at "; 7380 IG->getInsertPos()->printAsOperand(O, false); 7381 O << ", "; 7382 getAddr()->printAsOperand(O, SlotTracker); 7383 VPValue *Mask = getMask(); 7384 if (Mask) { 7385 O << ", "; 7386 Mask->printAsOperand(O, SlotTracker); 7387 } 7388 O << "\\l\""; 7389 for (unsigned i = 0; i < IG->getFactor(); ++i) 7390 if (Instruction *I = IG->getMember(i)) 7391 O << " +\n" 7392 << Indent << "\" " << VPlanIngredient(I) << " " << i << "\\l\""; 7393 } 7394 7395 void VPWidenCallRecipe::execute(VPTransformState &State) { 7396 State.ILV->widenCallInstruction(Ingredient, User, State); 7397 } 7398 7399 void VPWidenSelectRecipe::execute(VPTransformState &State) { 7400 State.ILV->widenSelectInstruction(Ingredient, InvariantCond); 7401 } 7402 7403 void VPWidenRecipe::execute(VPTransformState &State) { 7404 State.ILV->widenInstruction(Ingredient); 7405 } 7406 7407 void VPWidenGEPRecipe::execute(VPTransformState &State) { 7408 State.ILV->widenGEP(GEP, State.UF, State.VF, IsPtrLoopInvariant, 7409 IsIndexLoopInvariant); 7410 } 7411 7412 void VPWidenIntOrFpInductionRecipe::execute(VPTransformState &State) { 7413 assert(!State.Instance && "Int or FP induction being replicated."); 7414 State.ILV->widenIntOrFpInduction(IV, Trunc); 7415 } 7416 7417 void VPWidenPHIRecipe::execute(VPTransformState &State) { 7418 State.ILV->widenPHIInstruction(Phi, State.UF, State.VF); 7419 } 7420 7421 void VPBlendRecipe::execute(VPTransformState &State) { 7422 State.ILV->setDebugLocFromInst(State.Builder, Phi); 7423 // We know that all PHIs in non-header blocks are converted into 7424 // selects, so we don't have to worry about the insertion order and we 7425 // can just use the builder. 7426 // At this point we generate the predication tree. There may be 7427 // duplications since this is a simple recursive scan, but future 7428 // optimizations will clean it up. 7429 7430 unsigned NumIncoming = getNumIncomingValues(); 7431 7432 // Generate a sequence of selects of the form: 7433 // SELECT(Mask3, In3, 7434 // SELECT(Mask2, In2, 7435 // SELECT(Mask1, In1, 7436 // In0))) 7437 // Note that Mask0 is never used: lanes for which no path reaches this phi and 7438 // are essentially undef are taken from In0. 7439 InnerLoopVectorizer::VectorParts Entry(State.UF); 7440 for (unsigned In = 0; In < NumIncoming; ++In) { 7441 for (unsigned Part = 0; Part < State.UF; ++Part) { 7442 // We might have single edge PHIs (blocks) - use an identity 7443 // 'select' for the first PHI operand. 7444 Value *In0 = State.get(getIncomingValue(In), Part); 7445 if (In == 0) 7446 Entry[Part] = In0; // Initialize with the first incoming value. 7447 else { 7448 // Select between the current value and the previous incoming edge 7449 // based on the incoming mask. 7450 Value *Cond = State.get(getMask(In), Part); 7451 Entry[Part] = 7452 State.Builder.CreateSelect(Cond, In0, Entry[Part], "predphi"); 7453 } 7454 } 7455 } 7456 for (unsigned Part = 0; Part < State.UF; ++Part) 7457 State.ValueMap.setVectorValue(Phi, Part, Entry[Part]); 7458 } 7459 7460 void VPInterleaveRecipe::execute(VPTransformState &State) { 7461 assert(!State.Instance && "Interleave group being replicated."); 7462 State.ILV->vectorizeInterleaveGroup(IG, State, getAddr(), getMask()); 7463 } 7464 7465 void VPReplicateRecipe::execute(VPTransformState &State) { 7466 if (State.Instance) { // Generate a single instance. 7467 State.ILV->scalarizeInstruction(Ingredient, *State.Instance, IsPredicated); 7468 // Insert scalar instance packing it into a vector. 7469 if (AlsoPack && State.VF > 1) { 7470 // If we're constructing lane 0, initialize to start from undef. 7471 if (State.Instance->Lane == 0) { 7472 Value *Undef = 7473 UndefValue::get(VectorType::get(Ingredient->getType(), State.VF)); 7474 State.ValueMap.setVectorValue(Ingredient, State.Instance->Part, Undef); 7475 } 7476 State.ILV->packScalarIntoVectorValue(Ingredient, *State.Instance); 7477 } 7478 return; 7479 } 7480 7481 // Generate scalar instances for all VF lanes of all UF parts, unless the 7482 // instruction is uniform inwhich case generate only the first lane for each 7483 // of the UF parts. 7484 unsigned EndLane = IsUniform ? 1 : State.VF; 7485 for (unsigned Part = 0; Part < State.UF; ++Part) 7486 for (unsigned Lane = 0; Lane < EndLane; ++Lane) 7487 State.ILV->scalarizeInstruction(Ingredient, {Part, Lane}, IsPredicated); 7488 } 7489 7490 void VPBranchOnMaskRecipe::execute(VPTransformState &State) { 7491 assert(State.Instance && "Branch on Mask works only on single instance."); 7492 7493 unsigned Part = State.Instance->Part; 7494 unsigned Lane = State.Instance->Lane; 7495 7496 Value *ConditionBit = nullptr; 7497 if (!User) // Block in mask is all-one. 7498 ConditionBit = State.Builder.getTrue(); 7499 else { 7500 VPValue *BlockInMask = User->getOperand(0); 7501 ConditionBit = State.get(BlockInMask, Part); 7502 if (ConditionBit->getType()->isVectorTy()) 7503 ConditionBit = State.Builder.CreateExtractElement( 7504 ConditionBit, State.Builder.getInt32(Lane)); 7505 } 7506 7507 // Replace the temporary unreachable terminator with a new conditional branch, 7508 // whose two destinations will be set later when they are created. 7509 auto *CurrentTerminator = State.CFG.PrevBB->getTerminator(); 7510 assert(isa<UnreachableInst>(CurrentTerminator) && 7511 "Expected to replace unreachable terminator with conditional branch."); 7512 auto *CondBr = BranchInst::Create(State.CFG.PrevBB, nullptr, ConditionBit); 7513 CondBr->setSuccessor(0, nullptr); 7514 ReplaceInstWithInst(CurrentTerminator, CondBr); 7515 } 7516 7517 void VPPredInstPHIRecipe::execute(VPTransformState &State) { 7518 assert(State.Instance && "Predicated instruction PHI works per instance."); 7519 Instruction *ScalarPredInst = cast<Instruction>( 7520 State.ValueMap.getScalarValue(PredInst, *State.Instance)); 7521 BasicBlock *PredicatedBB = ScalarPredInst->getParent(); 7522 BasicBlock *PredicatingBB = PredicatedBB->getSinglePredecessor(); 7523 assert(PredicatingBB && "Predicated block has no single predecessor."); 7524 7525 // By current pack/unpack logic we need to generate only a single phi node: if 7526 // a vector value for the predicated instruction exists at this point it means 7527 // the instruction has vector users only, and a phi for the vector value is 7528 // needed. In this case the recipe of the predicated instruction is marked to 7529 // also do that packing, thereby "hoisting" the insert-element sequence. 7530 // Otherwise, a phi node for the scalar value is needed. 7531 unsigned Part = State.Instance->Part; 7532 if (State.ValueMap.hasVectorValue(PredInst, Part)) { 7533 Value *VectorValue = State.ValueMap.getVectorValue(PredInst, Part); 7534 InsertElementInst *IEI = cast<InsertElementInst>(VectorValue); 7535 PHINode *VPhi = State.Builder.CreatePHI(IEI->getType(), 2); 7536 VPhi->addIncoming(IEI->getOperand(0), PredicatingBB); // Unmodified vector. 7537 VPhi->addIncoming(IEI, PredicatedBB); // New vector with inserted element. 7538 State.ValueMap.resetVectorValue(PredInst, Part, VPhi); // Update cache. 7539 } else { 7540 Type *PredInstType = PredInst->getType(); 7541 PHINode *Phi = State.Builder.CreatePHI(PredInstType, 2); 7542 Phi->addIncoming(UndefValue::get(ScalarPredInst->getType()), PredicatingBB); 7543 Phi->addIncoming(ScalarPredInst, PredicatedBB); 7544 State.ValueMap.resetScalarValue(PredInst, *State.Instance, Phi); 7545 } 7546 } 7547 7548 void VPWidenMemoryInstructionRecipe::execute(VPTransformState &State) { 7549 VPValue *StoredValue = isa<StoreInst>(Instr) ? getStoredValue() : nullptr; 7550 State.ILV->vectorizeMemoryInstruction(&Instr, State, getAddr(), StoredValue, 7551 getMask()); 7552 } 7553 7554 // Determine how to lower the scalar epilogue, which depends on 1) optimising 7555 // for minimum code-size, 2) predicate compiler options, 3) loop hints forcing 7556 // predication, and 4) a TTI hook that analyses whether the loop is suitable 7557 // for predication. 7558 static ScalarEpilogueLowering getScalarEpilogueLowering( 7559 Function *F, Loop *L, LoopVectorizeHints &Hints, ProfileSummaryInfo *PSI, 7560 BlockFrequencyInfo *BFI, TargetTransformInfo *TTI, TargetLibraryInfo *TLI, 7561 AssumptionCache *AC, LoopInfo *LI, ScalarEvolution *SE, DominatorTree *DT, 7562 LoopVectorizationLegality &LVL) { 7563 bool OptSize = 7564 F->hasOptSize() || llvm::shouldOptimizeForSize(L->getHeader(), PSI, BFI, 7565 PGSOQueryType::IRPass); 7566 // 1) OptSize takes precedence over all other options, i.e. if this is set, 7567 // don't look at hints or options, and don't request a scalar epilogue. 7568 if (OptSize && Hints.getForce() != LoopVectorizeHints::FK_Enabled) 7569 return CM_ScalarEpilogueNotAllowedOptSize; 7570 7571 bool PredicateOptDisabled = PreferPredicateOverEpilog.getNumOccurrences() && 7572 !PreferPredicateOverEpilog; 7573 7574 // 2) Next, if disabling predication is requested on the command line, honour 7575 // this and request a scalar epilogue. 7576 if (PredicateOptDisabled) 7577 return CM_ScalarEpilogueAllowed; 7578 7579 // 3) and 4) look if enabling predication is requested on the command line, 7580 // with a loop hint, or if the TTI hook indicates this is profitable, request 7581 // predication . 7582 if (PreferPredicateOverEpilog || 7583 Hints.getPredicate() == LoopVectorizeHints::FK_Enabled || 7584 (TTI->preferPredicateOverEpilogue(L, LI, *SE, *AC, TLI, DT, 7585 LVL.getLAI()) && 7586 Hints.getPredicate() != LoopVectorizeHints::FK_Disabled)) 7587 return CM_ScalarEpilogueNotNeededUsePredicate; 7588 7589 return CM_ScalarEpilogueAllowed; 7590 } 7591 7592 // Process the loop in the VPlan-native vectorization path. This path builds 7593 // VPlan upfront in the vectorization pipeline, which allows to apply 7594 // VPlan-to-VPlan transformations from the very beginning without modifying the 7595 // input LLVM IR. 7596 static bool processLoopInVPlanNativePath( 7597 Loop *L, PredicatedScalarEvolution &PSE, LoopInfo *LI, DominatorTree *DT, 7598 LoopVectorizationLegality *LVL, TargetTransformInfo *TTI, 7599 TargetLibraryInfo *TLI, DemandedBits *DB, AssumptionCache *AC, 7600 OptimizationRemarkEmitter *ORE, BlockFrequencyInfo *BFI, 7601 ProfileSummaryInfo *PSI, LoopVectorizeHints &Hints) { 7602 7603 assert(EnableVPlanNativePath && "VPlan-native path is disabled."); 7604 Function *F = L->getHeader()->getParent(); 7605 InterleavedAccessInfo IAI(PSE, L, DT, LI, LVL->getLAI()); 7606 7607 ScalarEpilogueLowering SEL = getScalarEpilogueLowering( 7608 F, L, Hints, PSI, BFI, TTI, TLI, AC, LI, PSE.getSE(), DT, *LVL); 7609 7610 LoopVectorizationCostModel CM(SEL, L, PSE, LI, LVL, *TTI, TLI, DB, AC, ORE, F, 7611 &Hints, IAI); 7612 // Use the planner for outer loop vectorization. 7613 // TODO: CM is not used at this point inside the planner. Turn CM into an 7614 // optional argument if we don't need it in the future. 7615 LoopVectorizationPlanner LVP(L, LI, TLI, TTI, LVL, CM, IAI, PSE); 7616 7617 // Get user vectorization factor. 7618 const unsigned UserVF = Hints.getWidth(); 7619 7620 // Plan how to best vectorize, return the best VF and its cost. 7621 const VectorizationFactor VF = LVP.planInVPlanNativePath(UserVF); 7622 7623 // If we are stress testing VPlan builds, do not attempt to generate vector 7624 // code. Masked vector code generation support will follow soon. 7625 // Also, do not attempt to vectorize if no vector code will be produced. 7626 if (VPlanBuildStressTest || EnableVPlanPredication || 7627 VectorizationFactor::Disabled() == VF) 7628 return false; 7629 7630 LVP.setBestPlan(VF.Width, 1); 7631 7632 InnerLoopVectorizer LB(L, PSE, LI, DT, TLI, TTI, AC, ORE, VF.Width, 1, LVL, 7633 &CM); 7634 LLVM_DEBUG(dbgs() << "Vectorizing outer loop in \"" 7635 << L->getHeader()->getParent()->getName() << "\"\n"); 7636 LVP.executePlan(LB, DT); 7637 7638 // Mark the loop as already vectorized to avoid vectorizing again. 7639 Hints.setAlreadyVectorized(); 7640 7641 LLVM_DEBUG(verifyFunction(*L->getHeader()->getParent())); 7642 return true; 7643 } 7644 7645 LoopVectorizePass::LoopVectorizePass(LoopVectorizeOptions Opts) 7646 : InterleaveOnlyWhenForced(Opts.InterleaveOnlyWhenForced || 7647 !EnableLoopInterleaving), 7648 VectorizeOnlyWhenForced(Opts.VectorizeOnlyWhenForced || 7649 !EnableLoopVectorization) {} 7650 7651 bool LoopVectorizePass::processLoop(Loop *L) { 7652 assert((EnableVPlanNativePath || L->empty()) && 7653 "VPlan-native path is not enabled. Only process inner loops."); 7654 7655 #ifndef NDEBUG 7656 const std::string DebugLocStr = getDebugLocString(L); 7657 #endif /* NDEBUG */ 7658 7659 LLVM_DEBUG(dbgs() << "\nLV: Checking a loop in \"" 7660 << L->getHeader()->getParent()->getName() << "\" from " 7661 << DebugLocStr << "\n"); 7662 7663 LoopVectorizeHints Hints(L, InterleaveOnlyWhenForced, *ORE); 7664 7665 LLVM_DEBUG( 7666 dbgs() << "LV: Loop hints:" 7667 << " force=" 7668 << (Hints.getForce() == LoopVectorizeHints::FK_Disabled 7669 ? "disabled" 7670 : (Hints.getForce() == LoopVectorizeHints::FK_Enabled 7671 ? "enabled" 7672 : "?")) 7673 << " width=" << Hints.getWidth() 7674 << " unroll=" << Hints.getInterleave() << "\n"); 7675 7676 // Function containing loop 7677 Function *F = L->getHeader()->getParent(); 7678 7679 // Looking at the diagnostic output is the only way to determine if a loop 7680 // was vectorized (other than looking at the IR or machine code), so it 7681 // is important to generate an optimization remark for each loop. Most of 7682 // these messages are generated as OptimizationRemarkAnalysis. Remarks 7683 // generated as OptimizationRemark and OptimizationRemarkMissed are 7684 // less verbose reporting vectorized loops and unvectorized loops that may 7685 // benefit from vectorization, respectively. 7686 7687 if (!Hints.allowVectorization(F, L, VectorizeOnlyWhenForced)) { 7688 LLVM_DEBUG(dbgs() << "LV: Loop hints prevent vectorization.\n"); 7689 return false; 7690 } 7691 7692 PredicatedScalarEvolution PSE(*SE, *L); 7693 7694 // Check if it is legal to vectorize the loop. 7695 LoopVectorizationRequirements Requirements(*ORE); 7696 LoopVectorizationLegality LVL(L, PSE, DT, TTI, TLI, AA, F, GetLAA, LI, ORE, 7697 &Requirements, &Hints, DB, AC); 7698 if (!LVL.canVectorize(EnableVPlanNativePath)) { 7699 LLVM_DEBUG(dbgs() << "LV: Not vectorizing: Cannot prove legality.\n"); 7700 Hints.emitRemarkWithHints(); 7701 return false; 7702 } 7703 7704 // Check the function attributes and profiles to find out if this function 7705 // should be optimized for size. 7706 ScalarEpilogueLowering SEL = getScalarEpilogueLowering( 7707 F, L, Hints, PSI, BFI, TTI, TLI, AC, LI, PSE.getSE(), DT, LVL); 7708 7709 // Entrance to the VPlan-native vectorization path. Outer loops are processed 7710 // here. They may require CFG and instruction level transformations before 7711 // even evaluating whether vectorization is profitable. Since we cannot modify 7712 // the incoming IR, we need to build VPlan upfront in the vectorization 7713 // pipeline. 7714 if (!L->empty()) 7715 return processLoopInVPlanNativePath(L, PSE, LI, DT, &LVL, TTI, TLI, DB, AC, 7716 ORE, BFI, PSI, Hints); 7717 7718 assert(L->empty() && "Inner loop expected."); 7719 7720 // Check the loop for a trip count threshold: vectorize loops with a tiny trip 7721 // count by optimizing for size, to minimize overheads. 7722 auto ExpectedTC = getSmallBestKnownTC(*SE, L); 7723 if (ExpectedTC && *ExpectedTC < TinyTripCountVectorThreshold) { 7724 LLVM_DEBUG(dbgs() << "LV: Found a loop with a very small trip count. " 7725 << "This loop is worth vectorizing only if no scalar " 7726 << "iteration overheads are incurred."); 7727 if (Hints.getForce() == LoopVectorizeHints::FK_Enabled) 7728 LLVM_DEBUG(dbgs() << " But vectorizing was explicitly forced.\n"); 7729 else { 7730 LLVM_DEBUG(dbgs() << "\n"); 7731 SEL = CM_ScalarEpilogueNotAllowedLowTripLoop; 7732 } 7733 } 7734 7735 // Check the function attributes to see if implicit floats are allowed. 7736 // FIXME: This check doesn't seem possibly correct -- what if the loop is 7737 // an integer loop and the vector instructions selected are purely integer 7738 // vector instructions? 7739 if (F->hasFnAttribute(Attribute::NoImplicitFloat)) { 7740 reportVectorizationFailure( 7741 "Can't vectorize when the NoImplicitFloat attribute is used", 7742 "loop not vectorized due to NoImplicitFloat attribute", 7743 "NoImplicitFloat", ORE, L); 7744 Hints.emitRemarkWithHints(); 7745 return false; 7746 } 7747 7748 // Check if the target supports potentially unsafe FP vectorization. 7749 // FIXME: Add a check for the type of safety issue (denormal, signaling) 7750 // for the target we're vectorizing for, to make sure none of the 7751 // additional fp-math flags can help. 7752 if (Hints.isPotentiallyUnsafe() && 7753 TTI->isFPVectorizationPotentiallyUnsafe()) { 7754 reportVectorizationFailure( 7755 "Potentially unsafe FP op prevents vectorization", 7756 "loop not vectorized due to unsafe FP support.", 7757 "UnsafeFP", ORE, L); 7758 Hints.emitRemarkWithHints(); 7759 return false; 7760 } 7761 7762 bool UseInterleaved = TTI->enableInterleavedAccessVectorization(); 7763 InterleavedAccessInfo IAI(PSE, L, DT, LI, LVL.getLAI()); 7764 7765 // If an override option has been passed in for interleaved accesses, use it. 7766 if (EnableInterleavedMemAccesses.getNumOccurrences() > 0) 7767 UseInterleaved = EnableInterleavedMemAccesses; 7768 7769 // Analyze interleaved memory accesses. 7770 if (UseInterleaved) { 7771 IAI.analyzeInterleaving(useMaskedInterleavedAccesses(*TTI)); 7772 } 7773 7774 // Use the cost model. 7775 LoopVectorizationCostModel CM(SEL, L, PSE, LI, &LVL, *TTI, TLI, DB, AC, ORE, 7776 F, &Hints, IAI); 7777 CM.collectValuesToIgnore(); 7778 7779 // Use the planner for vectorization. 7780 LoopVectorizationPlanner LVP(L, LI, TLI, TTI, &LVL, CM, IAI, PSE); 7781 7782 // Get user vectorization factor. 7783 unsigned UserVF = Hints.getWidth(); 7784 7785 // Plan how to best vectorize, return the best VF and its cost. 7786 Optional<VectorizationFactor> MaybeVF = LVP.plan(UserVF); 7787 7788 VectorizationFactor VF = VectorizationFactor::Disabled(); 7789 unsigned IC = 1; 7790 unsigned UserIC = Hints.getInterleave(); 7791 7792 if (MaybeVF) { 7793 VF = *MaybeVF; 7794 // Select the interleave count. 7795 IC = CM.selectInterleaveCount(VF.Width, VF.Cost); 7796 } 7797 7798 // Identify the diagnostic messages that should be produced. 7799 std::pair<StringRef, std::string> VecDiagMsg, IntDiagMsg; 7800 bool VectorizeLoop = true, InterleaveLoop = true; 7801 if (Requirements.doesNotMeet(F, L, Hints)) { 7802 LLVM_DEBUG(dbgs() << "LV: Not vectorizing: loop did not meet vectorization " 7803 "requirements.\n"); 7804 Hints.emitRemarkWithHints(); 7805 return false; 7806 } 7807 7808 if (VF.Width == 1) { 7809 LLVM_DEBUG(dbgs() << "LV: Vectorization is possible but not beneficial.\n"); 7810 VecDiagMsg = std::make_pair( 7811 "VectorizationNotBeneficial", 7812 "the cost-model indicates that vectorization is not beneficial"); 7813 VectorizeLoop = false; 7814 } 7815 7816 if (!MaybeVF && UserIC > 1) { 7817 // Tell the user interleaving was avoided up-front, despite being explicitly 7818 // requested. 7819 LLVM_DEBUG(dbgs() << "LV: Ignoring UserIC, because vectorization and " 7820 "interleaving should be avoided up front\n"); 7821 IntDiagMsg = std::make_pair( 7822 "InterleavingAvoided", 7823 "Ignoring UserIC, because interleaving was avoided up front"); 7824 InterleaveLoop = false; 7825 } else if (IC == 1 && UserIC <= 1) { 7826 // Tell the user interleaving is not beneficial. 7827 LLVM_DEBUG(dbgs() << "LV: Interleaving is not beneficial.\n"); 7828 IntDiagMsg = std::make_pair( 7829 "InterleavingNotBeneficial", 7830 "the cost-model indicates that interleaving is not beneficial"); 7831 InterleaveLoop = false; 7832 if (UserIC == 1) { 7833 IntDiagMsg.first = "InterleavingNotBeneficialAndDisabled"; 7834 IntDiagMsg.second += 7835 " and is explicitly disabled or interleave count is set to 1"; 7836 } 7837 } else if (IC > 1 && UserIC == 1) { 7838 // Tell the user interleaving is beneficial, but it explicitly disabled. 7839 LLVM_DEBUG( 7840 dbgs() << "LV: Interleaving is beneficial but is explicitly disabled."); 7841 IntDiagMsg = std::make_pair( 7842 "InterleavingBeneficialButDisabled", 7843 "the cost-model indicates that interleaving is beneficial " 7844 "but is explicitly disabled or interleave count is set to 1"); 7845 InterleaveLoop = false; 7846 } 7847 7848 // Override IC if user provided an interleave count. 7849 IC = UserIC > 0 ? UserIC : IC; 7850 7851 // Emit diagnostic messages, if any. 7852 const char *VAPassName = Hints.vectorizeAnalysisPassName(); 7853 if (!VectorizeLoop && !InterleaveLoop) { 7854 // Do not vectorize or interleaving the loop. 7855 ORE->emit([&]() { 7856 return OptimizationRemarkMissed(VAPassName, VecDiagMsg.first, 7857 L->getStartLoc(), L->getHeader()) 7858 << VecDiagMsg.second; 7859 }); 7860 ORE->emit([&]() { 7861 return OptimizationRemarkMissed(LV_NAME, IntDiagMsg.first, 7862 L->getStartLoc(), L->getHeader()) 7863 << IntDiagMsg.second; 7864 }); 7865 return false; 7866 } else if (!VectorizeLoop && InterleaveLoop) { 7867 LLVM_DEBUG(dbgs() << "LV: Interleave Count is " << IC << '\n'); 7868 ORE->emit([&]() { 7869 return OptimizationRemarkAnalysis(VAPassName, VecDiagMsg.first, 7870 L->getStartLoc(), L->getHeader()) 7871 << VecDiagMsg.second; 7872 }); 7873 } else if (VectorizeLoop && !InterleaveLoop) { 7874 LLVM_DEBUG(dbgs() << "LV: Found a vectorizable loop (" << VF.Width 7875 << ") in " << DebugLocStr << '\n'); 7876 ORE->emit([&]() { 7877 return OptimizationRemarkAnalysis(LV_NAME, IntDiagMsg.first, 7878 L->getStartLoc(), L->getHeader()) 7879 << IntDiagMsg.second; 7880 }); 7881 } else if (VectorizeLoop && InterleaveLoop) { 7882 LLVM_DEBUG(dbgs() << "LV: Found a vectorizable loop (" << VF.Width 7883 << ") in " << DebugLocStr << '\n'); 7884 LLVM_DEBUG(dbgs() << "LV: Interleave Count is " << IC << '\n'); 7885 } 7886 7887 LVP.setBestPlan(VF.Width, IC); 7888 7889 using namespace ore; 7890 bool DisableRuntimeUnroll = false; 7891 MDNode *OrigLoopID = L->getLoopID(); 7892 7893 if (!VectorizeLoop) { 7894 assert(IC > 1 && "interleave count should not be 1 or 0"); 7895 // If we decided that it is not legal to vectorize the loop, then 7896 // interleave it. 7897 InnerLoopUnroller Unroller(L, PSE, LI, DT, TLI, TTI, AC, ORE, IC, &LVL, 7898 &CM); 7899 LVP.executePlan(Unroller, DT); 7900 7901 ORE->emit([&]() { 7902 return OptimizationRemark(LV_NAME, "Interleaved", L->getStartLoc(), 7903 L->getHeader()) 7904 << "interleaved loop (interleaved count: " 7905 << NV("InterleaveCount", IC) << ")"; 7906 }); 7907 } else { 7908 // If we decided that it is *legal* to vectorize the loop, then do it. 7909 InnerLoopVectorizer LB(L, PSE, LI, DT, TLI, TTI, AC, ORE, VF.Width, IC, 7910 &LVL, &CM); 7911 LVP.executePlan(LB, DT); 7912 ++LoopsVectorized; 7913 7914 // Add metadata to disable runtime unrolling a scalar loop when there are 7915 // no runtime checks about strides and memory. A scalar loop that is 7916 // rarely used is not worth unrolling. 7917 if (!LB.areSafetyChecksAdded()) 7918 DisableRuntimeUnroll = true; 7919 7920 // Report the vectorization decision. 7921 ORE->emit([&]() { 7922 return OptimizationRemark(LV_NAME, "Vectorized", L->getStartLoc(), 7923 L->getHeader()) 7924 << "vectorized loop (vectorization width: " 7925 << NV("VectorizationFactor", VF.Width) 7926 << ", interleaved count: " << NV("InterleaveCount", IC) << ")"; 7927 }); 7928 } 7929 7930 Optional<MDNode *> RemainderLoopID = 7931 makeFollowupLoopID(OrigLoopID, {LLVMLoopVectorizeFollowupAll, 7932 LLVMLoopVectorizeFollowupEpilogue}); 7933 if (RemainderLoopID.hasValue()) { 7934 L->setLoopID(RemainderLoopID.getValue()); 7935 } else { 7936 if (DisableRuntimeUnroll) 7937 AddRuntimeUnrollDisableMetaData(L); 7938 7939 // Mark the loop as already vectorized to avoid vectorizing again. 7940 Hints.setAlreadyVectorized(); 7941 } 7942 7943 LLVM_DEBUG(verifyFunction(*L->getHeader()->getParent())); 7944 return true; 7945 } 7946 7947 bool LoopVectorizePass::runImpl( 7948 Function &F, ScalarEvolution &SE_, LoopInfo &LI_, TargetTransformInfo &TTI_, 7949 DominatorTree &DT_, BlockFrequencyInfo &BFI_, TargetLibraryInfo *TLI_, 7950 DemandedBits &DB_, AliasAnalysis &AA_, AssumptionCache &AC_, 7951 std::function<const LoopAccessInfo &(Loop &)> &GetLAA_, 7952 OptimizationRemarkEmitter &ORE_, ProfileSummaryInfo *PSI_) { 7953 SE = &SE_; 7954 LI = &LI_; 7955 TTI = &TTI_; 7956 DT = &DT_; 7957 BFI = &BFI_; 7958 TLI = TLI_; 7959 AA = &AA_; 7960 AC = &AC_; 7961 GetLAA = &GetLAA_; 7962 DB = &DB_; 7963 ORE = &ORE_; 7964 PSI = PSI_; 7965 7966 // Don't attempt if 7967 // 1. the target claims to have no vector registers, and 7968 // 2. interleaving won't help ILP. 7969 // 7970 // The second condition is necessary because, even if the target has no 7971 // vector registers, loop vectorization may still enable scalar 7972 // interleaving. 7973 if (!TTI->getNumberOfRegisters(TTI->getRegisterClassForType(true)) && 7974 TTI->getMaxInterleaveFactor(1) < 2) 7975 return false; 7976 7977 bool Changed = false; 7978 7979 // The vectorizer requires loops to be in simplified form. 7980 // Since simplification may add new inner loops, it has to run before the 7981 // legality and profitability checks. This means running the loop vectorizer 7982 // will simplify all loops, regardless of whether anything end up being 7983 // vectorized. 7984 for (auto &L : *LI) 7985 Changed |= 7986 simplifyLoop(L, DT, LI, SE, AC, nullptr, false /* PreserveLCSSA */); 7987 7988 // Build up a worklist of inner-loops to vectorize. This is necessary as 7989 // the act of vectorizing or partially unrolling a loop creates new loops 7990 // and can invalidate iterators across the loops. 7991 SmallVector<Loop *, 8> Worklist; 7992 7993 for (Loop *L : *LI) 7994 collectSupportedLoops(*L, LI, ORE, Worklist); 7995 7996 LoopsAnalyzed += Worklist.size(); 7997 7998 // Now walk the identified inner loops. 7999 while (!Worklist.empty()) { 8000 Loop *L = Worklist.pop_back_val(); 8001 8002 // For the inner loops we actually process, form LCSSA to simplify the 8003 // transform. 8004 Changed |= formLCSSARecursively(*L, *DT, LI, SE); 8005 8006 Changed |= processLoop(L); 8007 } 8008 8009 // Process each loop nest in the function. 8010 return Changed; 8011 } 8012 8013 PreservedAnalyses LoopVectorizePass::run(Function &F, 8014 FunctionAnalysisManager &AM) { 8015 auto &SE = AM.getResult<ScalarEvolutionAnalysis>(F); 8016 auto &LI = AM.getResult<LoopAnalysis>(F); 8017 auto &TTI = AM.getResult<TargetIRAnalysis>(F); 8018 auto &DT = AM.getResult<DominatorTreeAnalysis>(F); 8019 auto &BFI = AM.getResult<BlockFrequencyAnalysis>(F); 8020 auto &TLI = AM.getResult<TargetLibraryAnalysis>(F); 8021 auto &AA = AM.getResult<AAManager>(F); 8022 auto &AC = AM.getResult<AssumptionAnalysis>(F); 8023 auto &DB = AM.getResult<DemandedBitsAnalysis>(F); 8024 auto &ORE = AM.getResult<OptimizationRemarkEmitterAnalysis>(F); 8025 MemorySSA *MSSA = EnableMSSALoopDependency 8026 ? &AM.getResult<MemorySSAAnalysis>(F).getMSSA() 8027 : nullptr; 8028 8029 auto &LAM = AM.getResult<LoopAnalysisManagerFunctionProxy>(F).getManager(); 8030 std::function<const LoopAccessInfo &(Loop &)> GetLAA = 8031 [&](Loop &L) -> const LoopAccessInfo & { 8032 LoopStandardAnalysisResults AR = {AA, AC, DT, LI, SE, TLI, TTI, MSSA}; 8033 return LAM.getResult<LoopAccessAnalysis>(L, AR); 8034 }; 8035 const ModuleAnalysisManager &MAM = 8036 AM.getResult<ModuleAnalysisManagerFunctionProxy>(F).getManager(); 8037 ProfileSummaryInfo *PSI = 8038 MAM.getCachedResult<ProfileSummaryAnalysis>(*F.getParent()); 8039 bool Changed = 8040 runImpl(F, SE, LI, TTI, DT, BFI, &TLI, DB, AA, AC, GetLAA, ORE, PSI); 8041 if (!Changed) 8042 return PreservedAnalyses::all(); 8043 PreservedAnalyses PA; 8044 8045 // We currently do not preserve loopinfo/dominator analyses with outer loop 8046 // vectorization. Until this is addressed, mark these analyses as preserved 8047 // only for non-VPlan-native path. 8048 // TODO: Preserve Loop and Dominator analyses for VPlan-native path. 8049 if (!EnableVPlanNativePath) { 8050 PA.preserve<LoopAnalysis>(); 8051 PA.preserve<DominatorTreeAnalysis>(); 8052 } 8053 PA.preserve<BasicAA>(); 8054 PA.preserve<GlobalsAA>(); 8055 return PA; 8056 } 8057