1 //===- LoopVectorize.cpp - A Loop Vectorizer ------------------------------===// 2 // 3 // Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. 4 // See https://llvm.org/LICENSE.txt for license information. 5 // SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception 6 // 7 //===----------------------------------------------------------------------===// 8 // 9 // This is the LLVM loop vectorizer. This pass modifies 'vectorizable' loops 10 // and generates target-independent LLVM-IR. 11 // The vectorizer uses the TargetTransformInfo analysis to estimate the costs 12 // of instructions in order to estimate the profitability of vectorization. 13 // 14 // The loop vectorizer combines consecutive loop iterations into a single 15 // 'wide' iteration. After this transformation the index is incremented 16 // by the SIMD vector width, and not by one. 17 // 18 // This pass has three parts: 19 // 1. The main loop pass that drives the different parts. 20 // 2. LoopVectorizationLegality - A unit that checks for the legality 21 // of the vectorization. 22 // 3. InnerLoopVectorizer - A unit that performs the actual 23 // widening of instructions. 24 // 4. LoopVectorizationCostModel - A unit that checks for the profitability 25 // of vectorization. It decides on the optimal vector width, which 26 // can be one, if vectorization is not profitable. 27 // 28 // There is a development effort going on to migrate loop vectorizer to the 29 // VPlan infrastructure and to introduce outer loop vectorization support (see 30 // docs/Proposal/VectorizationPlan.rst and 31 // http://lists.llvm.org/pipermail/llvm-dev/2017-December/119523.html). For this 32 // purpose, we temporarily introduced the VPlan-native vectorization path: an 33 // alternative vectorization path that is natively implemented on top of the 34 // VPlan infrastructure. See EnableVPlanNativePath for enabling. 35 // 36 //===----------------------------------------------------------------------===// 37 // 38 // The reduction-variable vectorization is based on the paper: 39 // D. Nuzman and R. Henderson. Multi-platform Auto-vectorization. 40 // 41 // Variable uniformity checks are inspired by: 42 // Karrenberg, R. and Hack, S. Whole Function Vectorization. 43 // 44 // The interleaved access vectorization is based on the paper: 45 // Dorit Nuzman, Ira Rosen and Ayal Zaks. Auto-Vectorization of Interleaved 46 // Data for SIMD 47 // 48 // Other ideas/concepts are from: 49 // A. Zaks and D. Nuzman. Autovectorization in GCC-two years later. 50 // 51 // S. Maleki, Y. Gao, M. Garzaran, T. Wong and D. Padua. An Evaluation of 52 // Vectorizing Compilers. 53 // 54 //===----------------------------------------------------------------------===// 55 56 #include "llvm/Transforms/Vectorize/LoopVectorize.h" 57 #include "LoopVectorizationPlanner.h" 58 #include "VPRecipeBuilder.h" 59 #include "VPlan.h" 60 #include "VPlanHCFGBuilder.h" 61 #include "VPlanPredicator.h" 62 #include "VPlanTransforms.h" 63 #include "llvm/ADT/APInt.h" 64 #include "llvm/ADT/ArrayRef.h" 65 #include "llvm/ADT/DenseMap.h" 66 #include "llvm/ADT/DenseMapInfo.h" 67 #include "llvm/ADT/Hashing.h" 68 #include "llvm/ADT/MapVector.h" 69 #include "llvm/ADT/None.h" 70 #include "llvm/ADT/Optional.h" 71 #include "llvm/ADT/STLExtras.h" 72 #include "llvm/ADT/SetVector.h" 73 #include "llvm/ADT/SmallPtrSet.h" 74 #include "llvm/ADT/SmallVector.h" 75 #include "llvm/ADT/Statistic.h" 76 #include "llvm/ADT/StringRef.h" 77 #include "llvm/ADT/Twine.h" 78 #include "llvm/ADT/iterator_range.h" 79 #include "llvm/Analysis/AssumptionCache.h" 80 #include "llvm/Analysis/BasicAliasAnalysis.h" 81 #include "llvm/Analysis/BlockFrequencyInfo.h" 82 #include "llvm/Analysis/CFG.h" 83 #include "llvm/Analysis/CodeMetrics.h" 84 #include "llvm/Analysis/DemandedBits.h" 85 #include "llvm/Analysis/GlobalsModRef.h" 86 #include "llvm/Analysis/LoopAccessAnalysis.h" 87 #include "llvm/Analysis/LoopAnalysisManager.h" 88 #include "llvm/Analysis/LoopInfo.h" 89 #include "llvm/Analysis/LoopIterator.h" 90 #include "llvm/Analysis/MemorySSA.h" 91 #include "llvm/Analysis/OptimizationRemarkEmitter.h" 92 #include "llvm/Analysis/ProfileSummaryInfo.h" 93 #include "llvm/Analysis/ScalarEvolution.h" 94 #include "llvm/Analysis/ScalarEvolutionExpander.h" 95 #include "llvm/Analysis/ScalarEvolutionExpressions.h" 96 #include "llvm/Analysis/TargetLibraryInfo.h" 97 #include "llvm/Analysis/TargetTransformInfo.h" 98 #include "llvm/Analysis/VectorUtils.h" 99 #include "llvm/IR/Attributes.h" 100 #include "llvm/IR/BasicBlock.h" 101 #include "llvm/IR/CFG.h" 102 #include "llvm/IR/Constant.h" 103 #include "llvm/IR/Constants.h" 104 #include "llvm/IR/DataLayout.h" 105 #include "llvm/IR/DebugInfoMetadata.h" 106 #include "llvm/IR/DebugLoc.h" 107 #include "llvm/IR/DerivedTypes.h" 108 #include "llvm/IR/DiagnosticInfo.h" 109 #include "llvm/IR/Dominators.h" 110 #include "llvm/IR/Function.h" 111 #include "llvm/IR/IRBuilder.h" 112 #include "llvm/IR/InstrTypes.h" 113 #include "llvm/IR/Instruction.h" 114 #include "llvm/IR/Instructions.h" 115 #include "llvm/IR/IntrinsicInst.h" 116 #include "llvm/IR/Intrinsics.h" 117 #include "llvm/IR/LLVMContext.h" 118 #include "llvm/IR/Metadata.h" 119 #include "llvm/IR/Module.h" 120 #include "llvm/IR/Operator.h" 121 #include "llvm/IR/Type.h" 122 #include "llvm/IR/Use.h" 123 #include "llvm/IR/User.h" 124 #include "llvm/IR/Value.h" 125 #include "llvm/IR/ValueHandle.h" 126 #include "llvm/IR/Verifier.h" 127 #include "llvm/InitializePasses.h" 128 #include "llvm/Pass.h" 129 #include "llvm/Support/Casting.h" 130 #include "llvm/Support/CommandLine.h" 131 #include "llvm/Support/Compiler.h" 132 #include "llvm/Support/Debug.h" 133 #include "llvm/Support/ErrorHandling.h" 134 #include "llvm/Support/MathExtras.h" 135 #include "llvm/Support/raw_ostream.h" 136 #include "llvm/Transforms/Utils/BasicBlockUtils.h" 137 #include "llvm/Transforms/Utils/InjectTLIMappings.h" 138 #include "llvm/Transforms/Utils/LoopSimplify.h" 139 #include "llvm/Transforms/Utils/LoopUtils.h" 140 #include "llvm/Transforms/Utils/LoopVersioning.h" 141 #include "llvm/Transforms/Utils/SizeOpts.h" 142 #include "llvm/Transforms/Vectorize/LoopVectorizationLegality.h" 143 #include <algorithm> 144 #include <cassert> 145 #include <cstdint> 146 #include <cstdlib> 147 #include <functional> 148 #include <iterator> 149 #include <limits> 150 #include <memory> 151 #include <string> 152 #include <tuple> 153 #include <utility> 154 155 using namespace llvm; 156 157 #define LV_NAME "loop-vectorize" 158 #define DEBUG_TYPE LV_NAME 159 160 /// @{ 161 /// Metadata attribute names 162 static const char *const LLVMLoopVectorizeFollowupAll = 163 "llvm.loop.vectorize.followup_all"; 164 static const char *const LLVMLoopVectorizeFollowupVectorized = 165 "llvm.loop.vectorize.followup_vectorized"; 166 static const char *const LLVMLoopVectorizeFollowupEpilogue = 167 "llvm.loop.vectorize.followup_epilogue"; 168 /// @} 169 170 STATISTIC(LoopsVectorized, "Number of loops vectorized"); 171 STATISTIC(LoopsAnalyzed, "Number of loops analyzed for vectorization"); 172 173 /// Loops with a known constant trip count below this number are vectorized only 174 /// if no scalar iteration overheads are incurred. 175 static cl::opt<unsigned> TinyTripCountVectorThreshold( 176 "vectorizer-min-trip-count", cl::init(16), cl::Hidden, 177 cl::desc("Loops with a constant trip count that is smaller than this " 178 "value are vectorized only if no scalar iteration overheads " 179 "are incurred.")); 180 181 // Indicates that an epilogue is undesired, predication is preferred. 182 // This means that the vectorizer will try to fold the loop-tail (epilogue) 183 // into the loop and predicate the loop body accordingly. 184 static cl::opt<bool> PreferPredicateOverEpilog( 185 "prefer-predicate-over-epilog", cl::init(false), cl::Hidden, 186 cl::desc("Indicate that an epilogue is undesired, predication should be " 187 "used instead.")); 188 189 static cl::opt<bool> MaximizeBandwidth( 190 "vectorizer-maximize-bandwidth", cl::init(false), cl::Hidden, 191 cl::desc("Maximize bandwidth when selecting vectorization factor which " 192 "will be determined by the smallest type in loop.")); 193 194 static cl::opt<bool> EnableInterleavedMemAccesses( 195 "enable-interleaved-mem-accesses", cl::init(false), cl::Hidden, 196 cl::desc("Enable vectorization on interleaved memory accesses in a loop")); 197 198 /// An interleave-group may need masking if it resides in a block that needs 199 /// predication, or in order to mask away gaps. 200 static cl::opt<bool> EnableMaskedInterleavedMemAccesses( 201 "enable-masked-interleaved-mem-accesses", cl::init(false), cl::Hidden, 202 cl::desc("Enable vectorization on masked interleaved memory accesses in a loop")); 203 204 static cl::opt<unsigned> TinyTripCountInterleaveThreshold( 205 "tiny-trip-count-interleave-threshold", cl::init(128), cl::Hidden, 206 cl::desc("We don't interleave loops with a estimated constant trip count " 207 "below this number")); 208 209 static cl::opt<unsigned> ForceTargetNumScalarRegs( 210 "force-target-num-scalar-regs", cl::init(0), cl::Hidden, 211 cl::desc("A flag that overrides the target's number of scalar registers.")); 212 213 static cl::opt<unsigned> ForceTargetNumVectorRegs( 214 "force-target-num-vector-regs", cl::init(0), cl::Hidden, 215 cl::desc("A flag that overrides the target's number of vector registers.")); 216 217 static cl::opt<unsigned> ForceTargetMaxScalarInterleaveFactor( 218 "force-target-max-scalar-interleave", cl::init(0), cl::Hidden, 219 cl::desc("A flag that overrides the target's max interleave factor for " 220 "scalar loops.")); 221 222 static cl::opt<unsigned> ForceTargetMaxVectorInterleaveFactor( 223 "force-target-max-vector-interleave", cl::init(0), cl::Hidden, 224 cl::desc("A flag that overrides the target's max interleave factor for " 225 "vectorized loops.")); 226 227 static cl::opt<unsigned> ForceTargetInstructionCost( 228 "force-target-instruction-cost", cl::init(0), cl::Hidden, 229 cl::desc("A flag that overrides the target's expected cost for " 230 "an instruction to a single constant value. Mostly " 231 "useful for getting consistent testing.")); 232 233 static cl::opt<unsigned> SmallLoopCost( 234 "small-loop-cost", cl::init(20), cl::Hidden, 235 cl::desc( 236 "The cost of a loop that is considered 'small' by the interleaver.")); 237 238 static cl::opt<bool> LoopVectorizeWithBlockFrequency( 239 "loop-vectorize-with-block-frequency", cl::init(true), cl::Hidden, 240 cl::desc("Enable the use of the block frequency analysis to access PGO " 241 "heuristics minimizing code growth in cold regions and being more " 242 "aggressive in hot regions.")); 243 244 // Runtime interleave loops for load/store throughput. 245 static cl::opt<bool> EnableLoadStoreRuntimeInterleave( 246 "enable-loadstore-runtime-interleave", cl::init(true), cl::Hidden, 247 cl::desc( 248 "Enable runtime interleaving until load/store ports are saturated")); 249 250 /// The number of stores in a loop that are allowed to need predication. 251 static cl::opt<unsigned> NumberOfStoresToPredicate( 252 "vectorize-num-stores-pred", cl::init(1), cl::Hidden, 253 cl::desc("Max number of stores to be predicated behind an if.")); 254 255 static cl::opt<bool> EnableIndVarRegisterHeur( 256 "enable-ind-var-reg-heur", cl::init(true), cl::Hidden, 257 cl::desc("Count the induction variable only once when interleaving")); 258 259 static cl::opt<bool> EnableCondStoresVectorization( 260 "enable-cond-stores-vec", cl::init(true), cl::Hidden, 261 cl::desc("Enable if predication of stores during vectorization.")); 262 263 static cl::opt<unsigned> MaxNestedScalarReductionIC( 264 "max-nested-scalar-reduction-interleave", cl::init(2), cl::Hidden, 265 cl::desc("The maximum interleave count to use when interleaving a scalar " 266 "reduction in a nested loop.")); 267 268 cl::opt<bool> EnableVPlanNativePath( 269 "enable-vplan-native-path", cl::init(false), cl::Hidden, 270 cl::desc("Enable VPlan-native vectorization path with " 271 "support for outer loop vectorization.")); 272 273 // FIXME: Remove this switch once we have divergence analysis. Currently we 274 // assume divergent non-backedge branches when this switch is true. 275 cl::opt<bool> EnableVPlanPredication( 276 "enable-vplan-predication", cl::init(false), cl::Hidden, 277 cl::desc("Enable VPlan-native vectorization path predicator with " 278 "support for outer loop vectorization.")); 279 280 // This flag enables the stress testing of the VPlan H-CFG construction in the 281 // VPlan-native vectorization path. It must be used in conjuction with 282 // -enable-vplan-native-path. -vplan-verify-hcfg can also be used to enable the 283 // verification of the H-CFGs built. 284 static cl::opt<bool> VPlanBuildStressTest( 285 "vplan-build-stress-test", cl::init(false), cl::Hidden, 286 cl::desc( 287 "Build VPlan for every supported loop nest in the function and bail " 288 "out right after the build (stress test the VPlan H-CFG construction " 289 "in the VPlan-native vectorization path).")); 290 291 cl::opt<bool> llvm::EnableLoopInterleaving( 292 "interleave-loops", cl::init(true), cl::Hidden, 293 cl::desc("Enable loop interleaving in Loop vectorization passes")); 294 cl::opt<bool> llvm::EnableLoopVectorization( 295 "vectorize-loops", cl::init(true), cl::Hidden, 296 cl::desc("Run the Loop vectorization passes")); 297 298 /// A helper function that returns the type of loaded or stored value. 299 static Type *getMemInstValueType(Value *I) { 300 assert((isa<LoadInst>(I) || isa<StoreInst>(I)) && 301 "Expected Load or Store instruction"); 302 if (auto *LI = dyn_cast<LoadInst>(I)) 303 return LI->getType(); 304 return cast<StoreInst>(I)->getValueOperand()->getType(); 305 } 306 307 /// A helper function that returns true if the given type is irregular. The 308 /// type is irregular if its allocated size doesn't equal the store size of an 309 /// element of the corresponding vector type at the given vectorization factor. 310 static bool hasIrregularType(Type *Ty, const DataLayout &DL, unsigned VF) { 311 // Determine if an array of VF elements of type Ty is "bitcast compatible" 312 // with a <VF x Ty> vector. 313 if (VF > 1) { 314 auto *VectorTy = VectorType::get(Ty, VF); 315 return VF * DL.getTypeAllocSize(Ty) != DL.getTypeStoreSize(VectorTy); 316 } 317 318 // If the vectorization factor is one, we just check if an array of type Ty 319 // requires padding between elements. 320 return DL.getTypeAllocSizeInBits(Ty) != DL.getTypeSizeInBits(Ty); 321 } 322 323 /// A helper function that returns the reciprocal of the block probability of 324 /// predicated blocks. If we return X, we are assuming the predicated block 325 /// will execute once for every X iterations of the loop header. 326 /// 327 /// TODO: We should use actual block probability here, if available. Currently, 328 /// we always assume predicated blocks have a 50% chance of executing. 329 static unsigned getReciprocalPredBlockProb() { return 2; } 330 331 /// A helper function that adds a 'fast' flag to floating-point operations. 332 static Value *addFastMathFlag(Value *V) { 333 if (isa<FPMathOperator>(V)) 334 cast<Instruction>(V)->setFastMathFlags(FastMathFlags::getFast()); 335 return V; 336 } 337 338 static Value *addFastMathFlag(Value *V, FastMathFlags FMF) { 339 if (isa<FPMathOperator>(V)) 340 cast<Instruction>(V)->setFastMathFlags(FMF); 341 return V; 342 } 343 344 /// A helper function that returns an integer or floating-point constant with 345 /// value C. 346 static Constant *getSignedIntOrFpConstant(Type *Ty, int64_t C) { 347 return Ty->isIntegerTy() ? ConstantInt::getSigned(Ty, C) 348 : ConstantFP::get(Ty, C); 349 } 350 351 /// Returns "best known" trip count for the specified loop \p L as defined by 352 /// the following procedure: 353 /// 1) Returns exact trip count if it is known. 354 /// 2) Returns expected trip count according to profile data if any. 355 /// 3) Returns upper bound estimate if it is known. 356 /// 4) Returns None if all of the above failed. 357 static Optional<unsigned> getSmallBestKnownTC(ScalarEvolution &SE, Loop *L) { 358 // Check if exact trip count is known. 359 if (unsigned ExpectedTC = SE.getSmallConstantTripCount(L)) 360 return ExpectedTC; 361 362 // Check if there is an expected trip count available from profile data. 363 if (LoopVectorizeWithBlockFrequency) 364 if (auto EstimatedTC = getLoopEstimatedTripCount(L)) 365 return EstimatedTC; 366 367 // Check if upper bound estimate is known. 368 if (unsigned ExpectedTC = SE.getSmallConstantMaxTripCount(L)) 369 return ExpectedTC; 370 371 return None; 372 } 373 374 namespace llvm { 375 376 /// InnerLoopVectorizer vectorizes loops which contain only one basic 377 /// block to a specified vectorization factor (VF). 378 /// This class performs the widening of scalars into vectors, or multiple 379 /// scalars. This class also implements the following features: 380 /// * It inserts an epilogue loop for handling loops that don't have iteration 381 /// counts that are known to be a multiple of the vectorization factor. 382 /// * It handles the code generation for reduction variables. 383 /// * Scalarization (implementation using scalars) of un-vectorizable 384 /// instructions. 385 /// InnerLoopVectorizer does not perform any vectorization-legality 386 /// checks, and relies on the caller to check for the different legality 387 /// aspects. The InnerLoopVectorizer relies on the 388 /// LoopVectorizationLegality class to provide information about the induction 389 /// and reduction variables that were found to a given vectorization factor. 390 class InnerLoopVectorizer { 391 public: 392 InnerLoopVectorizer(Loop *OrigLoop, PredicatedScalarEvolution &PSE, 393 LoopInfo *LI, DominatorTree *DT, 394 const TargetLibraryInfo *TLI, 395 const TargetTransformInfo *TTI, AssumptionCache *AC, 396 OptimizationRemarkEmitter *ORE, unsigned VecWidth, 397 unsigned UnrollFactor, LoopVectorizationLegality *LVL, 398 LoopVectorizationCostModel *CM) 399 : OrigLoop(OrigLoop), PSE(PSE), LI(LI), DT(DT), TLI(TLI), TTI(TTI), 400 AC(AC), ORE(ORE), VF(VecWidth), UF(UnrollFactor), 401 Builder(PSE.getSE()->getContext()), 402 VectorLoopValueMap(UnrollFactor, VecWidth), Legal(LVL), Cost(CM) {} 403 virtual ~InnerLoopVectorizer() = default; 404 405 /// Create a new empty loop. Unlink the old loop and connect the new one. 406 /// Return the pre-header block of the new loop. 407 BasicBlock *createVectorizedLoopSkeleton(); 408 409 /// Widen a single instruction within the innermost loop. 410 void widenInstruction(Instruction &I, VPUser &Operands, 411 VPTransformState &State); 412 413 /// Widen a single call instruction within the innermost loop. 414 void widenCallInstruction(CallInst &I, VPUser &ArgOperands, 415 VPTransformState &State); 416 417 /// Widen a single select instruction within the innermost loop. 418 void widenSelectInstruction(SelectInst &I, bool InvariantCond); 419 420 /// Fix the vectorized code, taking care of header phi's, live-outs, and more. 421 void fixVectorizedLoop(); 422 423 // Return true if any runtime check is added. 424 bool areSafetyChecksAdded() { return AddedSafetyChecks; } 425 426 /// A type for vectorized values in the new loop. Each value from the 427 /// original loop, when vectorized, is represented by UF vector values in the 428 /// new unrolled loop, where UF is the unroll factor. 429 using VectorParts = SmallVector<Value *, 2>; 430 431 /// Vectorize a single GetElementPtrInst based on information gathered and 432 /// decisions taken during planning. 433 void widenGEP(GetElementPtrInst *GEP, unsigned UF, unsigned VF, 434 bool IsPtrLoopInvariant, SmallBitVector &IsIndexLoopInvariant); 435 436 /// Vectorize a single PHINode in a block. This method handles the induction 437 /// variable canonicalization. It supports both VF = 1 for unrolled loops and 438 /// arbitrary length vectors. 439 void widenPHIInstruction(Instruction *PN, unsigned UF, unsigned VF); 440 441 /// A helper function to scalarize a single Instruction in the innermost loop. 442 /// Generates a sequence of scalar instances for each lane between \p MinLane 443 /// and \p MaxLane, times each part between \p MinPart and \p MaxPart, 444 /// inclusive. Uses the VPValue operands from \p Operands instead of \p 445 /// Instr's operands. 446 void scalarizeInstruction(Instruction *Instr, VPUser &Operands, 447 const VPIteration &Instance, bool IfPredicateInstr, 448 VPTransformState &State); 449 450 /// Widen an integer or floating-point induction variable \p IV. If \p Trunc 451 /// is provided, the integer induction variable will first be truncated to 452 /// the corresponding type. 453 void widenIntOrFpInduction(PHINode *IV, TruncInst *Trunc = nullptr); 454 455 /// getOrCreateVectorValue and getOrCreateScalarValue coordinate to generate a 456 /// vector or scalar value on-demand if one is not yet available. When 457 /// vectorizing a loop, we visit the definition of an instruction before its 458 /// uses. When visiting the definition, we either vectorize or scalarize the 459 /// instruction, creating an entry for it in the corresponding map. (In some 460 /// cases, such as induction variables, we will create both vector and scalar 461 /// entries.) Then, as we encounter uses of the definition, we derive values 462 /// for each scalar or vector use unless such a value is already available. 463 /// For example, if we scalarize a definition and one of its uses is vector, 464 /// we build the required vector on-demand with an insertelement sequence 465 /// when visiting the use. Otherwise, if the use is scalar, we can use the 466 /// existing scalar definition. 467 /// 468 /// Return a value in the new loop corresponding to \p V from the original 469 /// loop at unroll index \p Part. If the value has already been vectorized, 470 /// the corresponding vector entry in VectorLoopValueMap is returned. If, 471 /// however, the value has a scalar entry in VectorLoopValueMap, we construct 472 /// a new vector value on-demand by inserting the scalar values into a vector 473 /// with an insertelement sequence. If the value has been neither vectorized 474 /// nor scalarized, it must be loop invariant, so we simply broadcast the 475 /// value into a vector. 476 Value *getOrCreateVectorValue(Value *V, unsigned Part); 477 478 /// Return a value in the new loop corresponding to \p V from the original 479 /// loop at unroll and vector indices \p Instance. If the value has been 480 /// vectorized but not scalarized, the necessary extractelement instruction 481 /// will be generated. 482 Value *getOrCreateScalarValue(Value *V, const VPIteration &Instance); 483 484 /// Construct the vector value of a scalarized value \p V one lane at a time. 485 void packScalarIntoVectorValue(Value *V, const VPIteration &Instance); 486 487 /// Try to vectorize interleaved access group \p Group with the base address 488 /// given in \p Addr, optionally masking the vector operations if \p 489 /// BlockInMask is non-null. Use \p State to translate given VPValues to IR 490 /// values in the vectorized loop. 491 void vectorizeInterleaveGroup(const InterleaveGroup<Instruction> *Group, 492 VPTransformState &State, VPValue *Addr, 493 VPValue *BlockInMask = nullptr); 494 495 /// Vectorize Load and Store instructions with the base address given in \p 496 /// Addr, optionally masking the vector operations if \p BlockInMask is 497 /// non-null. Use \p State to translate given VPValues to IR values in the 498 /// vectorized loop. 499 void vectorizeMemoryInstruction(Instruction *Instr, VPTransformState &State, 500 VPValue *Addr, VPValue *StoredValue, 501 VPValue *BlockInMask); 502 503 /// Set the debug location in the builder using the debug location in 504 /// the instruction. 505 void setDebugLocFromInst(IRBuilder<> &B, const Value *Ptr); 506 507 /// Fix the non-induction PHIs in the OrigPHIsToFix vector. 508 void fixNonInductionPHIs(void); 509 510 protected: 511 friend class LoopVectorizationPlanner; 512 513 /// A small list of PHINodes. 514 using PhiVector = SmallVector<PHINode *, 4>; 515 516 /// A type for scalarized values in the new loop. Each value from the 517 /// original loop, when scalarized, is represented by UF x VF scalar values 518 /// in the new unrolled loop, where UF is the unroll factor and VF is the 519 /// vectorization factor. 520 using ScalarParts = SmallVector<SmallVector<Value *, 4>, 2>; 521 522 /// Set up the values of the IVs correctly when exiting the vector loop. 523 void fixupIVUsers(PHINode *OrigPhi, const InductionDescriptor &II, 524 Value *CountRoundDown, Value *EndValue, 525 BasicBlock *MiddleBlock); 526 527 /// Create a new induction variable inside L. 528 PHINode *createInductionVariable(Loop *L, Value *Start, Value *End, 529 Value *Step, Instruction *DL); 530 531 /// Handle all cross-iteration phis in the header. 532 void fixCrossIterationPHIs(); 533 534 /// Fix a first-order recurrence. This is the second phase of vectorizing 535 /// this phi node. 536 void fixFirstOrderRecurrence(PHINode *Phi); 537 538 /// Fix a reduction cross-iteration phi. This is the second phase of 539 /// vectorizing this phi node. 540 void fixReduction(PHINode *Phi); 541 542 /// Clear NSW/NUW flags from reduction instructions if necessary. 543 void clearReductionWrapFlags(RecurrenceDescriptor &RdxDesc); 544 545 /// The Loop exit block may have single value PHI nodes with some 546 /// incoming value. While vectorizing we only handled real values 547 /// that were defined inside the loop and we should have one value for 548 /// each predecessor of its parent basic block. See PR14725. 549 void fixLCSSAPHIs(); 550 551 /// Iteratively sink the scalarized operands of a predicated instruction into 552 /// the block that was created for it. 553 void sinkScalarOperands(Instruction *PredInst); 554 555 /// Shrinks vector element sizes to the smallest bitwidth they can be legally 556 /// represented as. 557 void truncateToMinimalBitwidths(); 558 559 /// Create a broadcast instruction. This method generates a broadcast 560 /// instruction (shuffle) for loop invariant values and for the induction 561 /// value. If this is the induction variable then we extend it to N, N+1, ... 562 /// this is needed because each iteration in the loop corresponds to a SIMD 563 /// element. 564 virtual Value *getBroadcastInstrs(Value *V); 565 566 /// This function adds (StartIdx, StartIdx + Step, StartIdx + 2*Step, ...) 567 /// to each vector element of Val. The sequence starts at StartIndex. 568 /// \p Opcode is relevant for FP induction variable. 569 virtual Value *getStepVector(Value *Val, int StartIdx, Value *Step, 570 Instruction::BinaryOps Opcode = 571 Instruction::BinaryOpsEnd); 572 573 /// Compute scalar induction steps. \p ScalarIV is the scalar induction 574 /// variable on which to base the steps, \p Step is the size of the step, and 575 /// \p EntryVal is the value from the original loop that maps to the steps. 576 /// Note that \p EntryVal doesn't have to be an induction variable - it 577 /// can also be a truncate instruction. 578 void buildScalarSteps(Value *ScalarIV, Value *Step, Instruction *EntryVal, 579 const InductionDescriptor &ID); 580 581 /// Create a vector induction phi node based on an existing scalar one. \p 582 /// EntryVal is the value from the original loop that maps to the vector phi 583 /// node, and \p Step is the loop-invariant step. If \p EntryVal is a 584 /// truncate instruction, instead of widening the original IV, we widen a 585 /// version of the IV truncated to \p EntryVal's type. 586 void createVectorIntOrFpInductionPHI(const InductionDescriptor &II, 587 Value *Step, Instruction *EntryVal); 588 589 /// Returns true if an instruction \p I should be scalarized instead of 590 /// vectorized for the chosen vectorization factor. 591 bool shouldScalarizeInstruction(Instruction *I) const; 592 593 /// Returns true if we should generate a scalar version of \p IV. 594 bool needsScalarInduction(Instruction *IV) const; 595 596 /// If there is a cast involved in the induction variable \p ID, which should 597 /// be ignored in the vectorized loop body, this function records the 598 /// VectorLoopValue of the respective Phi also as the VectorLoopValue of the 599 /// cast. We had already proved that the casted Phi is equal to the uncasted 600 /// Phi in the vectorized loop (under a runtime guard), and therefore 601 /// there is no need to vectorize the cast - the same value can be used in the 602 /// vector loop for both the Phi and the cast. 603 /// If \p VectorLoopValue is a scalarized value, \p Lane is also specified, 604 /// Otherwise, \p VectorLoopValue is a widened/vectorized value. 605 /// 606 /// \p EntryVal is the value from the original loop that maps to the vector 607 /// phi node and is used to distinguish what is the IV currently being 608 /// processed - original one (if \p EntryVal is a phi corresponding to the 609 /// original IV) or the "newly-created" one based on the proof mentioned above 610 /// (see also buildScalarSteps() and createVectorIntOrFPInductionPHI()). In the 611 /// latter case \p EntryVal is a TruncInst and we must not record anything for 612 /// that IV, but it's error-prone to expect callers of this routine to care 613 /// about that, hence this explicit parameter. 614 void recordVectorLoopValueForInductionCast(const InductionDescriptor &ID, 615 const Instruction *EntryVal, 616 Value *VectorLoopValue, 617 unsigned Part, 618 unsigned Lane = UINT_MAX); 619 620 /// Generate a shuffle sequence that will reverse the vector Vec. 621 virtual Value *reverseVector(Value *Vec); 622 623 /// Returns (and creates if needed) the original loop trip count. 624 Value *getOrCreateTripCount(Loop *NewLoop); 625 626 /// Returns (and creates if needed) the trip count of the widened loop. 627 Value *getOrCreateVectorTripCount(Loop *NewLoop); 628 629 /// Returns a bitcasted value to the requested vector type. 630 /// Also handles bitcasts of vector<float> <-> vector<pointer> types. 631 Value *createBitOrPointerCast(Value *V, VectorType *DstVTy, 632 const DataLayout &DL); 633 634 /// Emit a bypass check to see if the vector trip count is zero, including if 635 /// it overflows. 636 void emitMinimumIterationCountCheck(Loop *L, BasicBlock *Bypass); 637 638 /// Emit a bypass check to see if all of the SCEV assumptions we've 639 /// had to make are correct. 640 void emitSCEVChecks(Loop *L, BasicBlock *Bypass); 641 642 /// Emit bypass checks to check any memory assumptions we may have made. 643 void emitMemRuntimeChecks(Loop *L, BasicBlock *Bypass); 644 645 /// Compute the transformed value of Index at offset StartValue using step 646 /// StepValue. 647 /// For integer induction, returns StartValue + Index * StepValue. 648 /// For pointer induction, returns StartValue[Index * StepValue]. 649 /// FIXME: The newly created binary instructions should contain nsw/nuw 650 /// flags, which can be found from the original scalar operations. 651 Value *emitTransformedIndex(IRBuilder<> &B, Value *Index, ScalarEvolution *SE, 652 const DataLayout &DL, 653 const InductionDescriptor &ID) const; 654 655 /// Add additional metadata to \p To that was not present on \p Orig. 656 /// 657 /// Currently this is used to add the noalias annotations based on the 658 /// inserted memchecks. Use this for instructions that are *cloned* into the 659 /// vector loop. 660 void addNewMetadata(Instruction *To, const Instruction *Orig); 661 662 /// Add metadata from one instruction to another. 663 /// 664 /// This includes both the original MDs from \p From and additional ones (\see 665 /// addNewMetadata). Use this for *newly created* instructions in the vector 666 /// loop. 667 void addMetadata(Instruction *To, Instruction *From); 668 669 /// Similar to the previous function but it adds the metadata to a 670 /// vector of instructions. 671 void addMetadata(ArrayRef<Value *> To, Instruction *From); 672 673 /// The original loop. 674 Loop *OrigLoop; 675 676 /// A wrapper around ScalarEvolution used to add runtime SCEV checks. Applies 677 /// dynamic knowledge to simplify SCEV expressions and converts them to a 678 /// more usable form. 679 PredicatedScalarEvolution &PSE; 680 681 /// Loop Info. 682 LoopInfo *LI; 683 684 /// Dominator Tree. 685 DominatorTree *DT; 686 687 /// Alias Analysis. 688 AliasAnalysis *AA; 689 690 /// Target Library Info. 691 const TargetLibraryInfo *TLI; 692 693 /// Target Transform Info. 694 const TargetTransformInfo *TTI; 695 696 /// Assumption Cache. 697 AssumptionCache *AC; 698 699 /// Interface to emit optimization remarks. 700 OptimizationRemarkEmitter *ORE; 701 702 /// LoopVersioning. It's only set up (non-null) if memchecks were 703 /// used. 704 /// 705 /// This is currently only used to add no-alias metadata based on the 706 /// memchecks. The actually versioning is performed manually. 707 std::unique_ptr<LoopVersioning> LVer; 708 709 /// The vectorization SIMD factor to use. Each vector will have this many 710 /// vector elements. 711 unsigned VF; 712 713 /// The vectorization unroll factor to use. Each scalar is vectorized to this 714 /// many different vector instructions. 715 unsigned UF; 716 717 /// The builder that we use 718 IRBuilder<> Builder; 719 720 // --- Vectorization state --- 721 722 /// The vector-loop preheader. 723 BasicBlock *LoopVectorPreHeader; 724 725 /// The scalar-loop preheader. 726 BasicBlock *LoopScalarPreHeader; 727 728 /// Middle Block between the vector and the scalar. 729 BasicBlock *LoopMiddleBlock; 730 731 /// The ExitBlock of the scalar loop. 732 BasicBlock *LoopExitBlock; 733 734 /// The vector loop body. 735 BasicBlock *LoopVectorBody; 736 737 /// The scalar loop body. 738 BasicBlock *LoopScalarBody; 739 740 /// A list of all bypass blocks. The first block is the entry of the loop. 741 SmallVector<BasicBlock *, 4> LoopBypassBlocks; 742 743 /// The new Induction variable which was added to the new block. 744 PHINode *Induction = nullptr; 745 746 /// The induction variable of the old basic block. 747 PHINode *OldInduction = nullptr; 748 749 /// Maps values from the original loop to their corresponding values in the 750 /// vectorized loop. A key value can map to either vector values, scalar 751 /// values or both kinds of values, depending on whether the key was 752 /// vectorized and scalarized. 753 VectorizerValueMap VectorLoopValueMap; 754 755 /// Store instructions that were predicated. 756 SmallVector<Instruction *, 4> PredicatedInstructions; 757 758 /// Trip count of the original loop. 759 Value *TripCount = nullptr; 760 761 /// Trip count of the widened loop (TripCount - TripCount % (VF*UF)) 762 Value *VectorTripCount = nullptr; 763 764 /// The legality analysis. 765 LoopVectorizationLegality *Legal; 766 767 /// The profitablity analysis. 768 LoopVectorizationCostModel *Cost; 769 770 // Record whether runtime checks are added. 771 bool AddedSafetyChecks = false; 772 773 // Holds the end values for each induction variable. We save the end values 774 // so we can later fix-up the external users of the induction variables. 775 DenseMap<PHINode *, Value *> IVEndValues; 776 777 // Vector of original scalar PHIs whose corresponding widened PHIs need to be 778 // fixed up at the end of vector code generation. 779 SmallVector<PHINode *, 8> OrigPHIsToFix; 780 }; 781 782 class InnerLoopUnroller : public InnerLoopVectorizer { 783 public: 784 InnerLoopUnroller(Loop *OrigLoop, PredicatedScalarEvolution &PSE, 785 LoopInfo *LI, DominatorTree *DT, 786 const TargetLibraryInfo *TLI, 787 const TargetTransformInfo *TTI, AssumptionCache *AC, 788 OptimizationRemarkEmitter *ORE, unsigned UnrollFactor, 789 LoopVectorizationLegality *LVL, 790 LoopVectorizationCostModel *CM) 791 : InnerLoopVectorizer(OrigLoop, PSE, LI, DT, TLI, TTI, AC, ORE, 1, 792 UnrollFactor, LVL, CM) {} 793 794 private: 795 Value *getBroadcastInstrs(Value *V) override; 796 Value *getStepVector(Value *Val, int StartIdx, Value *Step, 797 Instruction::BinaryOps Opcode = 798 Instruction::BinaryOpsEnd) override; 799 Value *reverseVector(Value *Vec) override; 800 }; 801 802 } // end namespace llvm 803 804 /// Look for a meaningful debug location on the instruction or it's 805 /// operands. 806 static Instruction *getDebugLocFromInstOrOperands(Instruction *I) { 807 if (!I) 808 return I; 809 810 DebugLoc Empty; 811 if (I->getDebugLoc() != Empty) 812 return I; 813 814 for (User::op_iterator OI = I->op_begin(), OE = I->op_end(); OI != OE; ++OI) { 815 if (Instruction *OpInst = dyn_cast<Instruction>(*OI)) 816 if (OpInst->getDebugLoc() != Empty) 817 return OpInst; 818 } 819 820 return I; 821 } 822 823 void InnerLoopVectorizer::setDebugLocFromInst(IRBuilder<> &B, const Value *Ptr) { 824 if (const Instruction *Inst = dyn_cast_or_null<Instruction>(Ptr)) { 825 const DILocation *DIL = Inst->getDebugLoc(); 826 if (DIL && Inst->getFunction()->isDebugInfoForProfiling() && 827 !isa<DbgInfoIntrinsic>(Inst)) { 828 auto NewDIL = DIL->cloneByMultiplyingDuplicationFactor(UF * VF); 829 if (NewDIL) 830 B.SetCurrentDebugLocation(NewDIL.getValue()); 831 else 832 LLVM_DEBUG(dbgs() 833 << "Failed to create new discriminator: " 834 << DIL->getFilename() << " Line: " << DIL->getLine()); 835 } 836 else 837 B.SetCurrentDebugLocation(DIL); 838 } else 839 B.SetCurrentDebugLocation(DebugLoc()); 840 } 841 842 /// Write a record \p DebugMsg about vectorization failure to the debug 843 /// output stream. If \p I is passed, it is an instruction that prevents 844 /// vectorization. 845 #ifndef NDEBUG 846 static void debugVectorizationFailure(const StringRef DebugMsg, 847 Instruction *I) { 848 dbgs() << "LV: Not vectorizing: " << DebugMsg; 849 if (I != nullptr) 850 dbgs() << " " << *I; 851 else 852 dbgs() << '.'; 853 dbgs() << '\n'; 854 } 855 #endif 856 857 /// Create an analysis remark that explains why vectorization failed 858 /// 859 /// \p PassName is the name of the pass (e.g. can be AlwaysPrint). \p 860 /// RemarkName is the identifier for the remark. If \p I is passed it is an 861 /// instruction that prevents vectorization. Otherwise \p TheLoop is used for 862 /// the location of the remark. \return the remark object that can be 863 /// streamed to. 864 static OptimizationRemarkAnalysis createLVAnalysis(const char *PassName, 865 StringRef RemarkName, Loop *TheLoop, Instruction *I) { 866 Value *CodeRegion = TheLoop->getHeader(); 867 DebugLoc DL = TheLoop->getStartLoc(); 868 869 if (I) { 870 CodeRegion = I->getParent(); 871 // If there is no debug location attached to the instruction, revert back to 872 // using the loop's. 873 if (I->getDebugLoc()) 874 DL = I->getDebugLoc(); 875 } 876 877 OptimizationRemarkAnalysis R(PassName, RemarkName, DL, CodeRegion); 878 R << "loop not vectorized: "; 879 return R; 880 } 881 882 namespace llvm { 883 884 void reportVectorizationFailure(const StringRef DebugMsg, 885 const StringRef OREMsg, const StringRef ORETag, 886 OptimizationRemarkEmitter *ORE, Loop *TheLoop, Instruction *I) { 887 LLVM_DEBUG(debugVectorizationFailure(DebugMsg, I)); 888 LoopVectorizeHints Hints(TheLoop, true /* doesn't matter */, *ORE); 889 ORE->emit(createLVAnalysis(Hints.vectorizeAnalysisPassName(), 890 ORETag, TheLoop, I) << OREMsg); 891 } 892 893 } // end namespace llvm 894 895 #ifndef NDEBUG 896 /// \return string containing a file name and a line # for the given loop. 897 static std::string getDebugLocString(const Loop *L) { 898 std::string Result; 899 if (L) { 900 raw_string_ostream OS(Result); 901 if (const DebugLoc LoopDbgLoc = L->getStartLoc()) 902 LoopDbgLoc.print(OS); 903 else 904 // Just print the module name. 905 OS << L->getHeader()->getParent()->getParent()->getModuleIdentifier(); 906 OS.flush(); 907 } 908 return Result; 909 } 910 #endif 911 912 void InnerLoopVectorizer::addNewMetadata(Instruction *To, 913 const Instruction *Orig) { 914 // If the loop was versioned with memchecks, add the corresponding no-alias 915 // metadata. 916 if (LVer && (isa<LoadInst>(Orig) || isa<StoreInst>(Orig))) 917 LVer->annotateInstWithNoAlias(To, Orig); 918 } 919 920 void InnerLoopVectorizer::addMetadata(Instruction *To, 921 Instruction *From) { 922 propagateMetadata(To, From); 923 addNewMetadata(To, From); 924 } 925 926 void InnerLoopVectorizer::addMetadata(ArrayRef<Value *> To, 927 Instruction *From) { 928 for (Value *V : To) { 929 if (Instruction *I = dyn_cast<Instruction>(V)) 930 addMetadata(I, From); 931 } 932 } 933 934 namespace llvm { 935 936 // Loop vectorization cost-model hints how the scalar epilogue loop should be 937 // lowered. 938 enum ScalarEpilogueLowering { 939 940 // The default: allowing scalar epilogues. 941 CM_ScalarEpilogueAllowed, 942 943 // Vectorization with OptForSize: don't allow epilogues. 944 CM_ScalarEpilogueNotAllowedOptSize, 945 946 // A special case of vectorisation with OptForSize: loops with a very small 947 // trip count are considered for vectorization under OptForSize, thereby 948 // making sure the cost of their loop body is dominant, free of runtime 949 // guards and scalar iteration overheads. 950 CM_ScalarEpilogueNotAllowedLowTripLoop, 951 952 // Loop hint predicate indicating an epilogue is undesired. 953 CM_ScalarEpilogueNotNeededUsePredicate 954 }; 955 956 /// LoopVectorizationCostModel - estimates the expected speedups due to 957 /// vectorization. 958 /// In many cases vectorization is not profitable. This can happen because of 959 /// a number of reasons. In this class we mainly attempt to predict the 960 /// expected speedup/slowdowns due to the supported instruction set. We use the 961 /// TargetTransformInfo to query the different backends for the cost of 962 /// different operations. 963 class LoopVectorizationCostModel { 964 public: 965 LoopVectorizationCostModel(ScalarEpilogueLowering SEL, Loop *L, 966 PredicatedScalarEvolution &PSE, LoopInfo *LI, 967 LoopVectorizationLegality *Legal, 968 const TargetTransformInfo &TTI, 969 const TargetLibraryInfo *TLI, DemandedBits *DB, 970 AssumptionCache *AC, 971 OptimizationRemarkEmitter *ORE, const Function *F, 972 const LoopVectorizeHints *Hints, 973 InterleavedAccessInfo &IAI) 974 : ScalarEpilogueStatus(SEL), TheLoop(L), PSE(PSE), LI(LI), Legal(Legal), 975 TTI(TTI), TLI(TLI), DB(DB), AC(AC), ORE(ORE), TheFunction(F), 976 Hints(Hints), InterleaveInfo(IAI) {} 977 978 /// \return An upper bound for the vectorization factor, or None if 979 /// vectorization and interleaving should be avoided up front. 980 Optional<unsigned> computeMaxVF(unsigned UserVF, unsigned UserIC); 981 982 /// \return True if runtime checks are required for vectorization, and false 983 /// otherwise. 984 bool runtimeChecksRequired(); 985 986 /// \return The most profitable vectorization factor and the cost of that VF. 987 /// This method checks every power of two up to MaxVF. If UserVF is not ZERO 988 /// then this vectorization factor will be selected if vectorization is 989 /// possible. 990 VectorizationFactor selectVectorizationFactor(unsigned MaxVF); 991 992 /// Setup cost-based decisions for user vectorization factor. 993 void selectUserVectorizationFactor(unsigned UserVF) { 994 collectUniformsAndScalars(UserVF); 995 collectInstsToScalarize(UserVF); 996 } 997 998 /// \return The size (in bits) of the smallest and widest types in the code 999 /// that needs to be vectorized. We ignore values that remain scalar such as 1000 /// 64 bit loop indices. 1001 std::pair<unsigned, unsigned> getSmallestAndWidestTypes(); 1002 1003 /// \return The desired interleave count. 1004 /// If interleave count has been specified by metadata it will be returned. 1005 /// Otherwise, the interleave count is computed and returned. VF and LoopCost 1006 /// are the selected vectorization factor and the cost of the selected VF. 1007 unsigned selectInterleaveCount(unsigned VF, unsigned LoopCost); 1008 1009 /// Memory access instruction may be vectorized in more than one way. 1010 /// Form of instruction after vectorization depends on cost. 1011 /// This function takes cost-based decisions for Load/Store instructions 1012 /// and collects them in a map. This decisions map is used for building 1013 /// the lists of loop-uniform and loop-scalar instructions. 1014 /// The calculated cost is saved with widening decision in order to 1015 /// avoid redundant calculations. 1016 void setCostBasedWideningDecision(unsigned VF); 1017 1018 /// A struct that represents some properties of the register usage 1019 /// of a loop. 1020 struct RegisterUsage { 1021 /// Holds the number of loop invariant values that are used in the loop. 1022 /// The key is ClassID of target-provided register class. 1023 SmallMapVector<unsigned, unsigned, 4> LoopInvariantRegs; 1024 /// Holds the maximum number of concurrent live intervals in the loop. 1025 /// The key is ClassID of target-provided register class. 1026 SmallMapVector<unsigned, unsigned, 4> MaxLocalUsers; 1027 }; 1028 1029 /// \return Returns information about the register usages of the loop for the 1030 /// given vectorization factors. 1031 SmallVector<RegisterUsage, 8> calculateRegisterUsage(ArrayRef<unsigned> VFs); 1032 1033 /// Collect values we want to ignore in the cost model. 1034 void collectValuesToIgnore(); 1035 1036 /// \returns The smallest bitwidth each instruction can be represented with. 1037 /// The vector equivalents of these instructions should be truncated to this 1038 /// type. 1039 const MapVector<Instruction *, uint64_t> &getMinimalBitwidths() const { 1040 return MinBWs; 1041 } 1042 1043 /// \returns True if it is more profitable to scalarize instruction \p I for 1044 /// vectorization factor \p VF. 1045 bool isProfitableToScalarize(Instruction *I, unsigned VF) const { 1046 assert(VF > 1 && "Profitable to scalarize relevant only for VF > 1."); 1047 1048 // Cost model is not run in the VPlan-native path - return conservative 1049 // result until this changes. 1050 if (EnableVPlanNativePath) 1051 return false; 1052 1053 auto Scalars = InstsToScalarize.find(VF); 1054 assert(Scalars != InstsToScalarize.end() && 1055 "VF not yet analyzed for scalarization profitability"); 1056 return Scalars->second.find(I) != Scalars->second.end(); 1057 } 1058 1059 /// Returns true if \p I is known to be uniform after vectorization. 1060 bool isUniformAfterVectorization(Instruction *I, unsigned VF) const { 1061 if (VF == 1) 1062 return true; 1063 1064 // Cost model is not run in the VPlan-native path - return conservative 1065 // result until this changes. 1066 if (EnableVPlanNativePath) 1067 return false; 1068 1069 auto UniformsPerVF = Uniforms.find(VF); 1070 assert(UniformsPerVF != Uniforms.end() && 1071 "VF not yet analyzed for uniformity"); 1072 return UniformsPerVF->second.find(I) != UniformsPerVF->second.end(); 1073 } 1074 1075 /// Returns true if \p I is known to be scalar after vectorization. 1076 bool isScalarAfterVectorization(Instruction *I, unsigned VF) const { 1077 if (VF == 1) 1078 return true; 1079 1080 // Cost model is not run in the VPlan-native path - return conservative 1081 // result until this changes. 1082 if (EnableVPlanNativePath) 1083 return false; 1084 1085 auto ScalarsPerVF = Scalars.find(VF); 1086 assert(ScalarsPerVF != Scalars.end() && 1087 "Scalar values are not calculated for VF"); 1088 return ScalarsPerVF->second.find(I) != ScalarsPerVF->second.end(); 1089 } 1090 1091 /// \returns True if instruction \p I can be truncated to a smaller bitwidth 1092 /// for vectorization factor \p VF. 1093 bool canTruncateToMinimalBitwidth(Instruction *I, unsigned VF) const { 1094 return VF > 1 && MinBWs.find(I) != MinBWs.end() && 1095 !isProfitableToScalarize(I, VF) && 1096 !isScalarAfterVectorization(I, VF); 1097 } 1098 1099 /// Decision that was taken during cost calculation for memory instruction. 1100 enum InstWidening { 1101 CM_Unknown, 1102 CM_Widen, // For consecutive accesses with stride +1. 1103 CM_Widen_Reverse, // For consecutive accesses with stride -1. 1104 CM_Interleave, 1105 CM_GatherScatter, 1106 CM_Scalarize 1107 }; 1108 1109 /// Save vectorization decision \p W and \p Cost taken by the cost model for 1110 /// instruction \p I and vector width \p VF. 1111 void setWideningDecision(Instruction *I, unsigned VF, InstWidening W, 1112 unsigned Cost) { 1113 assert(VF >= 2 && "Expected VF >=2"); 1114 WideningDecisions[std::make_pair(I, VF)] = std::make_pair(W, Cost); 1115 } 1116 1117 /// Save vectorization decision \p W and \p Cost taken by the cost model for 1118 /// interleaving group \p Grp and vector width \p VF. 1119 void setWideningDecision(const InterleaveGroup<Instruction> *Grp, unsigned VF, 1120 InstWidening W, unsigned Cost) { 1121 assert(VF >= 2 && "Expected VF >=2"); 1122 /// Broadcast this decicion to all instructions inside the group. 1123 /// But the cost will be assigned to one instruction only. 1124 for (unsigned i = 0; i < Grp->getFactor(); ++i) { 1125 if (auto *I = Grp->getMember(i)) { 1126 if (Grp->getInsertPos() == I) 1127 WideningDecisions[std::make_pair(I, VF)] = std::make_pair(W, Cost); 1128 else 1129 WideningDecisions[std::make_pair(I, VF)] = std::make_pair(W, 0); 1130 } 1131 } 1132 } 1133 1134 /// Return the cost model decision for the given instruction \p I and vector 1135 /// width \p VF. Return CM_Unknown if this instruction did not pass 1136 /// through the cost modeling. 1137 InstWidening getWideningDecision(Instruction *I, unsigned VF) { 1138 assert(VF >= 2 && "Expected VF >=2"); 1139 1140 // Cost model is not run in the VPlan-native path - return conservative 1141 // result until this changes. 1142 if (EnableVPlanNativePath) 1143 return CM_GatherScatter; 1144 1145 std::pair<Instruction *, unsigned> InstOnVF = std::make_pair(I, VF); 1146 auto Itr = WideningDecisions.find(InstOnVF); 1147 if (Itr == WideningDecisions.end()) 1148 return CM_Unknown; 1149 return Itr->second.first; 1150 } 1151 1152 /// Return the vectorization cost for the given instruction \p I and vector 1153 /// width \p VF. 1154 unsigned getWideningCost(Instruction *I, unsigned VF) { 1155 assert(VF >= 2 && "Expected VF >=2"); 1156 std::pair<Instruction *, unsigned> InstOnVF = std::make_pair(I, VF); 1157 assert(WideningDecisions.find(InstOnVF) != WideningDecisions.end() && 1158 "The cost is not calculated"); 1159 return WideningDecisions[InstOnVF].second; 1160 } 1161 1162 /// Return True if instruction \p I is an optimizable truncate whose operand 1163 /// is an induction variable. Such a truncate will be removed by adding a new 1164 /// induction variable with the destination type. 1165 bool isOptimizableIVTruncate(Instruction *I, unsigned VF) { 1166 // If the instruction is not a truncate, return false. 1167 auto *Trunc = dyn_cast<TruncInst>(I); 1168 if (!Trunc) 1169 return false; 1170 1171 // Get the source and destination types of the truncate. 1172 Type *SrcTy = ToVectorTy(cast<CastInst>(I)->getSrcTy(), VF); 1173 Type *DestTy = ToVectorTy(cast<CastInst>(I)->getDestTy(), VF); 1174 1175 // If the truncate is free for the given types, return false. Replacing a 1176 // free truncate with an induction variable would add an induction variable 1177 // update instruction to each iteration of the loop. We exclude from this 1178 // check the primary induction variable since it will need an update 1179 // instruction regardless. 1180 Value *Op = Trunc->getOperand(0); 1181 if (Op != Legal->getPrimaryInduction() && TTI.isTruncateFree(SrcTy, DestTy)) 1182 return false; 1183 1184 // If the truncated value is not an induction variable, return false. 1185 return Legal->isInductionPhi(Op); 1186 } 1187 1188 /// Collects the instructions to scalarize for each predicated instruction in 1189 /// the loop. 1190 void collectInstsToScalarize(unsigned VF); 1191 1192 /// Collect Uniform and Scalar values for the given \p VF. 1193 /// The sets depend on CM decision for Load/Store instructions 1194 /// that may be vectorized as interleave, gather-scatter or scalarized. 1195 void collectUniformsAndScalars(unsigned VF) { 1196 // Do the analysis once. 1197 if (VF == 1 || Uniforms.find(VF) != Uniforms.end()) 1198 return; 1199 setCostBasedWideningDecision(VF); 1200 collectLoopUniforms(VF); 1201 collectLoopScalars(VF); 1202 } 1203 1204 /// Returns true if the target machine supports masked store operation 1205 /// for the given \p DataType and kind of access to \p Ptr. 1206 bool isLegalMaskedStore(Type *DataType, Value *Ptr, Align Alignment) { 1207 return Legal->isConsecutivePtr(Ptr) && 1208 TTI.isLegalMaskedStore(DataType, Alignment); 1209 } 1210 1211 /// Returns true if the target machine supports masked load operation 1212 /// for the given \p DataType and kind of access to \p Ptr. 1213 bool isLegalMaskedLoad(Type *DataType, Value *Ptr, Align Alignment) { 1214 return Legal->isConsecutivePtr(Ptr) && 1215 TTI.isLegalMaskedLoad(DataType, Alignment); 1216 } 1217 1218 /// Returns true if the target machine supports masked scatter operation 1219 /// for the given \p DataType. 1220 bool isLegalMaskedScatter(Type *DataType, Align Alignment) { 1221 return TTI.isLegalMaskedScatter(DataType, Alignment); 1222 } 1223 1224 /// Returns true if the target machine supports masked gather operation 1225 /// for the given \p DataType. 1226 bool isLegalMaskedGather(Type *DataType, Align Alignment) { 1227 return TTI.isLegalMaskedGather(DataType, Alignment); 1228 } 1229 1230 /// Returns true if the target machine can represent \p V as a masked gather 1231 /// or scatter operation. 1232 bool isLegalGatherOrScatter(Value *V) { 1233 bool LI = isa<LoadInst>(V); 1234 bool SI = isa<StoreInst>(V); 1235 if (!LI && !SI) 1236 return false; 1237 auto *Ty = getMemInstValueType(V); 1238 Align Align = getLoadStoreAlignment(V); 1239 return (LI && isLegalMaskedGather(Ty, Align)) || 1240 (SI && isLegalMaskedScatter(Ty, Align)); 1241 } 1242 1243 /// Returns true if \p I is an instruction that will be scalarized with 1244 /// predication. Such instructions include conditional stores and 1245 /// instructions that may divide by zero. 1246 /// If a non-zero VF has been calculated, we check if I will be scalarized 1247 /// predication for that VF. 1248 bool isScalarWithPredication(Instruction *I, unsigned VF = 1); 1249 1250 // Returns true if \p I is an instruction that will be predicated either 1251 // through scalar predication or masked load/store or masked gather/scatter. 1252 // Superset of instructions that return true for isScalarWithPredication. 1253 bool isPredicatedInst(Instruction *I) { 1254 if (!blockNeedsPredication(I->getParent())) 1255 return false; 1256 // Loads and stores that need some form of masked operation are predicated 1257 // instructions. 1258 if (isa<LoadInst>(I) || isa<StoreInst>(I)) 1259 return Legal->isMaskRequired(I); 1260 return isScalarWithPredication(I); 1261 } 1262 1263 /// Returns true if \p I is a memory instruction with consecutive memory 1264 /// access that can be widened. 1265 bool memoryInstructionCanBeWidened(Instruction *I, unsigned VF = 1); 1266 1267 /// Returns true if \p I is a memory instruction in an interleaved-group 1268 /// of memory accesses that can be vectorized with wide vector loads/stores 1269 /// and shuffles. 1270 bool interleavedAccessCanBeWidened(Instruction *I, unsigned VF = 1); 1271 1272 /// Check if \p Instr belongs to any interleaved access group. 1273 bool isAccessInterleaved(Instruction *Instr) { 1274 return InterleaveInfo.isInterleaved(Instr); 1275 } 1276 1277 /// Get the interleaved access group that \p Instr belongs to. 1278 const InterleaveGroup<Instruction> * 1279 getInterleavedAccessGroup(Instruction *Instr) { 1280 return InterleaveInfo.getInterleaveGroup(Instr); 1281 } 1282 1283 /// Returns true if an interleaved group requires a scalar iteration 1284 /// to handle accesses with gaps, and there is nothing preventing us from 1285 /// creating a scalar epilogue. 1286 bool requiresScalarEpilogue() const { 1287 return isScalarEpilogueAllowed() && InterleaveInfo.requiresScalarEpilogue(); 1288 } 1289 1290 /// Returns true if a scalar epilogue is not allowed due to optsize or a 1291 /// loop hint annotation. 1292 bool isScalarEpilogueAllowed() const { 1293 return ScalarEpilogueStatus == CM_ScalarEpilogueAllowed; 1294 } 1295 1296 /// Returns true if all loop blocks should be masked to fold tail loop. 1297 bool foldTailByMasking() const { return FoldTailByMasking; } 1298 1299 bool blockNeedsPredication(BasicBlock *BB) { 1300 return foldTailByMasking() || Legal->blockNeedsPredication(BB); 1301 } 1302 1303 /// Estimate cost of an intrinsic call instruction CI if it were vectorized 1304 /// with factor VF. Return the cost of the instruction, including 1305 /// scalarization overhead if it's needed. 1306 unsigned getVectorIntrinsicCost(CallInst *CI, unsigned VF); 1307 1308 /// Estimate cost of a call instruction CI if it were vectorized with factor 1309 /// VF. Return the cost of the instruction, including scalarization overhead 1310 /// if it's needed. The flag NeedToScalarize shows if the call needs to be 1311 /// scalarized - 1312 /// i.e. either vector version isn't available, or is too expensive. 1313 unsigned getVectorCallCost(CallInst *CI, unsigned VF, bool &NeedToScalarize); 1314 1315 /// Invalidates decisions already taken by the cost model. 1316 void invalidateCostModelingDecisions() { 1317 WideningDecisions.clear(); 1318 Uniforms.clear(); 1319 Scalars.clear(); 1320 } 1321 1322 private: 1323 unsigned NumPredStores = 0; 1324 1325 /// \return An upper bound for the vectorization factor, larger than zero. 1326 /// One is returned if vectorization should best be avoided due to cost. 1327 unsigned computeFeasibleMaxVF(unsigned ConstTripCount); 1328 1329 /// The vectorization cost is a combination of the cost itself and a boolean 1330 /// indicating whether any of the contributing operations will actually 1331 /// operate on 1332 /// vector values after type legalization in the backend. If this latter value 1333 /// is 1334 /// false, then all operations will be scalarized (i.e. no vectorization has 1335 /// actually taken place). 1336 using VectorizationCostTy = std::pair<unsigned, bool>; 1337 1338 /// Returns the expected execution cost. The unit of the cost does 1339 /// not matter because we use the 'cost' units to compare different 1340 /// vector widths. The cost that is returned is *not* normalized by 1341 /// the factor width. 1342 VectorizationCostTy expectedCost(unsigned VF); 1343 1344 /// Returns the execution time cost of an instruction for a given vector 1345 /// width. Vector width of one means scalar. 1346 VectorizationCostTy getInstructionCost(Instruction *I, unsigned VF); 1347 1348 /// The cost-computation logic from getInstructionCost which provides 1349 /// the vector type as an output parameter. 1350 unsigned getInstructionCost(Instruction *I, unsigned VF, Type *&VectorTy); 1351 1352 /// Calculate vectorization cost of memory instruction \p I. 1353 unsigned getMemoryInstructionCost(Instruction *I, unsigned VF); 1354 1355 /// The cost computation for scalarized memory instruction. 1356 unsigned getMemInstScalarizationCost(Instruction *I, unsigned VF); 1357 1358 /// The cost computation for interleaving group of memory instructions. 1359 unsigned getInterleaveGroupCost(Instruction *I, unsigned VF); 1360 1361 /// The cost computation for Gather/Scatter instruction. 1362 unsigned getGatherScatterCost(Instruction *I, unsigned VF); 1363 1364 /// The cost computation for widening instruction \p I with consecutive 1365 /// memory access. 1366 unsigned getConsecutiveMemOpCost(Instruction *I, unsigned VF); 1367 1368 /// The cost calculation for Load/Store instruction \p I with uniform pointer - 1369 /// Load: scalar load + broadcast. 1370 /// Store: scalar store + (loop invariant value stored? 0 : extract of last 1371 /// element) 1372 unsigned getUniformMemOpCost(Instruction *I, unsigned VF); 1373 1374 /// Estimate the overhead of scalarizing an instruction. This is a 1375 /// convenience wrapper for the type-based getScalarizationOverhead API. 1376 unsigned getScalarizationOverhead(Instruction *I, unsigned VF); 1377 1378 /// Returns whether the instruction is a load or store and will be a emitted 1379 /// as a vector operation. 1380 bool isConsecutiveLoadOrStore(Instruction *I); 1381 1382 /// Returns true if an artificially high cost for emulated masked memrefs 1383 /// should be used. 1384 bool useEmulatedMaskMemRefHack(Instruction *I); 1385 1386 /// Map of scalar integer values to the smallest bitwidth they can be legally 1387 /// represented as. The vector equivalents of these values should be truncated 1388 /// to this type. 1389 MapVector<Instruction *, uint64_t> MinBWs; 1390 1391 /// A type representing the costs for instructions if they were to be 1392 /// scalarized rather than vectorized. The entries are Instruction-Cost 1393 /// pairs. 1394 using ScalarCostsTy = DenseMap<Instruction *, unsigned>; 1395 1396 /// A set containing all BasicBlocks that are known to present after 1397 /// vectorization as a predicated block. 1398 SmallPtrSet<BasicBlock *, 4> PredicatedBBsAfterVectorization; 1399 1400 /// Records whether it is allowed to have the original scalar loop execute at 1401 /// least once. This may be needed as a fallback loop in case runtime 1402 /// aliasing/dependence checks fail, or to handle the tail/remainder 1403 /// iterations when the trip count is unknown or doesn't divide by the VF, 1404 /// or as a peel-loop to handle gaps in interleave-groups. 1405 /// Under optsize and when the trip count is very small we don't allow any 1406 /// iterations to execute in the scalar loop. 1407 ScalarEpilogueLowering ScalarEpilogueStatus = CM_ScalarEpilogueAllowed; 1408 1409 /// All blocks of loop are to be masked to fold tail of scalar iterations. 1410 bool FoldTailByMasking = false; 1411 1412 /// A map holding scalar costs for different vectorization factors. The 1413 /// presence of a cost for an instruction in the mapping indicates that the 1414 /// instruction will be scalarized when vectorizing with the associated 1415 /// vectorization factor. The entries are VF-ScalarCostTy pairs. 1416 DenseMap<unsigned, ScalarCostsTy> InstsToScalarize; 1417 1418 /// Holds the instructions known to be uniform after vectorization. 1419 /// The data is collected per VF. 1420 DenseMap<unsigned, SmallPtrSet<Instruction *, 4>> Uniforms; 1421 1422 /// Holds the instructions known to be scalar after vectorization. 1423 /// The data is collected per VF. 1424 DenseMap<unsigned, SmallPtrSet<Instruction *, 4>> Scalars; 1425 1426 /// Holds the instructions (address computations) that are forced to be 1427 /// scalarized. 1428 DenseMap<unsigned, SmallPtrSet<Instruction *, 4>> ForcedScalars; 1429 1430 /// Returns the expected difference in cost from scalarizing the expression 1431 /// feeding a predicated instruction \p PredInst. The instructions to 1432 /// scalarize and their scalar costs are collected in \p ScalarCosts. A 1433 /// non-negative return value implies the expression will be scalarized. 1434 /// Currently, only single-use chains are considered for scalarization. 1435 int computePredInstDiscount(Instruction *PredInst, ScalarCostsTy &ScalarCosts, 1436 unsigned VF); 1437 1438 /// Collect the instructions that are uniform after vectorization. An 1439 /// instruction is uniform if we represent it with a single scalar value in 1440 /// the vectorized loop corresponding to each vector iteration. Examples of 1441 /// uniform instructions include pointer operands of consecutive or 1442 /// interleaved memory accesses. Note that although uniformity implies an 1443 /// instruction will be scalar, the reverse is not true. In general, a 1444 /// scalarized instruction will be represented by VF scalar values in the 1445 /// vectorized loop, each corresponding to an iteration of the original 1446 /// scalar loop. 1447 void collectLoopUniforms(unsigned VF); 1448 1449 /// Collect the instructions that are scalar after vectorization. An 1450 /// instruction is scalar if it is known to be uniform or will be scalarized 1451 /// during vectorization. Non-uniform scalarized instructions will be 1452 /// represented by VF values in the vectorized loop, each corresponding to an 1453 /// iteration of the original scalar loop. 1454 void collectLoopScalars(unsigned VF); 1455 1456 /// Keeps cost model vectorization decision and cost for instructions. 1457 /// Right now it is used for memory instructions only. 1458 using DecisionList = DenseMap<std::pair<Instruction *, unsigned>, 1459 std::pair<InstWidening, unsigned>>; 1460 1461 DecisionList WideningDecisions; 1462 1463 /// Returns true if \p V is expected to be vectorized and it needs to be 1464 /// extracted. 1465 bool needsExtract(Value *V, unsigned VF) const { 1466 Instruction *I = dyn_cast<Instruction>(V); 1467 if (VF == 1 || !I || !TheLoop->contains(I) || TheLoop->isLoopInvariant(I)) 1468 return false; 1469 1470 // Assume we can vectorize V (and hence we need extraction) if the 1471 // scalars are not computed yet. This can happen, because it is called 1472 // via getScalarizationOverhead from setCostBasedWideningDecision, before 1473 // the scalars are collected. That should be a safe assumption in most 1474 // cases, because we check if the operands have vectorizable types 1475 // beforehand in LoopVectorizationLegality. 1476 return Scalars.find(VF) == Scalars.end() || 1477 !isScalarAfterVectorization(I, VF); 1478 }; 1479 1480 /// Returns a range containing only operands needing to be extracted. 1481 SmallVector<Value *, 4> filterExtractingOperands(Instruction::op_range Ops, 1482 unsigned VF) { 1483 return SmallVector<Value *, 4>(make_filter_range( 1484 Ops, [this, VF](Value *V) { return this->needsExtract(V, VF); })); 1485 } 1486 1487 public: 1488 /// The loop that we evaluate. 1489 Loop *TheLoop; 1490 1491 /// Predicated scalar evolution analysis. 1492 PredicatedScalarEvolution &PSE; 1493 1494 /// Loop Info analysis. 1495 LoopInfo *LI; 1496 1497 /// Vectorization legality. 1498 LoopVectorizationLegality *Legal; 1499 1500 /// Vector target information. 1501 const TargetTransformInfo &TTI; 1502 1503 /// Target Library Info. 1504 const TargetLibraryInfo *TLI; 1505 1506 /// Demanded bits analysis. 1507 DemandedBits *DB; 1508 1509 /// Assumption cache. 1510 AssumptionCache *AC; 1511 1512 /// Interface to emit optimization remarks. 1513 OptimizationRemarkEmitter *ORE; 1514 1515 const Function *TheFunction; 1516 1517 /// Loop Vectorize Hint. 1518 const LoopVectorizeHints *Hints; 1519 1520 /// The interleave access information contains groups of interleaved accesses 1521 /// with the same stride and close to each other. 1522 InterleavedAccessInfo &InterleaveInfo; 1523 1524 /// Values to ignore in the cost model. 1525 SmallPtrSet<const Value *, 16> ValuesToIgnore; 1526 1527 /// Values to ignore in the cost model when VF > 1. 1528 SmallPtrSet<const Value *, 16> VecValuesToIgnore; 1529 }; 1530 1531 } // end namespace llvm 1532 1533 // Return true if \p OuterLp is an outer loop annotated with hints for explicit 1534 // vectorization. The loop needs to be annotated with #pragma omp simd 1535 // simdlen(#) or #pragma clang vectorize(enable) vectorize_width(#). If the 1536 // vector length information is not provided, vectorization is not considered 1537 // explicit. Interleave hints are not allowed either. These limitations will be 1538 // relaxed in the future. 1539 // Please, note that we are currently forced to abuse the pragma 'clang 1540 // vectorize' semantics. This pragma provides *auto-vectorization hints* 1541 // (i.e., LV must check that vectorization is legal) whereas pragma 'omp simd' 1542 // provides *explicit vectorization hints* (LV can bypass legal checks and 1543 // assume that vectorization is legal). However, both hints are implemented 1544 // using the same metadata (llvm.loop.vectorize, processed by 1545 // LoopVectorizeHints). This will be fixed in the future when the native IR 1546 // representation for pragma 'omp simd' is introduced. 1547 static bool isExplicitVecOuterLoop(Loop *OuterLp, 1548 OptimizationRemarkEmitter *ORE) { 1549 assert(!OuterLp->empty() && "This is not an outer loop"); 1550 LoopVectorizeHints Hints(OuterLp, true /*DisableInterleaving*/, *ORE); 1551 1552 // Only outer loops with an explicit vectorization hint are supported. 1553 // Unannotated outer loops are ignored. 1554 if (Hints.getForce() == LoopVectorizeHints::FK_Undefined) 1555 return false; 1556 1557 Function *Fn = OuterLp->getHeader()->getParent(); 1558 if (!Hints.allowVectorization(Fn, OuterLp, 1559 true /*VectorizeOnlyWhenForced*/)) { 1560 LLVM_DEBUG(dbgs() << "LV: Loop hints prevent outer loop vectorization.\n"); 1561 return false; 1562 } 1563 1564 if (Hints.getInterleave() > 1) { 1565 // TODO: Interleave support is future work. 1566 LLVM_DEBUG(dbgs() << "LV: Not vectorizing: Interleave is not supported for " 1567 "outer loops.\n"); 1568 Hints.emitRemarkWithHints(); 1569 return false; 1570 } 1571 1572 return true; 1573 } 1574 1575 static void collectSupportedLoops(Loop &L, LoopInfo *LI, 1576 OptimizationRemarkEmitter *ORE, 1577 SmallVectorImpl<Loop *> &V) { 1578 // Collect inner loops and outer loops without irreducible control flow. For 1579 // now, only collect outer loops that have explicit vectorization hints. If we 1580 // are stress testing the VPlan H-CFG construction, we collect the outermost 1581 // loop of every loop nest. 1582 if (L.empty() || VPlanBuildStressTest || 1583 (EnableVPlanNativePath && isExplicitVecOuterLoop(&L, ORE))) { 1584 LoopBlocksRPO RPOT(&L); 1585 RPOT.perform(LI); 1586 if (!containsIrreducibleCFG<const BasicBlock *>(RPOT, *LI)) { 1587 V.push_back(&L); 1588 // TODO: Collect inner loops inside marked outer loops in case 1589 // vectorization fails for the outer loop. Do not invoke 1590 // 'containsIrreducibleCFG' again for inner loops when the outer loop is 1591 // already known to be reducible. We can use an inherited attribute for 1592 // that. 1593 return; 1594 } 1595 } 1596 for (Loop *InnerL : L) 1597 collectSupportedLoops(*InnerL, LI, ORE, V); 1598 } 1599 1600 namespace { 1601 1602 /// The LoopVectorize Pass. 1603 struct LoopVectorize : public FunctionPass { 1604 /// Pass identification, replacement for typeid 1605 static char ID; 1606 1607 LoopVectorizePass Impl; 1608 1609 explicit LoopVectorize(bool InterleaveOnlyWhenForced = false, 1610 bool VectorizeOnlyWhenForced = false) 1611 : FunctionPass(ID), 1612 Impl({InterleaveOnlyWhenForced, VectorizeOnlyWhenForced}) { 1613 initializeLoopVectorizePass(*PassRegistry::getPassRegistry()); 1614 } 1615 1616 bool runOnFunction(Function &F) override { 1617 if (skipFunction(F)) 1618 return false; 1619 1620 auto *SE = &getAnalysis<ScalarEvolutionWrapperPass>().getSE(); 1621 auto *LI = &getAnalysis<LoopInfoWrapperPass>().getLoopInfo(); 1622 auto *TTI = &getAnalysis<TargetTransformInfoWrapperPass>().getTTI(F); 1623 auto *DT = &getAnalysis<DominatorTreeWrapperPass>().getDomTree(); 1624 auto *BFI = &getAnalysis<BlockFrequencyInfoWrapperPass>().getBFI(); 1625 auto *TLIP = getAnalysisIfAvailable<TargetLibraryInfoWrapperPass>(); 1626 auto *TLI = TLIP ? &TLIP->getTLI(F) : nullptr; 1627 auto *AA = &getAnalysis<AAResultsWrapperPass>().getAAResults(); 1628 auto *AC = &getAnalysis<AssumptionCacheTracker>().getAssumptionCache(F); 1629 auto *LAA = &getAnalysis<LoopAccessLegacyAnalysis>(); 1630 auto *DB = &getAnalysis<DemandedBitsWrapperPass>().getDemandedBits(); 1631 auto *ORE = &getAnalysis<OptimizationRemarkEmitterWrapperPass>().getORE(); 1632 auto *PSI = &getAnalysis<ProfileSummaryInfoWrapperPass>().getPSI(); 1633 1634 std::function<const LoopAccessInfo &(Loop &)> GetLAA = 1635 [&](Loop &L) -> const LoopAccessInfo & { return LAA->getInfo(&L); }; 1636 1637 return Impl.runImpl(F, *SE, *LI, *TTI, *DT, *BFI, TLI, *DB, *AA, *AC, 1638 GetLAA, *ORE, PSI).MadeAnyChange; 1639 } 1640 1641 void getAnalysisUsage(AnalysisUsage &AU) const override { 1642 AU.addRequired<AssumptionCacheTracker>(); 1643 AU.addRequired<BlockFrequencyInfoWrapperPass>(); 1644 AU.addRequired<DominatorTreeWrapperPass>(); 1645 AU.addRequired<LoopInfoWrapperPass>(); 1646 AU.addRequired<ScalarEvolutionWrapperPass>(); 1647 AU.addRequired<TargetTransformInfoWrapperPass>(); 1648 AU.addRequired<AAResultsWrapperPass>(); 1649 AU.addRequired<LoopAccessLegacyAnalysis>(); 1650 AU.addRequired<DemandedBitsWrapperPass>(); 1651 AU.addRequired<OptimizationRemarkEmitterWrapperPass>(); 1652 AU.addRequired<InjectTLIMappingsLegacy>(); 1653 1654 // We currently do not preserve loopinfo/dominator analyses with outer loop 1655 // vectorization. Until this is addressed, mark these analyses as preserved 1656 // only for non-VPlan-native path. 1657 // TODO: Preserve Loop and Dominator analyses for VPlan-native path. 1658 if (!EnableVPlanNativePath) { 1659 AU.addPreserved<LoopInfoWrapperPass>(); 1660 AU.addPreserved<DominatorTreeWrapperPass>(); 1661 } 1662 1663 AU.addPreserved<BasicAAWrapperPass>(); 1664 AU.addPreserved<GlobalsAAWrapperPass>(); 1665 AU.addRequired<ProfileSummaryInfoWrapperPass>(); 1666 } 1667 }; 1668 1669 } // end anonymous namespace 1670 1671 //===----------------------------------------------------------------------===// 1672 // Implementation of LoopVectorizationLegality, InnerLoopVectorizer and 1673 // LoopVectorizationCostModel and LoopVectorizationPlanner. 1674 //===----------------------------------------------------------------------===// 1675 1676 Value *InnerLoopVectorizer::getBroadcastInstrs(Value *V) { 1677 // We need to place the broadcast of invariant variables outside the loop, 1678 // but only if it's proven safe to do so. Else, broadcast will be inside 1679 // vector loop body. 1680 Instruction *Instr = dyn_cast<Instruction>(V); 1681 bool SafeToHoist = OrigLoop->isLoopInvariant(V) && 1682 (!Instr || 1683 DT->dominates(Instr->getParent(), LoopVectorPreHeader)); 1684 // Place the code for broadcasting invariant variables in the new preheader. 1685 IRBuilder<>::InsertPointGuard Guard(Builder); 1686 if (SafeToHoist) 1687 Builder.SetInsertPoint(LoopVectorPreHeader->getTerminator()); 1688 1689 // Broadcast the scalar into all locations in the vector. 1690 Value *Shuf = Builder.CreateVectorSplat(VF, V, "broadcast"); 1691 1692 return Shuf; 1693 } 1694 1695 void InnerLoopVectorizer::createVectorIntOrFpInductionPHI( 1696 const InductionDescriptor &II, Value *Step, Instruction *EntryVal) { 1697 assert((isa<PHINode>(EntryVal) || isa<TruncInst>(EntryVal)) && 1698 "Expected either an induction phi-node or a truncate of it!"); 1699 Value *Start = II.getStartValue(); 1700 1701 // Construct the initial value of the vector IV in the vector loop preheader 1702 auto CurrIP = Builder.saveIP(); 1703 Builder.SetInsertPoint(LoopVectorPreHeader->getTerminator()); 1704 if (isa<TruncInst>(EntryVal)) { 1705 assert(Start->getType()->isIntegerTy() && 1706 "Truncation requires an integer type"); 1707 auto *TruncType = cast<IntegerType>(EntryVal->getType()); 1708 Step = Builder.CreateTrunc(Step, TruncType); 1709 Start = Builder.CreateCast(Instruction::Trunc, Start, TruncType); 1710 } 1711 Value *SplatStart = Builder.CreateVectorSplat(VF, Start); 1712 Value *SteppedStart = 1713 getStepVector(SplatStart, 0, Step, II.getInductionOpcode()); 1714 1715 // We create vector phi nodes for both integer and floating-point induction 1716 // variables. Here, we determine the kind of arithmetic we will perform. 1717 Instruction::BinaryOps AddOp; 1718 Instruction::BinaryOps MulOp; 1719 if (Step->getType()->isIntegerTy()) { 1720 AddOp = Instruction::Add; 1721 MulOp = Instruction::Mul; 1722 } else { 1723 AddOp = II.getInductionOpcode(); 1724 MulOp = Instruction::FMul; 1725 } 1726 1727 // Multiply the vectorization factor by the step using integer or 1728 // floating-point arithmetic as appropriate. 1729 Value *ConstVF = getSignedIntOrFpConstant(Step->getType(), VF); 1730 Value *Mul = addFastMathFlag(Builder.CreateBinOp(MulOp, Step, ConstVF)); 1731 1732 // Create a vector splat to use in the induction update. 1733 // 1734 // FIXME: If the step is non-constant, we create the vector splat with 1735 // IRBuilder. IRBuilder can constant-fold the multiply, but it doesn't 1736 // handle a constant vector splat. 1737 Value *SplatVF = 1738 isa<Constant>(Mul) 1739 ? ConstantVector::getSplat({VF, false}, cast<Constant>(Mul)) 1740 : Builder.CreateVectorSplat(VF, Mul); 1741 Builder.restoreIP(CurrIP); 1742 1743 // We may need to add the step a number of times, depending on the unroll 1744 // factor. The last of those goes into the PHI. 1745 PHINode *VecInd = PHINode::Create(SteppedStart->getType(), 2, "vec.ind", 1746 &*LoopVectorBody->getFirstInsertionPt()); 1747 VecInd->setDebugLoc(EntryVal->getDebugLoc()); 1748 Instruction *LastInduction = VecInd; 1749 for (unsigned Part = 0; Part < UF; ++Part) { 1750 VectorLoopValueMap.setVectorValue(EntryVal, Part, LastInduction); 1751 1752 if (isa<TruncInst>(EntryVal)) 1753 addMetadata(LastInduction, EntryVal); 1754 recordVectorLoopValueForInductionCast(II, EntryVal, LastInduction, Part); 1755 1756 LastInduction = cast<Instruction>(addFastMathFlag( 1757 Builder.CreateBinOp(AddOp, LastInduction, SplatVF, "step.add"))); 1758 LastInduction->setDebugLoc(EntryVal->getDebugLoc()); 1759 } 1760 1761 // Move the last step to the end of the latch block. This ensures consistent 1762 // placement of all induction updates. 1763 auto *LoopVectorLatch = LI->getLoopFor(LoopVectorBody)->getLoopLatch(); 1764 auto *Br = cast<BranchInst>(LoopVectorLatch->getTerminator()); 1765 auto *ICmp = cast<Instruction>(Br->getCondition()); 1766 LastInduction->moveBefore(ICmp); 1767 LastInduction->setName("vec.ind.next"); 1768 1769 VecInd->addIncoming(SteppedStart, LoopVectorPreHeader); 1770 VecInd->addIncoming(LastInduction, LoopVectorLatch); 1771 } 1772 1773 bool InnerLoopVectorizer::shouldScalarizeInstruction(Instruction *I) const { 1774 return Cost->isScalarAfterVectorization(I, VF) || 1775 Cost->isProfitableToScalarize(I, VF); 1776 } 1777 1778 bool InnerLoopVectorizer::needsScalarInduction(Instruction *IV) const { 1779 if (shouldScalarizeInstruction(IV)) 1780 return true; 1781 auto isScalarInst = [&](User *U) -> bool { 1782 auto *I = cast<Instruction>(U); 1783 return (OrigLoop->contains(I) && shouldScalarizeInstruction(I)); 1784 }; 1785 return llvm::any_of(IV->users(), isScalarInst); 1786 } 1787 1788 void InnerLoopVectorizer::recordVectorLoopValueForInductionCast( 1789 const InductionDescriptor &ID, const Instruction *EntryVal, 1790 Value *VectorLoopVal, unsigned Part, unsigned Lane) { 1791 assert((isa<PHINode>(EntryVal) || isa<TruncInst>(EntryVal)) && 1792 "Expected either an induction phi-node or a truncate of it!"); 1793 1794 // This induction variable is not the phi from the original loop but the 1795 // newly-created IV based on the proof that casted Phi is equal to the 1796 // uncasted Phi in the vectorized loop (under a runtime guard possibly). It 1797 // re-uses the same InductionDescriptor that original IV uses but we don't 1798 // have to do any recording in this case - that is done when original IV is 1799 // processed. 1800 if (isa<TruncInst>(EntryVal)) 1801 return; 1802 1803 const SmallVectorImpl<Instruction *> &Casts = ID.getCastInsts(); 1804 if (Casts.empty()) 1805 return; 1806 // Only the first Cast instruction in the Casts vector is of interest. 1807 // The rest of the Casts (if exist) have no uses outside the 1808 // induction update chain itself. 1809 Instruction *CastInst = *Casts.begin(); 1810 if (Lane < UINT_MAX) 1811 VectorLoopValueMap.setScalarValue(CastInst, {Part, Lane}, VectorLoopVal); 1812 else 1813 VectorLoopValueMap.setVectorValue(CastInst, Part, VectorLoopVal); 1814 } 1815 1816 void InnerLoopVectorizer::widenIntOrFpInduction(PHINode *IV, TruncInst *Trunc) { 1817 assert((IV->getType()->isIntegerTy() || IV != OldInduction) && 1818 "Primary induction variable must have an integer type"); 1819 1820 auto II = Legal->getInductionVars().find(IV); 1821 assert(II != Legal->getInductionVars().end() && "IV is not an induction"); 1822 1823 auto ID = II->second; 1824 assert(IV->getType() == ID.getStartValue()->getType() && "Types must match"); 1825 1826 // The value from the original loop to which we are mapping the new induction 1827 // variable. 1828 Instruction *EntryVal = Trunc ? cast<Instruction>(Trunc) : IV; 1829 1830 auto &DL = OrigLoop->getHeader()->getModule()->getDataLayout(); 1831 1832 // Generate code for the induction step. Note that induction steps are 1833 // required to be loop-invariant 1834 auto CreateStepValue = [&](const SCEV *Step) -> Value * { 1835 assert(PSE.getSE()->isLoopInvariant(Step, OrigLoop) && 1836 "Induction step should be loop invariant"); 1837 if (PSE.getSE()->isSCEVable(IV->getType())) { 1838 SCEVExpander Exp(*PSE.getSE(), DL, "induction"); 1839 return Exp.expandCodeFor(Step, Step->getType(), 1840 LoopVectorPreHeader->getTerminator()); 1841 } 1842 return cast<SCEVUnknown>(Step)->getValue(); 1843 }; 1844 1845 // The scalar value to broadcast. This is derived from the canonical 1846 // induction variable. If a truncation type is given, truncate the canonical 1847 // induction variable and step. Otherwise, derive these values from the 1848 // induction descriptor. 1849 auto CreateScalarIV = [&](Value *&Step) -> Value * { 1850 Value *ScalarIV = Induction; 1851 if (IV != OldInduction) { 1852 ScalarIV = IV->getType()->isIntegerTy() 1853 ? Builder.CreateSExtOrTrunc(Induction, IV->getType()) 1854 : Builder.CreateCast(Instruction::SIToFP, Induction, 1855 IV->getType()); 1856 ScalarIV = emitTransformedIndex(Builder, ScalarIV, PSE.getSE(), DL, ID); 1857 ScalarIV->setName("offset.idx"); 1858 } 1859 if (Trunc) { 1860 auto *TruncType = cast<IntegerType>(Trunc->getType()); 1861 assert(Step->getType()->isIntegerTy() && 1862 "Truncation requires an integer step"); 1863 ScalarIV = Builder.CreateTrunc(ScalarIV, TruncType); 1864 Step = Builder.CreateTrunc(Step, TruncType); 1865 } 1866 return ScalarIV; 1867 }; 1868 1869 // Create the vector values from the scalar IV, in the absence of creating a 1870 // vector IV. 1871 auto CreateSplatIV = [&](Value *ScalarIV, Value *Step) { 1872 Value *Broadcasted = getBroadcastInstrs(ScalarIV); 1873 for (unsigned Part = 0; Part < UF; ++Part) { 1874 Value *EntryPart = 1875 getStepVector(Broadcasted, VF * Part, Step, ID.getInductionOpcode()); 1876 VectorLoopValueMap.setVectorValue(EntryVal, Part, EntryPart); 1877 if (Trunc) 1878 addMetadata(EntryPart, Trunc); 1879 recordVectorLoopValueForInductionCast(ID, EntryVal, EntryPart, Part); 1880 } 1881 }; 1882 1883 // Now do the actual transformations, and start with creating the step value. 1884 Value *Step = CreateStepValue(ID.getStep()); 1885 if (VF <= 1) { 1886 Value *ScalarIV = CreateScalarIV(Step); 1887 CreateSplatIV(ScalarIV, Step); 1888 return; 1889 } 1890 1891 // Determine if we want a scalar version of the induction variable. This is 1892 // true if the induction variable itself is not widened, or if it has at 1893 // least one user in the loop that is not widened. 1894 auto NeedsScalarIV = needsScalarInduction(EntryVal); 1895 if (!NeedsScalarIV) { 1896 createVectorIntOrFpInductionPHI(ID, Step, EntryVal); 1897 return; 1898 } 1899 1900 // Try to create a new independent vector induction variable. If we can't 1901 // create the phi node, we will splat the scalar induction variable in each 1902 // loop iteration. 1903 if (!shouldScalarizeInstruction(EntryVal)) { 1904 createVectorIntOrFpInductionPHI(ID, Step, EntryVal); 1905 Value *ScalarIV = CreateScalarIV(Step); 1906 // Create scalar steps that can be used by instructions we will later 1907 // scalarize. Note that the addition of the scalar steps will not increase 1908 // the number of instructions in the loop in the common case prior to 1909 // InstCombine. We will be trading one vector extract for each scalar step. 1910 buildScalarSteps(ScalarIV, Step, EntryVal, ID); 1911 return; 1912 } 1913 1914 // All IV users are scalar instructions, so only emit a scalar IV, not a 1915 // vectorised IV. Except when we tail-fold, then the splat IV feeds the 1916 // predicate used by the masked loads/stores. 1917 Value *ScalarIV = CreateScalarIV(Step); 1918 if (!Cost->isScalarEpilogueAllowed()) 1919 CreateSplatIV(ScalarIV, Step); 1920 buildScalarSteps(ScalarIV, Step, EntryVal, ID); 1921 } 1922 1923 Value *InnerLoopVectorizer::getStepVector(Value *Val, int StartIdx, Value *Step, 1924 Instruction::BinaryOps BinOp) { 1925 // Create and check the types. 1926 auto *ValVTy = cast<VectorType>(Val->getType()); 1927 int VLen = ValVTy->getNumElements(); 1928 1929 Type *STy = Val->getType()->getScalarType(); 1930 assert((STy->isIntegerTy() || STy->isFloatingPointTy()) && 1931 "Induction Step must be an integer or FP"); 1932 assert(Step->getType() == STy && "Step has wrong type"); 1933 1934 SmallVector<Constant *, 8> Indices; 1935 1936 if (STy->isIntegerTy()) { 1937 // Create a vector of consecutive numbers from zero to VF. 1938 for (int i = 0; i < VLen; ++i) 1939 Indices.push_back(ConstantInt::get(STy, StartIdx + i)); 1940 1941 // Add the consecutive indices to the vector value. 1942 Constant *Cv = ConstantVector::get(Indices); 1943 assert(Cv->getType() == Val->getType() && "Invalid consecutive vec"); 1944 Step = Builder.CreateVectorSplat(VLen, Step); 1945 assert(Step->getType() == Val->getType() && "Invalid step vec"); 1946 // FIXME: The newly created binary instructions should contain nsw/nuw flags, 1947 // which can be found from the original scalar operations. 1948 Step = Builder.CreateMul(Cv, Step); 1949 return Builder.CreateAdd(Val, Step, "induction"); 1950 } 1951 1952 // Floating point induction. 1953 assert((BinOp == Instruction::FAdd || BinOp == Instruction::FSub) && 1954 "Binary Opcode should be specified for FP induction"); 1955 // Create a vector of consecutive numbers from zero to VF. 1956 for (int i = 0; i < VLen; ++i) 1957 Indices.push_back(ConstantFP::get(STy, (double)(StartIdx + i))); 1958 1959 // Add the consecutive indices to the vector value. 1960 Constant *Cv = ConstantVector::get(Indices); 1961 1962 Step = Builder.CreateVectorSplat(VLen, Step); 1963 1964 // Floating point operations had to be 'fast' to enable the induction. 1965 FastMathFlags Flags; 1966 Flags.setFast(); 1967 1968 Value *MulOp = Builder.CreateFMul(Cv, Step); 1969 if (isa<Instruction>(MulOp)) 1970 // Have to check, MulOp may be a constant 1971 cast<Instruction>(MulOp)->setFastMathFlags(Flags); 1972 1973 Value *BOp = Builder.CreateBinOp(BinOp, Val, MulOp, "induction"); 1974 if (isa<Instruction>(BOp)) 1975 cast<Instruction>(BOp)->setFastMathFlags(Flags); 1976 return BOp; 1977 } 1978 1979 void InnerLoopVectorizer::buildScalarSteps(Value *ScalarIV, Value *Step, 1980 Instruction *EntryVal, 1981 const InductionDescriptor &ID) { 1982 // We shouldn't have to build scalar steps if we aren't vectorizing. 1983 assert(VF > 1 && "VF should be greater than one"); 1984 1985 // Get the value type and ensure it and the step have the same integer type. 1986 Type *ScalarIVTy = ScalarIV->getType()->getScalarType(); 1987 assert(ScalarIVTy == Step->getType() && 1988 "Val and Step should have the same type"); 1989 1990 // We build scalar steps for both integer and floating-point induction 1991 // variables. Here, we determine the kind of arithmetic we will perform. 1992 Instruction::BinaryOps AddOp; 1993 Instruction::BinaryOps MulOp; 1994 if (ScalarIVTy->isIntegerTy()) { 1995 AddOp = Instruction::Add; 1996 MulOp = Instruction::Mul; 1997 } else { 1998 AddOp = ID.getInductionOpcode(); 1999 MulOp = Instruction::FMul; 2000 } 2001 2002 // Determine the number of scalars we need to generate for each unroll 2003 // iteration. If EntryVal is uniform, we only need to generate the first 2004 // lane. Otherwise, we generate all VF values. 2005 unsigned Lanes = 2006 Cost->isUniformAfterVectorization(cast<Instruction>(EntryVal), VF) ? 1 2007 : VF; 2008 // Compute the scalar steps and save the results in VectorLoopValueMap. 2009 for (unsigned Part = 0; Part < UF; ++Part) { 2010 for (unsigned Lane = 0; Lane < Lanes; ++Lane) { 2011 auto *StartIdx = getSignedIntOrFpConstant(ScalarIVTy, VF * Part + Lane); 2012 auto *Mul = addFastMathFlag(Builder.CreateBinOp(MulOp, StartIdx, Step)); 2013 auto *Add = addFastMathFlag(Builder.CreateBinOp(AddOp, ScalarIV, Mul)); 2014 VectorLoopValueMap.setScalarValue(EntryVal, {Part, Lane}, Add); 2015 recordVectorLoopValueForInductionCast(ID, EntryVal, Add, Part, Lane); 2016 } 2017 } 2018 } 2019 2020 Value *InnerLoopVectorizer::getOrCreateVectorValue(Value *V, unsigned Part) { 2021 assert(V != Induction && "The new induction variable should not be used."); 2022 assert(!V->getType()->isVectorTy() && "Can't widen a vector"); 2023 assert(!V->getType()->isVoidTy() && "Type does not produce a value"); 2024 2025 // If we have a stride that is replaced by one, do it here. Defer this for 2026 // the VPlan-native path until we start running Legal checks in that path. 2027 if (!EnableVPlanNativePath && Legal->hasStride(V)) 2028 V = ConstantInt::get(V->getType(), 1); 2029 2030 // If we have a vector mapped to this value, return it. 2031 if (VectorLoopValueMap.hasVectorValue(V, Part)) 2032 return VectorLoopValueMap.getVectorValue(V, Part); 2033 2034 // If the value has not been vectorized, check if it has been scalarized 2035 // instead. If it has been scalarized, and we actually need the value in 2036 // vector form, we will construct the vector values on demand. 2037 if (VectorLoopValueMap.hasAnyScalarValue(V)) { 2038 Value *ScalarValue = VectorLoopValueMap.getScalarValue(V, {Part, 0}); 2039 2040 // If we've scalarized a value, that value should be an instruction. 2041 auto *I = cast<Instruction>(V); 2042 2043 // If we aren't vectorizing, we can just copy the scalar map values over to 2044 // the vector map. 2045 if (VF == 1) { 2046 VectorLoopValueMap.setVectorValue(V, Part, ScalarValue); 2047 return ScalarValue; 2048 } 2049 2050 // Get the last scalar instruction we generated for V and Part. If the value 2051 // is known to be uniform after vectorization, this corresponds to lane zero 2052 // of the Part unroll iteration. Otherwise, the last instruction is the one 2053 // we created for the last vector lane of the Part unroll iteration. 2054 unsigned LastLane = Cost->isUniformAfterVectorization(I, VF) ? 0 : VF - 1; 2055 auto *LastInst = cast<Instruction>( 2056 VectorLoopValueMap.getScalarValue(V, {Part, LastLane})); 2057 2058 // Set the insert point after the last scalarized instruction. This ensures 2059 // the insertelement sequence will directly follow the scalar definitions. 2060 auto OldIP = Builder.saveIP(); 2061 auto NewIP = std::next(BasicBlock::iterator(LastInst)); 2062 Builder.SetInsertPoint(&*NewIP); 2063 2064 // However, if we are vectorizing, we need to construct the vector values. 2065 // If the value is known to be uniform after vectorization, we can just 2066 // broadcast the scalar value corresponding to lane zero for each unroll 2067 // iteration. Otherwise, we construct the vector values using insertelement 2068 // instructions. Since the resulting vectors are stored in 2069 // VectorLoopValueMap, we will only generate the insertelements once. 2070 Value *VectorValue = nullptr; 2071 if (Cost->isUniformAfterVectorization(I, VF)) { 2072 VectorValue = getBroadcastInstrs(ScalarValue); 2073 VectorLoopValueMap.setVectorValue(V, Part, VectorValue); 2074 } else { 2075 // Initialize packing with insertelements to start from undef. 2076 Value *Undef = UndefValue::get(VectorType::get(V->getType(), VF)); 2077 VectorLoopValueMap.setVectorValue(V, Part, Undef); 2078 for (unsigned Lane = 0; Lane < VF; ++Lane) 2079 packScalarIntoVectorValue(V, {Part, Lane}); 2080 VectorValue = VectorLoopValueMap.getVectorValue(V, Part); 2081 } 2082 Builder.restoreIP(OldIP); 2083 return VectorValue; 2084 } 2085 2086 // If this scalar is unknown, assume that it is a constant or that it is 2087 // loop invariant. Broadcast V and save the value for future uses. 2088 Value *B = getBroadcastInstrs(V); 2089 VectorLoopValueMap.setVectorValue(V, Part, B); 2090 return B; 2091 } 2092 2093 Value * 2094 InnerLoopVectorizer::getOrCreateScalarValue(Value *V, 2095 const VPIteration &Instance) { 2096 // If the value is not an instruction contained in the loop, it should 2097 // already be scalar. 2098 if (OrigLoop->isLoopInvariant(V)) 2099 return V; 2100 2101 assert(Instance.Lane > 0 2102 ? !Cost->isUniformAfterVectorization(cast<Instruction>(V), VF) 2103 : true && "Uniform values only have lane zero"); 2104 2105 // If the value from the original loop has not been vectorized, it is 2106 // represented by UF x VF scalar values in the new loop. Return the requested 2107 // scalar value. 2108 if (VectorLoopValueMap.hasScalarValue(V, Instance)) 2109 return VectorLoopValueMap.getScalarValue(V, Instance); 2110 2111 // If the value has not been scalarized, get its entry in VectorLoopValueMap 2112 // for the given unroll part. If this entry is not a vector type (i.e., the 2113 // vectorization factor is one), there is no need to generate an 2114 // extractelement instruction. 2115 auto *U = getOrCreateVectorValue(V, Instance.Part); 2116 if (!U->getType()->isVectorTy()) { 2117 assert(VF == 1 && "Value not scalarized has non-vector type"); 2118 return U; 2119 } 2120 2121 // Otherwise, the value from the original loop has been vectorized and is 2122 // represented by UF vector values. Extract and return the requested scalar 2123 // value from the appropriate vector lane. 2124 return Builder.CreateExtractElement(U, Builder.getInt32(Instance.Lane)); 2125 } 2126 2127 void InnerLoopVectorizer::packScalarIntoVectorValue( 2128 Value *V, const VPIteration &Instance) { 2129 assert(V != Induction && "The new induction variable should not be used."); 2130 assert(!V->getType()->isVectorTy() && "Can't pack a vector"); 2131 assert(!V->getType()->isVoidTy() && "Type does not produce a value"); 2132 2133 Value *ScalarInst = VectorLoopValueMap.getScalarValue(V, Instance); 2134 Value *VectorValue = VectorLoopValueMap.getVectorValue(V, Instance.Part); 2135 VectorValue = Builder.CreateInsertElement(VectorValue, ScalarInst, 2136 Builder.getInt32(Instance.Lane)); 2137 VectorLoopValueMap.resetVectorValue(V, Instance.Part, VectorValue); 2138 } 2139 2140 Value *InnerLoopVectorizer::reverseVector(Value *Vec) { 2141 assert(Vec->getType()->isVectorTy() && "Invalid type"); 2142 SmallVector<int, 8> ShuffleMask; 2143 for (unsigned i = 0; i < VF; ++i) 2144 ShuffleMask.push_back(VF - i - 1); 2145 2146 return Builder.CreateShuffleVector(Vec, UndefValue::get(Vec->getType()), 2147 ShuffleMask, "reverse"); 2148 } 2149 2150 // Return whether we allow using masked interleave-groups (for dealing with 2151 // strided loads/stores that reside in predicated blocks, or for dealing 2152 // with gaps). 2153 static bool useMaskedInterleavedAccesses(const TargetTransformInfo &TTI) { 2154 // If an override option has been passed in for interleaved accesses, use it. 2155 if (EnableMaskedInterleavedMemAccesses.getNumOccurrences() > 0) 2156 return EnableMaskedInterleavedMemAccesses; 2157 2158 return TTI.enableMaskedInterleavedAccessVectorization(); 2159 } 2160 2161 // Try to vectorize the interleave group that \p Instr belongs to. 2162 // 2163 // E.g. Translate following interleaved load group (factor = 3): 2164 // for (i = 0; i < N; i+=3) { 2165 // R = Pic[i]; // Member of index 0 2166 // G = Pic[i+1]; // Member of index 1 2167 // B = Pic[i+2]; // Member of index 2 2168 // ... // do something to R, G, B 2169 // } 2170 // To: 2171 // %wide.vec = load <12 x i32> ; Read 4 tuples of R,G,B 2172 // %R.vec = shuffle %wide.vec, undef, <0, 3, 6, 9> ; R elements 2173 // %G.vec = shuffle %wide.vec, undef, <1, 4, 7, 10> ; G elements 2174 // %B.vec = shuffle %wide.vec, undef, <2, 5, 8, 11> ; B elements 2175 // 2176 // Or translate following interleaved store group (factor = 3): 2177 // for (i = 0; i < N; i+=3) { 2178 // ... do something to R, G, B 2179 // Pic[i] = R; // Member of index 0 2180 // Pic[i+1] = G; // Member of index 1 2181 // Pic[i+2] = B; // Member of index 2 2182 // } 2183 // To: 2184 // %R_G.vec = shuffle %R.vec, %G.vec, <0, 1, 2, ..., 7> 2185 // %B_U.vec = shuffle %B.vec, undef, <0, 1, 2, 3, u, u, u, u> 2186 // %interleaved.vec = shuffle %R_G.vec, %B_U.vec, 2187 // <0, 4, 8, 1, 5, 9, 2, 6, 10, 3, 7, 11> ; Interleave R,G,B elements 2188 // store <12 x i32> %interleaved.vec ; Write 4 tuples of R,G,B 2189 void InnerLoopVectorizer::vectorizeInterleaveGroup( 2190 const InterleaveGroup<Instruction> *Group, VPTransformState &State, 2191 VPValue *Addr, VPValue *BlockInMask) { 2192 Instruction *Instr = Group->getInsertPos(); 2193 const DataLayout &DL = Instr->getModule()->getDataLayout(); 2194 2195 // Prepare for the vector type of the interleaved load/store. 2196 Type *ScalarTy = getMemInstValueType(Instr); 2197 unsigned InterleaveFactor = Group->getFactor(); 2198 Type *VecTy = VectorType::get(ScalarTy, InterleaveFactor * VF); 2199 2200 // Prepare for the new pointers. 2201 SmallVector<Value *, 2> AddrParts; 2202 unsigned Index = Group->getIndex(Instr); 2203 2204 // TODO: extend the masked interleaved-group support to reversed access. 2205 assert((!BlockInMask || !Group->isReverse()) && 2206 "Reversed masked interleave-group not supported."); 2207 2208 // If the group is reverse, adjust the index to refer to the last vector lane 2209 // instead of the first. We adjust the index from the first vector lane, 2210 // rather than directly getting the pointer for lane VF - 1, because the 2211 // pointer operand of the interleaved access is supposed to be uniform. For 2212 // uniform instructions, we're only required to generate a value for the 2213 // first vector lane in each unroll iteration. 2214 if (Group->isReverse()) 2215 Index += (VF - 1) * Group->getFactor(); 2216 2217 for (unsigned Part = 0; Part < UF; Part++) { 2218 Value *AddrPart = State.get(Addr, {Part, 0}); 2219 setDebugLocFromInst(Builder, AddrPart); 2220 2221 // Notice current instruction could be any index. Need to adjust the address 2222 // to the member of index 0. 2223 // 2224 // E.g. a = A[i+1]; // Member of index 1 (Current instruction) 2225 // b = A[i]; // Member of index 0 2226 // Current pointer is pointed to A[i+1], adjust it to A[i]. 2227 // 2228 // E.g. A[i+1] = a; // Member of index 1 2229 // A[i] = b; // Member of index 0 2230 // A[i+2] = c; // Member of index 2 (Current instruction) 2231 // Current pointer is pointed to A[i+2], adjust it to A[i]. 2232 2233 bool InBounds = false; 2234 if (auto *gep = dyn_cast<GetElementPtrInst>(AddrPart->stripPointerCasts())) 2235 InBounds = gep->isInBounds(); 2236 AddrPart = Builder.CreateGEP(ScalarTy, AddrPart, Builder.getInt32(-Index)); 2237 cast<GetElementPtrInst>(AddrPart)->setIsInBounds(InBounds); 2238 2239 // Cast to the vector pointer type. 2240 unsigned AddressSpace = AddrPart->getType()->getPointerAddressSpace(); 2241 Type *PtrTy = VecTy->getPointerTo(AddressSpace); 2242 AddrParts.push_back(Builder.CreateBitCast(AddrPart, PtrTy)); 2243 } 2244 2245 setDebugLocFromInst(Builder, Instr); 2246 Value *UndefVec = UndefValue::get(VecTy); 2247 2248 Value *MaskForGaps = nullptr; 2249 if (Group->requiresScalarEpilogue() && !Cost->isScalarEpilogueAllowed()) { 2250 MaskForGaps = createBitMaskForGaps(Builder, VF, *Group); 2251 assert(MaskForGaps && "Mask for Gaps is required but it is null"); 2252 } 2253 2254 // Vectorize the interleaved load group. 2255 if (isa<LoadInst>(Instr)) { 2256 // For each unroll part, create a wide load for the group. 2257 SmallVector<Value *, 2> NewLoads; 2258 for (unsigned Part = 0; Part < UF; Part++) { 2259 Instruction *NewLoad; 2260 if (BlockInMask || MaskForGaps) { 2261 assert(useMaskedInterleavedAccesses(*TTI) && 2262 "masked interleaved groups are not allowed."); 2263 Value *GroupMask = MaskForGaps; 2264 if (BlockInMask) { 2265 Value *BlockInMaskPart = State.get(BlockInMask, Part); 2266 auto *Undefs = UndefValue::get(BlockInMaskPart->getType()); 2267 Value *ShuffledMask = Builder.CreateShuffleVector( 2268 BlockInMaskPart, Undefs, 2269 createReplicatedMask(InterleaveFactor, VF), "interleaved.mask"); 2270 GroupMask = MaskForGaps 2271 ? Builder.CreateBinOp(Instruction::And, ShuffledMask, 2272 MaskForGaps) 2273 : ShuffledMask; 2274 } 2275 NewLoad = 2276 Builder.CreateMaskedLoad(AddrParts[Part], Group->getAlign(), 2277 GroupMask, UndefVec, "wide.masked.vec"); 2278 } 2279 else 2280 NewLoad = Builder.CreateAlignedLoad(VecTy, AddrParts[Part], 2281 Group->getAlign(), "wide.vec"); 2282 Group->addMetadata(NewLoad); 2283 NewLoads.push_back(NewLoad); 2284 } 2285 2286 // For each member in the group, shuffle out the appropriate data from the 2287 // wide loads. 2288 for (unsigned I = 0; I < InterleaveFactor; ++I) { 2289 Instruction *Member = Group->getMember(I); 2290 2291 // Skip the gaps in the group. 2292 if (!Member) 2293 continue; 2294 2295 auto StrideMask = createStrideMask(I, InterleaveFactor, VF); 2296 for (unsigned Part = 0; Part < UF; Part++) { 2297 Value *StridedVec = Builder.CreateShuffleVector( 2298 NewLoads[Part], UndefVec, StrideMask, "strided.vec"); 2299 2300 // If this member has different type, cast the result type. 2301 if (Member->getType() != ScalarTy) { 2302 VectorType *OtherVTy = VectorType::get(Member->getType(), VF); 2303 StridedVec = createBitOrPointerCast(StridedVec, OtherVTy, DL); 2304 } 2305 2306 if (Group->isReverse()) 2307 StridedVec = reverseVector(StridedVec); 2308 2309 VectorLoopValueMap.setVectorValue(Member, Part, StridedVec); 2310 } 2311 } 2312 return; 2313 } 2314 2315 // The sub vector type for current instruction. 2316 VectorType *SubVT = VectorType::get(ScalarTy, VF); 2317 2318 // Vectorize the interleaved store group. 2319 for (unsigned Part = 0; Part < UF; Part++) { 2320 // Collect the stored vector from each member. 2321 SmallVector<Value *, 4> StoredVecs; 2322 for (unsigned i = 0; i < InterleaveFactor; i++) { 2323 // Interleaved store group doesn't allow a gap, so each index has a member 2324 Instruction *Member = Group->getMember(i); 2325 assert(Member && "Fail to get a member from an interleaved store group"); 2326 2327 Value *StoredVec = getOrCreateVectorValue( 2328 cast<StoreInst>(Member)->getValueOperand(), Part); 2329 if (Group->isReverse()) 2330 StoredVec = reverseVector(StoredVec); 2331 2332 // If this member has different type, cast it to a unified type. 2333 2334 if (StoredVec->getType() != SubVT) 2335 StoredVec = createBitOrPointerCast(StoredVec, SubVT, DL); 2336 2337 StoredVecs.push_back(StoredVec); 2338 } 2339 2340 // Concatenate all vectors into a wide vector. 2341 Value *WideVec = concatenateVectors(Builder, StoredVecs); 2342 2343 // Interleave the elements in the wide vector. 2344 Value *IVec = Builder.CreateShuffleVector( 2345 WideVec, UndefVec, createInterleaveMask(VF, InterleaveFactor), 2346 "interleaved.vec"); 2347 2348 Instruction *NewStoreInstr; 2349 if (BlockInMask) { 2350 Value *BlockInMaskPart = State.get(BlockInMask, Part); 2351 auto *Undefs = UndefValue::get(BlockInMaskPart->getType()); 2352 Value *ShuffledMask = Builder.CreateShuffleVector( 2353 BlockInMaskPart, Undefs, createReplicatedMask(InterleaveFactor, VF), 2354 "interleaved.mask"); 2355 NewStoreInstr = Builder.CreateMaskedStore( 2356 IVec, AddrParts[Part], Group->getAlign(), ShuffledMask); 2357 } 2358 else 2359 NewStoreInstr = 2360 Builder.CreateAlignedStore(IVec, AddrParts[Part], Group->getAlign()); 2361 2362 Group->addMetadata(NewStoreInstr); 2363 } 2364 } 2365 2366 void InnerLoopVectorizer::vectorizeMemoryInstruction(Instruction *Instr, 2367 VPTransformState &State, 2368 VPValue *Addr, 2369 VPValue *StoredValue, 2370 VPValue *BlockInMask) { 2371 // Attempt to issue a wide load. 2372 LoadInst *LI = dyn_cast<LoadInst>(Instr); 2373 StoreInst *SI = dyn_cast<StoreInst>(Instr); 2374 2375 assert((LI || SI) && "Invalid Load/Store instruction"); 2376 assert((!SI || StoredValue) && "No stored value provided for widened store"); 2377 assert((!LI || !StoredValue) && "Stored value provided for widened load"); 2378 2379 LoopVectorizationCostModel::InstWidening Decision = 2380 Cost->getWideningDecision(Instr, VF); 2381 assert((Decision == LoopVectorizationCostModel::CM_Widen || 2382 Decision == LoopVectorizationCostModel::CM_Widen_Reverse || 2383 Decision == LoopVectorizationCostModel::CM_GatherScatter) && 2384 "CM decision is not to widen the memory instruction"); 2385 2386 Type *ScalarDataTy = getMemInstValueType(Instr); 2387 Type *DataTy = VectorType::get(ScalarDataTy, VF); 2388 const Align Alignment = getLoadStoreAlignment(Instr); 2389 2390 // Determine if the pointer operand of the access is either consecutive or 2391 // reverse consecutive. 2392 bool Reverse = (Decision == LoopVectorizationCostModel::CM_Widen_Reverse); 2393 bool ConsecutiveStride = 2394 Reverse || (Decision == LoopVectorizationCostModel::CM_Widen); 2395 bool CreateGatherScatter = 2396 (Decision == LoopVectorizationCostModel::CM_GatherScatter); 2397 2398 // Either Ptr feeds a vector load/store, or a vector GEP should feed a vector 2399 // gather/scatter. Otherwise Decision should have been to Scalarize. 2400 assert((ConsecutiveStride || CreateGatherScatter) && 2401 "The instruction should be scalarized"); 2402 (void)ConsecutiveStride; 2403 2404 VectorParts BlockInMaskParts(UF); 2405 bool isMaskRequired = BlockInMask; 2406 if (isMaskRequired) 2407 for (unsigned Part = 0; Part < UF; ++Part) 2408 BlockInMaskParts[Part] = State.get(BlockInMask, Part); 2409 2410 const auto CreateVecPtr = [&](unsigned Part, Value *Ptr) -> Value * { 2411 // Calculate the pointer for the specific unroll-part. 2412 GetElementPtrInst *PartPtr = nullptr; 2413 2414 bool InBounds = false; 2415 if (auto *gep = dyn_cast<GetElementPtrInst>(Ptr->stripPointerCasts())) 2416 InBounds = gep->isInBounds(); 2417 2418 if (Reverse) { 2419 // If the address is consecutive but reversed, then the 2420 // wide store needs to start at the last vector element. 2421 PartPtr = cast<GetElementPtrInst>( 2422 Builder.CreateGEP(ScalarDataTy, Ptr, Builder.getInt32(-Part * VF))); 2423 PartPtr->setIsInBounds(InBounds); 2424 PartPtr = cast<GetElementPtrInst>( 2425 Builder.CreateGEP(ScalarDataTy, PartPtr, Builder.getInt32(1 - VF))); 2426 PartPtr->setIsInBounds(InBounds); 2427 if (isMaskRequired) // Reverse of a null all-one mask is a null mask. 2428 BlockInMaskParts[Part] = reverseVector(BlockInMaskParts[Part]); 2429 } else { 2430 PartPtr = cast<GetElementPtrInst>( 2431 Builder.CreateGEP(ScalarDataTy, Ptr, Builder.getInt32(Part * VF))); 2432 PartPtr->setIsInBounds(InBounds); 2433 } 2434 2435 unsigned AddressSpace = Ptr->getType()->getPointerAddressSpace(); 2436 return Builder.CreateBitCast(PartPtr, DataTy->getPointerTo(AddressSpace)); 2437 }; 2438 2439 // Handle Stores: 2440 if (SI) { 2441 setDebugLocFromInst(Builder, SI); 2442 2443 for (unsigned Part = 0; Part < UF; ++Part) { 2444 Instruction *NewSI = nullptr; 2445 Value *StoredVal = State.get(StoredValue, Part); 2446 if (CreateGatherScatter) { 2447 Value *MaskPart = isMaskRequired ? BlockInMaskParts[Part] : nullptr; 2448 Value *VectorGep = State.get(Addr, Part); 2449 NewSI = Builder.CreateMaskedScatter(StoredVal, VectorGep, Alignment, 2450 MaskPart); 2451 } else { 2452 if (Reverse) { 2453 // If we store to reverse consecutive memory locations, then we need 2454 // to reverse the order of elements in the stored value. 2455 StoredVal = reverseVector(StoredVal); 2456 // We don't want to update the value in the map as it might be used in 2457 // another expression. So don't call resetVectorValue(StoredVal). 2458 } 2459 auto *VecPtr = CreateVecPtr(Part, State.get(Addr, {0, 0})); 2460 if (isMaskRequired) 2461 NewSI = Builder.CreateMaskedStore(StoredVal, VecPtr, Alignment, 2462 BlockInMaskParts[Part]); 2463 else 2464 NewSI = Builder.CreateAlignedStore(StoredVal, VecPtr, Alignment); 2465 } 2466 addMetadata(NewSI, SI); 2467 } 2468 return; 2469 } 2470 2471 // Handle loads. 2472 assert(LI && "Must have a load instruction"); 2473 setDebugLocFromInst(Builder, LI); 2474 for (unsigned Part = 0; Part < UF; ++Part) { 2475 Value *NewLI; 2476 if (CreateGatherScatter) { 2477 Value *MaskPart = isMaskRequired ? BlockInMaskParts[Part] : nullptr; 2478 Value *VectorGep = State.get(Addr, Part); 2479 NewLI = Builder.CreateMaskedGather(VectorGep, Alignment, MaskPart, 2480 nullptr, "wide.masked.gather"); 2481 addMetadata(NewLI, LI); 2482 } else { 2483 auto *VecPtr = CreateVecPtr(Part, State.get(Addr, {0, 0})); 2484 if (isMaskRequired) 2485 NewLI = Builder.CreateMaskedLoad( 2486 VecPtr, Alignment, BlockInMaskParts[Part], UndefValue::get(DataTy), 2487 "wide.masked.load"); 2488 else 2489 NewLI = 2490 Builder.CreateAlignedLoad(DataTy, VecPtr, Alignment, "wide.load"); 2491 2492 // Add metadata to the load, but setVectorValue to the reverse shuffle. 2493 addMetadata(NewLI, LI); 2494 if (Reverse) 2495 NewLI = reverseVector(NewLI); 2496 } 2497 VectorLoopValueMap.setVectorValue(Instr, Part, NewLI); 2498 } 2499 } 2500 2501 void InnerLoopVectorizer::scalarizeInstruction(Instruction *Instr, VPUser &User, 2502 const VPIteration &Instance, 2503 bool IfPredicateInstr, 2504 VPTransformState &State) { 2505 assert(!Instr->getType()->isAggregateType() && "Can't handle vectors"); 2506 2507 setDebugLocFromInst(Builder, Instr); 2508 2509 // Does this instruction return a value ? 2510 bool IsVoidRetTy = Instr->getType()->isVoidTy(); 2511 2512 Instruction *Cloned = Instr->clone(); 2513 if (!IsVoidRetTy) 2514 Cloned->setName(Instr->getName() + ".cloned"); 2515 2516 // Replace the operands of the cloned instructions with their scalar 2517 // equivalents in the new loop. 2518 for (unsigned op = 0, e = User.getNumOperands(); op != e; ++op) { 2519 auto *NewOp = State.get(User.getOperand(op), Instance); 2520 Cloned->setOperand(op, NewOp); 2521 } 2522 addNewMetadata(Cloned, Instr); 2523 2524 // Place the cloned scalar in the new loop. 2525 Builder.Insert(Cloned); 2526 2527 // Add the cloned scalar to the scalar map entry. 2528 VectorLoopValueMap.setScalarValue(Instr, Instance, Cloned); 2529 2530 // If we just cloned a new assumption, add it the assumption cache. 2531 if (auto *II = dyn_cast<IntrinsicInst>(Cloned)) 2532 if (II->getIntrinsicID() == Intrinsic::assume) 2533 AC->registerAssumption(II); 2534 2535 // End if-block. 2536 if (IfPredicateInstr) 2537 PredicatedInstructions.push_back(Cloned); 2538 } 2539 2540 PHINode *InnerLoopVectorizer::createInductionVariable(Loop *L, Value *Start, 2541 Value *End, Value *Step, 2542 Instruction *DL) { 2543 BasicBlock *Header = L->getHeader(); 2544 BasicBlock *Latch = L->getLoopLatch(); 2545 // As we're just creating this loop, it's possible no latch exists 2546 // yet. If so, use the header as this will be a single block loop. 2547 if (!Latch) 2548 Latch = Header; 2549 2550 IRBuilder<> Builder(&*Header->getFirstInsertionPt()); 2551 Instruction *OldInst = getDebugLocFromInstOrOperands(OldInduction); 2552 setDebugLocFromInst(Builder, OldInst); 2553 auto *Induction = Builder.CreatePHI(Start->getType(), 2, "index"); 2554 2555 Builder.SetInsertPoint(Latch->getTerminator()); 2556 setDebugLocFromInst(Builder, OldInst); 2557 2558 // Create i+1 and fill the PHINode. 2559 Value *Next = Builder.CreateAdd(Induction, Step, "index.next"); 2560 Induction->addIncoming(Start, L->getLoopPreheader()); 2561 Induction->addIncoming(Next, Latch); 2562 // Create the compare. 2563 Value *ICmp = Builder.CreateICmpEQ(Next, End); 2564 Builder.CreateCondBr(ICmp, L->getExitBlock(), Header); 2565 2566 // Now we have two terminators. Remove the old one from the block. 2567 Latch->getTerminator()->eraseFromParent(); 2568 2569 return Induction; 2570 } 2571 2572 Value *InnerLoopVectorizer::getOrCreateTripCount(Loop *L) { 2573 if (TripCount) 2574 return TripCount; 2575 2576 assert(L && "Create Trip Count for null loop."); 2577 IRBuilder<> Builder(L->getLoopPreheader()->getTerminator()); 2578 // Find the loop boundaries. 2579 ScalarEvolution *SE = PSE.getSE(); 2580 const SCEV *BackedgeTakenCount = PSE.getBackedgeTakenCount(); 2581 assert(BackedgeTakenCount != SE->getCouldNotCompute() && 2582 "Invalid loop count"); 2583 2584 Type *IdxTy = Legal->getWidestInductionType(); 2585 assert(IdxTy && "No type for induction"); 2586 2587 // The exit count might have the type of i64 while the phi is i32. This can 2588 // happen if we have an induction variable that is sign extended before the 2589 // compare. The only way that we get a backedge taken count is that the 2590 // induction variable was signed and as such will not overflow. In such a case 2591 // truncation is legal. 2592 if (SE->getTypeSizeInBits(BackedgeTakenCount->getType()) > 2593 IdxTy->getPrimitiveSizeInBits()) 2594 BackedgeTakenCount = SE->getTruncateOrNoop(BackedgeTakenCount, IdxTy); 2595 BackedgeTakenCount = SE->getNoopOrZeroExtend(BackedgeTakenCount, IdxTy); 2596 2597 // Get the total trip count from the count by adding 1. 2598 const SCEV *ExitCount = SE->getAddExpr( 2599 BackedgeTakenCount, SE->getOne(BackedgeTakenCount->getType())); 2600 2601 const DataLayout &DL = L->getHeader()->getModule()->getDataLayout(); 2602 2603 // Expand the trip count and place the new instructions in the preheader. 2604 // Notice that the pre-header does not change, only the loop body. 2605 SCEVExpander Exp(*SE, DL, "induction"); 2606 2607 // Count holds the overall loop count (N). 2608 TripCount = Exp.expandCodeFor(ExitCount, ExitCount->getType(), 2609 L->getLoopPreheader()->getTerminator()); 2610 2611 if (TripCount->getType()->isPointerTy()) 2612 TripCount = 2613 CastInst::CreatePointerCast(TripCount, IdxTy, "exitcount.ptrcnt.to.int", 2614 L->getLoopPreheader()->getTerminator()); 2615 2616 return TripCount; 2617 } 2618 2619 Value *InnerLoopVectorizer::getOrCreateVectorTripCount(Loop *L) { 2620 if (VectorTripCount) 2621 return VectorTripCount; 2622 2623 Value *TC = getOrCreateTripCount(L); 2624 IRBuilder<> Builder(L->getLoopPreheader()->getTerminator()); 2625 2626 Type *Ty = TC->getType(); 2627 Constant *Step = ConstantInt::get(Ty, VF * UF); 2628 2629 // If the tail is to be folded by masking, round the number of iterations N 2630 // up to a multiple of Step instead of rounding down. This is done by first 2631 // adding Step-1 and then rounding down. Note that it's ok if this addition 2632 // overflows: the vector induction variable will eventually wrap to zero given 2633 // that it starts at zero and its Step is a power of two; the loop will then 2634 // exit, with the last early-exit vector comparison also producing all-true. 2635 if (Cost->foldTailByMasking()) { 2636 assert(isPowerOf2_32(VF * UF) && 2637 "VF*UF must be a power of 2 when folding tail by masking"); 2638 TC = Builder.CreateAdd(TC, ConstantInt::get(Ty, VF * UF - 1), "n.rnd.up"); 2639 } 2640 2641 // Now we need to generate the expression for the part of the loop that the 2642 // vectorized body will execute. This is equal to N - (N % Step) if scalar 2643 // iterations are not required for correctness, or N - Step, otherwise. Step 2644 // is equal to the vectorization factor (number of SIMD elements) times the 2645 // unroll factor (number of SIMD instructions). 2646 Value *R = Builder.CreateURem(TC, Step, "n.mod.vf"); 2647 2648 // If there is a non-reversed interleaved group that may speculatively access 2649 // memory out-of-bounds, we need to ensure that there will be at least one 2650 // iteration of the scalar epilogue loop. Thus, if the step evenly divides 2651 // the trip count, we set the remainder to be equal to the step. If the step 2652 // does not evenly divide the trip count, no adjustment is necessary since 2653 // there will already be scalar iterations. Note that the minimum iterations 2654 // check ensures that N >= Step. 2655 if (VF > 1 && Cost->requiresScalarEpilogue()) { 2656 auto *IsZero = Builder.CreateICmpEQ(R, ConstantInt::get(R->getType(), 0)); 2657 R = Builder.CreateSelect(IsZero, Step, R); 2658 } 2659 2660 VectorTripCount = Builder.CreateSub(TC, R, "n.vec"); 2661 2662 return VectorTripCount; 2663 } 2664 2665 Value *InnerLoopVectorizer::createBitOrPointerCast(Value *V, VectorType *DstVTy, 2666 const DataLayout &DL) { 2667 // Verify that V is a vector type with same number of elements as DstVTy. 2668 unsigned VF = DstVTy->getNumElements(); 2669 VectorType *SrcVecTy = cast<VectorType>(V->getType()); 2670 assert((VF == SrcVecTy->getNumElements()) && "Vector dimensions do not match"); 2671 Type *SrcElemTy = SrcVecTy->getElementType(); 2672 Type *DstElemTy = DstVTy->getElementType(); 2673 assert((DL.getTypeSizeInBits(SrcElemTy) == DL.getTypeSizeInBits(DstElemTy)) && 2674 "Vector elements must have same size"); 2675 2676 // Do a direct cast if element types are castable. 2677 if (CastInst::isBitOrNoopPointerCastable(SrcElemTy, DstElemTy, DL)) { 2678 return Builder.CreateBitOrPointerCast(V, DstVTy); 2679 } 2680 // V cannot be directly casted to desired vector type. 2681 // May happen when V is a floating point vector but DstVTy is a vector of 2682 // pointers or vice-versa. Handle this using a two-step bitcast using an 2683 // intermediate Integer type for the bitcast i.e. Ptr <-> Int <-> Float. 2684 assert((DstElemTy->isPointerTy() != SrcElemTy->isPointerTy()) && 2685 "Only one type should be a pointer type"); 2686 assert((DstElemTy->isFloatingPointTy() != SrcElemTy->isFloatingPointTy()) && 2687 "Only one type should be a floating point type"); 2688 Type *IntTy = 2689 IntegerType::getIntNTy(V->getContext(), DL.getTypeSizeInBits(SrcElemTy)); 2690 VectorType *VecIntTy = VectorType::get(IntTy, VF); 2691 Value *CastVal = Builder.CreateBitOrPointerCast(V, VecIntTy); 2692 return Builder.CreateBitOrPointerCast(CastVal, DstVTy); 2693 } 2694 2695 void InnerLoopVectorizer::emitMinimumIterationCountCheck(Loop *L, 2696 BasicBlock *Bypass) { 2697 Value *Count = getOrCreateTripCount(L); 2698 // Reuse existing vector loop preheader for TC checks. 2699 // Note that new preheader block is generated for vector loop. 2700 BasicBlock *const TCCheckBlock = LoopVectorPreHeader; 2701 IRBuilder<> Builder(TCCheckBlock->getTerminator()); 2702 2703 // Generate code to check if the loop's trip count is less than VF * UF, or 2704 // equal to it in case a scalar epilogue is required; this implies that the 2705 // vector trip count is zero. This check also covers the case where adding one 2706 // to the backedge-taken count overflowed leading to an incorrect trip count 2707 // of zero. In this case we will also jump to the scalar loop. 2708 auto P = Cost->requiresScalarEpilogue() ? ICmpInst::ICMP_ULE 2709 : ICmpInst::ICMP_ULT; 2710 2711 // If tail is to be folded, vector loop takes care of all iterations. 2712 Value *CheckMinIters = Builder.getFalse(); 2713 if (!Cost->foldTailByMasking()) 2714 CheckMinIters = Builder.CreateICmp( 2715 P, Count, ConstantInt::get(Count->getType(), VF * UF), 2716 "min.iters.check"); 2717 2718 // Create new preheader for vector loop. 2719 LoopVectorPreHeader = 2720 SplitBlock(TCCheckBlock, TCCheckBlock->getTerminator(), DT, LI, nullptr, 2721 "vector.ph"); 2722 2723 assert(DT->properlyDominates(DT->getNode(TCCheckBlock), 2724 DT->getNode(Bypass)->getIDom()) && 2725 "TC check is expected to dominate Bypass"); 2726 2727 // Update dominator for Bypass & LoopExit. 2728 DT->changeImmediateDominator(Bypass, TCCheckBlock); 2729 DT->changeImmediateDominator(LoopExitBlock, TCCheckBlock); 2730 2731 ReplaceInstWithInst( 2732 TCCheckBlock->getTerminator(), 2733 BranchInst::Create(Bypass, LoopVectorPreHeader, CheckMinIters)); 2734 LoopBypassBlocks.push_back(TCCheckBlock); 2735 } 2736 2737 void InnerLoopVectorizer::emitSCEVChecks(Loop *L, BasicBlock *Bypass) { 2738 // Reuse existing vector loop preheader for SCEV checks. 2739 // Note that new preheader block is generated for vector loop. 2740 BasicBlock *const SCEVCheckBlock = LoopVectorPreHeader; 2741 2742 // Generate the code to check that the SCEV assumptions that we made. 2743 // We want the new basic block to start at the first instruction in a 2744 // sequence of instructions that form a check. 2745 SCEVExpander Exp(*PSE.getSE(), Bypass->getModule()->getDataLayout(), 2746 "scev.check"); 2747 Value *SCEVCheck = Exp.expandCodeForPredicate( 2748 &PSE.getUnionPredicate(), SCEVCheckBlock->getTerminator()); 2749 2750 if (auto *C = dyn_cast<ConstantInt>(SCEVCheck)) 2751 if (C->isZero()) 2752 return; 2753 2754 assert(!SCEVCheckBlock->getParent()->hasOptSize() && 2755 "Cannot SCEV check stride or overflow when optimizing for size"); 2756 2757 SCEVCheckBlock->setName("vector.scevcheck"); 2758 // Create new preheader for vector loop. 2759 LoopVectorPreHeader = 2760 SplitBlock(SCEVCheckBlock, SCEVCheckBlock->getTerminator(), DT, LI, 2761 nullptr, "vector.ph"); 2762 2763 // Update dominator only if this is first RT check. 2764 if (LoopBypassBlocks.empty()) { 2765 DT->changeImmediateDominator(Bypass, SCEVCheckBlock); 2766 DT->changeImmediateDominator(LoopExitBlock, SCEVCheckBlock); 2767 } 2768 2769 ReplaceInstWithInst( 2770 SCEVCheckBlock->getTerminator(), 2771 BranchInst::Create(Bypass, LoopVectorPreHeader, SCEVCheck)); 2772 LoopBypassBlocks.push_back(SCEVCheckBlock); 2773 AddedSafetyChecks = true; 2774 } 2775 2776 void InnerLoopVectorizer::emitMemRuntimeChecks(Loop *L, BasicBlock *Bypass) { 2777 // VPlan-native path does not do any analysis for runtime checks currently. 2778 if (EnableVPlanNativePath) 2779 return; 2780 2781 // Reuse existing vector loop preheader for runtime memory checks. 2782 // Note that new preheader block is generated for vector loop. 2783 BasicBlock *const MemCheckBlock = L->getLoopPreheader(); 2784 2785 // Generate the code that checks in runtime if arrays overlap. We put the 2786 // checks into a separate block to make the more common case of few elements 2787 // faster. 2788 auto *LAI = Legal->getLAI(); 2789 const auto &RtPtrChecking = *LAI->getRuntimePointerChecking(); 2790 if (!RtPtrChecking.Need) 2791 return; 2792 Instruction *FirstCheckInst; 2793 Instruction *MemRuntimeCheck; 2794 std::tie(FirstCheckInst, MemRuntimeCheck) = 2795 addRuntimeChecks(MemCheckBlock->getTerminator(), OrigLoop, 2796 RtPtrChecking.getChecks(), RtPtrChecking.getSE()); 2797 if (!MemRuntimeCheck) 2798 return; 2799 2800 if (MemCheckBlock->getParent()->hasOptSize()) { 2801 assert(Cost->Hints->getForce() == LoopVectorizeHints::FK_Enabled && 2802 "Cannot emit memory checks when optimizing for size, unless forced " 2803 "to vectorize."); 2804 ORE->emit([&]() { 2805 return OptimizationRemarkAnalysis(DEBUG_TYPE, "VectorizationCodeSize", 2806 L->getStartLoc(), L->getHeader()) 2807 << "Code-size may be reduced by not forcing " 2808 "vectorization, or by source-code modifications " 2809 "eliminating the need for runtime checks " 2810 "(e.g., adding 'restrict')."; 2811 }); 2812 } 2813 2814 MemCheckBlock->setName("vector.memcheck"); 2815 // Create new preheader for vector loop. 2816 LoopVectorPreHeader = 2817 SplitBlock(MemCheckBlock, MemCheckBlock->getTerminator(), DT, LI, nullptr, 2818 "vector.ph"); 2819 2820 // Update dominator only if this is first RT check. 2821 if (LoopBypassBlocks.empty()) { 2822 DT->changeImmediateDominator(Bypass, MemCheckBlock); 2823 DT->changeImmediateDominator(LoopExitBlock, MemCheckBlock); 2824 } 2825 2826 ReplaceInstWithInst( 2827 MemCheckBlock->getTerminator(), 2828 BranchInst::Create(Bypass, LoopVectorPreHeader, MemRuntimeCheck)); 2829 LoopBypassBlocks.push_back(MemCheckBlock); 2830 AddedSafetyChecks = true; 2831 2832 // We currently don't use LoopVersioning for the actual loop cloning but we 2833 // still use it to add the noalias metadata. 2834 LVer = std::make_unique<LoopVersioning>(*Legal->getLAI(), OrigLoop, LI, DT, 2835 PSE.getSE()); 2836 LVer->prepareNoAliasMetadata(); 2837 } 2838 2839 Value *InnerLoopVectorizer::emitTransformedIndex( 2840 IRBuilder<> &B, Value *Index, ScalarEvolution *SE, const DataLayout &DL, 2841 const InductionDescriptor &ID) const { 2842 2843 SCEVExpander Exp(*SE, DL, "induction"); 2844 auto Step = ID.getStep(); 2845 auto StartValue = ID.getStartValue(); 2846 assert(Index->getType() == Step->getType() && 2847 "Index type does not match StepValue type"); 2848 2849 // Note: the IR at this point is broken. We cannot use SE to create any new 2850 // SCEV and then expand it, hoping that SCEV's simplification will give us 2851 // a more optimal code. Unfortunately, attempt of doing so on invalid IR may 2852 // lead to various SCEV crashes. So all we can do is to use builder and rely 2853 // on InstCombine for future simplifications. Here we handle some trivial 2854 // cases only. 2855 auto CreateAdd = [&B](Value *X, Value *Y) { 2856 assert(X->getType() == Y->getType() && "Types don't match!"); 2857 if (auto *CX = dyn_cast<ConstantInt>(X)) 2858 if (CX->isZero()) 2859 return Y; 2860 if (auto *CY = dyn_cast<ConstantInt>(Y)) 2861 if (CY->isZero()) 2862 return X; 2863 return B.CreateAdd(X, Y); 2864 }; 2865 2866 auto CreateMul = [&B](Value *X, Value *Y) { 2867 assert(X->getType() == Y->getType() && "Types don't match!"); 2868 if (auto *CX = dyn_cast<ConstantInt>(X)) 2869 if (CX->isOne()) 2870 return Y; 2871 if (auto *CY = dyn_cast<ConstantInt>(Y)) 2872 if (CY->isOne()) 2873 return X; 2874 return B.CreateMul(X, Y); 2875 }; 2876 2877 switch (ID.getKind()) { 2878 case InductionDescriptor::IK_IntInduction: { 2879 assert(Index->getType() == StartValue->getType() && 2880 "Index type does not match StartValue type"); 2881 if (ID.getConstIntStepValue() && ID.getConstIntStepValue()->isMinusOne()) 2882 return B.CreateSub(StartValue, Index); 2883 auto *Offset = CreateMul( 2884 Index, Exp.expandCodeFor(Step, Index->getType(), &*B.GetInsertPoint())); 2885 return CreateAdd(StartValue, Offset); 2886 } 2887 case InductionDescriptor::IK_PtrInduction: { 2888 assert(isa<SCEVConstant>(Step) && 2889 "Expected constant step for pointer induction"); 2890 return B.CreateGEP( 2891 StartValue->getType()->getPointerElementType(), StartValue, 2892 CreateMul(Index, Exp.expandCodeFor(Step, Index->getType(), 2893 &*B.GetInsertPoint()))); 2894 } 2895 case InductionDescriptor::IK_FpInduction: { 2896 assert(Step->getType()->isFloatingPointTy() && "Expected FP Step value"); 2897 auto InductionBinOp = ID.getInductionBinOp(); 2898 assert(InductionBinOp && 2899 (InductionBinOp->getOpcode() == Instruction::FAdd || 2900 InductionBinOp->getOpcode() == Instruction::FSub) && 2901 "Original bin op should be defined for FP induction"); 2902 2903 Value *StepValue = cast<SCEVUnknown>(Step)->getValue(); 2904 2905 // Floating point operations had to be 'fast' to enable the induction. 2906 FastMathFlags Flags; 2907 Flags.setFast(); 2908 2909 Value *MulExp = B.CreateFMul(StepValue, Index); 2910 if (isa<Instruction>(MulExp)) 2911 // We have to check, the MulExp may be a constant. 2912 cast<Instruction>(MulExp)->setFastMathFlags(Flags); 2913 2914 Value *BOp = B.CreateBinOp(InductionBinOp->getOpcode(), StartValue, MulExp, 2915 "induction"); 2916 if (isa<Instruction>(BOp)) 2917 cast<Instruction>(BOp)->setFastMathFlags(Flags); 2918 2919 return BOp; 2920 } 2921 case InductionDescriptor::IK_NoInduction: 2922 return nullptr; 2923 } 2924 llvm_unreachable("invalid enum"); 2925 } 2926 2927 BasicBlock *InnerLoopVectorizer::createVectorizedLoopSkeleton() { 2928 /* 2929 In this function we generate a new loop. The new loop will contain 2930 the vectorized instructions while the old loop will continue to run the 2931 scalar remainder. 2932 2933 [ ] <-- loop iteration number check. 2934 / | 2935 / v 2936 | [ ] <-- vector loop bypass (may consist of multiple blocks). 2937 | / | 2938 | / v 2939 || [ ] <-- vector pre header. 2940 |/ | 2941 | v 2942 | [ ] \ 2943 | [ ]_| <-- vector loop. 2944 | | 2945 | v 2946 | -[ ] <--- middle-block. 2947 | / | 2948 | / v 2949 -|- >[ ] <--- new preheader. 2950 | | 2951 | v 2952 | [ ] \ 2953 | [ ]_| <-- old scalar loop to handle remainder. 2954 \ | 2955 \ v 2956 >[ ] <-- exit block. 2957 ... 2958 */ 2959 2960 MDNode *OrigLoopID = OrigLoop->getLoopID(); 2961 2962 // Some loops have a single integer induction variable, while other loops 2963 // don't. One example is c++ iterators that often have multiple pointer 2964 // induction variables. In the code below we also support a case where we 2965 // don't have a single induction variable. 2966 // 2967 // We try to obtain an induction variable from the original loop as hard 2968 // as possible. However if we don't find one that: 2969 // - is an integer 2970 // - counts from zero, stepping by one 2971 // - is the size of the widest induction variable type 2972 // then we create a new one. 2973 OldInduction = Legal->getPrimaryInduction(); 2974 Type *IdxTy = Legal->getWidestInductionType(); 2975 2976 // Split the single block loop into the two loop structure described above. 2977 LoopScalarBody = OrigLoop->getHeader(); 2978 LoopVectorPreHeader = OrigLoop->getLoopPreheader(); 2979 LoopExitBlock = OrigLoop->getExitBlock(); 2980 assert(LoopExitBlock && "Must have an exit block"); 2981 assert(LoopVectorPreHeader && "Invalid loop structure"); 2982 2983 LoopMiddleBlock = 2984 SplitBlock(LoopVectorPreHeader, LoopVectorPreHeader->getTerminator(), DT, 2985 LI, nullptr, "middle.block"); 2986 LoopScalarPreHeader = 2987 SplitBlock(LoopMiddleBlock, LoopMiddleBlock->getTerminator(), DT, LI, 2988 nullptr, "scalar.ph"); 2989 // We intentionally don't let SplitBlock to update LoopInfo since 2990 // LoopVectorBody should belong to another loop than LoopVectorPreHeader. 2991 // LoopVectorBody is explicitly added to the correct place few lines later. 2992 LoopVectorBody = 2993 SplitBlock(LoopVectorPreHeader, LoopVectorPreHeader->getTerminator(), DT, 2994 nullptr, nullptr, "vector.body"); 2995 2996 // Update dominator for loop exit. 2997 DT->changeImmediateDominator(LoopExitBlock, LoopMiddleBlock); 2998 2999 // Create and register the new vector loop. 3000 Loop *Lp = LI->AllocateLoop(); 3001 Loop *ParentLoop = OrigLoop->getParentLoop(); 3002 3003 // Insert the new loop into the loop nest and register the new basic blocks 3004 // before calling any utilities such as SCEV that require valid LoopInfo. 3005 if (ParentLoop) { 3006 ParentLoop->addChildLoop(Lp); 3007 } else { 3008 LI->addTopLevelLoop(Lp); 3009 } 3010 Lp->addBasicBlockToLoop(LoopVectorBody, *LI); 3011 3012 // Find the loop boundaries. 3013 Value *Count = getOrCreateTripCount(Lp); 3014 3015 Value *StartIdx = ConstantInt::get(IdxTy, 0); 3016 3017 // Now, compare the new count to zero. If it is zero skip the vector loop and 3018 // jump to the scalar loop. This check also covers the case where the 3019 // backedge-taken count is uint##_max: adding one to it will overflow leading 3020 // to an incorrect trip count of zero. In this (rare) case we will also jump 3021 // to the scalar loop. 3022 emitMinimumIterationCountCheck(Lp, LoopScalarPreHeader); 3023 3024 // Generate the code to check any assumptions that we've made for SCEV 3025 // expressions. 3026 emitSCEVChecks(Lp, LoopScalarPreHeader); 3027 3028 // Generate the code that checks in runtime if arrays overlap. We put the 3029 // checks into a separate block to make the more common case of few elements 3030 // faster. 3031 emitMemRuntimeChecks(Lp, LoopScalarPreHeader); 3032 3033 // Generate the induction variable. 3034 // The loop step is equal to the vectorization factor (num of SIMD elements) 3035 // times the unroll factor (num of SIMD instructions). 3036 Value *CountRoundDown = getOrCreateVectorTripCount(Lp); 3037 Constant *Step = ConstantInt::get(IdxTy, VF * UF); 3038 Induction = 3039 createInductionVariable(Lp, StartIdx, CountRoundDown, Step, 3040 getDebugLocFromInstOrOperands(OldInduction)); 3041 3042 // We are going to resume the execution of the scalar loop. 3043 // Go over all of the induction variables that we found and fix the 3044 // PHIs that are left in the scalar version of the loop. 3045 // The starting values of PHI nodes depend on the counter of the last 3046 // iteration in the vectorized loop. 3047 // If we come from a bypass edge then we need to start from the original 3048 // start value. 3049 3050 // This variable saves the new starting index for the scalar loop. It is used 3051 // to test if there are any tail iterations left once the vector loop has 3052 // completed. 3053 for (auto &InductionEntry : Legal->getInductionVars()) { 3054 PHINode *OrigPhi = InductionEntry.first; 3055 InductionDescriptor II = InductionEntry.second; 3056 3057 // Create phi nodes to merge from the backedge-taken check block. 3058 PHINode *BCResumeVal = 3059 PHINode::Create(OrigPhi->getType(), 3, "bc.resume.val", 3060 LoopScalarPreHeader->getTerminator()); 3061 // Copy original phi DL over to the new one. 3062 BCResumeVal->setDebugLoc(OrigPhi->getDebugLoc()); 3063 Value *&EndValue = IVEndValues[OrigPhi]; 3064 if (OrigPhi == OldInduction) { 3065 // We know what the end value is. 3066 EndValue = CountRoundDown; 3067 } else { 3068 IRBuilder<> B(Lp->getLoopPreheader()->getTerminator()); 3069 Type *StepType = II.getStep()->getType(); 3070 Instruction::CastOps CastOp = 3071 CastInst::getCastOpcode(CountRoundDown, true, StepType, true); 3072 Value *CRD = B.CreateCast(CastOp, CountRoundDown, StepType, "cast.crd"); 3073 const DataLayout &DL = LoopScalarBody->getModule()->getDataLayout(); 3074 EndValue = emitTransformedIndex(B, CRD, PSE.getSE(), DL, II); 3075 EndValue->setName("ind.end"); 3076 } 3077 3078 // The new PHI merges the original incoming value, in case of a bypass, 3079 // or the value at the end of the vectorized loop. 3080 BCResumeVal->addIncoming(EndValue, LoopMiddleBlock); 3081 3082 // Fix the scalar body counter (PHI node). 3083 // The old induction's phi node in the scalar body needs the truncated 3084 // value. 3085 for (BasicBlock *BB : LoopBypassBlocks) 3086 BCResumeVal->addIncoming(II.getStartValue(), BB); 3087 OrigPhi->setIncomingValueForBlock(LoopScalarPreHeader, BCResumeVal); 3088 } 3089 3090 // We need the OrigLoop (scalar loop part) latch terminator to help 3091 // produce correct debug info for the middle block BB instructions. 3092 // The legality check stage guarantees that the loop will have a single 3093 // latch. 3094 assert(isa<BranchInst>(OrigLoop->getLoopLatch()->getTerminator()) && 3095 "Scalar loop latch terminator isn't a branch"); 3096 BranchInst *ScalarLatchBr = 3097 cast<BranchInst>(OrigLoop->getLoopLatch()->getTerminator()); 3098 3099 // Add a check in the middle block to see if we have completed 3100 // all of the iterations in the first vector loop. 3101 // If (N - N%VF) == N, then we *don't* need to run the remainder. 3102 // If tail is to be folded, we know we don't need to run the remainder. 3103 Value *CmpN = Builder.getTrue(); 3104 if (!Cost->foldTailByMasking()) { 3105 CmpN = CmpInst::Create(Instruction::ICmp, CmpInst::ICMP_EQ, Count, 3106 CountRoundDown, "cmp.n", 3107 LoopMiddleBlock->getTerminator()); 3108 3109 // Here we use the same DebugLoc as the scalar loop latch branch instead 3110 // of the corresponding compare because they may have ended up with 3111 // different line numbers and we want to avoid awkward line stepping while 3112 // debugging. Eg. if the compare has got a line number inside the loop. 3113 cast<Instruction>(CmpN)->setDebugLoc(ScalarLatchBr->getDebugLoc()); 3114 } 3115 3116 BranchInst *BrInst = 3117 BranchInst::Create(LoopExitBlock, LoopScalarPreHeader, CmpN); 3118 BrInst->setDebugLoc(ScalarLatchBr->getDebugLoc()); 3119 ReplaceInstWithInst(LoopMiddleBlock->getTerminator(), BrInst); 3120 3121 // Get ready to start creating new instructions into the vectorized body. 3122 assert(LoopVectorPreHeader == Lp->getLoopPreheader() && 3123 "Inconsistent vector loop preheader"); 3124 Builder.SetInsertPoint(&*LoopVectorBody->getFirstInsertionPt()); 3125 3126 Optional<MDNode *> VectorizedLoopID = 3127 makeFollowupLoopID(OrigLoopID, {LLVMLoopVectorizeFollowupAll, 3128 LLVMLoopVectorizeFollowupVectorized}); 3129 if (VectorizedLoopID.hasValue()) { 3130 Lp->setLoopID(VectorizedLoopID.getValue()); 3131 3132 // Do not setAlreadyVectorized if loop attributes have been defined 3133 // explicitly. 3134 return LoopVectorPreHeader; 3135 } 3136 3137 // Keep all loop hints from the original loop on the vector loop (we'll 3138 // replace the vectorizer-specific hints below). 3139 if (MDNode *LID = OrigLoop->getLoopID()) 3140 Lp->setLoopID(LID); 3141 3142 LoopVectorizeHints Hints(Lp, true, *ORE); 3143 Hints.setAlreadyVectorized(); 3144 3145 #ifdef EXPENSIVE_CHECKS 3146 assert(DT->verify(DominatorTree::VerificationLevel::Fast)); 3147 LI->verify(*DT); 3148 #endif 3149 3150 return LoopVectorPreHeader; 3151 } 3152 3153 // Fix up external users of the induction variable. At this point, we are 3154 // in LCSSA form, with all external PHIs that use the IV having one input value, 3155 // coming from the remainder loop. We need those PHIs to also have a correct 3156 // value for the IV when arriving directly from the middle block. 3157 void InnerLoopVectorizer::fixupIVUsers(PHINode *OrigPhi, 3158 const InductionDescriptor &II, 3159 Value *CountRoundDown, Value *EndValue, 3160 BasicBlock *MiddleBlock) { 3161 // There are two kinds of external IV usages - those that use the value 3162 // computed in the last iteration (the PHI) and those that use the penultimate 3163 // value (the value that feeds into the phi from the loop latch). 3164 // We allow both, but they, obviously, have different values. 3165 3166 assert(OrigLoop->getExitBlock() && "Expected a single exit block"); 3167 3168 DenseMap<Value *, Value *> MissingVals; 3169 3170 // An external user of the last iteration's value should see the value that 3171 // the remainder loop uses to initialize its own IV. 3172 Value *PostInc = OrigPhi->getIncomingValueForBlock(OrigLoop->getLoopLatch()); 3173 for (User *U : PostInc->users()) { 3174 Instruction *UI = cast<Instruction>(U); 3175 if (!OrigLoop->contains(UI)) { 3176 assert(isa<PHINode>(UI) && "Expected LCSSA form"); 3177 MissingVals[UI] = EndValue; 3178 } 3179 } 3180 3181 // An external user of the penultimate value need to see EndValue - Step. 3182 // The simplest way to get this is to recompute it from the constituent SCEVs, 3183 // that is Start + (Step * (CRD - 1)). 3184 for (User *U : OrigPhi->users()) { 3185 auto *UI = cast<Instruction>(U); 3186 if (!OrigLoop->contains(UI)) { 3187 const DataLayout &DL = 3188 OrigLoop->getHeader()->getModule()->getDataLayout(); 3189 assert(isa<PHINode>(UI) && "Expected LCSSA form"); 3190 3191 IRBuilder<> B(MiddleBlock->getTerminator()); 3192 Value *CountMinusOne = B.CreateSub( 3193 CountRoundDown, ConstantInt::get(CountRoundDown->getType(), 1)); 3194 Value *CMO = 3195 !II.getStep()->getType()->isIntegerTy() 3196 ? B.CreateCast(Instruction::SIToFP, CountMinusOne, 3197 II.getStep()->getType()) 3198 : B.CreateSExtOrTrunc(CountMinusOne, II.getStep()->getType()); 3199 CMO->setName("cast.cmo"); 3200 Value *Escape = emitTransformedIndex(B, CMO, PSE.getSE(), DL, II); 3201 Escape->setName("ind.escape"); 3202 MissingVals[UI] = Escape; 3203 } 3204 } 3205 3206 for (auto &I : MissingVals) { 3207 PHINode *PHI = cast<PHINode>(I.first); 3208 // One corner case we have to handle is two IVs "chasing" each-other, 3209 // that is %IV2 = phi [...], [ %IV1, %latch ] 3210 // In this case, if IV1 has an external use, we need to avoid adding both 3211 // "last value of IV1" and "penultimate value of IV2". So, verify that we 3212 // don't already have an incoming value for the middle block. 3213 if (PHI->getBasicBlockIndex(MiddleBlock) == -1) 3214 PHI->addIncoming(I.second, MiddleBlock); 3215 } 3216 } 3217 3218 namespace { 3219 3220 struct CSEDenseMapInfo { 3221 static bool canHandle(const Instruction *I) { 3222 return isa<InsertElementInst>(I) || isa<ExtractElementInst>(I) || 3223 isa<ShuffleVectorInst>(I) || isa<GetElementPtrInst>(I); 3224 } 3225 3226 static inline Instruction *getEmptyKey() { 3227 return DenseMapInfo<Instruction *>::getEmptyKey(); 3228 } 3229 3230 static inline Instruction *getTombstoneKey() { 3231 return DenseMapInfo<Instruction *>::getTombstoneKey(); 3232 } 3233 3234 static unsigned getHashValue(const Instruction *I) { 3235 assert(canHandle(I) && "Unknown instruction!"); 3236 return hash_combine(I->getOpcode(), hash_combine_range(I->value_op_begin(), 3237 I->value_op_end())); 3238 } 3239 3240 static bool isEqual(const Instruction *LHS, const Instruction *RHS) { 3241 if (LHS == getEmptyKey() || RHS == getEmptyKey() || 3242 LHS == getTombstoneKey() || RHS == getTombstoneKey()) 3243 return LHS == RHS; 3244 return LHS->isIdenticalTo(RHS); 3245 } 3246 }; 3247 3248 } // end anonymous namespace 3249 3250 ///Perform cse of induction variable instructions. 3251 static void cse(BasicBlock *BB) { 3252 // Perform simple cse. 3253 SmallDenseMap<Instruction *, Instruction *, 4, CSEDenseMapInfo> CSEMap; 3254 for (BasicBlock::iterator I = BB->begin(), E = BB->end(); I != E;) { 3255 Instruction *In = &*I++; 3256 3257 if (!CSEDenseMapInfo::canHandle(In)) 3258 continue; 3259 3260 // Check if we can replace this instruction with any of the 3261 // visited instructions. 3262 if (Instruction *V = CSEMap.lookup(In)) { 3263 In->replaceAllUsesWith(V); 3264 In->eraseFromParent(); 3265 continue; 3266 } 3267 3268 CSEMap[In] = In; 3269 } 3270 } 3271 3272 unsigned LoopVectorizationCostModel::getVectorCallCost(CallInst *CI, 3273 unsigned VF, 3274 bool &NeedToScalarize) { 3275 Function *F = CI->getCalledFunction(); 3276 Type *ScalarRetTy = CI->getType(); 3277 SmallVector<Type *, 4> Tys, ScalarTys; 3278 for (auto &ArgOp : CI->arg_operands()) 3279 ScalarTys.push_back(ArgOp->getType()); 3280 3281 // Estimate cost of scalarized vector call. The source operands are assumed 3282 // to be vectors, so we need to extract individual elements from there, 3283 // execute VF scalar calls, and then gather the result into the vector return 3284 // value. 3285 unsigned ScalarCallCost = TTI.getCallInstrCost(F, ScalarRetTy, ScalarTys, 3286 TTI::TCK_RecipThroughput); 3287 if (VF == 1) 3288 return ScalarCallCost; 3289 3290 // Compute corresponding vector type for return value and arguments. 3291 Type *RetTy = ToVectorTy(ScalarRetTy, VF); 3292 for (Type *ScalarTy : ScalarTys) 3293 Tys.push_back(ToVectorTy(ScalarTy, VF)); 3294 3295 // Compute costs of unpacking argument values for the scalar calls and 3296 // packing the return values to a vector. 3297 unsigned ScalarizationCost = getScalarizationOverhead(CI, VF); 3298 3299 unsigned Cost = ScalarCallCost * VF + ScalarizationCost; 3300 3301 // If we can't emit a vector call for this function, then the currently found 3302 // cost is the cost we need to return. 3303 NeedToScalarize = true; 3304 VFShape Shape = VFShape::get(*CI, {VF, false}, false /*HasGlobalPred*/); 3305 Function *VecFunc = VFDatabase(*CI).getVectorizedFunction(Shape); 3306 3307 if (!TLI || CI->isNoBuiltin() || !VecFunc) 3308 return Cost; 3309 3310 // If the corresponding vector cost is cheaper, return its cost. 3311 unsigned VectorCallCost = TTI.getCallInstrCost(nullptr, RetTy, Tys, 3312 TTI::TCK_RecipThroughput); 3313 if (VectorCallCost < Cost) { 3314 NeedToScalarize = false; 3315 return VectorCallCost; 3316 } 3317 return Cost; 3318 } 3319 3320 unsigned LoopVectorizationCostModel::getVectorIntrinsicCost(CallInst *CI, 3321 unsigned VF) { 3322 Intrinsic::ID ID = getVectorIntrinsicIDForCall(CI, TLI); 3323 assert(ID && "Expected intrinsic call!"); 3324 3325 FastMathFlags FMF; 3326 if (auto *FPMO = dyn_cast<FPMathOperator>(CI)) 3327 FMF = FPMO->getFastMathFlags(); 3328 3329 SmallVector<Value *, 4> Operands(CI->arg_operands()); 3330 return TTI.getIntrinsicInstrCost(ID, CI->getType(), Operands, FMF, VF, 3331 TargetTransformInfo::TCK_RecipThroughput, 3332 CI); 3333 } 3334 3335 static Type *smallestIntegerVectorType(Type *T1, Type *T2) { 3336 auto *I1 = cast<IntegerType>(cast<VectorType>(T1)->getElementType()); 3337 auto *I2 = cast<IntegerType>(cast<VectorType>(T2)->getElementType()); 3338 return I1->getBitWidth() < I2->getBitWidth() ? T1 : T2; 3339 } 3340 3341 static Type *largestIntegerVectorType(Type *T1, Type *T2) { 3342 auto *I1 = cast<IntegerType>(cast<VectorType>(T1)->getElementType()); 3343 auto *I2 = cast<IntegerType>(cast<VectorType>(T2)->getElementType()); 3344 return I1->getBitWidth() > I2->getBitWidth() ? T1 : T2; 3345 } 3346 3347 void InnerLoopVectorizer::truncateToMinimalBitwidths() { 3348 // For every instruction `I` in MinBWs, truncate the operands, create a 3349 // truncated version of `I` and reextend its result. InstCombine runs 3350 // later and will remove any ext/trunc pairs. 3351 SmallPtrSet<Value *, 4> Erased; 3352 for (const auto &KV : Cost->getMinimalBitwidths()) { 3353 // If the value wasn't vectorized, we must maintain the original scalar 3354 // type. The absence of the value from VectorLoopValueMap indicates that it 3355 // wasn't vectorized. 3356 if (!VectorLoopValueMap.hasAnyVectorValue(KV.first)) 3357 continue; 3358 for (unsigned Part = 0; Part < UF; ++Part) { 3359 Value *I = getOrCreateVectorValue(KV.first, Part); 3360 if (Erased.find(I) != Erased.end() || I->use_empty() || 3361 !isa<Instruction>(I)) 3362 continue; 3363 Type *OriginalTy = I->getType(); 3364 Type *ScalarTruncatedTy = 3365 IntegerType::get(OriginalTy->getContext(), KV.second); 3366 Type *TruncatedTy = VectorType::get( 3367 ScalarTruncatedTy, cast<VectorType>(OriginalTy)->getNumElements()); 3368 if (TruncatedTy == OriginalTy) 3369 continue; 3370 3371 IRBuilder<> B(cast<Instruction>(I)); 3372 auto ShrinkOperand = [&](Value *V) -> Value * { 3373 if (auto *ZI = dyn_cast<ZExtInst>(V)) 3374 if (ZI->getSrcTy() == TruncatedTy) 3375 return ZI->getOperand(0); 3376 return B.CreateZExtOrTrunc(V, TruncatedTy); 3377 }; 3378 3379 // The actual instruction modification depends on the instruction type, 3380 // unfortunately. 3381 Value *NewI = nullptr; 3382 if (auto *BO = dyn_cast<BinaryOperator>(I)) { 3383 NewI = B.CreateBinOp(BO->getOpcode(), ShrinkOperand(BO->getOperand(0)), 3384 ShrinkOperand(BO->getOperand(1))); 3385 3386 // Any wrapping introduced by shrinking this operation shouldn't be 3387 // considered undefined behavior. So, we can't unconditionally copy 3388 // arithmetic wrapping flags to NewI. 3389 cast<BinaryOperator>(NewI)->copyIRFlags(I, /*IncludeWrapFlags=*/false); 3390 } else if (auto *CI = dyn_cast<ICmpInst>(I)) { 3391 NewI = 3392 B.CreateICmp(CI->getPredicate(), ShrinkOperand(CI->getOperand(0)), 3393 ShrinkOperand(CI->getOperand(1))); 3394 } else if (auto *SI = dyn_cast<SelectInst>(I)) { 3395 NewI = B.CreateSelect(SI->getCondition(), 3396 ShrinkOperand(SI->getTrueValue()), 3397 ShrinkOperand(SI->getFalseValue())); 3398 } else if (auto *CI = dyn_cast<CastInst>(I)) { 3399 switch (CI->getOpcode()) { 3400 default: 3401 llvm_unreachable("Unhandled cast!"); 3402 case Instruction::Trunc: 3403 NewI = ShrinkOperand(CI->getOperand(0)); 3404 break; 3405 case Instruction::SExt: 3406 NewI = B.CreateSExtOrTrunc( 3407 CI->getOperand(0), 3408 smallestIntegerVectorType(OriginalTy, TruncatedTy)); 3409 break; 3410 case Instruction::ZExt: 3411 NewI = B.CreateZExtOrTrunc( 3412 CI->getOperand(0), 3413 smallestIntegerVectorType(OriginalTy, TruncatedTy)); 3414 break; 3415 } 3416 } else if (auto *SI = dyn_cast<ShuffleVectorInst>(I)) { 3417 auto Elements0 = 3418 cast<VectorType>(SI->getOperand(0)->getType())->getNumElements(); 3419 auto *O0 = B.CreateZExtOrTrunc( 3420 SI->getOperand(0), VectorType::get(ScalarTruncatedTy, Elements0)); 3421 auto Elements1 = 3422 cast<VectorType>(SI->getOperand(1)->getType())->getNumElements(); 3423 auto *O1 = B.CreateZExtOrTrunc( 3424 SI->getOperand(1), VectorType::get(ScalarTruncatedTy, Elements1)); 3425 3426 NewI = B.CreateShuffleVector(O0, O1, SI->getShuffleMask()); 3427 } else if (isa<LoadInst>(I) || isa<PHINode>(I)) { 3428 // Don't do anything with the operands, just extend the result. 3429 continue; 3430 } else if (auto *IE = dyn_cast<InsertElementInst>(I)) { 3431 auto Elements = 3432 cast<VectorType>(IE->getOperand(0)->getType())->getNumElements(); 3433 auto *O0 = B.CreateZExtOrTrunc( 3434 IE->getOperand(0), VectorType::get(ScalarTruncatedTy, Elements)); 3435 auto *O1 = B.CreateZExtOrTrunc(IE->getOperand(1), ScalarTruncatedTy); 3436 NewI = B.CreateInsertElement(O0, O1, IE->getOperand(2)); 3437 } else if (auto *EE = dyn_cast<ExtractElementInst>(I)) { 3438 auto Elements = 3439 cast<VectorType>(EE->getOperand(0)->getType())->getNumElements(); 3440 auto *O0 = B.CreateZExtOrTrunc( 3441 EE->getOperand(0), VectorType::get(ScalarTruncatedTy, Elements)); 3442 NewI = B.CreateExtractElement(O0, EE->getOperand(2)); 3443 } else { 3444 // If we don't know what to do, be conservative and don't do anything. 3445 continue; 3446 } 3447 3448 // Lastly, extend the result. 3449 NewI->takeName(cast<Instruction>(I)); 3450 Value *Res = B.CreateZExtOrTrunc(NewI, OriginalTy); 3451 I->replaceAllUsesWith(Res); 3452 cast<Instruction>(I)->eraseFromParent(); 3453 Erased.insert(I); 3454 VectorLoopValueMap.resetVectorValue(KV.first, Part, Res); 3455 } 3456 } 3457 3458 // We'll have created a bunch of ZExts that are now parentless. Clean up. 3459 for (const auto &KV : Cost->getMinimalBitwidths()) { 3460 // If the value wasn't vectorized, we must maintain the original scalar 3461 // type. The absence of the value from VectorLoopValueMap indicates that it 3462 // wasn't vectorized. 3463 if (!VectorLoopValueMap.hasAnyVectorValue(KV.first)) 3464 continue; 3465 for (unsigned Part = 0; Part < UF; ++Part) { 3466 Value *I = getOrCreateVectorValue(KV.first, Part); 3467 ZExtInst *Inst = dyn_cast<ZExtInst>(I); 3468 if (Inst && Inst->use_empty()) { 3469 Value *NewI = Inst->getOperand(0); 3470 Inst->eraseFromParent(); 3471 VectorLoopValueMap.resetVectorValue(KV.first, Part, NewI); 3472 } 3473 } 3474 } 3475 } 3476 3477 void InnerLoopVectorizer::fixVectorizedLoop() { 3478 // Insert truncates and extends for any truncated instructions as hints to 3479 // InstCombine. 3480 if (VF > 1) 3481 truncateToMinimalBitwidths(); 3482 3483 // Fix widened non-induction PHIs by setting up the PHI operands. 3484 if (OrigPHIsToFix.size()) { 3485 assert(EnableVPlanNativePath && 3486 "Unexpected non-induction PHIs for fixup in non VPlan-native path"); 3487 fixNonInductionPHIs(); 3488 } 3489 3490 // At this point every instruction in the original loop is widened to a 3491 // vector form. Now we need to fix the recurrences in the loop. These PHI 3492 // nodes are currently empty because we did not want to introduce cycles. 3493 // This is the second stage of vectorizing recurrences. 3494 fixCrossIterationPHIs(); 3495 3496 // Forget the original basic block. 3497 PSE.getSE()->forgetLoop(OrigLoop); 3498 3499 // Fix-up external users of the induction variables. 3500 for (auto &Entry : Legal->getInductionVars()) 3501 fixupIVUsers(Entry.first, Entry.second, 3502 getOrCreateVectorTripCount(LI->getLoopFor(LoopVectorBody)), 3503 IVEndValues[Entry.first], LoopMiddleBlock); 3504 3505 fixLCSSAPHIs(); 3506 for (Instruction *PI : PredicatedInstructions) 3507 sinkScalarOperands(&*PI); 3508 3509 // Remove redundant induction instructions. 3510 cse(LoopVectorBody); 3511 3512 // Set/update profile weights for the vector and remainder loops as original 3513 // loop iterations are now distributed among them. Note that original loop 3514 // represented by LoopScalarBody becomes remainder loop after vectorization. 3515 // 3516 // For cases like foldTailByMasking() and requiresScalarEpiloque() we may 3517 // end up getting slightly roughened result but that should be OK since 3518 // profile is not inherently precise anyway. Note also possible bypass of 3519 // vector code caused by legality checks is ignored, assigning all the weight 3520 // to the vector loop, optimistically. 3521 setProfileInfoAfterUnrolling(LI->getLoopFor(LoopScalarBody), 3522 LI->getLoopFor(LoopVectorBody), 3523 LI->getLoopFor(LoopScalarBody), VF * UF); 3524 } 3525 3526 void InnerLoopVectorizer::fixCrossIterationPHIs() { 3527 // In order to support recurrences we need to be able to vectorize Phi nodes. 3528 // Phi nodes have cycles, so we need to vectorize them in two stages. This is 3529 // stage #2: We now need to fix the recurrences by adding incoming edges to 3530 // the currently empty PHI nodes. At this point every instruction in the 3531 // original loop is widened to a vector form so we can use them to construct 3532 // the incoming edges. 3533 for (PHINode &Phi : OrigLoop->getHeader()->phis()) { 3534 // Handle first-order recurrences and reductions that need to be fixed. 3535 if (Legal->isFirstOrderRecurrence(&Phi)) 3536 fixFirstOrderRecurrence(&Phi); 3537 else if (Legal->isReductionVariable(&Phi)) 3538 fixReduction(&Phi); 3539 } 3540 } 3541 3542 void InnerLoopVectorizer::fixFirstOrderRecurrence(PHINode *Phi) { 3543 // This is the second phase of vectorizing first-order recurrences. An 3544 // overview of the transformation is described below. Suppose we have the 3545 // following loop. 3546 // 3547 // for (int i = 0; i < n; ++i) 3548 // b[i] = a[i] - a[i - 1]; 3549 // 3550 // There is a first-order recurrence on "a". For this loop, the shorthand 3551 // scalar IR looks like: 3552 // 3553 // scalar.ph: 3554 // s_init = a[-1] 3555 // br scalar.body 3556 // 3557 // scalar.body: 3558 // i = phi [0, scalar.ph], [i+1, scalar.body] 3559 // s1 = phi [s_init, scalar.ph], [s2, scalar.body] 3560 // s2 = a[i] 3561 // b[i] = s2 - s1 3562 // br cond, scalar.body, ... 3563 // 3564 // In this example, s1 is a recurrence because it's value depends on the 3565 // previous iteration. In the first phase of vectorization, we created a 3566 // temporary value for s1. We now complete the vectorization and produce the 3567 // shorthand vector IR shown below (for VF = 4, UF = 1). 3568 // 3569 // vector.ph: 3570 // v_init = vector(..., ..., ..., a[-1]) 3571 // br vector.body 3572 // 3573 // vector.body 3574 // i = phi [0, vector.ph], [i+4, vector.body] 3575 // v1 = phi [v_init, vector.ph], [v2, vector.body] 3576 // v2 = a[i, i+1, i+2, i+3]; 3577 // v3 = vector(v1(3), v2(0, 1, 2)) 3578 // b[i, i+1, i+2, i+3] = v2 - v3 3579 // br cond, vector.body, middle.block 3580 // 3581 // middle.block: 3582 // x = v2(3) 3583 // br scalar.ph 3584 // 3585 // scalar.ph: 3586 // s_init = phi [x, middle.block], [a[-1], otherwise] 3587 // br scalar.body 3588 // 3589 // After execution completes the vector loop, we extract the next value of 3590 // the recurrence (x) to use as the initial value in the scalar loop. 3591 3592 // Get the original loop preheader and single loop latch. 3593 auto *Preheader = OrigLoop->getLoopPreheader(); 3594 auto *Latch = OrigLoop->getLoopLatch(); 3595 3596 // Get the initial and previous values of the scalar recurrence. 3597 auto *ScalarInit = Phi->getIncomingValueForBlock(Preheader); 3598 auto *Previous = Phi->getIncomingValueForBlock(Latch); 3599 3600 // Create a vector from the initial value. 3601 auto *VectorInit = ScalarInit; 3602 if (VF > 1) { 3603 Builder.SetInsertPoint(LoopVectorPreHeader->getTerminator()); 3604 VectorInit = Builder.CreateInsertElement( 3605 UndefValue::get(VectorType::get(VectorInit->getType(), VF)), VectorInit, 3606 Builder.getInt32(VF - 1), "vector.recur.init"); 3607 } 3608 3609 // We constructed a temporary phi node in the first phase of vectorization. 3610 // This phi node will eventually be deleted. 3611 Builder.SetInsertPoint( 3612 cast<Instruction>(VectorLoopValueMap.getVectorValue(Phi, 0))); 3613 3614 // Create a phi node for the new recurrence. The current value will either be 3615 // the initial value inserted into a vector or loop-varying vector value. 3616 auto *VecPhi = Builder.CreatePHI(VectorInit->getType(), 2, "vector.recur"); 3617 VecPhi->addIncoming(VectorInit, LoopVectorPreHeader); 3618 3619 // Get the vectorized previous value of the last part UF - 1. It appears last 3620 // among all unrolled iterations, due to the order of their construction. 3621 Value *PreviousLastPart = getOrCreateVectorValue(Previous, UF - 1); 3622 3623 // Find and set the insertion point after the previous value if it is an 3624 // instruction. 3625 BasicBlock::iterator InsertPt; 3626 // Note that the previous value may have been constant-folded so it is not 3627 // guaranteed to be an instruction in the vector loop. 3628 // FIXME: Loop invariant values do not form recurrences. We should deal with 3629 // them earlier. 3630 if (LI->getLoopFor(LoopVectorBody)->isLoopInvariant(PreviousLastPart)) 3631 InsertPt = LoopVectorBody->getFirstInsertionPt(); 3632 else { 3633 Instruction *PreviousInst = cast<Instruction>(PreviousLastPart); 3634 if (isa<PHINode>(PreviousLastPart)) 3635 // If the previous value is a phi node, we should insert after all the phi 3636 // nodes in the block containing the PHI to avoid breaking basic block 3637 // verification. Note that the basic block may be different to 3638 // LoopVectorBody, in case we predicate the loop. 3639 InsertPt = PreviousInst->getParent()->getFirstInsertionPt(); 3640 else 3641 InsertPt = ++PreviousInst->getIterator(); 3642 } 3643 Builder.SetInsertPoint(&*InsertPt); 3644 3645 // We will construct a vector for the recurrence by combining the values for 3646 // the current and previous iterations. This is the required shuffle mask. 3647 SmallVector<int, 8> ShuffleMask(VF); 3648 ShuffleMask[0] = VF - 1; 3649 for (unsigned I = 1; I < VF; ++I) 3650 ShuffleMask[I] = I + VF - 1; 3651 3652 // The vector from which to take the initial value for the current iteration 3653 // (actual or unrolled). Initially, this is the vector phi node. 3654 Value *Incoming = VecPhi; 3655 3656 // Shuffle the current and previous vector and update the vector parts. 3657 for (unsigned Part = 0; Part < UF; ++Part) { 3658 Value *PreviousPart = getOrCreateVectorValue(Previous, Part); 3659 Value *PhiPart = VectorLoopValueMap.getVectorValue(Phi, Part); 3660 auto *Shuffle = VF > 1 ? Builder.CreateShuffleVector(Incoming, PreviousPart, 3661 ShuffleMask) 3662 : Incoming; 3663 PhiPart->replaceAllUsesWith(Shuffle); 3664 cast<Instruction>(PhiPart)->eraseFromParent(); 3665 VectorLoopValueMap.resetVectorValue(Phi, Part, Shuffle); 3666 Incoming = PreviousPart; 3667 } 3668 3669 // Fix the latch value of the new recurrence in the vector loop. 3670 VecPhi->addIncoming(Incoming, LI->getLoopFor(LoopVectorBody)->getLoopLatch()); 3671 3672 // Extract the last vector element in the middle block. This will be the 3673 // initial value for the recurrence when jumping to the scalar loop. 3674 auto *ExtractForScalar = Incoming; 3675 if (VF > 1) { 3676 Builder.SetInsertPoint(LoopMiddleBlock->getTerminator()); 3677 ExtractForScalar = Builder.CreateExtractElement( 3678 ExtractForScalar, Builder.getInt32(VF - 1), "vector.recur.extract"); 3679 } 3680 // Extract the second last element in the middle block if the 3681 // Phi is used outside the loop. We need to extract the phi itself 3682 // and not the last element (the phi update in the current iteration). This 3683 // will be the value when jumping to the exit block from the LoopMiddleBlock, 3684 // when the scalar loop is not run at all. 3685 Value *ExtractForPhiUsedOutsideLoop = nullptr; 3686 if (VF > 1) 3687 ExtractForPhiUsedOutsideLoop = Builder.CreateExtractElement( 3688 Incoming, Builder.getInt32(VF - 2), "vector.recur.extract.for.phi"); 3689 // When loop is unrolled without vectorizing, initialize 3690 // ExtractForPhiUsedOutsideLoop with the value just prior to unrolled value of 3691 // `Incoming`. This is analogous to the vectorized case above: extracting the 3692 // second last element when VF > 1. 3693 else if (UF > 1) 3694 ExtractForPhiUsedOutsideLoop = getOrCreateVectorValue(Previous, UF - 2); 3695 3696 // Fix the initial value of the original recurrence in the scalar loop. 3697 Builder.SetInsertPoint(&*LoopScalarPreHeader->begin()); 3698 auto *Start = Builder.CreatePHI(Phi->getType(), 2, "scalar.recur.init"); 3699 for (auto *BB : predecessors(LoopScalarPreHeader)) { 3700 auto *Incoming = BB == LoopMiddleBlock ? ExtractForScalar : ScalarInit; 3701 Start->addIncoming(Incoming, BB); 3702 } 3703 3704 Phi->setIncomingValueForBlock(LoopScalarPreHeader, Start); 3705 Phi->setName("scalar.recur"); 3706 3707 // Finally, fix users of the recurrence outside the loop. The users will need 3708 // either the last value of the scalar recurrence or the last value of the 3709 // vector recurrence we extracted in the middle block. Since the loop is in 3710 // LCSSA form, we just need to find all the phi nodes for the original scalar 3711 // recurrence in the exit block, and then add an edge for the middle block. 3712 for (PHINode &LCSSAPhi : LoopExitBlock->phis()) { 3713 if (LCSSAPhi.getIncomingValue(0) == Phi) { 3714 LCSSAPhi.addIncoming(ExtractForPhiUsedOutsideLoop, LoopMiddleBlock); 3715 } 3716 } 3717 } 3718 3719 void InnerLoopVectorizer::fixReduction(PHINode *Phi) { 3720 Constant *Zero = Builder.getInt32(0); 3721 3722 // Get it's reduction variable descriptor. 3723 assert(Legal->isReductionVariable(Phi) && 3724 "Unable to find the reduction variable"); 3725 RecurrenceDescriptor RdxDesc = Legal->getReductionVars()[Phi]; 3726 3727 RecurrenceDescriptor::RecurrenceKind RK = RdxDesc.getRecurrenceKind(); 3728 TrackingVH<Value> ReductionStartValue = RdxDesc.getRecurrenceStartValue(); 3729 Instruction *LoopExitInst = RdxDesc.getLoopExitInstr(); 3730 RecurrenceDescriptor::MinMaxRecurrenceKind MinMaxKind = 3731 RdxDesc.getMinMaxRecurrenceKind(); 3732 setDebugLocFromInst(Builder, ReductionStartValue); 3733 3734 // We need to generate a reduction vector from the incoming scalar. 3735 // To do so, we need to generate the 'identity' vector and override 3736 // one of the elements with the incoming scalar reduction. We need 3737 // to do it in the vector-loop preheader. 3738 Builder.SetInsertPoint(LoopVectorPreHeader->getTerminator()); 3739 3740 // This is the vector-clone of the value that leaves the loop. 3741 Type *VecTy = getOrCreateVectorValue(LoopExitInst, 0)->getType(); 3742 3743 // Find the reduction identity variable. Zero for addition, or, xor, 3744 // one for multiplication, -1 for And. 3745 Value *Identity; 3746 Value *VectorStart; 3747 if (RK == RecurrenceDescriptor::RK_IntegerMinMax || 3748 RK == RecurrenceDescriptor::RK_FloatMinMax) { 3749 // MinMax reduction have the start value as their identify. 3750 if (VF == 1) { 3751 VectorStart = Identity = ReductionStartValue; 3752 } else { 3753 VectorStart = Identity = 3754 Builder.CreateVectorSplat(VF, ReductionStartValue, "minmax.ident"); 3755 } 3756 } else { 3757 // Handle other reduction kinds: 3758 Constant *Iden = RecurrenceDescriptor::getRecurrenceIdentity( 3759 RK, VecTy->getScalarType()); 3760 if (VF == 1) { 3761 Identity = Iden; 3762 // This vector is the Identity vector where the first element is the 3763 // incoming scalar reduction. 3764 VectorStart = ReductionStartValue; 3765 } else { 3766 Identity = ConstantVector::getSplat({VF, false}, Iden); 3767 3768 // This vector is the Identity vector where the first element is the 3769 // incoming scalar reduction. 3770 VectorStart = 3771 Builder.CreateInsertElement(Identity, ReductionStartValue, Zero); 3772 } 3773 } 3774 3775 // Wrap flags are in general invalid after vectorization, clear them. 3776 clearReductionWrapFlags(RdxDesc); 3777 3778 // Fix the vector-loop phi. 3779 3780 // Reductions do not have to start at zero. They can start with 3781 // any loop invariant values. 3782 BasicBlock *Latch = OrigLoop->getLoopLatch(); 3783 Value *LoopVal = Phi->getIncomingValueForBlock(Latch); 3784 3785 for (unsigned Part = 0; Part < UF; ++Part) { 3786 Value *VecRdxPhi = getOrCreateVectorValue(Phi, Part); 3787 Value *Val = getOrCreateVectorValue(LoopVal, Part); 3788 // Make sure to add the reduction start value only to the 3789 // first unroll part. 3790 Value *StartVal = (Part == 0) ? VectorStart : Identity; 3791 cast<PHINode>(VecRdxPhi)->addIncoming(StartVal, LoopVectorPreHeader); 3792 cast<PHINode>(VecRdxPhi) 3793 ->addIncoming(Val, LI->getLoopFor(LoopVectorBody)->getLoopLatch()); 3794 } 3795 3796 // Before each round, move the insertion point right between 3797 // the PHIs and the values we are going to write. 3798 // This allows us to write both PHINodes and the extractelement 3799 // instructions. 3800 Builder.SetInsertPoint(&*LoopMiddleBlock->getFirstInsertionPt()); 3801 3802 setDebugLocFromInst(Builder, LoopExitInst); 3803 3804 // If tail is folded by masking, the vector value to leave the loop should be 3805 // a Select choosing between the vectorized LoopExitInst and vectorized Phi, 3806 // instead of the former. 3807 if (Cost->foldTailByMasking()) { 3808 for (unsigned Part = 0; Part < UF; ++Part) { 3809 Value *VecLoopExitInst = 3810 VectorLoopValueMap.getVectorValue(LoopExitInst, Part); 3811 Value *Sel = nullptr; 3812 for (User *U : VecLoopExitInst->users()) { 3813 if (isa<SelectInst>(U)) { 3814 assert(!Sel && "Reduction exit feeding two selects"); 3815 Sel = U; 3816 } else 3817 assert(isa<PHINode>(U) && "Reduction exit must feed Phi's or select"); 3818 } 3819 assert(Sel && "Reduction exit feeds no select"); 3820 VectorLoopValueMap.resetVectorValue(LoopExitInst, Part, Sel); 3821 } 3822 } 3823 3824 // If the vector reduction can be performed in a smaller type, we truncate 3825 // then extend the loop exit value to enable InstCombine to evaluate the 3826 // entire expression in the smaller type. 3827 if (VF > 1 && Phi->getType() != RdxDesc.getRecurrenceType()) { 3828 Type *RdxVecTy = VectorType::get(RdxDesc.getRecurrenceType(), VF); 3829 Builder.SetInsertPoint( 3830 LI->getLoopFor(LoopVectorBody)->getLoopLatch()->getTerminator()); 3831 VectorParts RdxParts(UF); 3832 for (unsigned Part = 0; Part < UF; ++Part) { 3833 RdxParts[Part] = VectorLoopValueMap.getVectorValue(LoopExitInst, Part); 3834 Value *Trunc = Builder.CreateTrunc(RdxParts[Part], RdxVecTy); 3835 Value *Extnd = RdxDesc.isSigned() ? Builder.CreateSExt(Trunc, VecTy) 3836 : Builder.CreateZExt(Trunc, VecTy); 3837 for (Value::user_iterator UI = RdxParts[Part]->user_begin(); 3838 UI != RdxParts[Part]->user_end();) 3839 if (*UI != Trunc) { 3840 (*UI++)->replaceUsesOfWith(RdxParts[Part], Extnd); 3841 RdxParts[Part] = Extnd; 3842 } else { 3843 ++UI; 3844 } 3845 } 3846 Builder.SetInsertPoint(&*LoopMiddleBlock->getFirstInsertionPt()); 3847 for (unsigned Part = 0; Part < UF; ++Part) { 3848 RdxParts[Part] = Builder.CreateTrunc(RdxParts[Part], RdxVecTy); 3849 VectorLoopValueMap.resetVectorValue(LoopExitInst, Part, RdxParts[Part]); 3850 } 3851 } 3852 3853 // Reduce all of the unrolled parts into a single vector. 3854 Value *ReducedPartRdx = VectorLoopValueMap.getVectorValue(LoopExitInst, 0); 3855 unsigned Op = RecurrenceDescriptor::getRecurrenceBinOp(RK); 3856 3857 // The middle block terminator has already been assigned a DebugLoc here (the 3858 // OrigLoop's single latch terminator). We want the whole middle block to 3859 // appear to execute on this line because: (a) it is all compiler generated, 3860 // (b) these instructions are always executed after evaluating the latch 3861 // conditional branch, and (c) other passes may add new predecessors which 3862 // terminate on this line. This is the easiest way to ensure we don't 3863 // accidentally cause an extra step back into the loop while debugging. 3864 setDebugLocFromInst(Builder, LoopMiddleBlock->getTerminator()); 3865 for (unsigned Part = 1; Part < UF; ++Part) { 3866 Value *RdxPart = VectorLoopValueMap.getVectorValue(LoopExitInst, Part); 3867 if (Op != Instruction::ICmp && Op != Instruction::FCmp) 3868 // Floating point operations had to be 'fast' to enable the reduction. 3869 ReducedPartRdx = addFastMathFlag( 3870 Builder.CreateBinOp((Instruction::BinaryOps)Op, RdxPart, 3871 ReducedPartRdx, "bin.rdx"), 3872 RdxDesc.getFastMathFlags()); 3873 else 3874 ReducedPartRdx = createMinMaxOp(Builder, MinMaxKind, ReducedPartRdx, 3875 RdxPart); 3876 } 3877 3878 if (VF > 1) { 3879 bool NoNaN = Legal->hasFunNoNaNAttr(); 3880 ReducedPartRdx = 3881 createTargetReduction(Builder, TTI, RdxDesc, ReducedPartRdx, NoNaN); 3882 // If the reduction can be performed in a smaller type, we need to extend 3883 // the reduction to the wider type before we branch to the original loop. 3884 if (Phi->getType() != RdxDesc.getRecurrenceType()) 3885 ReducedPartRdx = 3886 RdxDesc.isSigned() 3887 ? Builder.CreateSExt(ReducedPartRdx, Phi->getType()) 3888 : Builder.CreateZExt(ReducedPartRdx, Phi->getType()); 3889 } 3890 3891 // Create a phi node that merges control-flow from the backedge-taken check 3892 // block and the middle block. 3893 PHINode *BCBlockPhi = PHINode::Create(Phi->getType(), 2, "bc.merge.rdx", 3894 LoopScalarPreHeader->getTerminator()); 3895 for (unsigned I = 0, E = LoopBypassBlocks.size(); I != E; ++I) 3896 BCBlockPhi->addIncoming(ReductionStartValue, LoopBypassBlocks[I]); 3897 BCBlockPhi->addIncoming(ReducedPartRdx, LoopMiddleBlock); 3898 3899 // Now, we need to fix the users of the reduction variable 3900 // inside and outside of the scalar remainder loop. 3901 // We know that the loop is in LCSSA form. We need to update the 3902 // PHI nodes in the exit blocks. 3903 for (PHINode &LCSSAPhi : LoopExitBlock->phis()) { 3904 // All PHINodes need to have a single entry edge, or two if 3905 // we already fixed them. 3906 assert(LCSSAPhi.getNumIncomingValues() < 3 && "Invalid LCSSA PHI"); 3907 3908 // We found a reduction value exit-PHI. Update it with the 3909 // incoming bypass edge. 3910 if (LCSSAPhi.getIncomingValue(0) == LoopExitInst) 3911 LCSSAPhi.addIncoming(ReducedPartRdx, LoopMiddleBlock); 3912 } // end of the LCSSA phi scan. 3913 3914 // Fix the scalar loop reduction variable with the incoming reduction sum 3915 // from the vector body and from the backedge value. 3916 int IncomingEdgeBlockIdx = 3917 Phi->getBasicBlockIndex(OrigLoop->getLoopLatch()); 3918 assert(IncomingEdgeBlockIdx >= 0 && "Invalid block index"); 3919 // Pick the other block. 3920 int SelfEdgeBlockIdx = (IncomingEdgeBlockIdx ? 0 : 1); 3921 Phi->setIncomingValue(SelfEdgeBlockIdx, BCBlockPhi); 3922 Phi->setIncomingValue(IncomingEdgeBlockIdx, LoopExitInst); 3923 } 3924 3925 void InnerLoopVectorizer::clearReductionWrapFlags( 3926 RecurrenceDescriptor &RdxDesc) { 3927 RecurrenceDescriptor::RecurrenceKind RK = RdxDesc.getRecurrenceKind(); 3928 if (RK != RecurrenceDescriptor::RK_IntegerAdd && 3929 RK != RecurrenceDescriptor::RK_IntegerMult) 3930 return; 3931 3932 Instruction *LoopExitInstr = RdxDesc.getLoopExitInstr(); 3933 assert(LoopExitInstr && "null loop exit instruction"); 3934 SmallVector<Instruction *, 8> Worklist; 3935 SmallPtrSet<Instruction *, 8> Visited; 3936 Worklist.push_back(LoopExitInstr); 3937 Visited.insert(LoopExitInstr); 3938 3939 while (!Worklist.empty()) { 3940 Instruction *Cur = Worklist.pop_back_val(); 3941 if (isa<OverflowingBinaryOperator>(Cur)) 3942 for (unsigned Part = 0; Part < UF; ++Part) { 3943 Value *V = getOrCreateVectorValue(Cur, Part); 3944 cast<Instruction>(V)->dropPoisonGeneratingFlags(); 3945 } 3946 3947 for (User *U : Cur->users()) { 3948 Instruction *UI = cast<Instruction>(U); 3949 if ((Cur != LoopExitInstr || OrigLoop->contains(UI->getParent())) && 3950 Visited.insert(UI).second) 3951 Worklist.push_back(UI); 3952 } 3953 } 3954 } 3955 3956 void InnerLoopVectorizer::fixLCSSAPHIs() { 3957 for (PHINode &LCSSAPhi : LoopExitBlock->phis()) { 3958 if (LCSSAPhi.getNumIncomingValues() == 1) { 3959 auto *IncomingValue = LCSSAPhi.getIncomingValue(0); 3960 // Non-instruction incoming values will have only one value. 3961 unsigned LastLane = 0; 3962 if (isa<Instruction>(IncomingValue)) 3963 LastLane = Cost->isUniformAfterVectorization( 3964 cast<Instruction>(IncomingValue), VF) 3965 ? 0 3966 : VF - 1; 3967 // Can be a loop invariant incoming value or the last scalar value to be 3968 // extracted from the vectorized loop. 3969 Builder.SetInsertPoint(LoopMiddleBlock->getTerminator()); 3970 Value *lastIncomingValue = 3971 getOrCreateScalarValue(IncomingValue, { UF - 1, LastLane }); 3972 LCSSAPhi.addIncoming(lastIncomingValue, LoopMiddleBlock); 3973 } 3974 } 3975 } 3976 3977 void InnerLoopVectorizer::sinkScalarOperands(Instruction *PredInst) { 3978 // The basic block and loop containing the predicated instruction. 3979 auto *PredBB = PredInst->getParent(); 3980 auto *VectorLoop = LI->getLoopFor(PredBB); 3981 3982 // Initialize a worklist with the operands of the predicated instruction. 3983 SetVector<Value *> Worklist(PredInst->op_begin(), PredInst->op_end()); 3984 3985 // Holds instructions that we need to analyze again. An instruction may be 3986 // reanalyzed if we don't yet know if we can sink it or not. 3987 SmallVector<Instruction *, 8> InstsToReanalyze; 3988 3989 // Returns true if a given use occurs in the predicated block. Phi nodes use 3990 // their operands in their corresponding predecessor blocks. 3991 auto isBlockOfUsePredicated = [&](Use &U) -> bool { 3992 auto *I = cast<Instruction>(U.getUser()); 3993 BasicBlock *BB = I->getParent(); 3994 if (auto *Phi = dyn_cast<PHINode>(I)) 3995 BB = Phi->getIncomingBlock( 3996 PHINode::getIncomingValueNumForOperand(U.getOperandNo())); 3997 return BB == PredBB; 3998 }; 3999 4000 // Iteratively sink the scalarized operands of the predicated instruction 4001 // into the block we created for it. When an instruction is sunk, it's 4002 // operands are then added to the worklist. The algorithm ends after one pass 4003 // through the worklist doesn't sink a single instruction. 4004 bool Changed; 4005 do { 4006 // Add the instructions that need to be reanalyzed to the worklist, and 4007 // reset the changed indicator. 4008 Worklist.insert(InstsToReanalyze.begin(), InstsToReanalyze.end()); 4009 InstsToReanalyze.clear(); 4010 Changed = false; 4011 4012 while (!Worklist.empty()) { 4013 auto *I = dyn_cast<Instruction>(Worklist.pop_back_val()); 4014 4015 // We can't sink an instruction if it is a phi node, is already in the 4016 // predicated block, is not in the loop, or may have side effects. 4017 if (!I || isa<PHINode>(I) || I->getParent() == PredBB || 4018 !VectorLoop->contains(I) || I->mayHaveSideEffects()) 4019 continue; 4020 4021 // It's legal to sink the instruction if all its uses occur in the 4022 // predicated block. Otherwise, there's nothing to do yet, and we may 4023 // need to reanalyze the instruction. 4024 if (!llvm::all_of(I->uses(), isBlockOfUsePredicated)) { 4025 InstsToReanalyze.push_back(I); 4026 continue; 4027 } 4028 4029 // Move the instruction to the beginning of the predicated block, and add 4030 // it's operands to the worklist. 4031 I->moveBefore(&*PredBB->getFirstInsertionPt()); 4032 Worklist.insert(I->op_begin(), I->op_end()); 4033 4034 // The sinking may have enabled other instructions to be sunk, so we will 4035 // need to iterate. 4036 Changed = true; 4037 } 4038 } while (Changed); 4039 } 4040 4041 void InnerLoopVectorizer::fixNonInductionPHIs() { 4042 for (PHINode *OrigPhi : OrigPHIsToFix) { 4043 PHINode *NewPhi = 4044 cast<PHINode>(VectorLoopValueMap.getVectorValue(OrigPhi, 0)); 4045 unsigned NumIncomingValues = OrigPhi->getNumIncomingValues(); 4046 4047 SmallVector<BasicBlock *, 2> ScalarBBPredecessors( 4048 predecessors(OrigPhi->getParent())); 4049 SmallVector<BasicBlock *, 2> VectorBBPredecessors( 4050 predecessors(NewPhi->getParent())); 4051 assert(ScalarBBPredecessors.size() == VectorBBPredecessors.size() && 4052 "Scalar and Vector BB should have the same number of predecessors"); 4053 4054 // The insertion point in Builder may be invalidated by the time we get 4055 // here. Force the Builder insertion point to something valid so that we do 4056 // not run into issues during insertion point restore in 4057 // getOrCreateVectorValue calls below. 4058 Builder.SetInsertPoint(NewPhi); 4059 4060 // The predecessor order is preserved and we can rely on mapping between 4061 // scalar and vector block predecessors. 4062 for (unsigned i = 0; i < NumIncomingValues; ++i) { 4063 BasicBlock *NewPredBB = VectorBBPredecessors[i]; 4064 4065 // When looking up the new scalar/vector values to fix up, use incoming 4066 // values from original phi. 4067 Value *ScIncV = 4068 OrigPhi->getIncomingValueForBlock(ScalarBBPredecessors[i]); 4069 4070 // Scalar incoming value may need a broadcast 4071 Value *NewIncV = getOrCreateVectorValue(ScIncV, 0); 4072 NewPhi->addIncoming(NewIncV, NewPredBB); 4073 } 4074 } 4075 } 4076 4077 void InnerLoopVectorizer::widenGEP(GetElementPtrInst *GEP, unsigned UF, 4078 unsigned VF, bool IsPtrLoopInvariant, 4079 SmallBitVector &IsIndexLoopInvariant) { 4080 // Construct a vector GEP by widening the operands of the scalar GEP as 4081 // necessary. We mark the vector GEP 'inbounds' if appropriate. A GEP 4082 // results in a vector of pointers when at least one operand of the GEP 4083 // is vector-typed. Thus, to keep the representation compact, we only use 4084 // vector-typed operands for loop-varying values. 4085 4086 if (VF > 1 && IsPtrLoopInvariant && IsIndexLoopInvariant.all()) { 4087 // If we are vectorizing, but the GEP has only loop-invariant operands, 4088 // the GEP we build (by only using vector-typed operands for 4089 // loop-varying values) would be a scalar pointer. Thus, to ensure we 4090 // produce a vector of pointers, we need to either arbitrarily pick an 4091 // operand to broadcast, or broadcast a clone of the original GEP. 4092 // Here, we broadcast a clone of the original. 4093 // 4094 // TODO: If at some point we decide to scalarize instructions having 4095 // loop-invariant operands, this special case will no longer be 4096 // required. We would add the scalarization decision to 4097 // collectLoopScalars() and teach getVectorValue() to broadcast 4098 // the lane-zero scalar value. 4099 auto *Clone = Builder.Insert(GEP->clone()); 4100 for (unsigned Part = 0; Part < UF; ++Part) { 4101 Value *EntryPart = Builder.CreateVectorSplat(VF, Clone); 4102 VectorLoopValueMap.setVectorValue(GEP, Part, EntryPart); 4103 addMetadata(EntryPart, GEP); 4104 } 4105 } else { 4106 // If the GEP has at least one loop-varying operand, we are sure to 4107 // produce a vector of pointers. But if we are only unrolling, we want 4108 // to produce a scalar GEP for each unroll part. Thus, the GEP we 4109 // produce with the code below will be scalar (if VF == 1) or vector 4110 // (otherwise). Note that for the unroll-only case, we still maintain 4111 // values in the vector mapping with initVector, as we do for other 4112 // instructions. 4113 for (unsigned Part = 0; Part < UF; ++Part) { 4114 // The pointer operand of the new GEP. If it's loop-invariant, we 4115 // won't broadcast it. 4116 auto *Ptr = IsPtrLoopInvariant 4117 ? GEP->getPointerOperand() 4118 : getOrCreateVectorValue(GEP->getPointerOperand(), Part); 4119 4120 // Collect all the indices for the new GEP. If any index is 4121 // loop-invariant, we won't broadcast it. 4122 SmallVector<Value *, 4> Indices; 4123 for (auto Index : enumerate(GEP->indices())) { 4124 Value *User = Index.value().get(); 4125 if (IsIndexLoopInvariant[Index.index()]) 4126 Indices.push_back(User); 4127 else 4128 Indices.push_back(getOrCreateVectorValue(User, Part)); 4129 } 4130 4131 // Create the new GEP. Note that this GEP may be a scalar if VF == 1, 4132 // but it should be a vector, otherwise. 4133 auto *NewGEP = 4134 GEP->isInBounds() 4135 ? Builder.CreateInBoundsGEP(GEP->getSourceElementType(), Ptr, 4136 Indices) 4137 : Builder.CreateGEP(GEP->getSourceElementType(), Ptr, Indices); 4138 assert((VF == 1 || NewGEP->getType()->isVectorTy()) && 4139 "NewGEP is not a pointer vector"); 4140 VectorLoopValueMap.setVectorValue(GEP, Part, NewGEP); 4141 addMetadata(NewGEP, GEP); 4142 } 4143 } 4144 } 4145 4146 void InnerLoopVectorizer::widenPHIInstruction(Instruction *PN, unsigned UF, 4147 unsigned VF) { 4148 PHINode *P = cast<PHINode>(PN); 4149 if (EnableVPlanNativePath) { 4150 // Currently we enter here in the VPlan-native path for non-induction 4151 // PHIs where all control flow is uniform. We simply widen these PHIs. 4152 // Create a vector phi with no operands - the vector phi operands will be 4153 // set at the end of vector code generation. 4154 Type *VecTy = 4155 (VF == 1) ? PN->getType() : VectorType::get(PN->getType(), VF); 4156 Value *VecPhi = Builder.CreatePHI(VecTy, PN->getNumOperands(), "vec.phi"); 4157 VectorLoopValueMap.setVectorValue(P, 0, VecPhi); 4158 OrigPHIsToFix.push_back(P); 4159 4160 return; 4161 } 4162 4163 assert(PN->getParent() == OrigLoop->getHeader() && 4164 "Non-header phis should have been handled elsewhere"); 4165 4166 // In order to support recurrences we need to be able to vectorize Phi nodes. 4167 // Phi nodes have cycles, so we need to vectorize them in two stages. This is 4168 // stage #1: We create a new vector PHI node with no incoming edges. We'll use 4169 // this value when we vectorize all of the instructions that use the PHI. 4170 if (Legal->isReductionVariable(P) || Legal->isFirstOrderRecurrence(P)) { 4171 for (unsigned Part = 0; Part < UF; ++Part) { 4172 // This is phase one of vectorizing PHIs. 4173 Type *VecTy = 4174 (VF == 1) ? PN->getType() : VectorType::get(PN->getType(), VF); 4175 Value *EntryPart = PHINode::Create( 4176 VecTy, 2, "vec.phi", &*LoopVectorBody->getFirstInsertionPt()); 4177 VectorLoopValueMap.setVectorValue(P, Part, EntryPart); 4178 } 4179 return; 4180 } 4181 4182 setDebugLocFromInst(Builder, P); 4183 4184 // This PHINode must be an induction variable. 4185 // Make sure that we know about it. 4186 assert(Legal->getInductionVars().count(P) && "Not an induction variable"); 4187 4188 InductionDescriptor II = Legal->getInductionVars().lookup(P); 4189 const DataLayout &DL = OrigLoop->getHeader()->getModule()->getDataLayout(); 4190 4191 // FIXME: The newly created binary instructions should contain nsw/nuw flags, 4192 // which can be found from the original scalar operations. 4193 switch (II.getKind()) { 4194 case InductionDescriptor::IK_NoInduction: 4195 llvm_unreachable("Unknown induction"); 4196 case InductionDescriptor::IK_IntInduction: 4197 case InductionDescriptor::IK_FpInduction: 4198 llvm_unreachable("Integer/fp induction is handled elsewhere."); 4199 case InductionDescriptor::IK_PtrInduction: { 4200 // Handle the pointer induction variable case. 4201 assert(P->getType()->isPointerTy() && "Unexpected type."); 4202 // This is the normalized GEP that starts counting at zero. 4203 Value *PtrInd = Induction; 4204 PtrInd = Builder.CreateSExtOrTrunc(PtrInd, II.getStep()->getType()); 4205 // Determine the number of scalars we need to generate for each unroll 4206 // iteration. If the instruction is uniform, we only need to generate the 4207 // first lane. Otherwise, we generate all VF values. 4208 unsigned Lanes = Cost->isUniformAfterVectorization(P, VF) ? 1 : VF; 4209 // These are the scalar results. Notice that we don't generate vector GEPs 4210 // because scalar GEPs result in better code. 4211 for (unsigned Part = 0; Part < UF; ++Part) { 4212 for (unsigned Lane = 0; Lane < Lanes; ++Lane) { 4213 Constant *Idx = ConstantInt::get(PtrInd->getType(), Lane + Part * VF); 4214 Value *GlobalIdx = Builder.CreateAdd(PtrInd, Idx); 4215 Value *SclrGep = 4216 emitTransformedIndex(Builder, GlobalIdx, PSE.getSE(), DL, II); 4217 SclrGep->setName("next.gep"); 4218 VectorLoopValueMap.setScalarValue(P, {Part, Lane}, SclrGep); 4219 } 4220 } 4221 return; 4222 } 4223 } 4224 } 4225 4226 /// A helper function for checking whether an integer division-related 4227 /// instruction may divide by zero (in which case it must be predicated if 4228 /// executed conditionally in the scalar code). 4229 /// TODO: It may be worthwhile to generalize and check isKnownNonZero(). 4230 /// Non-zero divisors that are non compile-time constants will not be 4231 /// converted into multiplication, so we will still end up scalarizing 4232 /// the division, but can do so w/o predication. 4233 static bool mayDivideByZero(Instruction &I) { 4234 assert((I.getOpcode() == Instruction::UDiv || 4235 I.getOpcode() == Instruction::SDiv || 4236 I.getOpcode() == Instruction::URem || 4237 I.getOpcode() == Instruction::SRem) && 4238 "Unexpected instruction"); 4239 Value *Divisor = I.getOperand(1); 4240 auto *CInt = dyn_cast<ConstantInt>(Divisor); 4241 return !CInt || CInt->isZero(); 4242 } 4243 4244 void InnerLoopVectorizer::widenInstruction(Instruction &I, VPUser &User, 4245 VPTransformState &State) { 4246 switch (I.getOpcode()) { 4247 case Instruction::Call: 4248 case Instruction::Br: 4249 case Instruction::PHI: 4250 case Instruction::GetElementPtr: 4251 case Instruction::Select: 4252 llvm_unreachable("This instruction is handled by a different recipe."); 4253 case Instruction::UDiv: 4254 case Instruction::SDiv: 4255 case Instruction::SRem: 4256 case Instruction::URem: 4257 case Instruction::Add: 4258 case Instruction::FAdd: 4259 case Instruction::Sub: 4260 case Instruction::FSub: 4261 case Instruction::FNeg: 4262 case Instruction::Mul: 4263 case Instruction::FMul: 4264 case Instruction::FDiv: 4265 case Instruction::FRem: 4266 case Instruction::Shl: 4267 case Instruction::LShr: 4268 case Instruction::AShr: 4269 case Instruction::And: 4270 case Instruction::Or: 4271 case Instruction::Xor: { 4272 // Just widen unops and binops. 4273 setDebugLocFromInst(Builder, &I); 4274 4275 for (unsigned Part = 0; Part < UF; ++Part) { 4276 SmallVector<Value *, 2> Ops; 4277 for (VPValue *VPOp : User.operands()) 4278 Ops.push_back(State.get(VPOp, Part)); 4279 4280 Value *V = Builder.CreateNAryOp(I.getOpcode(), Ops); 4281 4282 if (auto *VecOp = dyn_cast<Instruction>(V)) 4283 VecOp->copyIRFlags(&I); 4284 4285 // Use this vector value for all users of the original instruction. 4286 VectorLoopValueMap.setVectorValue(&I, Part, V); 4287 addMetadata(V, &I); 4288 } 4289 4290 break; 4291 } 4292 case Instruction::ICmp: 4293 case Instruction::FCmp: { 4294 // Widen compares. Generate vector compares. 4295 bool FCmp = (I.getOpcode() == Instruction::FCmp); 4296 auto *Cmp = cast<CmpInst>(&I); 4297 setDebugLocFromInst(Builder, Cmp); 4298 for (unsigned Part = 0; Part < UF; ++Part) { 4299 Value *A = State.get(User.getOperand(0), Part); 4300 Value *B = State.get(User.getOperand(1), Part); 4301 Value *C = nullptr; 4302 if (FCmp) { 4303 // Propagate fast math flags. 4304 IRBuilder<>::FastMathFlagGuard FMFG(Builder); 4305 Builder.setFastMathFlags(Cmp->getFastMathFlags()); 4306 C = Builder.CreateFCmp(Cmp->getPredicate(), A, B); 4307 } else { 4308 C = Builder.CreateICmp(Cmp->getPredicate(), A, B); 4309 } 4310 VectorLoopValueMap.setVectorValue(&I, Part, C); 4311 addMetadata(C, &I); 4312 } 4313 4314 break; 4315 } 4316 4317 case Instruction::ZExt: 4318 case Instruction::SExt: 4319 case Instruction::FPToUI: 4320 case Instruction::FPToSI: 4321 case Instruction::FPExt: 4322 case Instruction::PtrToInt: 4323 case Instruction::IntToPtr: 4324 case Instruction::SIToFP: 4325 case Instruction::UIToFP: 4326 case Instruction::Trunc: 4327 case Instruction::FPTrunc: 4328 case Instruction::BitCast: { 4329 auto *CI = cast<CastInst>(&I); 4330 setDebugLocFromInst(Builder, CI); 4331 4332 /// Vectorize casts. 4333 Type *DestTy = 4334 (VF == 1) ? CI->getType() : VectorType::get(CI->getType(), VF); 4335 4336 for (unsigned Part = 0; Part < UF; ++Part) { 4337 Value *A = State.get(User.getOperand(0), Part); 4338 Value *Cast = Builder.CreateCast(CI->getOpcode(), A, DestTy); 4339 VectorLoopValueMap.setVectorValue(&I, Part, Cast); 4340 addMetadata(Cast, &I); 4341 } 4342 break; 4343 } 4344 default: 4345 // This instruction is not vectorized by simple widening. 4346 LLVM_DEBUG(dbgs() << "LV: Found an unhandled instruction: " << I); 4347 llvm_unreachable("Unhandled instruction!"); 4348 } // end of switch. 4349 } 4350 4351 void InnerLoopVectorizer::widenCallInstruction(CallInst &I, VPUser &ArgOperands, 4352 VPTransformState &State) { 4353 assert(!isa<DbgInfoIntrinsic>(I) && 4354 "DbgInfoIntrinsic should have been dropped during VPlan construction"); 4355 setDebugLocFromInst(Builder, &I); 4356 4357 Module *M = I.getParent()->getParent()->getParent(); 4358 auto *CI = cast<CallInst>(&I); 4359 4360 SmallVector<Type *, 4> Tys; 4361 for (Value *ArgOperand : CI->arg_operands()) 4362 Tys.push_back(ToVectorTy(ArgOperand->getType(), VF)); 4363 4364 Intrinsic::ID ID = getVectorIntrinsicIDForCall(CI, TLI); 4365 4366 // The flag shows whether we use Intrinsic or a usual Call for vectorized 4367 // version of the instruction. 4368 // Is it beneficial to perform intrinsic call compared to lib call? 4369 bool NeedToScalarize = false; 4370 unsigned CallCost = Cost->getVectorCallCost(CI, VF, NeedToScalarize); 4371 bool UseVectorIntrinsic = 4372 ID && Cost->getVectorIntrinsicCost(CI, VF) <= CallCost; 4373 assert((UseVectorIntrinsic || !NeedToScalarize) && 4374 "Instruction should be scalarized elsewhere."); 4375 4376 for (unsigned Part = 0; Part < UF; ++Part) { 4377 SmallVector<Value *, 4> Args; 4378 for (auto &I : enumerate(ArgOperands.operands())) { 4379 // Some intrinsics have a scalar argument - don't replace it with a 4380 // vector. 4381 Value *Arg; 4382 if (!UseVectorIntrinsic || !hasVectorInstrinsicScalarOpd(ID, I.index())) 4383 Arg = State.get(I.value(), Part); 4384 else 4385 Arg = State.get(I.value(), {0, 0}); 4386 Args.push_back(Arg); 4387 } 4388 4389 Function *VectorF; 4390 if (UseVectorIntrinsic) { 4391 // Use vector version of the intrinsic. 4392 Type *TysForDecl[] = {CI->getType()}; 4393 if (VF > 1) 4394 TysForDecl[0] = VectorType::get(CI->getType()->getScalarType(), VF); 4395 VectorF = Intrinsic::getDeclaration(M, ID, TysForDecl); 4396 assert(VectorF && "Can't retrieve vector intrinsic."); 4397 } else { 4398 // Use vector version of the function call. 4399 const VFShape Shape = 4400 VFShape::get(*CI, {VF, false} /*EC*/, false /*HasGlobalPred*/); 4401 #ifndef NDEBUG 4402 assert(VFDatabase(*CI).getVectorizedFunction(Shape) != nullptr && 4403 "Can't create vector function."); 4404 #endif 4405 VectorF = VFDatabase(*CI).getVectorizedFunction(Shape); 4406 } 4407 SmallVector<OperandBundleDef, 1> OpBundles; 4408 CI->getOperandBundlesAsDefs(OpBundles); 4409 CallInst *V = Builder.CreateCall(VectorF, Args, OpBundles); 4410 4411 if (isa<FPMathOperator>(V)) 4412 V->copyFastMathFlags(CI); 4413 4414 VectorLoopValueMap.setVectorValue(&I, Part, V); 4415 addMetadata(V, &I); 4416 } 4417 } 4418 4419 void InnerLoopVectorizer::widenSelectInstruction(SelectInst &I, 4420 bool InvariantCond) { 4421 setDebugLocFromInst(Builder, &I); 4422 4423 // The condition can be loop invariant but still defined inside the 4424 // loop. This means that we can't just use the original 'cond' value. 4425 // We have to take the 'vectorized' value and pick the first lane. 4426 // Instcombine will make this a no-op. 4427 4428 auto *ScalarCond = getOrCreateScalarValue(I.getOperand(0), {0, 0}); 4429 4430 for (unsigned Part = 0; Part < UF; ++Part) { 4431 Value *Cond = getOrCreateVectorValue(I.getOperand(0), Part); 4432 Value *Op0 = getOrCreateVectorValue(I.getOperand(1), Part); 4433 Value *Op1 = getOrCreateVectorValue(I.getOperand(2), Part); 4434 Value *Sel = 4435 Builder.CreateSelect(InvariantCond ? ScalarCond : Cond, Op0, Op1); 4436 VectorLoopValueMap.setVectorValue(&I, Part, Sel); 4437 addMetadata(Sel, &I); 4438 } 4439 } 4440 4441 void LoopVectorizationCostModel::collectLoopScalars(unsigned VF) { 4442 // We should not collect Scalars more than once per VF. Right now, this 4443 // function is called from collectUniformsAndScalars(), which already does 4444 // this check. Collecting Scalars for VF=1 does not make any sense. 4445 assert(VF >= 2 && Scalars.find(VF) == Scalars.end() && 4446 "This function should not be visited twice for the same VF"); 4447 4448 SmallSetVector<Instruction *, 8> Worklist; 4449 4450 // These sets are used to seed the analysis with pointers used by memory 4451 // accesses that will remain scalar. 4452 SmallSetVector<Instruction *, 8> ScalarPtrs; 4453 SmallPtrSet<Instruction *, 8> PossibleNonScalarPtrs; 4454 4455 // A helper that returns true if the use of Ptr by MemAccess will be scalar. 4456 // The pointer operands of loads and stores will be scalar as long as the 4457 // memory access is not a gather or scatter operation. The value operand of a 4458 // store will remain scalar if the store is scalarized. 4459 auto isScalarUse = [&](Instruction *MemAccess, Value *Ptr) { 4460 InstWidening WideningDecision = getWideningDecision(MemAccess, VF); 4461 assert(WideningDecision != CM_Unknown && 4462 "Widening decision should be ready at this moment"); 4463 if (auto *Store = dyn_cast<StoreInst>(MemAccess)) 4464 if (Ptr == Store->getValueOperand()) 4465 return WideningDecision == CM_Scalarize; 4466 assert(Ptr == getLoadStorePointerOperand(MemAccess) && 4467 "Ptr is neither a value or pointer operand"); 4468 return WideningDecision != CM_GatherScatter; 4469 }; 4470 4471 // A helper that returns true if the given value is a bitcast or 4472 // getelementptr instruction contained in the loop. 4473 auto isLoopVaryingBitCastOrGEP = [&](Value *V) { 4474 return ((isa<BitCastInst>(V) && V->getType()->isPointerTy()) || 4475 isa<GetElementPtrInst>(V)) && 4476 !TheLoop->isLoopInvariant(V); 4477 }; 4478 4479 // A helper that evaluates a memory access's use of a pointer. If the use 4480 // will be a scalar use, and the pointer is only used by memory accesses, we 4481 // place the pointer in ScalarPtrs. Otherwise, the pointer is placed in 4482 // PossibleNonScalarPtrs. 4483 auto evaluatePtrUse = [&](Instruction *MemAccess, Value *Ptr) { 4484 // We only care about bitcast and getelementptr instructions contained in 4485 // the loop. 4486 if (!isLoopVaryingBitCastOrGEP(Ptr)) 4487 return; 4488 4489 // If the pointer has already been identified as scalar (e.g., if it was 4490 // also identified as uniform), there's nothing to do. 4491 auto *I = cast<Instruction>(Ptr); 4492 if (Worklist.count(I)) 4493 return; 4494 4495 // If the use of the pointer will be a scalar use, and all users of the 4496 // pointer are memory accesses, place the pointer in ScalarPtrs. Otherwise, 4497 // place the pointer in PossibleNonScalarPtrs. 4498 if (isScalarUse(MemAccess, Ptr) && llvm::all_of(I->users(), [&](User *U) { 4499 return isa<LoadInst>(U) || isa<StoreInst>(U); 4500 })) 4501 ScalarPtrs.insert(I); 4502 else 4503 PossibleNonScalarPtrs.insert(I); 4504 }; 4505 4506 // We seed the scalars analysis with three classes of instructions: (1) 4507 // instructions marked uniform-after-vectorization, (2) bitcast and 4508 // getelementptr instructions used by memory accesses requiring a scalar use, 4509 // and (3) pointer induction variables and their update instructions (we 4510 // currently only scalarize these). 4511 // 4512 // (1) Add to the worklist all instructions that have been identified as 4513 // uniform-after-vectorization. 4514 Worklist.insert(Uniforms[VF].begin(), Uniforms[VF].end()); 4515 4516 // (2) Add to the worklist all bitcast and getelementptr instructions used by 4517 // memory accesses requiring a scalar use. The pointer operands of loads and 4518 // stores will be scalar as long as the memory accesses is not a gather or 4519 // scatter operation. The value operand of a store will remain scalar if the 4520 // store is scalarized. 4521 for (auto *BB : TheLoop->blocks()) 4522 for (auto &I : *BB) { 4523 if (auto *Load = dyn_cast<LoadInst>(&I)) { 4524 evaluatePtrUse(Load, Load->getPointerOperand()); 4525 } else if (auto *Store = dyn_cast<StoreInst>(&I)) { 4526 evaluatePtrUse(Store, Store->getPointerOperand()); 4527 evaluatePtrUse(Store, Store->getValueOperand()); 4528 } 4529 } 4530 for (auto *I : ScalarPtrs) 4531 if (PossibleNonScalarPtrs.find(I) == PossibleNonScalarPtrs.end()) { 4532 LLVM_DEBUG(dbgs() << "LV: Found scalar instruction: " << *I << "\n"); 4533 Worklist.insert(I); 4534 } 4535 4536 // (3) Add to the worklist all pointer induction variables and their update 4537 // instructions. 4538 // 4539 // TODO: Once we are able to vectorize pointer induction variables we should 4540 // no longer insert them into the worklist here. 4541 auto *Latch = TheLoop->getLoopLatch(); 4542 for (auto &Induction : Legal->getInductionVars()) { 4543 auto *Ind = Induction.first; 4544 auto *IndUpdate = cast<Instruction>(Ind->getIncomingValueForBlock(Latch)); 4545 if (Induction.second.getKind() != InductionDescriptor::IK_PtrInduction) 4546 continue; 4547 Worklist.insert(Ind); 4548 Worklist.insert(IndUpdate); 4549 LLVM_DEBUG(dbgs() << "LV: Found scalar instruction: " << *Ind << "\n"); 4550 LLVM_DEBUG(dbgs() << "LV: Found scalar instruction: " << *IndUpdate 4551 << "\n"); 4552 } 4553 4554 // Insert the forced scalars. 4555 // FIXME: Currently widenPHIInstruction() often creates a dead vector 4556 // induction variable when the PHI user is scalarized. 4557 auto ForcedScalar = ForcedScalars.find(VF); 4558 if (ForcedScalar != ForcedScalars.end()) 4559 for (auto *I : ForcedScalar->second) 4560 Worklist.insert(I); 4561 4562 // Expand the worklist by looking through any bitcasts and getelementptr 4563 // instructions we've already identified as scalar. This is similar to the 4564 // expansion step in collectLoopUniforms(); however, here we're only 4565 // expanding to include additional bitcasts and getelementptr instructions. 4566 unsigned Idx = 0; 4567 while (Idx != Worklist.size()) { 4568 Instruction *Dst = Worklist[Idx++]; 4569 if (!isLoopVaryingBitCastOrGEP(Dst->getOperand(0))) 4570 continue; 4571 auto *Src = cast<Instruction>(Dst->getOperand(0)); 4572 if (llvm::all_of(Src->users(), [&](User *U) -> bool { 4573 auto *J = cast<Instruction>(U); 4574 return !TheLoop->contains(J) || Worklist.count(J) || 4575 ((isa<LoadInst>(J) || isa<StoreInst>(J)) && 4576 isScalarUse(J, Src)); 4577 })) { 4578 Worklist.insert(Src); 4579 LLVM_DEBUG(dbgs() << "LV: Found scalar instruction: " << *Src << "\n"); 4580 } 4581 } 4582 4583 // An induction variable will remain scalar if all users of the induction 4584 // variable and induction variable update remain scalar. 4585 for (auto &Induction : Legal->getInductionVars()) { 4586 auto *Ind = Induction.first; 4587 auto *IndUpdate = cast<Instruction>(Ind->getIncomingValueForBlock(Latch)); 4588 4589 // We already considered pointer induction variables, so there's no reason 4590 // to look at their users again. 4591 // 4592 // TODO: Once we are able to vectorize pointer induction variables we 4593 // should no longer skip over them here. 4594 if (Induction.second.getKind() == InductionDescriptor::IK_PtrInduction) 4595 continue; 4596 4597 // If tail-folding is applied, the primary induction variable will be used 4598 // to feed a vector compare. 4599 if (Ind == Legal->getPrimaryInduction() && foldTailByMasking()) 4600 continue; 4601 4602 // Determine if all users of the induction variable are scalar after 4603 // vectorization. 4604 auto ScalarInd = llvm::all_of(Ind->users(), [&](User *U) -> bool { 4605 auto *I = cast<Instruction>(U); 4606 return I == IndUpdate || !TheLoop->contains(I) || Worklist.count(I); 4607 }); 4608 if (!ScalarInd) 4609 continue; 4610 4611 // Determine if all users of the induction variable update instruction are 4612 // scalar after vectorization. 4613 auto ScalarIndUpdate = 4614 llvm::all_of(IndUpdate->users(), [&](User *U) -> bool { 4615 auto *I = cast<Instruction>(U); 4616 return I == Ind || !TheLoop->contains(I) || Worklist.count(I); 4617 }); 4618 if (!ScalarIndUpdate) 4619 continue; 4620 4621 // The induction variable and its update instruction will remain scalar. 4622 Worklist.insert(Ind); 4623 Worklist.insert(IndUpdate); 4624 LLVM_DEBUG(dbgs() << "LV: Found scalar instruction: " << *Ind << "\n"); 4625 LLVM_DEBUG(dbgs() << "LV: Found scalar instruction: " << *IndUpdate 4626 << "\n"); 4627 } 4628 4629 Scalars[VF].insert(Worklist.begin(), Worklist.end()); 4630 } 4631 4632 bool LoopVectorizationCostModel::isScalarWithPredication(Instruction *I, unsigned VF) { 4633 if (!blockNeedsPredication(I->getParent())) 4634 return false; 4635 switch(I->getOpcode()) { 4636 default: 4637 break; 4638 case Instruction::Load: 4639 case Instruction::Store: { 4640 if (!Legal->isMaskRequired(I)) 4641 return false; 4642 auto *Ptr = getLoadStorePointerOperand(I); 4643 auto *Ty = getMemInstValueType(I); 4644 // We have already decided how to vectorize this instruction, get that 4645 // result. 4646 if (VF > 1) { 4647 InstWidening WideningDecision = getWideningDecision(I, VF); 4648 assert(WideningDecision != CM_Unknown && 4649 "Widening decision should be ready at this moment"); 4650 return WideningDecision == CM_Scalarize; 4651 } 4652 const Align Alignment = getLoadStoreAlignment(I); 4653 return isa<LoadInst>(I) ? !(isLegalMaskedLoad(Ty, Ptr, Alignment) || 4654 isLegalMaskedGather(Ty, Alignment)) 4655 : !(isLegalMaskedStore(Ty, Ptr, Alignment) || 4656 isLegalMaskedScatter(Ty, Alignment)); 4657 } 4658 case Instruction::UDiv: 4659 case Instruction::SDiv: 4660 case Instruction::SRem: 4661 case Instruction::URem: 4662 return mayDivideByZero(*I); 4663 } 4664 return false; 4665 } 4666 4667 bool LoopVectorizationCostModel::interleavedAccessCanBeWidened(Instruction *I, 4668 unsigned VF) { 4669 assert(isAccessInterleaved(I) && "Expecting interleaved access."); 4670 assert(getWideningDecision(I, VF) == CM_Unknown && 4671 "Decision should not be set yet."); 4672 auto *Group = getInterleavedAccessGroup(I); 4673 assert(Group && "Must have a group."); 4674 4675 // If the instruction's allocated size doesn't equal it's type size, it 4676 // requires padding and will be scalarized. 4677 auto &DL = I->getModule()->getDataLayout(); 4678 auto *ScalarTy = getMemInstValueType(I); 4679 if (hasIrregularType(ScalarTy, DL, VF)) 4680 return false; 4681 4682 // Check if masking is required. 4683 // A Group may need masking for one of two reasons: it resides in a block that 4684 // needs predication, or it was decided to use masking to deal with gaps. 4685 bool PredicatedAccessRequiresMasking = 4686 Legal->blockNeedsPredication(I->getParent()) && Legal->isMaskRequired(I); 4687 bool AccessWithGapsRequiresMasking = 4688 Group->requiresScalarEpilogue() && !isScalarEpilogueAllowed(); 4689 if (!PredicatedAccessRequiresMasking && !AccessWithGapsRequiresMasking) 4690 return true; 4691 4692 // If masked interleaving is required, we expect that the user/target had 4693 // enabled it, because otherwise it either wouldn't have been created or 4694 // it should have been invalidated by the CostModel. 4695 assert(useMaskedInterleavedAccesses(TTI) && 4696 "Masked interleave-groups for predicated accesses are not enabled."); 4697 4698 auto *Ty = getMemInstValueType(I); 4699 const Align Alignment = getLoadStoreAlignment(I); 4700 return isa<LoadInst>(I) ? TTI.isLegalMaskedLoad(Ty, Alignment) 4701 : TTI.isLegalMaskedStore(Ty, Alignment); 4702 } 4703 4704 bool LoopVectorizationCostModel::memoryInstructionCanBeWidened(Instruction *I, 4705 unsigned VF) { 4706 // Get and ensure we have a valid memory instruction. 4707 LoadInst *LI = dyn_cast<LoadInst>(I); 4708 StoreInst *SI = dyn_cast<StoreInst>(I); 4709 assert((LI || SI) && "Invalid memory instruction"); 4710 4711 auto *Ptr = getLoadStorePointerOperand(I); 4712 4713 // In order to be widened, the pointer should be consecutive, first of all. 4714 if (!Legal->isConsecutivePtr(Ptr)) 4715 return false; 4716 4717 // If the instruction is a store located in a predicated block, it will be 4718 // scalarized. 4719 if (isScalarWithPredication(I)) 4720 return false; 4721 4722 // If the instruction's allocated size doesn't equal it's type size, it 4723 // requires padding and will be scalarized. 4724 auto &DL = I->getModule()->getDataLayout(); 4725 auto *ScalarTy = LI ? LI->getType() : SI->getValueOperand()->getType(); 4726 if (hasIrregularType(ScalarTy, DL, VF)) 4727 return false; 4728 4729 return true; 4730 } 4731 4732 void LoopVectorizationCostModel::collectLoopUniforms(unsigned VF) { 4733 // We should not collect Uniforms more than once per VF. Right now, 4734 // this function is called from collectUniformsAndScalars(), which 4735 // already does this check. Collecting Uniforms for VF=1 does not make any 4736 // sense. 4737 4738 assert(VF >= 2 && Uniforms.find(VF) == Uniforms.end() && 4739 "This function should not be visited twice for the same VF"); 4740 4741 // Visit the list of Uniforms. If we'll not find any uniform value, we'll 4742 // not analyze again. Uniforms.count(VF) will return 1. 4743 Uniforms[VF].clear(); 4744 4745 // We now know that the loop is vectorizable! 4746 // Collect instructions inside the loop that will remain uniform after 4747 // vectorization. 4748 4749 // Global values, params and instructions outside of current loop are out of 4750 // scope. 4751 auto isOutOfScope = [&](Value *V) -> bool { 4752 Instruction *I = dyn_cast<Instruction>(V); 4753 return (!I || !TheLoop->contains(I)); 4754 }; 4755 4756 SetVector<Instruction *> Worklist; 4757 BasicBlock *Latch = TheLoop->getLoopLatch(); 4758 4759 // Instructions that are scalar with predication must not be considered 4760 // uniform after vectorization, because that would create an erroneous 4761 // replicating region where only a single instance out of VF should be formed. 4762 // TODO: optimize such seldom cases if found important, see PR40816. 4763 auto addToWorklistIfAllowed = [&](Instruction *I) -> void { 4764 if (isScalarWithPredication(I, VF)) { 4765 LLVM_DEBUG(dbgs() << "LV: Found not uniform being ScalarWithPredication: " 4766 << *I << "\n"); 4767 return; 4768 } 4769 LLVM_DEBUG(dbgs() << "LV: Found uniform instruction: " << *I << "\n"); 4770 Worklist.insert(I); 4771 }; 4772 4773 // Start with the conditional branch. If the branch condition is an 4774 // instruction contained in the loop that is only used by the branch, it is 4775 // uniform. 4776 auto *Cmp = dyn_cast<Instruction>(Latch->getTerminator()->getOperand(0)); 4777 if (Cmp && TheLoop->contains(Cmp) && Cmp->hasOneUse()) 4778 addToWorklistIfAllowed(Cmp); 4779 4780 // Holds consecutive and consecutive-like pointers. Consecutive-like pointers 4781 // are pointers that are treated like consecutive pointers during 4782 // vectorization. The pointer operands of interleaved accesses are an 4783 // example. 4784 SmallSetVector<Instruction *, 8> ConsecutiveLikePtrs; 4785 4786 // Holds pointer operands of instructions that are possibly non-uniform. 4787 SmallPtrSet<Instruction *, 8> PossibleNonUniformPtrs; 4788 4789 auto isUniformDecision = [&](Instruction *I, unsigned VF) { 4790 InstWidening WideningDecision = getWideningDecision(I, VF); 4791 assert(WideningDecision != CM_Unknown && 4792 "Widening decision should be ready at this moment"); 4793 4794 return (WideningDecision == CM_Widen || 4795 WideningDecision == CM_Widen_Reverse || 4796 WideningDecision == CM_Interleave); 4797 }; 4798 // Iterate over the instructions in the loop, and collect all 4799 // consecutive-like pointer operands in ConsecutiveLikePtrs. If it's possible 4800 // that a consecutive-like pointer operand will be scalarized, we collect it 4801 // in PossibleNonUniformPtrs instead. We use two sets here because a single 4802 // getelementptr instruction can be used by both vectorized and scalarized 4803 // memory instructions. For example, if a loop loads and stores from the same 4804 // location, but the store is conditional, the store will be scalarized, and 4805 // the getelementptr won't remain uniform. 4806 for (auto *BB : TheLoop->blocks()) 4807 for (auto &I : *BB) { 4808 // If there's no pointer operand, there's nothing to do. 4809 auto *Ptr = dyn_cast_or_null<Instruction>(getLoadStorePointerOperand(&I)); 4810 if (!Ptr) 4811 continue; 4812 4813 // True if all users of Ptr are memory accesses that have Ptr as their 4814 // pointer operand. 4815 auto UsersAreMemAccesses = 4816 llvm::all_of(Ptr->users(), [&](User *U) -> bool { 4817 return getLoadStorePointerOperand(U) == Ptr; 4818 }); 4819 4820 // Ensure the memory instruction will not be scalarized or used by 4821 // gather/scatter, making its pointer operand non-uniform. If the pointer 4822 // operand is used by any instruction other than a memory access, we 4823 // conservatively assume the pointer operand may be non-uniform. 4824 if (!UsersAreMemAccesses || !isUniformDecision(&I, VF)) 4825 PossibleNonUniformPtrs.insert(Ptr); 4826 4827 // If the memory instruction will be vectorized and its pointer operand 4828 // is consecutive-like, or interleaving - the pointer operand should 4829 // remain uniform. 4830 else 4831 ConsecutiveLikePtrs.insert(Ptr); 4832 } 4833 4834 // Add to the Worklist all consecutive and consecutive-like pointers that 4835 // aren't also identified as possibly non-uniform. 4836 for (auto *V : ConsecutiveLikePtrs) 4837 if (PossibleNonUniformPtrs.find(V) == PossibleNonUniformPtrs.end()) 4838 addToWorklistIfAllowed(V); 4839 4840 // Expand Worklist in topological order: whenever a new instruction 4841 // is added , its users should be already inside Worklist. It ensures 4842 // a uniform instruction will only be used by uniform instructions. 4843 unsigned idx = 0; 4844 while (idx != Worklist.size()) { 4845 Instruction *I = Worklist[idx++]; 4846 4847 for (auto OV : I->operand_values()) { 4848 // isOutOfScope operands cannot be uniform instructions. 4849 if (isOutOfScope(OV)) 4850 continue; 4851 // First order recurrence Phi's should typically be considered 4852 // non-uniform. 4853 auto *OP = dyn_cast<PHINode>(OV); 4854 if (OP && Legal->isFirstOrderRecurrence(OP)) 4855 continue; 4856 // If all the users of the operand are uniform, then add the 4857 // operand into the uniform worklist. 4858 auto *OI = cast<Instruction>(OV); 4859 if (llvm::all_of(OI->users(), [&](User *U) -> bool { 4860 auto *J = cast<Instruction>(U); 4861 return Worklist.count(J) || 4862 (OI == getLoadStorePointerOperand(J) && 4863 isUniformDecision(J, VF)); 4864 })) 4865 addToWorklistIfAllowed(OI); 4866 } 4867 } 4868 4869 // Returns true if Ptr is the pointer operand of a memory access instruction 4870 // I, and I is known to not require scalarization. 4871 auto isVectorizedMemAccessUse = [&](Instruction *I, Value *Ptr) -> bool { 4872 return getLoadStorePointerOperand(I) == Ptr && isUniformDecision(I, VF); 4873 }; 4874 4875 // For an instruction to be added into Worklist above, all its users inside 4876 // the loop should also be in Worklist. However, this condition cannot be 4877 // true for phi nodes that form a cyclic dependence. We must process phi 4878 // nodes separately. An induction variable will remain uniform if all users 4879 // of the induction variable and induction variable update remain uniform. 4880 // The code below handles both pointer and non-pointer induction variables. 4881 for (auto &Induction : Legal->getInductionVars()) { 4882 auto *Ind = Induction.first; 4883 auto *IndUpdate = cast<Instruction>(Ind->getIncomingValueForBlock(Latch)); 4884 4885 // Determine if all users of the induction variable are uniform after 4886 // vectorization. 4887 auto UniformInd = llvm::all_of(Ind->users(), [&](User *U) -> bool { 4888 auto *I = cast<Instruction>(U); 4889 return I == IndUpdate || !TheLoop->contains(I) || Worklist.count(I) || 4890 isVectorizedMemAccessUse(I, Ind); 4891 }); 4892 if (!UniformInd) 4893 continue; 4894 4895 // Determine if all users of the induction variable update instruction are 4896 // uniform after vectorization. 4897 auto UniformIndUpdate = 4898 llvm::all_of(IndUpdate->users(), [&](User *U) -> bool { 4899 auto *I = cast<Instruction>(U); 4900 return I == Ind || !TheLoop->contains(I) || Worklist.count(I) || 4901 isVectorizedMemAccessUse(I, IndUpdate); 4902 }); 4903 if (!UniformIndUpdate) 4904 continue; 4905 4906 // The induction variable and its update instruction will remain uniform. 4907 addToWorklistIfAllowed(Ind); 4908 addToWorklistIfAllowed(IndUpdate); 4909 } 4910 4911 Uniforms[VF].insert(Worklist.begin(), Worklist.end()); 4912 } 4913 4914 bool LoopVectorizationCostModel::runtimeChecksRequired() { 4915 LLVM_DEBUG(dbgs() << "LV: Performing code size checks.\n"); 4916 4917 if (Legal->getRuntimePointerChecking()->Need) { 4918 reportVectorizationFailure("Runtime ptr check is required with -Os/-Oz", 4919 "runtime pointer checks needed. Enable vectorization of this " 4920 "loop with '#pragma clang loop vectorize(enable)' when " 4921 "compiling with -Os/-Oz", 4922 "CantVersionLoopWithOptForSize", ORE, TheLoop); 4923 return true; 4924 } 4925 4926 if (!PSE.getUnionPredicate().getPredicates().empty()) { 4927 reportVectorizationFailure("Runtime SCEV check is required with -Os/-Oz", 4928 "runtime SCEV checks needed. Enable vectorization of this " 4929 "loop with '#pragma clang loop vectorize(enable)' when " 4930 "compiling with -Os/-Oz", 4931 "CantVersionLoopWithOptForSize", ORE, TheLoop); 4932 return true; 4933 } 4934 4935 // FIXME: Avoid specializing for stride==1 instead of bailing out. 4936 if (!Legal->getLAI()->getSymbolicStrides().empty()) { 4937 reportVectorizationFailure("Runtime stride check is required with -Os/-Oz", 4938 "runtime stride == 1 checks needed. Enable vectorization of " 4939 "this loop with '#pragma clang loop vectorize(enable)' when " 4940 "compiling with -Os/-Oz", 4941 "CantVersionLoopWithOptForSize", ORE, TheLoop); 4942 return true; 4943 } 4944 4945 return false; 4946 } 4947 4948 Optional<unsigned> LoopVectorizationCostModel::computeMaxVF(unsigned UserVF, 4949 unsigned UserIC) { 4950 if (Legal->getRuntimePointerChecking()->Need && TTI.hasBranchDivergence()) { 4951 // TODO: It may by useful to do since it's still likely to be dynamically 4952 // uniform if the target can skip. 4953 reportVectorizationFailure( 4954 "Not inserting runtime ptr check for divergent target", 4955 "runtime pointer checks needed. Not enabled for divergent target", 4956 "CantVersionLoopWithDivergentTarget", ORE, TheLoop); 4957 return None; 4958 } 4959 4960 unsigned TC = PSE.getSE()->getSmallConstantTripCount(TheLoop); 4961 LLVM_DEBUG(dbgs() << "LV: Found trip count: " << TC << '\n'); 4962 if (TC == 1) { 4963 reportVectorizationFailure("Single iteration (non) loop", 4964 "loop trip count is one, irrelevant for vectorization", 4965 "SingleIterationLoop", ORE, TheLoop); 4966 return None; 4967 } 4968 4969 switch (ScalarEpilogueStatus) { 4970 case CM_ScalarEpilogueAllowed: 4971 return UserVF ? UserVF : computeFeasibleMaxVF(TC); 4972 case CM_ScalarEpilogueNotNeededUsePredicate: 4973 LLVM_DEBUG( 4974 dbgs() << "LV: vector predicate hint/switch found.\n" 4975 << "LV: Not allowing scalar epilogue, creating predicated " 4976 << "vector loop.\n"); 4977 break; 4978 case CM_ScalarEpilogueNotAllowedLowTripLoop: 4979 // fallthrough as a special case of OptForSize 4980 case CM_ScalarEpilogueNotAllowedOptSize: 4981 if (ScalarEpilogueStatus == CM_ScalarEpilogueNotAllowedOptSize) 4982 LLVM_DEBUG( 4983 dbgs() << "LV: Not allowing scalar epilogue due to -Os/-Oz.\n"); 4984 else 4985 LLVM_DEBUG(dbgs() << "LV: Not allowing scalar epilogue due to low trip " 4986 << "count.\n"); 4987 4988 // Bail if runtime checks are required, which are not good when optimising 4989 // for size. 4990 if (runtimeChecksRequired()) 4991 return None; 4992 break; 4993 } 4994 4995 // Now try the tail folding 4996 4997 // Invalidate interleave groups that require an epilogue if we can't mask 4998 // the interleave-group. 4999 if (!useMaskedInterleavedAccesses(TTI)) { 5000 assert(WideningDecisions.empty() && Uniforms.empty() && Scalars.empty() && 5001 "No decisions should have been taken at this point"); 5002 // Note: There is no need to invalidate any cost modeling decisions here, as 5003 // non where taken so far. 5004 InterleaveInfo.invalidateGroupsRequiringScalarEpilogue(); 5005 } 5006 5007 unsigned MaxVF = UserVF ? UserVF : computeFeasibleMaxVF(TC); 5008 unsigned MaxVFtimesIC = UserIC ? MaxVF * UserIC : MaxVF; 5009 if (TC > 0 && TC % MaxVFtimesIC == 0) { 5010 // Accept MaxVF if we do not have a tail. 5011 LLVM_DEBUG(dbgs() << "LV: No tail will remain for any chosen VF.\n"); 5012 return MaxVF; 5013 } 5014 5015 // If we don't know the precise trip count, or if the trip count that we 5016 // found modulo the vectorization factor is not zero, try to fold the tail 5017 // by masking. 5018 // FIXME: look for a smaller MaxVF that does divide TC rather than masking. 5019 if (Legal->prepareToFoldTailByMasking()) { 5020 FoldTailByMasking = true; 5021 return MaxVF; 5022 } 5023 5024 if (TC == 0) { 5025 reportVectorizationFailure( 5026 "Unable to calculate the loop count due to complex control flow", 5027 "unable to calculate the loop count due to complex control flow", 5028 "UnknownLoopCountComplexCFG", ORE, TheLoop); 5029 return None; 5030 } 5031 5032 reportVectorizationFailure( 5033 "Cannot optimize for size and vectorize at the same time.", 5034 "cannot optimize for size and vectorize at the same time. " 5035 "Enable vectorization of this loop with '#pragma clang loop " 5036 "vectorize(enable)' when compiling with -Os/-Oz", 5037 "NoTailLoopWithOptForSize", ORE, TheLoop); 5038 return None; 5039 } 5040 5041 unsigned 5042 LoopVectorizationCostModel::computeFeasibleMaxVF(unsigned ConstTripCount) { 5043 MinBWs = computeMinimumValueSizes(TheLoop->getBlocks(), *DB, &TTI); 5044 unsigned SmallestType, WidestType; 5045 std::tie(SmallestType, WidestType) = getSmallestAndWidestTypes(); 5046 unsigned WidestRegister = TTI.getRegisterBitWidth(true); 5047 5048 // Get the maximum safe dependence distance in bits computed by LAA. 5049 // It is computed by MaxVF * sizeOf(type) * 8, where type is taken from 5050 // the memory accesses that is most restrictive (involved in the smallest 5051 // dependence distance). 5052 unsigned MaxSafeRegisterWidth = Legal->getMaxSafeRegisterWidth(); 5053 5054 WidestRegister = std::min(WidestRegister, MaxSafeRegisterWidth); 5055 5056 unsigned MaxVectorSize = WidestRegister / WidestType; 5057 5058 LLVM_DEBUG(dbgs() << "LV: The Smallest and Widest types: " << SmallestType 5059 << " / " << WidestType << " bits.\n"); 5060 LLVM_DEBUG(dbgs() << "LV: The Widest register safe to use is: " 5061 << WidestRegister << " bits.\n"); 5062 5063 assert(MaxVectorSize <= 256 && "Did not expect to pack so many elements" 5064 " into one vector!"); 5065 if (MaxVectorSize == 0) { 5066 LLVM_DEBUG(dbgs() << "LV: The target has no vector registers.\n"); 5067 MaxVectorSize = 1; 5068 return MaxVectorSize; 5069 } else if (ConstTripCount && ConstTripCount < MaxVectorSize && 5070 isPowerOf2_32(ConstTripCount)) { 5071 // We need to clamp the VF to be the ConstTripCount. There is no point in 5072 // choosing a higher viable VF as done in the loop below. 5073 LLVM_DEBUG(dbgs() << "LV: Clamping the MaxVF to the constant trip count: " 5074 << ConstTripCount << "\n"); 5075 MaxVectorSize = ConstTripCount; 5076 return MaxVectorSize; 5077 } 5078 5079 unsigned MaxVF = MaxVectorSize; 5080 if (TTI.shouldMaximizeVectorBandwidth(!isScalarEpilogueAllowed()) || 5081 (MaximizeBandwidth && isScalarEpilogueAllowed())) { 5082 // Collect all viable vectorization factors larger than the default MaxVF 5083 // (i.e. MaxVectorSize). 5084 SmallVector<unsigned, 8> VFs; 5085 unsigned NewMaxVectorSize = WidestRegister / SmallestType; 5086 for (unsigned VS = MaxVectorSize * 2; VS <= NewMaxVectorSize; VS *= 2) 5087 VFs.push_back(VS); 5088 5089 // For each VF calculate its register usage. 5090 auto RUs = calculateRegisterUsage(VFs); 5091 5092 // Select the largest VF which doesn't require more registers than existing 5093 // ones. 5094 for (int i = RUs.size() - 1; i >= 0; --i) { 5095 bool Selected = true; 5096 for (auto& pair : RUs[i].MaxLocalUsers) { 5097 unsigned TargetNumRegisters = TTI.getNumberOfRegisters(pair.first); 5098 if (pair.second > TargetNumRegisters) 5099 Selected = false; 5100 } 5101 if (Selected) { 5102 MaxVF = VFs[i]; 5103 break; 5104 } 5105 } 5106 if (unsigned MinVF = TTI.getMinimumVF(SmallestType)) { 5107 if (MaxVF < MinVF) { 5108 LLVM_DEBUG(dbgs() << "LV: Overriding calculated MaxVF(" << MaxVF 5109 << ") with target's minimum: " << MinVF << '\n'); 5110 MaxVF = MinVF; 5111 } 5112 } 5113 } 5114 return MaxVF; 5115 } 5116 5117 VectorizationFactor 5118 LoopVectorizationCostModel::selectVectorizationFactor(unsigned MaxVF) { 5119 float Cost = expectedCost(1).first; 5120 const float ScalarCost = Cost; 5121 unsigned Width = 1; 5122 LLVM_DEBUG(dbgs() << "LV: Scalar loop costs: " << (int)ScalarCost << ".\n"); 5123 5124 bool ForceVectorization = Hints->getForce() == LoopVectorizeHints::FK_Enabled; 5125 if (ForceVectorization && MaxVF > 1) { 5126 // Ignore scalar width, because the user explicitly wants vectorization. 5127 // Initialize cost to max so that VF = 2 is, at least, chosen during cost 5128 // evaluation. 5129 Cost = std::numeric_limits<float>::max(); 5130 } 5131 5132 for (unsigned i = 2; i <= MaxVF; i *= 2) { 5133 // Notice that the vector loop needs to be executed less times, so 5134 // we need to divide the cost of the vector loops by the width of 5135 // the vector elements. 5136 VectorizationCostTy C = expectedCost(i); 5137 float VectorCost = C.first / (float)i; 5138 LLVM_DEBUG(dbgs() << "LV: Vector loop of width " << i 5139 << " costs: " << (int)VectorCost << ".\n"); 5140 if (!C.second && !ForceVectorization) { 5141 LLVM_DEBUG( 5142 dbgs() << "LV: Not considering vector loop of width " << i 5143 << " because it will not generate any vector instructions.\n"); 5144 continue; 5145 } 5146 if (VectorCost < Cost) { 5147 Cost = VectorCost; 5148 Width = i; 5149 } 5150 } 5151 5152 if (!EnableCondStoresVectorization && NumPredStores) { 5153 reportVectorizationFailure("There are conditional stores.", 5154 "store that is conditionally executed prevents vectorization", 5155 "ConditionalStore", ORE, TheLoop); 5156 Width = 1; 5157 Cost = ScalarCost; 5158 } 5159 5160 LLVM_DEBUG(if (ForceVectorization && Width > 1 && Cost >= ScalarCost) dbgs() 5161 << "LV: Vectorization seems to be not beneficial, " 5162 << "but was forced by a user.\n"); 5163 LLVM_DEBUG(dbgs() << "LV: Selecting VF: " << Width << ".\n"); 5164 VectorizationFactor Factor = {Width, (unsigned)(Width * Cost)}; 5165 return Factor; 5166 } 5167 5168 std::pair<unsigned, unsigned> 5169 LoopVectorizationCostModel::getSmallestAndWidestTypes() { 5170 unsigned MinWidth = -1U; 5171 unsigned MaxWidth = 8; 5172 const DataLayout &DL = TheFunction->getParent()->getDataLayout(); 5173 5174 // For each block. 5175 for (BasicBlock *BB : TheLoop->blocks()) { 5176 // For each instruction in the loop. 5177 for (Instruction &I : BB->instructionsWithoutDebug()) { 5178 Type *T = I.getType(); 5179 5180 // Skip ignored values. 5181 if (ValuesToIgnore.find(&I) != ValuesToIgnore.end()) 5182 continue; 5183 5184 // Only examine Loads, Stores and PHINodes. 5185 if (!isa<LoadInst>(I) && !isa<StoreInst>(I) && !isa<PHINode>(I)) 5186 continue; 5187 5188 // Examine PHI nodes that are reduction variables. Update the type to 5189 // account for the recurrence type. 5190 if (auto *PN = dyn_cast<PHINode>(&I)) { 5191 if (!Legal->isReductionVariable(PN)) 5192 continue; 5193 RecurrenceDescriptor RdxDesc = Legal->getReductionVars()[PN]; 5194 T = RdxDesc.getRecurrenceType(); 5195 } 5196 5197 // Examine the stored values. 5198 if (auto *ST = dyn_cast<StoreInst>(&I)) 5199 T = ST->getValueOperand()->getType(); 5200 5201 // Ignore loaded pointer types and stored pointer types that are not 5202 // vectorizable. 5203 // 5204 // FIXME: The check here attempts to predict whether a load or store will 5205 // be vectorized. We only know this for certain after a VF has 5206 // been selected. Here, we assume that if an access can be 5207 // vectorized, it will be. We should also look at extending this 5208 // optimization to non-pointer types. 5209 // 5210 if (T->isPointerTy() && !isConsecutiveLoadOrStore(&I) && 5211 !isAccessInterleaved(&I) && !isLegalGatherOrScatter(&I)) 5212 continue; 5213 5214 MinWidth = std::min(MinWidth, 5215 (unsigned)DL.getTypeSizeInBits(T->getScalarType())); 5216 MaxWidth = std::max(MaxWidth, 5217 (unsigned)DL.getTypeSizeInBits(T->getScalarType())); 5218 } 5219 } 5220 5221 return {MinWidth, MaxWidth}; 5222 } 5223 5224 unsigned LoopVectorizationCostModel::selectInterleaveCount(unsigned VF, 5225 unsigned LoopCost) { 5226 // -- The interleave heuristics -- 5227 // We interleave the loop in order to expose ILP and reduce the loop overhead. 5228 // There are many micro-architectural considerations that we can't predict 5229 // at this level. For example, frontend pressure (on decode or fetch) due to 5230 // code size, or the number and capabilities of the execution ports. 5231 // 5232 // We use the following heuristics to select the interleave count: 5233 // 1. If the code has reductions, then we interleave to break the cross 5234 // iteration dependency. 5235 // 2. If the loop is really small, then we interleave to reduce the loop 5236 // overhead. 5237 // 3. We don't interleave if we think that we will spill registers to memory 5238 // due to the increased register pressure. 5239 5240 if (!isScalarEpilogueAllowed()) 5241 return 1; 5242 5243 // We used the distance for the interleave count. 5244 if (Legal->getMaxSafeDepDistBytes() != -1U) 5245 return 1; 5246 5247 // Do not interleave loops with a relatively small known or estimated trip 5248 // count. 5249 auto BestKnownTC = getSmallBestKnownTC(*PSE.getSE(), TheLoop); 5250 if (BestKnownTC && *BestKnownTC < TinyTripCountInterleaveThreshold) 5251 return 1; 5252 5253 RegisterUsage R = calculateRegisterUsage({VF})[0]; 5254 // We divide by these constants so assume that we have at least one 5255 // instruction that uses at least one register. 5256 for (auto& pair : R.MaxLocalUsers) { 5257 pair.second = std::max(pair.second, 1U); 5258 } 5259 5260 // We calculate the interleave count using the following formula. 5261 // Subtract the number of loop invariants from the number of available 5262 // registers. These registers are used by all of the interleaved instances. 5263 // Next, divide the remaining registers by the number of registers that is 5264 // required by the loop, in order to estimate how many parallel instances 5265 // fit without causing spills. All of this is rounded down if necessary to be 5266 // a power of two. We want power of two interleave count to simplify any 5267 // addressing operations or alignment considerations. 5268 // We also want power of two interleave counts to ensure that the induction 5269 // variable of the vector loop wraps to zero, when tail is folded by masking; 5270 // this currently happens when OptForSize, in which case IC is set to 1 above. 5271 unsigned IC = UINT_MAX; 5272 5273 for (auto& pair : R.MaxLocalUsers) { 5274 unsigned TargetNumRegisters = TTI.getNumberOfRegisters(pair.first); 5275 LLVM_DEBUG(dbgs() << "LV: The target has " << TargetNumRegisters 5276 << " registers of " 5277 << TTI.getRegisterClassName(pair.first) << " register class\n"); 5278 if (VF == 1) { 5279 if (ForceTargetNumScalarRegs.getNumOccurrences() > 0) 5280 TargetNumRegisters = ForceTargetNumScalarRegs; 5281 } else { 5282 if (ForceTargetNumVectorRegs.getNumOccurrences() > 0) 5283 TargetNumRegisters = ForceTargetNumVectorRegs; 5284 } 5285 unsigned MaxLocalUsers = pair.second; 5286 unsigned LoopInvariantRegs = 0; 5287 if (R.LoopInvariantRegs.find(pair.first) != R.LoopInvariantRegs.end()) 5288 LoopInvariantRegs = R.LoopInvariantRegs[pair.first]; 5289 5290 unsigned TmpIC = PowerOf2Floor((TargetNumRegisters - LoopInvariantRegs) / MaxLocalUsers); 5291 // Don't count the induction variable as interleaved. 5292 if (EnableIndVarRegisterHeur) { 5293 TmpIC = 5294 PowerOf2Floor((TargetNumRegisters - LoopInvariantRegs - 1) / 5295 std::max(1U, (MaxLocalUsers - 1))); 5296 } 5297 5298 IC = std::min(IC, TmpIC); 5299 } 5300 5301 // Clamp the interleave ranges to reasonable counts. 5302 unsigned MaxInterleaveCount = TTI.getMaxInterleaveFactor(VF); 5303 5304 // Check if the user has overridden the max. 5305 if (VF == 1) { 5306 if (ForceTargetMaxScalarInterleaveFactor.getNumOccurrences() > 0) 5307 MaxInterleaveCount = ForceTargetMaxScalarInterleaveFactor; 5308 } else { 5309 if (ForceTargetMaxVectorInterleaveFactor.getNumOccurrences() > 0) 5310 MaxInterleaveCount = ForceTargetMaxVectorInterleaveFactor; 5311 } 5312 5313 // If trip count is known or estimated compile time constant, limit the 5314 // interleave count to be less than the trip count divided by VF. 5315 if (BestKnownTC) { 5316 MaxInterleaveCount = std::min(*BestKnownTC / VF, MaxInterleaveCount); 5317 } 5318 5319 // If we did not calculate the cost for VF (because the user selected the VF) 5320 // then we calculate the cost of VF here. 5321 if (LoopCost == 0) 5322 LoopCost = expectedCost(VF).first; 5323 5324 assert(LoopCost && "Non-zero loop cost expected"); 5325 5326 // Clamp the calculated IC to be between the 1 and the max interleave count 5327 // that the target and trip count allows. 5328 if (IC > MaxInterleaveCount) 5329 IC = MaxInterleaveCount; 5330 else if (IC < 1) 5331 IC = 1; 5332 5333 // Interleave if we vectorized this loop and there is a reduction that could 5334 // benefit from interleaving. 5335 if (VF > 1 && !Legal->getReductionVars().empty()) { 5336 LLVM_DEBUG(dbgs() << "LV: Interleaving because of reductions.\n"); 5337 return IC; 5338 } 5339 5340 // Note that if we've already vectorized the loop we will have done the 5341 // runtime check and so interleaving won't require further checks. 5342 bool InterleavingRequiresRuntimePointerCheck = 5343 (VF == 1 && Legal->getRuntimePointerChecking()->Need); 5344 5345 // We want to interleave small loops in order to reduce the loop overhead and 5346 // potentially expose ILP opportunities. 5347 LLVM_DEBUG(dbgs() << "LV: Loop cost is " << LoopCost << '\n'); 5348 if (!InterleavingRequiresRuntimePointerCheck && LoopCost < SmallLoopCost) { 5349 // We assume that the cost overhead is 1 and we use the cost model 5350 // to estimate the cost of the loop and interleave until the cost of the 5351 // loop overhead is about 5% of the cost of the loop. 5352 unsigned SmallIC = 5353 std::min(IC, (unsigned)PowerOf2Floor(SmallLoopCost / LoopCost)); 5354 5355 // Interleave until store/load ports (estimated by max interleave count) are 5356 // saturated. 5357 unsigned NumStores = Legal->getNumStores(); 5358 unsigned NumLoads = Legal->getNumLoads(); 5359 unsigned StoresIC = IC / (NumStores ? NumStores : 1); 5360 unsigned LoadsIC = IC / (NumLoads ? NumLoads : 1); 5361 5362 // If we have a scalar reduction (vector reductions are already dealt with 5363 // by this point), we can increase the critical path length if the loop 5364 // we're interleaving is inside another loop. Limit, by default to 2, so the 5365 // critical path only gets increased by one reduction operation. 5366 if (!Legal->getReductionVars().empty() && TheLoop->getLoopDepth() > 1) { 5367 unsigned F = static_cast<unsigned>(MaxNestedScalarReductionIC); 5368 SmallIC = std::min(SmallIC, F); 5369 StoresIC = std::min(StoresIC, F); 5370 LoadsIC = std::min(LoadsIC, F); 5371 } 5372 5373 if (EnableLoadStoreRuntimeInterleave && 5374 std::max(StoresIC, LoadsIC) > SmallIC) { 5375 LLVM_DEBUG( 5376 dbgs() << "LV: Interleaving to saturate store or load ports.\n"); 5377 return std::max(StoresIC, LoadsIC); 5378 } 5379 5380 LLVM_DEBUG(dbgs() << "LV: Interleaving to reduce branch cost.\n"); 5381 return SmallIC; 5382 } 5383 5384 // Interleave if this is a large loop (small loops are already dealt with by 5385 // this point) that could benefit from interleaving. 5386 bool HasReductions = !Legal->getReductionVars().empty(); 5387 if (TTI.enableAggressiveInterleaving(HasReductions)) { 5388 LLVM_DEBUG(dbgs() << "LV: Interleaving to expose ILP.\n"); 5389 return IC; 5390 } 5391 5392 LLVM_DEBUG(dbgs() << "LV: Not Interleaving.\n"); 5393 return 1; 5394 } 5395 5396 SmallVector<LoopVectorizationCostModel::RegisterUsage, 8> 5397 LoopVectorizationCostModel::calculateRegisterUsage(ArrayRef<unsigned> VFs) { 5398 // This function calculates the register usage by measuring the highest number 5399 // of values that are alive at a single location. Obviously, this is a very 5400 // rough estimation. We scan the loop in a topological order in order and 5401 // assign a number to each instruction. We use RPO to ensure that defs are 5402 // met before their users. We assume that each instruction that has in-loop 5403 // users starts an interval. We record every time that an in-loop value is 5404 // used, so we have a list of the first and last occurrences of each 5405 // instruction. Next, we transpose this data structure into a multi map that 5406 // holds the list of intervals that *end* at a specific location. This multi 5407 // map allows us to perform a linear search. We scan the instructions linearly 5408 // and record each time that a new interval starts, by placing it in a set. 5409 // If we find this value in the multi-map then we remove it from the set. 5410 // The max register usage is the maximum size of the set. 5411 // We also search for instructions that are defined outside the loop, but are 5412 // used inside the loop. We need this number separately from the max-interval 5413 // usage number because when we unroll, loop-invariant values do not take 5414 // more register. 5415 LoopBlocksDFS DFS(TheLoop); 5416 DFS.perform(LI); 5417 5418 RegisterUsage RU; 5419 5420 // Each 'key' in the map opens a new interval. The values 5421 // of the map are the index of the 'last seen' usage of the 5422 // instruction that is the key. 5423 using IntervalMap = DenseMap<Instruction *, unsigned>; 5424 5425 // Maps instruction to its index. 5426 SmallVector<Instruction *, 64> IdxToInstr; 5427 // Marks the end of each interval. 5428 IntervalMap EndPoint; 5429 // Saves the list of instruction indices that are used in the loop. 5430 SmallPtrSet<Instruction *, 8> Ends; 5431 // Saves the list of values that are used in the loop but are 5432 // defined outside the loop, such as arguments and constants. 5433 SmallPtrSet<Value *, 8> LoopInvariants; 5434 5435 for (BasicBlock *BB : make_range(DFS.beginRPO(), DFS.endRPO())) { 5436 for (Instruction &I : BB->instructionsWithoutDebug()) { 5437 IdxToInstr.push_back(&I); 5438 5439 // Save the end location of each USE. 5440 for (Value *U : I.operands()) { 5441 auto *Instr = dyn_cast<Instruction>(U); 5442 5443 // Ignore non-instruction values such as arguments, constants, etc. 5444 if (!Instr) 5445 continue; 5446 5447 // If this instruction is outside the loop then record it and continue. 5448 if (!TheLoop->contains(Instr)) { 5449 LoopInvariants.insert(Instr); 5450 continue; 5451 } 5452 5453 // Overwrite previous end points. 5454 EndPoint[Instr] = IdxToInstr.size(); 5455 Ends.insert(Instr); 5456 } 5457 } 5458 } 5459 5460 // Saves the list of intervals that end with the index in 'key'. 5461 using InstrList = SmallVector<Instruction *, 2>; 5462 DenseMap<unsigned, InstrList> TransposeEnds; 5463 5464 // Transpose the EndPoints to a list of values that end at each index. 5465 for (auto &Interval : EndPoint) 5466 TransposeEnds[Interval.second].push_back(Interval.first); 5467 5468 SmallPtrSet<Instruction *, 8> OpenIntervals; 5469 5470 // Get the size of the widest register. 5471 unsigned MaxSafeDepDist = -1U; 5472 if (Legal->getMaxSafeDepDistBytes() != -1U) 5473 MaxSafeDepDist = Legal->getMaxSafeDepDistBytes() * 8; 5474 unsigned WidestRegister = 5475 std::min(TTI.getRegisterBitWidth(true), MaxSafeDepDist); 5476 const DataLayout &DL = TheFunction->getParent()->getDataLayout(); 5477 5478 SmallVector<RegisterUsage, 8> RUs(VFs.size()); 5479 SmallVector<SmallMapVector<unsigned, unsigned, 4>, 8> MaxUsages(VFs.size()); 5480 5481 LLVM_DEBUG(dbgs() << "LV(REG): Calculating max register usage:\n"); 5482 5483 // A lambda that gets the register usage for the given type and VF. 5484 auto GetRegUsage = [&DL, WidestRegister](Type *Ty, unsigned VF) { 5485 if (Ty->isTokenTy()) 5486 return 0U; 5487 unsigned TypeSize = DL.getTypeSizeInBits(Ty->getScalarType()); 5488 return std::max<unsigned>(1, VF * TypeSize / WidestRegister); 5489 }; 5490 5491 for (unsigned int i = 0, s = IdxToInstr.size(); i < s; ++i) { 5492 Instruction *I = IdxToInstr[i]; 5493 5494 // Remove all of the instructions that end at this location. 5495 InstrList &List = TransposeEnds[i]; 5496 for (Instruction *ToRemove : List) 5497 OpenIntervals.erase(ToRemove); 5498 5499 // Ignore instructions that are never used within the loop. 5500 if (Ends.find(I) == Ends.end()) 5501 continue; 5502 5503 // Skip ignored values. 5504 if (ValuesToIgnore.find(I) != ValuesToIgnore.end()) 5505 continue; 5506 5507 // For each VF find the maximum usage of registers. 5508 for (unsigned j = 0, e = VFs.size(); j < e; ++j) { 5509 // Count the number of live intervals. 5510 SmallMapVector<unsigned, unsigned, 4> RegUsage; 5511 5512 if (VFs[j] == 1) { 5513 for (auto Inst : OpenIntervals) { 5514 unsigned ClassID = TTI.getRegisterClassForType(false, Inst->getType()); 5515 if (RegUsage.find(ClassID) == RegUsage.end()) 5516 RegUsage[ClassID] = 1; 5517 else 5518 RegUsage[ClassID] += 1; 5519 } 5520 } else { 5521 collectUniformsAndScalars(VFs[j]); 5522 for (auto Inst : OpenIntervals) { 5523 // Skip ignored values for VF > 1. 5524 if (VecValuesToIgnore.find(Inst) != VecValuesToIgnore.end()) 5525 continue; 5526 if (isScalarAfterVectorization(Inst, VFs[j])) { 5527 unsigned ClassID = TTI.getRegisterClassForType(false, Inst->getType()); 5528 if (RegUsage.find(ClassID) == RegUsage.end()) 5529 RegUsage[ClassID] = 1; 5530 else 5531 RegUsage[ClassID] += 1; 5532 } else { 5533 unsigned ClassID = TTI.getRegisterClassForType(true, Inst->getType()); 5534 if (RegUsage.find(ClassID) == RegUsage.end()) 5535 RegUsage[ClassID] = GetRegUsage(Inst->getType(), VFs[j]); 5536 else 5537 RegUsage[ClassID] += GetRegUsage(Inst->getType(), VFs[j]); 5538 } 5539 } 5540 } 5541 5542 for (auto& pair : RegUsage) { 5543 if (MaxUsages[j].find(pair.first) != MaxUsages[j].end()) 5544 MaxUsages[j][pair.first] = std::max(MaxUsages[j][pair.first], pair.second); 5545 else 5546 MaxUsages[j][pair.first] = pair.second; 5547 } 5548 } 5549 5550 LLVM_DEBUG(dbgs() << "LV(REG): At #" << i << " Interval # " 5551 << OpenIntervals.size() << '\n'); 5552 5553 // Add the current instruction to the list of open intervals. 5554 OpenIntervals.insert(I); 5555 } 5556 5557 for (unsigned i = 0, e = VFs.size(); i < e; ++i) { 5558 SmallMapVector<unsigned, unsigned, 4> Invariant; 5559 5560 for (auto Inst : LoopInvariants) { 5561 unsigned Usage = VFs[i] == 1 ? 1 : GetRegUsage(Inst->getType(), VFs[i]); 5562 unsigned ClassID = TTI.getRegisterClassForType(VFs[i] > 1, Inst->getType()); 5563 if (Invariant.find(ClassID) == Invariant.end()) 5564 Invariant[ClassID] = Usage; 5565 else 5566 Invariant[ClassID] += Usage; 5567 } 5568 5569 LLVM_DEBUG({ 5570 dbgs() << "LV(REG): VF = " << VFs[i] << '\n'; 5571 dbgs() << "LV(REG): Found max usage: " << MaxUsages[i].size() 5572 << " item\n"; 5573 for (const auto &pair : MaxUsages[i]) { 5574 dbgs() << "LV(REG): RegisterClass: " 5575 << TTI.getRegisterClassName(pair.first) << ", " << pair.second 5576 << " registers\n"; 5577 } 5578 dbgs() << "LV(REG): Found invariant usage: " << Invariant.size() 5579 << " item\n"; 5580 for (const auto &pair : Invariant) { 5581 dbgs() << "LV(REG): RegisterClass: " 5582 << TTI.getRegisterClassName(pair.first) << ", " << pair.second 5583 << " registers\n"; 5584 } 5585 }); 5586 5587 RU.LoopInvariantRegs = Invariant; 5588 RU.MaxLocalUsers = MaxUsages[i]; 5589 RUs[i] = RU; 5590 } 5591 5592 return RUs; 5593 } 5594 5595 bool LoopVectorizationCostModel::useEmulatedMaskMemRefHack(Instruction *I){ 5596 // TODO: Cost model for emulated masked load/store is completely 5597 // broken. This hack guides the cost model to use an artificially 5598 // high enough value to practically disable vectorization with such 5599 // operations, except where previously deployed legality hack allowed 5600 // using very low cost values. This is to avoid regressions coming simply 5601 // from moving "masked load/store" check from legality to cost model. 5602 // Masked Load/Gather emulation was previously never allowed. 5603 // Limited number of Masked Store/Scatter emulation was allowed. 5604 assert(isPredicatedInst(I) && "Expecting a scalar emulated instruction"); 5605 return isa<LoadInst>(I) || 5606 (isa<StoreInst>(I) && 5607 NumPredStores > NumberOfStoresToPredicate); 5608 } 5609 5610 void LoopVectorizationCostModel::collectInstsToScalarize(unsigned VF) { 5611 // If we aren't vectorizing the loop, or if we've already collected the 5612 // instructions to scalarize, there's nothing to do. Collection may already 5613 // have occurred if we have a user-selected VF and are now computing the 5614 // expected cost for interleaving. 5615 if (VF < 2 || InstsToScalarize.find(VF) != InstsToScalarize.end()) 5616 return; 5617 5618 // Initialize a mapping for VF in InstsToScalalarize. If we find that it's 5619 // not profitable to scalarize any instructions, the presence of VF in the 5620 // map will indicate that we've analyzed it already. 5621 ScalarCostsTy &ScalarCostsVF = InstsToScalarize[VF]; 5622 5623 // Find all the instructions that are scalar with predication in the loop and 5624 // determine if it would be better to not if-convert the blocks they are in. 5625 // If so, we also record the instructions to scalarize. 5626 for (BasicBlock *BB : TheLoop->blocks()) { 5627 if (!blockNeedsPredication(BB)) 5628 continue; 5629 for (Instruction &I : *BB) 5630 if (isScalarWithPredication(&I)) { 5631 ScalarCostsTy ScalarCosts; 5632 // Do not apply discount logic if hacked cost is needed 5633 // for emulated masked memrefs. 5634 if (!useEmulatedMaskMemRefHack(&I) && 5635 computePredInstDiscount(&I, ScalarCosts, VF) >= 0) 5636 ScalarCostsVF.insert(ScalarCosts.begin(), ScalarCosts.end()); 5637 // Remember that BB will remain after vectorization. 5638 PredicatedBBsAfterVectorization.insert(BB); 5639 } 5640 } 5641 } 5642 5643 int LoopVectorizationCostModel::computePredInstDiscount( 5644 Instruction *PredInst, DenseMap<Instruction *, unsigned> &ScalarCosts, 5645 unsigned VF) { 5646 assert(!isUniformAfterVectorization(PredInst, VF) && 5647 "Instruction marked uniform-after-vectorization will be predicated"); 5648 5649 // Initialize the discount to zero, meaning that the scalar version and the 5650 // vector version cost the same. 5651 int Discount = 0; 5652 5653 // Holds instructions to analyze. The instructions we visit are mapped in 5654 // ScalarCosts. Those instructions are the ones that would be scalarized if 5655 // we find that the scalar version costs less. 5656 SmallVector<Instruction *, 8> Worklist; 5657 5658 // Returns true if the given instruction can be scalarized. 5659 auto canBeScalarized = [&](Instruction *I) -> bool { 5660 // We only attempt to scalarize instructions forming a single-use chain 5661 // from the original predicated block that would otherwise be vectorized. 5662 // Although not strictly necessary, we give up on instructions we know will 5663 // already be scalar to avoid traversing chains that are unlikely to be 5664 // beneficial. 5665 if (!I->hasOneUse() || PredInst->getParent() != I->getParent() || 5666 isScalarAfterVectorization(I, VF)) 5667 return false; 5668 5669 // If the instruction is scalar with predication, it will be analyzed 5670 // separately. We ignore it within the context of PredInst. 5671 if (isScalarWithPredication(I)) 5672 return false; 5673 5674 // If any of the instruction's operands are uniform after vectorization, 5675 // the instruction cannot be scalarized. This prevents, for example, a 5676 // masked load from being scalarized. 5677 // 5678 // We assume we will only emit a value for lane zero of an instruction 5679 // marked uniform after vectorization, rather than VF identical values. 5680 // Thus, if we scalarize an instruction that uses a uniform, we would 5681 // create uses of values corresponding to the lanes we aren't emitting code 5682 // for. This behavior can be changed by allowing getScalarValue to clone 5683 // the lane zero values for uniforms rather than asserting. 5684 for (Use &U : I->operands()) 5685 if (auto *J = dyn_cast<Instruction>(U.get())) 5686 if (isUniformAfterVectorization(J, VF)) 5687 return false; 5688 5689 // Otherwise, we can scalarize the instruction. 5690 return true; 5691 }; 5692 5693 // Compute the expected cost discount from scalarizing the entire expression 5694 // feeding the predicated instruction. We currently only consider expressions 5695 // that are single-use instruction chains. 5696 Worklist.push_back(PredInst); 5697 while (!Worklist.empty()) { 5698 Instruction *I = Worklist.pop_back_val(); 5699 5700 // If we've already analyzed the instruction, there's nothing to do. 5701 if (ScalarCosts.find(I) != ScalarCosts.end()) 5702 continue; 5703 5704 // Compute the cost of the vector instruction. Note that this cost already 5705 // includes the scalarization overhead of the predicated instruction. 5706 unsigned VectorCost = getInstructionCost(I, VF).first; 5707 5708 // Compute the cost of the scalarized instruction. This cost is the cost of 5709 // the instruction as if it wasn't if-converted and instead remained in the 5710 // predicated block. We will scale this cost by block probability after 5711 // computing the scalarization overhead. 5712 unsigned ScalarCost = VF * getInstructionCost(I, 1).first; 5713 5714 // Compute the scalarization overhead of needed insertelement instructions 5715 // and phi nodes. 5716 if (isScalarWithPredication(I) && !I->getType()->isVoidTy()) { 5717 ScalarCost += TTI.getScalarizationOverhead( 5718 cast<VectorType>(ToVectorTy(I->getType(), VF)), 5719 APInt::getAllOnesValue(VF), true, false); 5720 ScalarCost += VF * TTI.getCFInstrCost(Instruction::PHI); 5721 } 5722 5723 // Compute the scalarization overhead of needed extractelement 5724 // instructions. For each of the instruction's operands, if the operand can 5725 // be scalarized, add it to the worklist; otherwise, account for the 5726 // overhead. 5727 for (Use &U : I->operands()) 5728 if (auto *J = dyn_cast<Instruction>(U.get())) { 5729 assert(VectorType::isValidElementType(J->getType()) && 5730 "Instruction has non-scalar type"); 5731 if (canBeScalarized(J)) 5732 Worklist.push_back(J); 5733 else if (needsExtract(J, VF)) 5734 ScalarCost += TTI.getScalarizationOverhead( 5735 cast<VectorType>(ToVectorTy(J->getType(), VF)), 5736 APInt::getAllOnesValue(VF), false, true); 5737 } 5738 5739 // Scale the total scalar cost by block probability. 5740 ScalarCost /= getReciprocalPredBlockProb(); 5741 5742 // Compute the discount. A non-negative discount means the vector version 5743 // of the instruction costs more, and scalarizing would be beneficial. 5744 Discount += VectorCost - ScalarCost; 5745 ScalarCosts[I] = ScalarCost; 5746 } 5747 5748 return Discount; 5749 } 5750 5751 LoopVectorizationCostModel::VectorizationCostTy 5752 LoopVectorizationCostModel::expectedCost(unsigned VF) { 5753 VectorizationCostTy Cost; 5754 5755 // For each block. 5756 for (BasicBlock *BB : TheLoop->blocks()) { 5757 VectorizationCostTy BlockCost; 5758 5759 // For each instruction in the old loop. 5760 for (Instruction &I : BB->instructionsWithoutDebug()) { 5761 // Skip ignored values. 5762 if (ValuesToIgnore.find(&I) != ValuesToIgnore.end() || 5763 (VF > 1 && VecValuesToIgnore.find(&I) != VecValuesToIgnore.end())) 5764 continue; 5765 5766 VectorizationCostTy C = getInstructionCost(&I, VF); 5767 5768 // Check if we should override the cost. 5769 if (ForceTargetInstructionCost.getNumOccurrences() > 0) 5770 C.first = ForceTargetInstructionCost; 5771 5772 BlockCost.first += C.first; 5773 BlockCost.second |= C.second; 5774 LLVM_DEBUG(dbgs() << "LV: Found an estimated cost of " << C.first 5775 << " for VF " << VF << " For instruction: " << I 5776 << '\n'); 5777 } 5778 5779 // If we are vectorizing a predicated block, it will have been 5780 // if-converted. This means that the block's instructions (aside from 5781 // stores and instructions that may divide by zero) will now be 5782 // unconditionally executed. For the scalar case, we may not always execute 5783 // the predicated block. Thus, scale the block's cost by the probability of 5784 // executing it. 5785 if (VF == 1 && blockNeedsPredication(BB)) 5786 BlockCost.first /= getReciprocalPredBlockProb(); 5787 5788 Cost.first += BlockCost.first; 5789 Cost.second |= BlockCost.second; 5790 } 5791 5792 return Cost; 5793 } 5794 5795 /// Gets Address Access SCEV after verifying that the access pattern 5796 /// is loop invariant except the induction variable dependence. 5797 /// 5798 /// This SCEV can be sent to the Target in order to estimate the address 5799 /// calculation cost. 5800 static const SCEV *getAddressAccessSCEV( 5801 Value *Ptr, 5802 LoopVectorizationLegality *Legal, 5803 PredicatedScalarEvolution &PSE, 5804 const Loop *TheLoop) { 5805 5806 auto *Gep = dyn_cast<GetElementPtrInst>(Ptr); 5807 if (!Gep) 5808 return nullptr; 5809 5810 // We are looking for a gep with all loop invariant indices except for one 5811 // which should be an induction variable. 5812 auto SE = PSE.getSE(); 5813 unsigned NumOperands = Gep->getNumOperands(); 5814 for (unsigned i = 1; i < NumOperands; ++i) { 5815 Value *Opd = Gep->getOperand(i); 5816 if (!SE->isLoopInvariant(SE->getSCEV(Opd), TheLoop) && 5817 !Legal->isInductionVariable(Opd)) 5818 return nullptr; 5819 } 5820 5821 // Now we know we have a GEP ptr, %inv, %ind, %inv. return the Ptr SCEV. 5822 return PSE.getSCEV(Ptr); 5823 } 5824 5825 static bool isStrideMul(Instruction *I, LoopVectorizationLegality *Legal) { 5826 return Legal->hasStride(I->getOperand(0)) || 5827 Legal->hasStride(I->getOperand(1)); 5828 } 5829 5830 unsigned LoopVectorizationCostModel::getMemInstScalarizationCost(Instruction *I, 5831 unsigned VF) { 5832 assert(VF > 1 && "Scalarization cost of instruction implies vectorization."); 5833 Type *ValTy = getMemInstValueType(I); 5834 auto SE = PSE.getSE(); 5835 5836 unsigned AS = getLoadStoreAddressSpace(I); 5837 Value *Ptr = getLoadStorePointerOperand(I); 5838 Type *PtrTy = ToVectorTy(Ptr->getType(), VF); 5839 5840 // Figure out whether the access is strided and get the stride value 5841 // if it's known in compile time 5842 const SCEV *PtrSCEV = getAddressAccessSCEV(Ptr, Legal, PSE, TheLoop); 5843 5844 // Get the cost of the scalar memory instruction and address computation. 5845 unsigned Cost = VF * TTI.getAddressComputationCost(PtrTy, SE, PtrSCEV); 5846 5847 // Don't pass *I here, since it is scalar but will actually be part of a 5848 // vectorized loop where the user of it is a vectorized instruction. 5849 const Align Alignment = getLoadStoreAlignment(I); 5850 Cost += VF * TTI.getMemoryOpCost(I->getOpcode(), ValTy->getScalarType(), 5851 Alignment, AS, 5852 TTI::TCK_RecipThroughput); 5853 5854 // Get the overhead of the extractelement and insertelement instructions 5855 // we might create due to scalarization. 5856 Cost += getScalarizationOverhead(I, VF); 5857 5858 // If we have a predicated store, it may not be executed for each vector 5859 // lane. Scale the cost by the probability of executing the predicated 5860 // block. 5861 if (isPredicatedInst(I)) { 5862 Cost /= getReciprocalPredBlockProb(); 5863 5864 if (useEmulatedMaskMemRefHack(I)) 5865 // Artificially setting to a high enough value to practically disable 5866 // vectorization with such operations. 5867 Cost = 3000000; 5868 } 5869 5870 return Cost; 5871 } 5872 5873 unsigned LoopVectorizationCostModel::getConsecutiveMemOpCost(Instruction *I, 5874 unsigned VF) { 5875 Type *ValTy = getMemInstValueType(I); 5876 auto *VectorTy = cast<VectorType>(ToVectorTy(ValTy, VF)); 5877 Value *Ptr = getLoadStorePointerOperand(I); 5878 unsigned AS = getLoadStoreAddressSpace(I); 5879 int ConsecutiveStride = Legal->isConsecutivePtr(Ptr); 5880 enum TTI::TargetCostKind CostKind = TTI::TCK_RecipThroughput; 5881 5882 assert((ConsecutiveStride == 1 || ConsecutiveStride == -1) && 5883 "Stride should be 1 or -1 for consecutive memory access"); 5884 const Align Alignment = getLoadStoreAlignment(I); 5885 unsigned Cost = 0; 5886 if (Legal->isMaskRequired(I)) 5887 Cost += TTI.getMaskedMemoryOpCost(I->getOpcode(), VectorTy, 5888 Alignment.value(), AS, CostKind); 5889 else 5890 Cost += TTI.getMemoryOpCost(I->getOpcode(), VectorTy, Alignment, AS, 5891 CostKind, I); 5892 5893 bool Reverse = ConsecutiveStride < 0; 5894 if (Reverse) 5895 Cost += TTI.getShuffleCost(TargetTransformInfo::SK_Reverse, VectorTy, 0); 5896 return Cost; 5897 } 5898 5899 unsigned LoopVectorizationCostModel::getUniformMemOpCost(Instruction *I, 5900 unsigned VF) { 5901 Type *ValTy = getMemInstValueType(I); 5902 auto *VectorTy = cast<VectorType>(ToVectorTy(ValTy, VF)); 5903 const Align Alignment = getLoadStoreAlignment(I); 5904 unsigned AS = getLoadStoreAddressSpace(I); 5905 enum TTI::TargetCostKind CostKind = TTI::TCK_RecipThroughput; 5906 if (isa<LoadInst>(I)) { 5907 return TTI.getAddressComputationCost(ValTy) + 5908 TTI.getMemoryOpCost(Instruction::Load, ValTy, Alignment, AS, 5909 CostKind) + 5910 TTI.getShuffleCost(TargetTransformInfo::SK_Broadcast, VectorTy); 5911 } 5912 StoreInst *SI = cast<StoreInst>(I); 5913 5914 bool isLoopInvariantStoreValue = Legal->isUniform(SI->getValueOperand()); 5915 return TTI.getAddressComputationCost(ValTy) + 5916 TTI.getMemoryOpCost(Instruction::Store, ValTy, Alignment, AS, 5917 CostKind) + 5918 (isLoopInvariantStoreValue 5919 ? 0 5920 : TTI.getVectorInstrCost(Instruction::ExtractElement, VectorTy, 5921 VF - 1)); 5922 } 5923 5924 unsigned LoopVectorizationCostModel::getGatherScatterCost(Instruction *I, 5925 unsigned VF) { 5926 Type *ValTy = getMemInstValueType(I); 5927 auto *VectorTy = cast<VectorType>(ToVectorTy(ValTy, VF)); 5928 const Align Alignment = getLoadStoreAlignment(I); 5929 Value *Ptr = getLoadStorePointerOperand(I); 5930 5931 return TTI.getAddressComputationCost(VectorTy) + 5932 TTI.getGatherScatterOpCost(I->getOpcode(), VectorTy, Ptr, 5933 Legal->isMaskRequired(I), Alignment.value(), 5934 TargetTransformInfo::TCK_RecipThroughput, 5935 I); 5936 } 5937 5938 unsigned LoopVectorizationCostModel::getInterleaveGroupCost(Instruction *I, 5939 unsigned VF) { 5940 Type *ValTy = getMemInstValueType(I); 5941 auto *VectorTy = cast<VectorType>(ToVectorTy(ValTy, VF)); 5942 unsigned AS = getLoadStoreAddressSpace(I); 5943 5944 auto Group = getInterleavedAccessGroup(I); 5945 assert(Group && "Fail to get an interleaved access group."); 5946 5947 unsigned InterleaveFactor = Group->getFactor(); 5948 VectorType *WideVecTy = VectorType::get(ValTy, VF * InterleaveFactor); 5949 5950 // Holds the indices of existing members in an interleaved load group. 5951 // An interleaved store group doesn't need this as it doesn't allow gaps. 5952 SmallVector<unsigned, 4> Indices; 5953 if (isa<LoadInst>(I)) { 5954 for (unsigned i = 0; i < InterleaveFactor; i++) 5955 if (Group->getMember(i)) 5956 Indices.push_back(i); 5957 } 5958 5959 // Calculate the cost of the whole interleaved group. 5960 bool UseMaskForGaps = 5961 Group->requiresScalarEpilogue() && !isScalarEpilogueAllowed(); 5962 unsigned Cost = TTI.getInterleavedMemoryOpCost( 5963 I->getOpcode(), WideVecTy, Group->getFactor(), Indices, 5964 Group->getAlign().value(), AS, TTI::TCK_RecipThroughput, 5965 Legal->isMaskRequired(I), UseMaskForGaps); 5966 5967 if (Group->isReverse()) { 5968 // TODO: Add support for reversed masked interleaved access. 5969 assert(!Legal->isMaskRequired(I) && 5970 "Reverse masked interleaved access not supported."); 5971 Cost += Group->getNumMembers() * 5972 TTI.getShuffleCost(TargetTransformInfo::SK_Reverse, VectorTy, 0); 5973 } 5974 return Cost; 5975 } 5976 5977 unsigned LoopVectorizationCostModel::getMemoryInstructionCost(Instruction *I, 5978 unsigned VF) { 5979 // Calculate scalar cost only. Vectorization cost should be ready at this 5980 // moment. 5981 if (VF == 1) { 5982 Type *ValTy = getMemInstValueType(I); 5983 const Align Alignment = getLoadStoreAlignment(I); 5984 unsigned AS = getLoadStoreAddressSpace(I); 5985 5986 return TTI.getAddressComputationCost(ValTy) + 5987 TTI.getMemoryOpCost(I->getOpcode(), ValTy, Alignment, AS, 5988 TTI::TCK_RecipThroughput, I); 5989 } 5990 return getWideningCost(I, VF); 5991 } 5992 5993 LoopVectorizationCostModel::VectorizationCostTy 5994 LoopVectorizationCostModel::getInstructionCost(Instruction *I, unsigned VF) { 5995 // If we know that this instruction will remain uniform, check the cost of 5996 // the scalar version. 5997 if (isUniformAfterVectorization(I, VF)) 5998 VF = 1; 5999 6000 if (VF > 1 && isProfitableToScalarize(I, VF)) 6001 return VectorizationCostTy(InstsToScalarize[VF][I], false); 6002 6003 // Forced scalars do not have any scalarization overhead. 6004 auto ForcedScalar = ForcedScalars.find(VF); 6005 if (VF > 1 && ForcedScalar != ForcedScalars.end()) { 6006 auto InstSet = ForcedScalar->second; 6007 if (InstSet.find(I) != InstSet.end()) 6008 return VectorizationCostTy((getInstructionCost(I, 1).first * VF), false); 6009 } 6010 6011 Type *VectorTy; 6012 unsigned C = getInstructionCost(I, VF, VectorTy); 6013 6014 bool TypeNotScalarized = 6015 VF > 1 && VectorTy->isVectorTy() && TTI.getNumberOfParts(VectorTy) < VF; 6016 return VectorizationCostTy(C, TypeNotScalarized); 6017 } 6018 6019 unsigned LoopVectorizationCostModel::getScalarizationOverhead(Instruction *I, 6020 unsigned VF) { 6021 6022 if (VF == 1) 6023 return 0; 6024 6025 unsigned Cost = 0; 6026 Type *RetTy = ToVectorTy(I->getType(), VF); 6027 if (!RetTy->isVoidTy() && 6028 (!isa<LoadInst>(I) || !TTI.supportsEfficientVectorElementLoadStore())) 6029 Cost += TTI.getScalarizationOverhead( 6030 cast<VectorType>(RetTy), APInt::getAllOnesValue(VF), true, false); 6031 6032 // Some targets keep addresses scalar. 6033 if (isa<LoadInst>(I) && !TTI.prefersVectorizedAddressing()) 6034 return Cost; 6035 6036 // Some targets support efficient element stores. 6037 if (isa<StoreInst>(I) && TTI.supportsEfficientVectorElementLoadStore()) 6038 return Cost; 6039 6040 // Collect operands to consider. 6041 CallInst *CI = dyn_cast<CallInst>(I); 6042 Instruction::op_range Ops = CI ? CI->arg_operands() : I->operands(); 6043 6044 // Skip operands that do not require extraction/scalarization and do not incur 6045 // any overhead. 6046 return Cost + TTI.getOperandsScalarizationOverhead( 6047 filterExtractingOperands(Ops, VF), VF); 6048 } 6049 6050 void LoopVectorizationCostModel::setCostBasedWideningDecision(unsigned VF) { 6051 if (VF == 1) 6052 return; 6053 NumPredStores = 0; 6054 for (BasicBlock *BB : TheLoop->blocks()) { 6055 // For each instruction in the old loop. 6056 for (Instruction &I : *BB) { 6057 Value *Ptr = getLoadStorePointerOperand(&I); 6058 if (!Ptr) 6059 continue; 6060 6061 // TODO: We should generate better code and update the cost model for 6062 // predicated uniform stores. Today they are treated as any other 6063 // predicated store (see added test cases in 6064 // invariant-store-vectorization.ll). 6065 if (isa<StoreInst>(&I) && isScalarWithPredication(&I)) 6066 NumPredStores++; 6067 6068 if (Legal->isUniform(Ptr) && 6069 // Conditional loads and stores should be scalarized and predicated. 6070 // isScalarWithPredication cannot be used here since masked 6071 // gather/scatters are not considered scalar with predication. 6072 !Legal->blockNeedsPredication(I.getParent())) { 6073 // TODO: Avoid replicating loads and stores instead of 6074 // relying on instcombine to remove them. 6075 // Load: Scalar load + broadcast 6076 // Store: Scalar store + isLoopInvariantStoreValue ? 0 : extract 6077 unsigned Cost = getUniformMemOpCost(&I, VF); 6078 setWideningDecision(&I, VF, CM_Scalarize, Cost); 6079 continue; 6080 } 6081 6082 // We assume that widening is the best solution when possible. 6083 if (memoryInstructionCanBeWidened(&I, VF)) { 6084 unsigned Cost = getConsecutiveMemOpCost(&I, VF); 6085 int ConsecutiveStride = 6086 Legal->isConsecutivePtr(getLoadStorePointerOperand(&I)); 6087 assert((ConsecutiveStride == 1 || ConsecutiveStride == -1) && 6088 "Expected consecutive stride."); 6089 InstWidening Decision = 6090 ConsecutiveStride == 1 ? CM_Widen : CM_Widen_Reverse; 6091 setWideningDecision(&I, VF, Decision, Cost); 6092 continue; 6093 } 6094 6095 // Choose between Interleaving, Gather/Scatter or Scalarization. 6096 unsigned InterleaveCost = std::numeric_limits<unsigned>::max(); 6097 unsigned NumAccesses = 1; 6098 if (isAccessInterleaved(&I)) { 6099 auto Group = getInterleavedAccessGroup(&I); 6100 assert(Group && "Fail to get an interleaved access group."); 6101 6102 // Make one decision for the whole group. 6103 if (getWideningDecision(&I, VF) != CM_Unknown) 6104 continue; 6105 6106 NumAccesses = Group->getNumMembers(); 6107 if (interleavedAccessCanBeWidened(&I, VF)) 6108 InterleaveCost = getInterleaveGroupCost(&I, VF); 6109 } 6110 6111 unsigned GatherScatterCost = 6112 isLegalGatherOrScatter(&I) 6113 ? getGatherScatterCost(&I, VF) * NumAccesses 6114 : std::numeric_limits<unsigned>::max(); 6115 6116 unsigned ScalarizationCost = 6117 getMemInstScalarizationCost(&I, VF) * NumAccesses; 6118 6119 // Choose better solution for the current VF, 6120 // write down this decision and use it during vectorization. 6121 unsigned Cost; 6122 InstWidening Decision; 6123 if (InterleaveCost <= GatherScatterCost && 6124 InterleaveCost < ScalarizationCost) { 6125 Decision = CM_Interleave; 6126 Cost = InterleaveCost; 6127 } else if (GatherScatterCost < ScalarizationCost) { 6128 Decision = CM_GatherScatter; 6129 Cost = GatherScatterCost; 6130 } else { 6131 Decision = CM_Scalarize; 6132 Cost = ScalarizationCost; 6133 } 6134 // If the instructions belongs to an interleave group, the whole group 6135 // receives the same decision. The whole group receives the cost, but 6136 // the cost will actually be assigned to one instruction. 6137 if (auto Group = getInterleavedAccessGroup(&I)) 6138 setWideningDecision(Group, VF, Decision, Cost); 6139 else 6140 setWideningDecision(&I, VF, Decision, Cost); 6141 } 6142 } 6143 6144 // Make sure that any load of address and any other address computation 6145 // remains scalar unless there is gather/scatter support. This avoids 6146 // inevitable extracts into address registers, and also has the benefit of 6147 // activating LSR more, since that pass can't optimize vectorized 6148 // addresses. 6149 if (TTI.prefersVectorizedAddressing()) 6150 return; 6151 6152 // Start with all scalar pointer uses. 6153 SmallPtrSet<Instruction *, 8> AddrDefs; 6154 for (BasicBlock *BB : TheLoop->blocks()) 6155 for (Instruction &I : *BB) { 6156 Instruction *PtrDef = 6157 dyn_cast_or_null<Instruction>(getLoadStorePointerOperand(&I)); 6158 if (PtrDef && TheLoop->contains(PtrDef) && 6159 getWideningDecision(&I, VF) != CM_GatherScatter) 6160 AddrDefs.insert(PtrDef); 6161 } 6162 6163 // Add all instructions used to generate the addresses. 6164 SmallVector<Instruction *, 4> Worklist; 6165 for (auto *I : AddrDefs) 6166 Worklist.push_back(I); 6167 while (!Worklist.empty()) { 6168 Instruction *I = Worklist.pop_back_val(); 6169 for (auto &Op : I->operands()) 6170 if (auto *InstOp = dyn_cast<Instruction>(Op)) 6171 if ((InstOp->getParent() == I->getParent()) && !isa<PHINode>(InstOp) && 6172 AddrDefs.insert(InstOp).second) 6173 Worklist.push_back(InstOp); 6174 } 6175 6176 for (auto *I : AddrDefs) { 6177 if (isa<LoadInst>(I)) { 6178 // Setting the desired widening decision should ideally be handled in 6179 // by cost functions, but since this involves the task of finding out 6180 // if the loaded register is involved in an address computation, it is 6181 // instead changed here when we know this is the case. 6182 InstWidening Decision = getWideningDecision(I, VF); 6183 if (Decision == CM_Widen || Decision == CM_Widen_Reverse) 6184 // Scalarize a widened load of address. 6185 setWideningDecision(I, VF, CM_Scalarize, 6186 (VF * getMemoryInstructionCost(I, 1))); 6187 else if (auto Group = getInterleavedAccessGroup(I)) { 6188 // Scalarize an interleave group of address loads. 6189 for (unsigned I = 0; I < Group->getFactor(); ++I) { 6190 if (Instruction *Member = Group->getMember(I)) 6191 setWideningDecision(Member, VF, CM_Scalarize, 6192 (VF * getMemoryInstructionCost(Member, 1))); 6193 } 6194 } 6195 } else 6196 // Make sure I gets scalarized and a cost estimate without 6197 // scalarization overhead. 6198 ForcedScalars[VF].insert(I); 6199 } 6200 } 6201 6202 unsigned LoopVectorizationCostModel::getInstructionCost(Instruction *I, 6203 unsigned VF, 6204 Type *&VectorTy) { 6205 Type *RetTy = I->getType(); 6206 if (canTruncateToMinimalBitwidth(I, VF)) 6207 RetTy = IntegerType::get(RetTy->getContext(), MinBWs[I]); 6208 VectorTy = isScalarAfterVectorization(I, VF) ? RetTy : ToVectorTy(RetTy, VF); 6209 auto SE = PSE.getSE(); 6210 TTI::TargetCostKind CostKind = TTI::TCK_RecipThroughput; 6211 6212 // TODO: We need to estimate the cost of intrinsic calls. 6213 switch (I->getOpcode()) { 6214 case Instruction::GetElementPtr: 6215 // We mark this instruction as zero-cost because the cost of GEPs in 6216 // vectorized code depends on whether the corresponding memory instruction 6217 // is scalarized or not. Therefore, we handle GEPs with the memory 6218 // instruction cost. 6219 return 0; 6220 case Instruction::Br: { 6221 // In cases of scalarized and predicated instructions, there will be VF 6222 // predicated blocks in the vectorized loop. Each branch around these 6223 // blocks requires also an extract of its vector compare i1 element. 6224 bool ScalarPredicatedBB = false; 6225 BranchInst *BI = cast<BranchInst>(I); 6226 if (VF > 1 && BI->isConditional() && 6227 (PredicatedBBsAfterVectorization.find(BI->getSuccessor(0)) != 6228 PredicatedBBsAfterVectorization.end() || 6229 PredicatedBBsAfterVectorization.find(BI->getSuccessor(1)) != 6230 PredicatedBBsAfterVectorization.end())) 6231 ScalarPredicatedBB = true; 6232 6233 if (ScalarPredicatedBB) { 6234 // Return cost for branches around scalarized and predicated blocks. 6235 VectorType *Vec_i1Ty = 6236 VectorType::get(IntegerType::getInt1Ty(RetTy->getContext()), VF); 6237 return (TTI.getScalarizationOverhead(Vec_i1Ty, APInt::getAllOnesValue(VF), 6238 false, true) + 6239 (TTI.getCFInstrCost(Instruction::Br) * VF)); 6240 } else if (I->getParent() == TheLoop->getLoopLatch() || VF == 1) 6241 // The back-edge branch will remain, as will all scalar branches. 6242 return TTI.getCFInstrCost(Instruction::Br); 6243 else 6244 // This branch will be eliminated by if-conversion. 6245 return 0; 6246 // Note: We currently assume zero cost for an unconditional branch inside 6247 // a predicated block since it will become a fall-through, although we 6248 // may decide in the future to call TTI for all branches. 6249 } 6250 case Instruction::PHI: { 6251 auto *Phi = cast<PHINode>(I); 6252 6253 // First-order recurrences are replaced by vector shuffles inside the loop. 6254 // NOTE: Don't use ToVectorTy as SK_ExtractSubvector expects a vector type. 6255 if (VF > 1 && Legal->isFirstOrderRecurrence(Phi)) 6256 return TTI.getShuffleCost(TargetTransformInfo::SK_ExtractSubvector, 6257 cast<VectorType>(VectorTy), VF - 1, 6258 VectorType::get(RetTy, 1)); 6259 6260 // Phi nodes in non-header blocks (not inductions, reductions, etc.) are 6261 // converted into select instructions. We require N - 1 selects per phi 6262 // node, where N is the number of incoming values. 6263 if (VF > 1 && Phi->getParent() != TheLoop->getHeader()) 6264 return (Phi->getNumIncomingValues() - 1) * 6265 TTI.getCmpSelInstrCost( 6266 Instruction::Select, ToVectorTy(Phi->getType(), VF), 6267 ToVectorTy(Type::getInt1Ty(Phi->getContext()), VF), 6268 CostKind); 6269 6270 return TTI.getCFInstrCost(Instruction::PHI); 6271 } 6272 case Instruction::UDiv: 6273 case Instruction::SDiv: 6274 case Instruction::URem: 6275 case Instruction::SRem: 6276 // If we have a predicated instruction, it may not be executed for each 6277 // vector lane. Get the scalarization cost and scale this amount by the 6278 // probability of executing the predicated block. If the instruction is not 6279 // predicated, we fall through to the next case. 6280 if (VF > 1 && isScalarWithPredication(I)) { 6281 unsigned Cost = 0; 6282 6283 // These instructions have a non-void type, so account for the phi nodes 6284 // that we will create. This cost is likely to be zero. The phi node 6285 // cost, if any, should be scaled by the block probability because it 6286 // models a copy at the end of each predicated block. 6287 Cost += VF * TTI.getCFInstrCost(Instruction::PHI); 6288 6289 // The cost of the non-predicated instruction. 6290 Cost += VF * TTI.getArithmeticInstrCost(I->getOpcode(), RetTy, CostKind); 6291 6292 // The cost of insertelement and extractelement instructions needed for 6293 // scalarization. 6294 Cost += getScalarizationOverhead(I, VF); 6295 6296 // Scale the cost by the probability of executing the predicated blocks. 6297 // This assumes the predicated block for each vector lane is equally 6298 // likely. 6299 return Cost / getReciprocalPredBlockProb(); 6300 } 6301 LLVM_FALLTHROUGH; 6302 case Instruction::Add: 6303 case Instruction::FAdd: 6304 case Instruction::Sub: 6305 case Instruction::FSub: 6306 case Instruction::Mul: 6307 case Instruction::FMul: 6308 case Instruction::FDiv: 6309 case Instruction::FRem: 6310 case Instruction::Shl: 6311 case Instruction::LShr: 6312 case Instruction::AShr: 6313 case Instruction::And: 6314 case Instruction::Or: 6315 case Instruction::Xor: { 6316 // Since we will replace the stride by 1 the multiplication should go away. 6317 if (I->getOpcode() == Instruction::Mul && isStrideMul(I, Legal)) 6318 return 0; 6319 // Certain instructions can be cheaper to vectorize if they have a constant 6320 // second vector operand. One example of this are shifts on x86. 6321 Value *Op2 = I->getOperand(1); 6322 TargetTransformInfo::OperandValueProperties Op2VP; 6323 TargetTransformInfo::OperandValueKind Op2VK = 6324 TTI.getOperandInfo(Op2, Op2VP); 6325 if (Op2VK == TargetTransformInfo::OK_AnyValue && Legal->isUniform(Op2)) 6326 Op2VK = TargetTransformInfo::OK_UniformValue; 6327 6328 SmallVector<const Value *, 4> Operands(I->operand_values()); 6329 unsigned N = isScalarAfterVectorization(I, VF) ? VF : 1; 6330 return N * TTI.getArithmeticInstrCost( 6331 I->getOpcode(), VectorTy, CostKind, 6332 TargetTransformInfo::OK_AnyValue, 6333 Op2VK, TargetTransformInfo::OP_None, Op2VP, Operands, I); 6334 } 6335 case Instruction::FNeg: { 6336 unsigned N = isScalarAfterVectorization(I, VF) ? VF : 1; 6337 return N * TTI.getArithmeticInstrCost( 6338 I->getOpcode(), VectorTy, CostKind, 6339 TargetTransformInfo::OK_AnyValue, 6340 TargetTransformInfo::OK_AnyValue, 6341 TargetTransformInfo::OP_None, TargetTransformInfo::OP_None, 6342 I->getOperand(0), I); 6343 } 6344 case Instruction::Select: { 6345 SelectInst *SI = cast<SelectInst>(I); 6346 const SCEV *CondSCEV = SE->getSCEV(SI->getCondition()); 6347 bool ScalarCond = (SE->isLoopInvariant(CondSCEV, TheLoop)); 6348 Type *CondTy = SI->getCondition()->getType(); 6349 if (!ScalarCond) 6350 CondTy = VectorType::get(CondTy, VF); 6351 6352 return TTI.getCmpSelInstrCost(I->getOpcode(), VectorTy, CondTy, 6353 CostKind, I); 6354 } 6355 case Instruction::ICmp: 6356 case Instruction::FCmp: { 6357 Type *ValTy = I->getOperand(0)->getType(); 6358 Instruction *Op0AsInstruction = dyn_cast<Instruction>(I->getOperand(0)); 6359 if (canTruncateToMinimalBitwidth(Op0AsInstruction, VF)) 6360 ValTy = IntegerType::get(ValTy->getContext(), MinBWs[Op0AsInstruction]); 6361 VectorTy = ToVectorTy(ValTy, VF); 6362 return TTI.getCmpSelInstrCost(I->getOpcode(), VectorTy, nullptr, CostKind, 6363 I); 6364 } 6365 case Instruction::Store: 6366 case Instruction::Load: { 6367 unsigned Width = VF; 6368 if (Width > 1) { 6369 InstWidening Decision = getWideningDecision(I, Width); 6370 assert(Decision != CM_Unknown && 6371 "CM decision should be taken at this point"); 6372 if (Decision == CM_Scalarize) 6373 Width = 1; 6374 } 6375 VectorTy = ToVectorTy(getMemInstValueType(I), Width); 6376 return getMemoryInstructionCost(I, VF); 6377 } 6378 case Instruction::ZExt: 6379 case Instruction::SExt: 6380 case Instruction::FPToUI: 6381 case Instruction::FPToSI: 6382 case Instruction::FPExt: 6383 case Instruction::PtrToInt: 6384 case Instruction::IntToPtr: 6385 case Instruction::SIToFP: 6386 case Instruction::UIToFP: 6387 case Instruction::Trunc: 6388 case Instruction::FPTrunc: 6389 case Instruction::BitCast: { 6390 // We optimize the truncation of induction variables having constant 6391 // integer steps. The cost of these truncations is the same as the scalar 6392 // operation. 6393 if (isOptimizableIVTruncate(I, VF)) { 6394 auto *Trunc = cast<TruncInst>(I); 6395 return TTI.getCastInstrCost(Instruction::Trunc, Trunc->getDestTy(), 6396 Trunc->getSrcTy(), CostKind, Trunc); 6397 } 6398 6399 Type *SrcScalarTy = I->getOperand(0)->getType(); 6400 Type *SrcVecTy = 6401 VectorTy->isVectorTy() ? ToVectorTy(SrcScalarTy, VF) : SrcScalarTy; 6402 if (canTruncateToMinimalBitwidth(I, VF)) { 6403 // This cast is going to be shrunk. This may remove the cast or it might 6404 // turn it into slightly different cast. For example, if MinBW == 16, 6405 // "zext i8 %1 to i32" becomes "zext i8 %1 to i16". 6406 // 6407 // Calculate the modified src and dest types. 6408 Type *MinVecTy = VectorTy; 6409 if (I->getOpcode() == Instruction::Trunc) { 6410 SrcVecTy = smallestIntegerVectorType(SrcVecTy, MinVecTy); 6411 VectorTy = 6412 largestIntegerVectorType(ToVectorTy(I->getType(), VF), MinVecTy); 6413 } else if (I->getOpcode() == Instruction::ZExt || 6414 I->getOpcode() == Instruction::SExt) { 6415 SrcVecTy = largestIntegerVectorType(SrcVecTy, MinVecTy); 6416 VectorTy = 6417 smallestIntegerVectorType(ToVectorTy(I->getType(), VF), MinVecTy); 6418 } 6419 } 6420 6421 unsigned N = isScalarAfterVectorization(I, VF) ? VF : 1; 6422 return N * TTI.getCastInstrCost(I->getOpcode(), VectorTy, SrcVecTy, 6423 CostKind, I); 6424 } 6425 case Instruction::Call: { 6426 bool NeedToScalarize; 6427 CallInst *CI = cast<CallInst>(I); 6428 unsigned CallCost = getVectorCallCost(CI, VF, NeedToScalarize); 6429 if (getVectorIntrinsicIDForCall(CI, TLI)) 6430 return std::min(CallCost, getVectorIntrinsicCost(CI, VF)); 6431 return CallCost; 6432 } 6433 default: 6434 // The cost of executing VF copies of the scalar instruction. This opcode 6435 // is unknown. Assume that it is the same as 'mul'. 6436 return VF * TTI.getArithmeticInstrCost(Instruction::Mul, VectorTy, 6437 CostKind) + 6438 getScalarizationOverhead(I, VF); 6439 } // end of switch. 6440 } 6441 6442 char LoopVectorize::ID = 0; 6443 6444 static const char lv_name[] = "Loop Vectorization"; 6445 6446 INITIALIZE_PASS_BEGIN(LoopVectorize, LV_NAME, lv_name, false, false) 6447 INITIALIZE_PASS_DEPENDENCY(TargetTransformInfoWrapperPass) 6448 INITIALIZE_PASS_DEPENDENCY(BasicAAWrapperPass) 6449 INITIALIZE_PASS_DEPENDENCY(AAResultsWrapperPass) 6450 INITIALIZE_PASS_DEPENDENCY(GlobalsAAWrapperPass) 6451 INITIALIZE_PASS_DEPENDENCY(AssumptionCacheTracker) 6452 INITIALIZE_PASS_DEPENDENCY(BlockFrequencyInfoWrapperPass) 6453 INITIALIZE_PASS_DEPENDENCY(DominatorTreeWrapperPass) 6454 INITIALIZE_PASS_DEPENDENCY(ScalarEvolutionWrapperPass) 6455 INITIALIZE_PASS_DEPENDENCY(LoopInfoWrapperPass) 6456 INITIALIZE_PASS_DEPENDENCY(LoopAccessLegacyAnalysis) 6457 INITIALIZE_PASS_DEPENDENCY(DemandedBitsWrapperPass) 6458 INITIALIZE_PASS_DEPENDENCY(OptimizationRemarkEmitterWrapperPass) 6459 INITIALIZE_PASS_DEPENDENCY(ProfileSummaryInfoWrapperPass) 6460 INITIALIZE_PASS_DEPENDENCY(InjectTLIMappingsLegacy) 6461 INITIALIZE_PASS_END(LoopVectorize, LV_NAME, lv_name, false, false) 6462 6463 namespace llvm { 6464 6465 Pass *createLoopVectorizePass() { return new LoopVectorize(); } 6466 6467 Pass *createLoopVectorizePass(bool InterleaveOnlyWhenForced, 6468 bool VectorizeOnlyWhenForced) { 6469 return new LoopVectorize(InterleaveOnlyWhenForced, VectorizeOnlyWhenForced); 6470 } 6471 6472 } // end namespace llvm 6473 6474 bool LoopVectorizationCostModel::isConsecutiveLoadOrStore(Instruction *Inst) { 6475 // Check if the pointer operand of a load or store instruction is 6476 // consecutive. 6477 if (auto *Ptr = getLoadStorePointerOperand(Inst)) 6478 return Legal->isConsecutivePtr(Ptr); 6479 return false; 6480 } 6481 6482 void LoopVectorizationCostModel::collectValuesToIgnore() { 6483 // Ignore ephemeral values. 6484 CodeMetrics::collectEphemeralValues(TheLoop, AC, ValuesToIgnore); 6485 6486 // Ignore type-promoting instructions we identified during reduction 6487 // detection. 6488 for (auto &Reduction : Legal->getReductionVars()) { 6489 RecurrenceDescriptor &RedDes = Reduction.second; 6490 SmallPtrSetImpl<Instruction *> &Casts = RedDes.getCastInsts(); 6491 VecValuesToIgnore.insert(Casts.begin(), Casts.end()); 6492 } 6493 // Ignore type-casting instructions we identified during induction 6494 // detection. 6495 for (auto &Induction : Legal->getInductionVars()) { 6496 InductionDescriptor &IndDes = Induction.second; 6497 const SmallVectorImpl<Instruction *> &Casts = IndDes.getCastInsts(); 6498 VecValuesToIgnore.insert(Casts.begin(), Casts.end()); 6499 } 6500 } 6501 6502 // TODO: we could return a pair of values that specify the max VF and 6503 // min VF, to be used in `buildVPlans(MinVF, MaxVF)` instead of 6504 // `buildVPlans(VF, VF)`. We cannot do it because VPLAN at the moment 6505 // doesn't have a cost model that can choose which plan to execute if 6506 // more than one is generated. 6507 static unsigned determineVPlanVF(const unsigned WidestVectorRegBits, 6508 LoopVectorizationCostModel &CM) { 6509 unsigned WidestType; 6510 std::tie(std::ignore, WidestType) = CM.getSmallestAndWidestTypes(); 6511 return WidestVectorRegBits / WidestType; 6512 } 6513 6514 VectorizationFactor 6515 LoopVectorizationPlanner::planInVPlanNativePath(unsigned UserVF) { 6516 unsigned VF = UserVF; 6517 // Outer loop handling: They may require CFG and instruction level 6518 // transformations before even evaluating whether vectorization is profitable. 6519 // Since we cannot modify the incoming IR, we need to build VPlan upfront in 6520 // the vectorization pipeline. 6521 if (!OrigLoop->empty()) { 6522 // If the user doesn't provide a vectorization factor, determine a 6523 // reasonable one. 6524 if (!UserVF) { 6525 VF = determineVPlanVF(TTI->getRegisterBitWidth(true /* Vector*/), CM); 6526 LLVM_DEBUG(dbgs() << "LV: VPlan computed VF " << VF << ".\n"); 6527 6528 // Make sure we have a VF > 1 for stress testing. 6529 if (VPlanBuildStressTest && VF < 2) { 6530 LLVM_DEBUG(dbgs() << "LV: VPlan stress testing: " 6531 << "overriding computed VF.\n"); 6532 VF = 4; 6533 } 6534 } 6535 assert(EnableVPlanNativePath && "VPlan-native path is not enabled."); 6536 assert(isPowerOf2_32(VF) && "VF needs to be a power of two"); 6537 LLVM_DEBUG(dbgs() << "LV: Using " << (UserVF ? "user " : "") << "VF " << VF 6538 << " to build VPlans.\n"); 6539 buildVPlans(VF, VF); 6540 6541 // For VPlan build stress testing, we bail out after VPlan construction. 6542 if (VPlanBuildStressTest) 6543 return VectorizationFactor::Disabled(); 6544 6545 return {VF, 0}; 6546 } 6547 6548 LLVM_DEBUG( 6549 dbgs() << "LV: Not vectorizing. Inner loops aren't supported in the " 6550 "VPlan-native path.\n"); 6551 return VectorizationFactor::Disabled(); 6552 } 6553 6554 Optional<VectorizationFactor> LoopVectorizationPlanner::plan(unsigned UserVF, 6555 unsigned UserIC) { 6556 assert(OrigLoop->empty() && "Inner loop expected."); 6557 Optional<unsigned> MaybeMaxVF = CM.computeMaxVF(UserVF, UserIC); 6558 if (!MaybeMaxVF) // Cases that should not to be vectorized nor interleaved. 6559 return None; 6560 6561 // Invalidate interleave groups if all blocks of loop will be predicated. 6562 if (CM.blockNeedsPredication(OrigLoop->getHeader()) && 6563 !useMaskedInterleavedAccesses(*TTI)) { 6564 LLVM_DEBUG( 6565 dbgs() 6566 << "LV: Invalidate all interleaved groups due to fold-tail by masking " 6567 "which requires masked-interleaved support.\n"); 6568 if (CM.InterleaveInfo.invalidateGroups()) 6569 // Invalidating interleave groups also requires invalidating all decisions 6570 // based on them, which includes widening decisions and uniform and scalar 6571 // values. 6572 CM.invalidateCostModelingDecisions(); 6573 } 6574 6575 if (UserVF) { 6576 LLVM_DEBUG(dbgs() << "LV: Using user VF " << UserVF << ".\n"); 6577 assert(isPowerOf2_32(UserVF) && "VF needs to be a power of two"); 6578 // Collect the instructions (and their associated costs) that will be more 6579 // profitable to scalarize. 6580 CM.selectUserVectorizationFactor(UserVF); 6581 buildVPlansWithVPRecipes(UserVF, UserVF); 6582 LLVM_DEBUG(printPlans(dbgs())); 6583 return {{UserVF, 0}}; 6584 } 6585 6586 unsigned MaxVF = MaybeMaxVF.getValue(); 6587 assert(MaxVF != 0 && "MaxVF is zero."); 6588 6589 for (unsigned VF = 1; VF <= MaxVF; VF *= 2) { 6590 // Collect Uniform and Scalar instructions after vectorization with VF. 6591 CM.collectUniformsAndScalars(VF); 6592 6593 // Collect the instructions (and their associated costs) that will be more 6594 // profitable to scalarize. 6595 if (VF > 1) 6596 CM.collectInstsToScalarize(VF); 6597 } 6598 6599 buildVPlansWithVPRecipes(1, MaxVF); 6600 LLVM_DEBUG(printPlans(dbgs())); 6601 if (MaxVF == 1) 6602 return VectorizationFactor::Disabled(); 6603 6604 // Select the optimal vectorization factor. 6605 return CM.selectVectorizationFactor(MaxVF); 6606 } 6607 6608 void LoopVectorizationPlanner::setBestPlan(unsigned VF, unsigned UF) { 6609 LLVM_DEBUG(dbgs() << "Setting best plan to VF=" << VF << ", UF=" << UF 6610 << '\n'); 6611 BestVF = VF; 6612 BestUF = UF; 6613 6614 erase_if(VPlans, [VF](const VPlanPtr &Plan) { 6615 return !Plan->hasVF(VF); 6616 }); 6617 assert(VPlans.size() == 1 && "Best VF has not a single VPlan."); 6618 } 6619 6620 void LoopVectorizationPlanner::executePlan(InnerLoopVectorizer &ILV, 6621 DominatorTree *DT) { 6622 // Perform the actual loop transformation. 6623 6624 // 1. Create a new empty loop. Unlink the old loop and connect the new one. 6625 VPCallbackILV CallbackILV(ILV); 6626 6627 VPTransformState State{BestVF, BestUF, LI, 6628 DT, ILV.Builder, ILV.VectorLoopValueMap, 6629 &ILV, CallbackILV}; 6630 State.CFG.PrevBB = ILV.createVectorizedLoopSkeleton(); 6631 State.TripCount = ILV.getOrCreateTripCount(nullptr); 6632 State.CanonicalIV = ILV.Induction; 6633 6634 //===------------------------------------------------===// 6635 // 6636 // Notice: any optimization or new instruction that go 6637 // into the code below should also be implemented in 6638 // the cost-model. 6639 // 6640 //===------------------------------------------------===// 6641 6642 // 2. Copy and widen instructions from the old loop into the new loop. 6643 assert(VPlans.size() == 1 && "Not a single VPlan to execute."); 6644 VPlans.front()->execute(&State); 6645 6646 // 3. Fix the vectorized code: take care of header phi's, live-outs, 6647 // predication, updating analyses. 6648 ILV.fixVectorizedLoop(); 6649 } 6650 6651 void LoopVectorizationPlanner::collectTriviallyDeadInstructions( 6652 SmallPtrSetImpl<Instruction *> &DeadInstructions) { 6653 BasicBlock *Latch = OrigLoop->getLoopLatch(); 6654 6655 // We create new control-flow for the vectorized loop, so the original 6656 // condition will be dead after vectorization if it's only used by the 6657 // branch. 6658 auto *Cmp = dyn_cast<Instruction>(Latch->getTerminator()->getOperand(0)); 6659 if (Cmp && Cmp->hasOneUse()) 6660 DeadInstructions.insert(Cmp); 6661 6662 // We create new "steps" for induction variable updates to which the original 6663 // induction variables map. An original update instruction will be dead if 6664 // all its users except the induction variable are dead. 6665 for (auto &Induction : Legal->getInductionVars()) { 6666 PHINode *Ind = Induction.first; 6667 auto *IndUpdate = cast<Instruction>(Ind->getIncomingValueForBlock(Latch)); 6668 if (llvm::all_of(IndUpdate->users(), [&](User *U) -> bool { 6669 return U == Ind || DeadInstructions.find(cast<Instruction>(U)) != 6670 DeadInstructions.end(); 6671 })) 6672 DeadInstructions.insert(IndUpdate); 6673 6674 // We record as "Dead" also the type-casting instructions we had identified 6675 // during induction analysis. We don't need any handling for them in the 6676 // vectorized loop because we have proven that, under a proper runtime 6677 // test guarding the vectorized loop, the value of the phi, and the casted 6678 // value of the phi, are the same. The last instruction in this casting chain 6679 // will get its scalar/vector/widened def from the scalar/vector/widened def 6680 // of the respective phi node. Any other casts in the induction def-use chain 6681 // have no other uses outside the phi update chain, and will be ignored. 6682 InductionDescriptor &IndDes = Induction.second; 6683 const SmallVectorImpl<Instruction *> &Casts = IndDes.getCastInsts(); 6684 DeadInstructions.insert(Casts.begin(), Casts.end()); 6685 } 6686 } 6687 6688 Value *InnerLoopUnroller::reverseVector(Value *Vec) { return Vec; } 6689 6690 Value *InnerLoopUnroller::getBroadcastInstrs(Value *V) { return V; } 6691 6692 Value *InnerLoopUnroller::getStepVector(Value *Val, int StartIdx, Value *Step, 6693 Instruction::BinaryOps BinOp) { 6694 // When unrolling and the VF is 1, we only need to add a simple scalar. 6695 Type *Ty = Val->getType(); 6696 assert(!Ty->isVectorTy() && "Val must be a scalar"); 6697 6698 if (Ty->isFloatingPointTy()) { 6699 Constant *C = ConstantFP::get(Ty, (double)StartIdx); 6700 6701 // Floating point operations had to be 'fast' to enable the unrolling. 6702 Value *MulOp = addFastMathFlag(Builder.CreateFMul(C, Step)); 6703 return addFastMathFlag(Builder.CreateBinOp(BinOp, Val, MulOp)); 6704 } 6705 Constant *C = ConstantInt::get(Ty, StartIdx); 6706 return Builder.CreateAdd(Val, Builder.CreateMul(C, Step), "induction"); 6707 } 6708 6709 static void AddRuntimeUnrollDisableMetaData(Loop *L) { 6710 SmallVector<Metadata *, 4> MDs; 6711 // Reserve first location for self reference to the LoopID metadata node. 6712 MDs.push_back(nullptr); 6713 bool IsUnrollMetadata = false; 6714 MDNode *LoopID = L->getLoopID(); 6715 if (LoopID) { 6716 // First find existing loop unrolling disable metadata. 6717 for (unsigned i = 1, ie = LoopID->getNumOperands(); i < ie; ++i) { 6718 auto *MD = dyn_cast<MDNode>(LoopID->getOperand(i)); 6719 if (MD) { 6720 const auto *S = dyn_cast<MDString>(MD->getOperand(0)); 6721 IsUnrollMetadata = 6722 S && S->getString().startswith("llvm.loop.unroll.disable"); 6723 } 6724 MDs.push_back(LoopID->getOperand(i)); 6725 } 6726 } 6727 6728 if (!IsUnrollMetadata) { 6729 // Add runtime unroll disable metadata. 6730 LLVMContext &Context = L->getHeader()->getContext(); 6731 SmallVector<Metadata *, 1> DisableOperands; 6732 DisableOperands.push_back( 6733 MDString::get(Context, "llvm.loop.unroll.runtime.disable")); 6734 MDNode *DisableNode = MDNode::get(Context, DisableOperands); 6735 MDs.push_back(DisableNode); 6736 MDNode *NewLoopID = MDNode::get(Context, MDs); 6737 // Set operand 0 to refer to the loop id itself. 6738 NewLoopID->replaceOperandWith(0, NewLoopID); 6739 L->setLoopID(NewLoopID); 6740 } 6741 } 6742 6743 bool LoopVectorizationPlanner::getDecisionAndClampRange( 6744 const std::function<bool(unsigned)> &Predicate, VFRange &Range) { 6745 assert(Range.End > Range.Start && "Trying to test an empty VF range."); 6746 bool PredicateAtRangeStart = Predicate(Range.Start); 6747 6748 for (unsigned TmpVF = Range.Start * 2; TmpVF < Range.End; TmpVF *= 2) 6749 if (Predicate(TmpVF) != PredicateAtRangeStart) { 6750 Range.End = TmpVF; 6751 break; 6752 } 6753 6754 return PredicateAtRangeStart; 6755 } 6756 6757 /// Build VPlans for the full range of feasible VF's = {\p MinVF, 2 * \p MinVF, 6758 /// 4 * \p MinVF, ..., \p MaxVF} by repeatedly building a VPlan for a sub-range 6759 /// of VF's starting at a given VF and extending it as much as possible. Each 6760 /// vectorization decision can potentially shorten this sub-range during 6761 /// buildVPlan(). 6762 void LoopVectorizationPlanner::buildVPlans(unsigned MinVF, unsigned MaxVF) { 6763 for (unsigned VF = MinVF; VF < MaxVF + 1;) { 6764 VFRange SubRange = {VF, MaxVF + 1}; 6765 VPlans.push_back(buildVPlan(SubRange)); 6766 VF = SubRange.End; 6767 } 6768 } 6769 6770 VPValue *VPRecipeBuilder::createEdgeMask(BasicBlock *Src, BasicBlock *Dst, 6771 VPlanPtr &Plan) { 6772 assert(is_contained(predecessors(Dst), Src) && "Invalid edge"); 6773 6774 // Look for cached value. 6775 std::pair<BasicBlock *, BasicBlock *> Edge(Src, Dst); 6776 EdgeMaskCacheTy::iterator ECEntryIt = EdgeMaskCache.find(Edge); 6777 if (ECEntryIt != EdgeMaskCache.end()) 6778 return ECEntryIt->second; 6779 6780 VPValue *SrcMask = createBlockInMask(Src, Plan); 6781 6782 // The terminator has to be a branch inst! 6783 BranchInst *BI = dyn_cast<BranchInst>(Src->getTerminator()); 6784 assert(BI && "Unexpected terminator found"); 6785 6786 if (!BI->isConditional() || BI->getSuccessor(0) == BI->getSuccessor(1)) 6787 return EdgeMaskCache[Edge] = SrcMask; 6788 6789 VPValue *EdgeMask = Plan->getVPValue(BI->getCondition()); 6790 assert(EdgeMask && "No Edge Mask found for condition"); 6791 6792 if (BI->getSuccessor(0) != Dst) 6793 EdgeMask = Builder.createNot(EdgeMask); 6794 6795 if (SrcMask) // Otherwise block in-mask is all-one, no need to AND. 6796 EdgeMask = Builder.createAnd(EdgeMask, SrcMask); 6797 6798 return EdgeMaskCache[Edge] = EdgeMask; 6799 } 6800 6801 VPValue *VPRecipeBuilder::createBlockInMask(BasicBlock *BB, VPlanPtr &Plan) { 6802 assert(OrigLoop->contains(BB) && "Block is not a part of a loop"); 6803 6804 // Look for cached value. 6805 BlockMaskCacheTy::iterator BCEntryIt = BlockMaskCache.find(BB); 6806 if (BCEntryIt != BlockMaskCache.end()) 6807 return BCEntryIt->second; 6808 6809 // All-one mask is modelled as no-mask following the convention for masked 6810 // load/store/gather/scatter. Initialize BlockMask to no-mask. 6811 VPValue *BlockMask = nullptr; 6812 6813 if (OrigLoop->getHeader() == BB) { 6814 if (!CM.blockNeedsPredication(BB)) 6815 return BlockMaskCache[BB] = BlockMask; // Loop incoming mask is all-one. 6816 6817 // Introduce the early-exit compare IV <= BTC to form header block mask. 6818 // This is used instead of IV < TC because TC may wrap, unlike BTC. 6819 // Start by constructing the desired canonical IV. 6820 VPValue *IV = nullptr; 6821 if (Legal->getPrimaryInduction()) 6822 IV = Plan->getVPValue(Legal->getPrimaryInduction()); 6823 else { 6824 auto IVRecipe = new VPWidenCanonicalIVRecipe(); 6825 Builder.getInsertBlock()->appendRecipe(IVRecipe); 6826 IV = IVRecipe->getVPValue(); 6827 } 6828 VPValue *BTC = Plan->getOrCreateBackedgeTakenCount(); 6829 BlockMask = Builder.createNaryOp(VPInstruction::ICmpULE, {IV, BTC}); 6830 return BlockMaskCache[BB] = BlockMask; 6831 } 6832 6833 // This is the block mask. We OR all incoming edges. 6834 for (auto *Predecessor : predecessors(BB)) { 6835 VPValue *EdgeMask = createEdgeMask(Predecessor, BB, Plan); 6836 if (!EdgeMask) // Mask of predecessor is all-one so mask of block is too. 6837 return BlockMaskCache[BB] = EdgeMask; 6838 6839 if (!BlockMask) { // BlockMask has its initialized nullptr value. 6840 BlockMask = EdgeMask; 6841 continue; 6842 } 6843 6844 BlockMask = Builder.createOr(BlockMask, EdgeMask); 6845 } 6846 6847 return BlockMaskCache[BB] = BlockMask; 6848 } 6849 6850 VPWidenMemoryInstructionRecipe * 6851 VPRecipeBuilder::tryToWidenMemory(Instruction *I, VFRange &Range, 6852 VPlanPtr &Plan) { 6853 assert((isa<LoadInst>(I) || isa<StoreInst>(I)) && 6854 "Must be called with either a load or store"); 6855 6856 auto willWiden = [&](unsigned VF) -> bool { 6857 if (VF == 1) 6858 return false; 6859 LoopVectorizationCostModel::InstWidening Decision = 6860 CM.getWideningDecision(I, VF); 6861 assert(Decision != LoopVectorizationCostModel::CM_Unknown && 6862 "CM decision should be taken at this point."); 6863 if (Decision == LoopVectorizationCostModel::CM_Interleave) 6864 return true; 6865 if (CM.isScalarAfterVectorization(I, VF) || 6866 CM.isProfitableToScalarize(I, VF)) 6867 return false; 6868 return Decision != LoopVectorizationCostModel::CM_Scalarize; 6869 }; 6870 6871 if (!LoopVectorizationPlanner::getDecisionAndClampRange(willWiden, Range)) 6872 return nullptr; 6873 6874 VPValue *Mask = nullptr; 6875 if (Legal->isMaskRequired(I)) 6876 Mask = createBlockInMask(I->getParent(), Plan); 6877 6878 VPValue *Addr = Plan->getOrAddVPValue(getLoadStorePointerOperand(I)); 6879 if (LoadInst *Load = dyn_cast<LoadInst>(I)) 6880 return new VPWidenMemoryInstructionRecipe(*Load, Addr, Mask); 6881 6882 StoreInst *Store = cast<StoreInst>(I); 6883 VPValue *StoredValue = Plan->getOrAddVPValue(Store->getValueOperand()); 6884 return new VPWidenMemoryInstructionRecipe(*Store, Addr, StoredValue, Mask); 6885 } 6886 6887 VPWidenIntOrFpInductionRecipe * 6888 VPRecipeBuilder::tryToOptimizeInductionPHI(PHINode *Phi) const { 6889 // Check if this is an integer or fp induction. If so, build the recipe that 6890 // produces its scalar and vector values. 6891 InductionDescriptor II = Legal->getInductionVars().lookup(Phi); 6892 if (II.getKind() == InductionDescriptor::IK_IntInduction || 6893 II.getKind() == InductionDescriptor::IK_FpInduction) 6894 return new VPWidenIntOrFpInductionRecipe(Phi); 6895 6896 return nullptr; 6897 } 6898 6899 VPWidenIntOrFpInductionRecipe * 6900 VPRecipeBuilder::tryToOptimizeInductionTruncate(TruncInst *I, 6901 VFRange &Range) const { 6902 // Optimize the special case where the source is a constant integer 6903 // induction variable. Notice that we can only optimize the 'trunc' case 6904 // because (a) FP conversions lose precision, (b) sext/zext may wrap, and 6905 // (c) other casts depend on pointer size. 6906 6907 // Determine whether \p K is a truncation based on an induction variable that 6908 // can be optimized. 6909 auto isOptimizableIVTruncate = 6910 [&](Instruction *K) -> std::function<bool(unsigned)> { 6911 return 6912 [=](unsigned VF) -> bool { return CM.isOptimizableIVTruncate(K, VF); }; 6913 }; 6914 6915 if (LoopVectorizationPlanner::getDecisionAndClampRange( 6916 isOptimizableIVTruncate(I), Range)) 6917 return new VPWidenIntOrFpInductionRecipe(cast<PHINode>(I->getOperand(0)), 6918 I); 6919 return nullptr; 6920 } 6921 6922 VPBlendRecipe *VPRecipeBuilder::tryToBlend(PHINode *Phi, VPlanPtr &Plan) { 6923 // We know that all PHIs in non-header blocks are converted into selects, so 6924 // we don't have to worry about the insertion order and we can just use the 6925 // builder. At this point we generate the predication tree. There may be 6926 // duplications since this is a simple recursive scan, but future 6927 // optimizations will clean it up. 6928 6929 SmallVector<VPValue *, 2> Operands; 6930 unsigned NumIncoming = Phi->getNumIncomingValues(); 6931 for (unsigned In = 0; In < NumIncoming; In++) { 6932 VPValue *EdgeMask = 6933 createEdgeMask(Phi->getIncomingBlock(In), Phi->getParent(), Plan); 6934 assert((EdgeMask || NumIncoming == 1) && 6935 "Multiple predecessors with one having a full mask"); 6936 Operands.push_back(Plan->getOrAddVPValue(Phi->getIncomingValue(In))); 6937 if (EdgeMask) 6938 Operands.push_back(EdgeMask); 6939 } 6940 return new VPBlendRecipe(Phi, Operands); 6941 } 6942 6943 VPWidenCallRecipe *VPRecipeBuilder::tryToWidenCall(CallInst *CI, VFRange &Range, 6944 VPlan &Plan) const { 6945 6946 bool IsPredicated = LoopVectorizationPlanner::getDecisionAndClampRange( 6947 [this, CI](unsigned VF) { return CM.isScalarWithPredication(CI, VF); }, 6948 Range); 6949 6950 if (IsPredicated) 6951 return nullptr; 6952 6953 Intrinsic::ID ID = getVectorIntrinsicIDForCall(CI, TLI); 6954 if (ID && (ID == Intrinsic::assume || ID == Intrinsic::lifetime_end || 6955 ID == Intrinsic::lifetime_start || ID == Intrinsic::sideeffect)) 6956 return nullptr; 6957 6958 auto willWiden = [&](unsigned VF) -> bool { 6959 Intrinsic::ID ID = getVectorIntrinsicIDForCall(CI, TLI); 6960 // The following case may be scalarized depending on the VF. 6961 // The flag shows whether we use Intrinsic or a usual Call for vectorized 6962 // version of the instruction. 6963 // Is it beneficial to perform intrinsic call compared to lib call? 6964 bool NeedToScalarize = false; 6965 unsigned CallCost = CM.getVectorCallCost(CI, VF, NeedToScalarize); 6966 bool UseVectorIntrinsic = 6967 ID && CM.getVectorIntrinsicCost(CI, VF) <= CallCost; 6968 return UseVectorIntrinsic || !NeedToScalarize; 6969 }; 6970 6971 if (!LoopVectorizationPlanner::getDecisionAndClampRange(willWiden, Range)) 6972 return nullptr; 6973 6974 return new VPWidenCallRecipe(*CI, Plan.mapToVPValues(CI->arg_operands())); 6975 } 6976 6977 bool VPRecipeBuilder::shouldWiden(Instruction *I, VFRange &Range) const { 6978 assert(!isa<BranchInst>(I) && !isa<PHINode>(I) && !isa<LoadInst>(I) && 6979 !isa<StoreInst>(I) && "Instruction should have been handled earlier"); 6980 // Instruction should be widened, unless it is scalar after vectorization, 6981 // scalarization is profitable or it is predicated. 6982 auto WillScalarize = [this, I](unsigned VF) -> bool { 6983 return CM.isScalarAfterVectorization(I, VF) || 6984 CM.isProfitableToScalarize(I, VF) || 6985 CM.isScalarWithPredication(I, VF); 6986 }; 6987 return !LoopVectorizationPlanner::getDecisionAndClampRange(WillScalarize, 6988 Range); 6989 } 6990 6991 VPWidenRecipe *VPRecipeBuilder::tryToWiden(Instruction *I, VPlan &Plan) const { 6992 auto IsVectorizableOpcode = [](unsigned Opcode) { 6993 switch (Opcode) { 6994 case Instruction::Add: 6995 case Instruction::And: 6996 case Instruction::AShr: 6997 case Instruction::BitCast: 6998 case Instruction::FAdd: 6999 case Instruction::FCmp: 7000 case Instruction::FDiv: 7001 case Instruction::FMul: 7002 case Instruction::FNeg: 7003 case Instruction::FPExt: 7004 case Instruction::FPToSI: 7005 case Instruction::FPToUI: 7006 case Instruction::FPTrunc: 7007 case Instruction::FRem: 7008 case Instruction::FSub: 7009 case Instruction::ICmp: 7010 case Instruction::IntToPtr: 7011 case Instruction::LShr: 7012 case Instruction::Mul: 7013 case Instruction::Or: 7014 case Instruction::PtrToInt: 7015 case Instruction::SDiv: 7016 case Instruction::Select: 7017 case Instruction::SExt: 7018 case Instruction::Shl: 7019 case Instruction::SIToFP: 7020 case Instruction::SRem: 7021 case Instruction::Sub: 7022 case Instruction::Trunc: 7023 case Instruction::UDiv: 7024 case Instruction::UIToFP: 7025 case Instruction::URem: 7026 case Instruction::Xor: 7027 case Instruction::ZExt: 7028 return true; 7029 } 7030 return false; 7031 }; 7032 7033 if (!IsVectorizableOpcode(I->getOpcode())) 7034 return nullptr; 7035 7036 // Success: widen this instruction. 7037 return new VPWidenRecipe(*I, Plan.mapToVPValues(I->operands())); 7038 } 7039 7040 VPBasicBlock *VPRecipeBuilder::handleReplication( 7041 Instruction *I, VFRange &Range, VPBasicBlock *VPBB, 7042 DenseMap<Instruction *, VPReplicateRecipe *> &PredInst2Recipe, 7043 VPlanPtr &Plan) { 7044 bool IsUniform = LoopVectorizationPlanner::getDecisionAndClampRange( 7045 [&](unsigned VF) { return CM.isUniformAfterVectorization(I, VF); }, 7046 Range); 7047 7048 bool IsPredicated = LoopVectorizationPlanner::getDecisionAndClampRange( 7049 [&](unsigned VF) { return CM.isScalarWithPredication(I, VF); }, Range); 7050 7051 auto *Recipe = new VPReplicateRecipe(I, Plan->mapToVPValues(I->operands()), 7052 IsUniform, IsPredicated); 7053 setRecipe(I, Recipe); 7054 7055 // Find if I uses a predicated instruction. If so, it will use its scalar 7056 // value. Avoid hoisting the insert-element which packs the scalar value into 7057 // a vector value, as that happens iff all users use the vector value. 7058 for (auto &Op : I->operands()) 7059 if (auto *PredInst = dyn_cast<Instruction>(Op)) 7060 if (PredInst2Recipe.find(PredInst) != PredInst2Recipe.end()) 7061 PredInst2Recipe[PredInst]->setAlsoPack(false); 7062 7063 // Finalize the recipe for Instr, first if it is not predicated. 7064 if (!IsPredicated) { 7065 LLVM_DEBUG(dbgs() << "LV: Scalarizing:" << *I << "\n"); 7066 VPBB->appendRecipe(Recipe); 7067 return VPBB; 7068 } 7069 LLVM_DEBUG(dbgs() << "LV: Scalarizing and predicating:" << *I << "\n"); 7070 assert(VPBB->getSuccessors().empty() && 7071 "VPBB has successors when handling predicated replication."); 7072 // Record predicated instructions for above packing optimizations. 7073 PredInst2Recipe[I] = Recipe; 7074 VPBlockBase *Region = createReplicateRegion(I, Recipe, Plan); 7075 VPBlockUtils::insertBlockAfter(Region, VPBB); 7076 auto *RegSucc = new VPBasicBlock(); 7077 VPBlockUtils::insertBlockAfter(RegSucc, Region); 7078 return RegSucc; 7079 } 7080 7081 VPRegionBlock *VPRecipeBuilder::createReplicateRegion(Instruction *Instr, 7082 VPRecipeBase *PredRecipe, 7083 VPlanPtr &Plan) { 7084 // Instructions marked for predication are replicated and placed under an 7085 // if-then construct to prevent side-effects. 7086 7087 // Generate recipes to compute the block mask for this region. 7088 VPValue *BlockInMask = createBlockInMask(Instr->getParent(), Plan); 7089 7090 // Build the triangular if-then region. 7091 std::string RegionName = (Twine("pred.") + Instr->getOpcodeName()).str(); 7092 assert(Instr->getParent() && "Predicated instruction not in any basic block"); 7093 auto *BOMRecipe = new VPBranchOnMaskRecipe(BlockInMask); 7094 auto *Entry = new VPBasicBlock(Twine(RegionName) + ".entry", BOMRecipe); 7095 auto *PHIRecipe = 7096 Instr->getType()->isVoidTy() ? nullptr : new VPPredInstPHIRecipe(Instr); 7097 auto *Exit = new VPBasicBlock(Twine(RegionName) + ".continue", PHIRecipe); 7098 auto *Pred = new VPBasicBlock(Twine(RegionName) + ".if", PredRecipe); 7099 VPRegionBlock *Region = new VPRegionBlock(Entry, Exit, RegionName, true); 7100 7101 // Note: first set Entry as region entry and then connect successors starting 7102 // from it in order, to propagate the "parent" of each VPBasicBlock. 7103 VPBlockUtils::insertTwoBlocksAfter(Pred, Exit, BlockInMask, Entry); 7104 VPBlockUtils::connectBlocks(Pred, Exit); 7105 7106 return Region; 7107 } 7108 7109 VPRecipeBase *VPRecipeBuilder::tryToCreateWidenRecipe(Instruction *Instr, 7110 VFRange &Range, 7111 VPlanPtr &Plan) { 7112 // First, check for specific widening recipes that deal with calls, memory 7113 // operations, inductions and Phi nodes. 7114 if (auto *CI = dyn_cast<CallInst>(Instr)) 7115 return tryToWidenCall(CI, Range, *Plan); 7116 7117 if (isa<LoadInst>(Instr) || isa<StoreInst>(Instr)) 7118 return tryToWidenMemory(Instr, Range, Plan); 7119 7120 VPRecipeBase *Recipe; 7121 if (auto Phi = dyn_cast<PHINode>(Instr)) { 7122 if (Phi->getParent() != OrigLoop->getHeader()) 7123 return tryToBlend(Phi, Plan); 7124 if ((Recipe = tryToOptimizeInductionPHI(Phi))) 7125 return Recipe; 7126 return new VPWidenPHIRecipe(Phi); 7127 } 7128 7129 if (isa<TruncInst>(Instr) && 7130 (Recipe = tryToOptimizeInductionTruncate(cast<TruncInst>(Instr), Range))) 7131 return Recipe; 7132 7133 if (!shouldWiden(Instr, Range)) 7134 return nullptr; 7135 7136 if (auto GEP = dyn_cast<GetElementPtrInst>(Instr)) 7137 return new VPWidenGEPRecipe(GEP, OrigLoop); 7138 7139 if (auto *SI = dyn_cast<SelectInst>(Instr)) { 7140 bool InvariantCond = 7141 PSE.getSE()->isLoopInvariant(PSE.getSCEV(SI->getOperand(0)), OrigLoop); 7142 return new VPWidenSelectRecipe(*SI, InvariantCond); 7143 } 7144 7145 return tryToWiden(Instr, *Plan); 7146 } 7147 7148 void LoopVectorizationPlanner::buildVPlansWithVPRecipes(unsigned MinVF, 7149 unsigned MaxVF) { 7150 assert(OrigLoop->empty() && "Inner loop expected."); 7151 7152 // Collect conditions feeding internal conditional branches; they need to be 7153 // represented in VPlan for it to model masking. 7154 SmallPtrSet<Value *, 1> NeedDef; 7155 7156 auto *Latch = OrigLoop->getLoopLatch(); 7157 for (BasicBlock *BB : OrigLoop->blocks()) { 7158 if (BB == Latch) 7159 continue; 7160 BranchInst *Branch = dyn_cast<BranchInst>(BB->getTerminator()); 7161 if (Branch && Branch->isConditional()) 7162 NeedDef.insert(Branch->getCondition()); 7163 } 7164 7165 // If the tail is to be folded by masking, the primary induction variable, if 7166 // exists needs to be represented in VPlan for it to model early-exit masking. 7167 // Also, both the Phi and the live-out instruction of each reduction are 7168 // required in order to introduce a select between them in VPlan. 7169 if (CM.foldTailByMasking()) { 7170 if (Legal->getPrimaryInduction()) 7171 NeedDef.insert(Legal->getPrimaryInduction()); 7172 for (auto &Reduction : Legal->getReductionVars()) { 7173 NeedDef.insert(Reduction.first); 7174 NeedDef.insert(Reduction.second.getLoopExitInstr()); 7175 } 7176 } 7177 7178 // Collect instructions from the original loop that will become trivially dead 7179 // in the vectorized loop. We don't need to vectorize these instructions. For 7180 // example, original induction update instructions can become dead because we 7181 // separately emit induction "steps" when generating code for the new loop. 7182 // Similarly, we create a new latch condition when setting up the structure 7183 // of the new loop, so the old one can become dead. 7184 SmallPtrSet<Instruction *, 4> DeadInstructions; 7185 collectTriviallyDeadInstructions(DeadInstructions); 7186 7187 // Add assume instructions we need to drop to DeadInstructions, to prevent 7188 // them from being added to the VPlan. 7189 // TODO: We only need to drop assumes in blocks that get flattend. If the 7190 // control flow is preserved, we should keep them. 7191 auto &ConditionalAssumes = Legal->getConditionalAssumes(); 7192 DeadInstructions.insert(ConditionalAssumes.begin(), ConditionalAssumes.end()); 7193 7194 DenseMap<Instruction *, Instruction *> &SinkAfter = Legal->getSinkAfter(); 7195 // Dead instructions do not need sinking. Remove them from SinkAfter. 7196 for (Instruction *I : DeadInstructions) 7197 SinkAfter.erase(I); 7198 7199 for (unsigned VF = MinVF; VF < MaxVF + 1;) { 7200 VFRange SubRange = {VF, MaxVF + 1}; 7201 VPlans.push_back(buildVPlanWithVPRecipes(SubRange, NeedDef, 7202 DeadInstructions, SinkAfter)); 7203 VF = SubRange.End; 7204 } 7205 } 7206 7207 VPlanPtr LoopVectorizationPlanner::buildVPlanWithVPRecipes( 7208 VFRange &Range, SmallPtrSetImpl<Value *> &NeedDef, 7209 SmallPtrSetImpl<Instruction *> &DeadInstructions, 7210 const DenseMap<Instruction *, Instruction *> &SinkAfter) { 7211 7212 // Hold a mapping from predicated instructions to their recipes, in order to 7213 // fix their AlsoPack behavior if a user is determined to replicate and use a 7214 // scalar instead of vector value. 7215 DenseMap<Instruction *, VPReplicateRecipe *> PredInst2Recipe; 7216 7217 SmallPtrSet<const InterleaveGroup<Instruction> *, 1> InterleaveGroups; 7218 7219 VPRecipeBuilder RecipeBuilder(OrigLoop, TLI, Legal, CM, PSE, Builder); 7220 7221 // --------------------------------------------------------------------------- 7222 // Pre-construction: record ingredients whose recipes we'll need to further 7223 // process after constructing the initial VPlan. 7224 // --------------------------------------------------------------------------- 7225 7226 // Mark instructions we'll need to sink later and their targets as 7227 // ingredients whose recipe we'll need to record. 7228 for (auto &Entry : SinkAfter) { 7229 RecipeBuilder.recordRecipeOf(Entry.first); 7230 RecipeBuilder.recordRecipeOf(Entry.second); 7231 } 7232 7233 // For each interleave group which is relevant for this (possibly trimmed) 7234 // Range, add it to the set of groups to be later applied to the VPlan and add 7235 // placeholders for its members' Recipes which we'll be replacing with a 7236 // single VPInterleaveRecipe. 7237 for (InterleaveGroup<Instruction> *IG : IAI.getInterleaveGroups()) { 7238 auto applyIG = [IG, this](unsigned VF) -> bool { 7239 return (VF >= 2 && // Query is illegal for VF == 1 7240 CM.getWideningDecision(IG->getInsertPos(), VF) == 7241 LoopVectorizationCostModel::CM_Interleave); 7242 }; 7243 if (!getDecisionAndClampRange(applyIG, Range)) 7244 continue; 7245 InterleaveGroups.insert(IG); 7246 for (unsigned i = 0; i < IG->getFactor(); i++) 7247 if (Instruction *Member = IG->getMember(i)) 7248 RecipeBuilder.recordRecipeOf(Member); 7249 }; 7250 7251 // --------------------------------------------------------------------------- 7252 // Build initial VPlan: Scan the body of the loop in a topological order to 7253 // visit each basic block after having visited its predecessor basic blocks. 7254 // --------------------------------------------------------------------------- 7255 7256 // Create a dummy pre-entry VPBasicBlock to start building the VPlan. 7257 auto Plan = std::make_unique<VPlan>(); 7258 VPBasicBlock *VPBB = new VPBasicBlock("Pre-Entry"); 7259 Plan->setEntry(VPBB); 7260 7261 // Represent values that will have defs inside VPlan. 7262 for (Value *V : NeedDef) 7263 Plan->addVPValue(V); 7264 7265 // Scan the body of the loop in a topological order to visit each basic block 7266 // after having visited its predecessor basic blocks. 7267 LoopBlocksDFS DFS(OrigLoop); 7268 DFS.perform(LI); 7269 7270 for (BasicBlock *BB : make_range(DFS.beginRPO(), DFS.endRPO())) { 7271 // Relevant instructions from basic block BB will be grouped into VPRecipe 7272 // ingredients and fill a new VPBasicBlock. 7273 unsigned VPBBsForBB = 0; 7274 auto *FirstVPBBForBB = new VPBasicBlock(BB->getName()); 7275 VPBlockUtils::insertBlockAfter(FirstVPBBForBB, VPBB); 7276 VPBB = FirstVPBBForBB; 7277 Builder.setInsertPoint(VPBB); 7278 7279 // Introduce each ingredient into VPlan. 7280 // TODO: Model and preserve debug instrinsics in VPlan. 7281 for (Instruction &I : BB->instructionsWithoutDebug()) { 7282 Instruction *Instr = &I; 7283 7284 // First filter out irrelevant instructions, to ensure no recipes are 7285 // built for them. 7286 if (isa<BranchInst>(Instr) || 7287 DeadInstructions.find(Instr) != DeadInstructions.end()) 7288 continue; 7289 7290 if (auto Recipe = 7291 RecipeBuilder.tryToCreateWidenRecipe(Instr, Range, Plan)) { 7292 RecipeBuilder.setRecipe(Instr, Recipe); 7293 VPBB->appendRecipe(Recipe); 7294 continue; 7295 } 7296 7297 // Otherwise, if all widening options failed, Instruction is to be 7298 // replicated. This may create a successor for VPBB. 7299 VPBasicBlock *NextVPBB = RecipeBuilder.handleReplication( 7300 Instr, Range, VPBB, PredInst2Recipe, Plan); 7301 if (NextVPBB != VPBB) { 7302 VPBB = NextVPBB; 7303 VPBB->setName(BB->hasName() ? BB->getName() + "." + Twine(VPBBsForBB++) 7304 : ""); 7305 } 7306 } 7307 } 7308 7309 // Discard empty dummy pre-entry VPBasicBlock. Note that other VPBasicBlocks 7310 // may also be empty, such as the last one VPBB, reflecting original 7311 // basic-blocks with no recipes. 7312 VPBasicBlock *PreEntry = cast<VPBasicBlock>(Plan->getEntry()); 7313 assert(PreEntry->empty() && "Expecting empty pre-entry block."); 7314 VPBlockBase *Entry = Plan->setEntry(PreEntry->getSingleSuccessor()); 7315 VPBlockUtils::disconnectBlocks(PreEntry, Entry); 7316 delete PreEntry; 7317 7318 // --------------------------------------------------------------------------- 7319 // Transform initial VPlan: Apply previously taken decisions, in order, to 7320 // bring the VPlan to its final state. 7321 // --------------------------------------------------------------------------- 7322 7323 // Apply Sink-After legal constraints. 7324 for (auto &Entry : SinkAfter) { 7325 VPRecipeBase *Sink = RecipeBuilder.getRecipe(Entry.first); 7326 VPRecipeBase *Target = RecipeBuilder.getRecipe(Entry.second); 7327 Sink->moveAfter(Target); 7328 } 7329 7330 // Interleave memory: for each Interleave Group we marked earlier as relevant 7331 // for this VPlan, replace the Recipes widening its memory instructions with a 7332 // single VPInterleaveRecipe at its insertion point. 7333 for (auto IG : InterleaveGroups) { 7334 auto *Recipe = cast<VPWidenMemoryInstructionRecipe>( 7335 RecipeBuilder.getRecipe(IG->getInsertPos())); 7336 (new VPInterleaveRecipe(IG, Recipe->getAddr(), Recipe->getMask())) 7337 ->insertBefore(Recipe); 7338 7339 for (unsigned i = 0; i < IG->getFactor(); ++i) 7340 if (Instruction *Member = IG->getMember(i)) { 7341 RecipeBuilder.getRecipe(Member)->eraseFromParent(); 7342 } 7343 } 7344 7345 // Finally, if tail is folded by masking, introduce selects between the phi 7346 // and the live-out instruction of each reduction, at the end of the latch. 7347 if (CM.foldTailByMasking()) { 7348 Builder.setInsertPoint(VPBB); 7349 auto *Cond = RecipeBuilder.createBlockInMask(OrigLoop->getHeader(), Plan); 7350 for (auto &Reduction : Legal->getReductionVars()) { 7351 VPValue *Phi = Plan->getVPValue(Reduction.first); 7352 VPValue *Red = Plan->getVPValue(Reduction.second.getLoopExitInstr()); 7353 Builder.createNaryOp(Instruction::Select, {Cond, Red, Phi}); 7354 } 7355 } 7356 7357 std::string PlanName; 7358 raw_string_ostream RSO(PlanName); 7359 unsigned VF = Range.Start; 7360 Plan->addVF(VF); 7361 RSO << "Initial VPlan for VF={" << VF; 7362 for (VF *= 2; VF < Range.End; VF *= 2) { 7363 Plan->addVF(VF); 7364 RSO << "," << VF; 7365 } 7366 RSO << "},UF>=1"; 7367 RSO.flush(); 7368 Plan->setName(PlanName); 7369 7370 return Plan; 7371 } 7372 7373 VPlanPtr LoopVectorizationPlanner::buildVPlan(VFRange &Range) { 7374 // Outer loop handling: They may require CFG and instruction level 7375 // transformations before even evaluating whether vectorization is profitable. 7376 // Since we cannot modify the incoming IR, we need to build VPlan upfront in 7377 // the vectorization pipeline. 7378 assert(!OrigLoop->empty()); 7379 assert(EnableVPlanNativePath && "VPlan-native path is not enabled."); 7380 7381 // Create new empty VPlan 7382 auto Plan = std::make_unique<VPlan>(); 7383 7384 // Build hierarchical CFG 7385 VPlanHCFGBuilder HCFGBuilder(OrigLoop, LI, *Plan); 7386 HCFGBuilder.buildHierarchicalCFG(); 7387 7388 for (unsigned VF = Range.Start; VF < Range.End; VF *= 2) 7389 Plan->addVF(VF); 7390 7391 if (EnableVPlanPredication) { 7392 VPlanPredicator VPP(*Plan); 7393 VPP.predicate(); 7394 7395 // Avoid running transformation to recipes until masked code generation in 7396 // VPlan-native path is in place. 7397 return Plan; 7398 } 7399 7400 SmallPtrSet<Instruction *, 1> DeadInstructions; 7401 VPlanTransforms::VPInstructionsToVPRecipes( 7402 OrigLoop, Plan, Legal->getInductionVars(), DeadInstructions); 7403 return Plan; 7404 } 7405 7406 Value* LoopVectorizationPlanner::VPCallbackILV:: 7407 getOrCreateVectorValues(Value *V, unsigned Part) { 7408 return ILV.getOrCreateVectorValue(V, Part); 7409 } 7410 7411 Value *LoopVectorizationPlanner::VPCallbackILV::getOrCreateScalarValue( 7412 Value *V, const VPIteration &Instance) { 7413 return ILV.getOrCreateScalarValue(V, Instance); 7414 } 7415 7416 void VPInterleaveRecipe::print(raw_ostream &O, const Twine &Indent, 7417 VPSlotTracker &SlotTracker) const { 7418 O << "\"INTERLEAVE-GROUP with factor " << IG->getFactor() << " at "; 7419 IG->getInsertPos()->printAsOperand(O, false); 7420 O << ", "; 7421 getAddr()->printAsOperand(O, SlotTracker); 7422 VPValue *Mask = getMask(); 7423 if (Mask) { 7424 O << ", "; 7425 Mask->printAsOperand(O, SlotTracker); 7426 } 7427 for (unsigned i = 0; i < IG->getFactor(); ++i) 7428 if (Instruction *I = IG->getMember(i)) 7429 O << "\\l\" +\n" << Indent << "\" " << VPlanIngredient(I) << " " << i; 7430 } 7431 7432 void VPWidenCallRecipe::execute(VPTransformState &State) { 7433 State.ILV->widenCallInstruction(Ingredient, User, State); 7434 } 7435 7436 void VPWidenSelectRecipe::execute(VPTransformState &State) { 7437 State.ILV->widenSelectInstruction(Ingredient, InvariantCond); 7438 } 7439 7440 void VPWidenRecipe::execute(VPTransformState &State) { 7441 State.ILV->widenInstruction(Ingredient, User, State); 7442 } 7443 7444 void VPWidenGEPRecipe::execute(VPTransformState &State) { 7445 State.ILV->widenGEP(GEP, State.UF, State.VF, IsPtrLoopInvariant, 7446 IsIndexLoopInvariant); 7447 } 7448 7449 void VPWidenIntOrFpInductionRecipe::execute(VPTransformState &State) { 7450 assert(!State.Instance && "Int or FP induction being replicated."); 7451 State.ILV->widenIntOrFpInduction(IV, Trunc); 7452 } 7453 7454 void VPWidenPHIRecipe::execute(VPTransformState &State) { 7455 State.ILV->widenPHIInstruction(Phi, State.UF, State.VF); 7456 } 7457 7458 void VPBlendRecipe::execute(VPTransformState &State) { 7459 State.ILV->setDebugLocFromInst(State.Builder, Phi); 7460 // We know that all PHIs in non-header blocks are converted into 7461 // selects, so we don't have to worry about the insertion order and we 7462 // can just use the builder. 7463 // At this point we generate the predication tree. There may be 7464 // duplications since this is a simple recursive scan, but future 7465 // optimizations will clean it up. 7466 7467 unsigned NumIncoming = getNumIncomingValues(); 7468 7469 // Generate a sequence of selects of the form: 7470 // SELECT(Mask3, In3, 7471 // SELECT(Mask2, In2, 7472 // SELECT(Mask1, In1, 7473 // In0))) 7474 // Note that Mask0 is never used: lanes for which no path reaches this phi and 7475 // are essentially undef are taken from In0. 7476 InnerLoopVectorizer::VectorParts Entry(State.UF); 7477 for (unsigned In = 0; In < NumIncoming; ++In) { 7478 for (unsigned Part = 0; Part < State.UF; ++Part) { 7479 // We might have single edge PHIs (blocks) - use an identity 7480 // 'select' for the first PHI operand. 7481 Value *In0 = State.get(getIncomingValue(In), Part); 7482 if (In == 0) 7483 Entry[Part] = In0; // Initialize with the first incoming value. 7484 else { 7485 // Select between the current value and the previous incoming edge 7486 // based on the incoming mask. 7487 Value *Cond = State.get(getMask(In), Part); 7488 Entry[Part] = 7489 State.Builder.CreateSelect(Cond, In0, Entry[Part], "predphi"); 7490 } 7491 } 7492 } 7493 for (unsigned Part = 0; Part < State.UF; ++Part) 7494 State.ValueMap.setVectorValue(Phi, Part, Entry[Part]); 7495 } 7496 7497 void VPInterleaveRecipe::execute(VPTransformState &State) { 7498 assert(!State.Instance && "Interleave group being replicated."); 7499 State.ILV->vectorizeInterleaveGroup(IG, State, getAddr(), getMask()); 7500 } 7501 7502 void VPReplicateRecipe::execute(VPTransformState &State) { 7503 if (State.Instance) { // Generate a single instance. 7504 State.ILV->scalarizeInstruction(Ingredient, User, *State.Instance, 7505 IsPredicated, State); 7506 // Insert scalar instance packing it into a vector. 7507 if (AlsoPack && State.VF > 1) { 7508 // If we're constructing lane 0, initialize to start from undef. 7509 if (State.Instance->Lane == 0) { 7510 Value *Undef = 7511 UndefValue::get(VectorType::get(Ingredient->getType(), State.VF)); 7512 State.ValueMap.setVectorValue(Ingredient, State.Instance->Part, Undef); 7513 } 7514 State.ILV->packScalarIntoVectorValue(Ingredient, *State.Instance); 7515 } 7516 return; 7517 } 7518 7519 // Generate scalar instances for all VF lanes of all UF parts, unless the 7520 // instruction is uniform inwhich case generate only the first lane for each 7521 // of the UF parts. 7522 unsigned EndLane = IsUniform ? 1 : State.VF; 7523 for (unsigned Part = 0; Part < State.UF; ++Part) 7524 for (unsigned Lane = 0; Lane < EndLane; ++Lane) 7525 State.ILV->scalarizeInstruction(Ingredient, User, {Part, Lane}, 7526 IsPredicated, State); 7527 } 7528 7529 void VPBranchOnMaskRecipe::execute(VPTransformState &State) { 7530 assert(State.Instance && "Branch on Mask works only on single instance."); 7531 7532 unsigned Part = State.Instance->Part; 7533 unsigned Lane = State.Instance->Lane; 7534 7535 Value *ConditionBit = nullptr; 7536 VPValue *BlockInMask = getMask(); 7537 if (BlockInMask) { 7538 ConditionBit = State.get(BlockInMask, Part); 7539 if (ConditionBit->getType()->isVectorTy()) 7540 ConditionBit = State.Builder.CreateExtractElement( 7541 ConditionBit, State.Builder.getInt32(Lane)); 7542 } else // Block in mask is all-one. 7543 ConditionBit = State.Builder.getTrue(); 7544 7545 // Replace the temporary unreachable terminator with a new conditional branch, 7546 // whose two destinations will be set later when they are created. 7547 auto *CurrentTerminator = State.CFG.PrevBB->getTerminator(); 7548 assert(isa<UnreachableInst>(CurrentTerminator) && 7549 "Expected to replace unreachable terminator with conditional branch."); 7550 auto *CondBr = BranchInst::Create(State.CFG.PrevBB, nullptr, ConditionBit); 7551 CondBr->setSuccessor(0, nullptr); 7552 ReplaceInstWithInst(CurrentTerminator, CondBr); 7553 } 7554 7555 void VPPredInstPHIRecipe::execute(VPTransformState &State) { 7556 assert(State.Instance && "Predicated instruction PHI works per instance."); 7557 Instruction *ScalarPredInst = cast<Instruction>( 7558 State.ValueMap.getScalarValue(PredInst, *State.Instance)); 7559 BasicBlock *PredicatedBB = ScalarPredInst->getParent(); 7560 BasicBlock *PredicatingBB = PredicatedBB->getSinglePredecessor(); 7561 assert(PredicatingBB && "Predicated block has no single predecessor."); 7562 7563 // By current pack/unpack logic we need to generate only a single phi node: if 7564 // a vector value for the predicated instruction exists at this point it means 7565 // the instruction has vector users only, and a phi for the vector value is 7566 // needed. In this case the recipe of the predicated instruction is marked to 7567 // also do that packing, thereby "hoisting" the insert-element sequence. 7568 // Otherwise, a phi node for the scalar value is needed. 7569 unsigned Part = State.Instance->Part; 7570 if (State.ValueMap.hasVectorValue(PredInst, Part)) { 7571 Value *VectorValue = State.ValueMap.getVectorValue(PredInst, Part); 7572 InsertElementInst *IEI = cast<InsertElementInst>(VectorValue); 7573 PHINode *VPhi = State.Builder.CreatePHI(IEI->getType(), 2); 7574 VPhi->addIncoming(IEI->getOperand(0), PredicatingBB); // Unmodified vector. 7575 VPhi->addIncoming(IEI, PredicatedBB); // New vector with inserted element. 7576 State.ValueMap.resetVectorValue(PredInst, Part, VPhi); // Update cache. 7577 } else { 7578 Type *PredInstType = PredInst->getType(); 7579 PHINode *Phi = State.Builder.CreatePHI(PredInstType, 2); 7580 Phi->addIncoming(UndefValue::get(ScalarPredInst->getType()), PredicatingBB); 7581 Phi->addIncoming(ScalarPredInst, PredicatedBB); 7582 State.ValueMap.resetScalarValue(PredInst, *State.Instance, Phi); 7583 } 7584 } 7585 7586 void VPWidenMemoryInstructionRecipe::execute(VPTransformState &State) { 7587 VPValue *StoredValue = isa<StoreInst>(Instr) ? getStoredValue() : nullptr; 7588 State.ILV->vectorizeMemoryInstruction(&Instr, State, getAddr(), StoredValue, 7589 getMask()); 7590 } 7591 7592 // Determine how to lower the scalar epilogue, which depends on 1) optimising 7593 // for minimum code-size, 2) predicate compiler options, 3) loop hints forcing 7594 // predication, and 4) a TTI hook that analyses whether the loop is suitable 7595 // for predication. 7596 static ScalarEpilogueLowering getScalarEpilogueLowering( 7597 Function *F, Loop *L, LoopVectorizeHints &Hints, ProfileSummaryInfo *PSI, 7598 BlockFrequencyInfo *BFI, TargetTransformInfo *TTI, TargetLibraryInfo *TLI, 7599 AssumptionCache *AC, LoopInfo *LI, ScalarEvolution *SE, DominatorTree *DT, 7600 LoopVectorizationLegality &LVL) { 7601 bool OptSize = 7602 F->hasOptSize() || llvm::shouldOptimizeForSize(L->getHeader(), PSI, BFI, 7603 PGSOQueryType::IRPass); 7604 // 1) OptSize takes precedence over all other options, i.e. if this is set, 7605 // don't look at hints or options, and don't request a scalar epilogue. 7606 if (OptSize && Hints.getForce() != LoopVectorizeHints::FK_Enabled) 7607 return CM_ScalarEpilogueNotAllowedOptSize; 7608 7609 bool PredicateOptDisabled = PreferPredicateOverEpilog.getNumOccurrences() && 7610 !PreferPredicateOverEpilog; 7611 7612 // 2) Next, if disabling predication is requested on the command line, honour 7613 // this and request a scalar epilogue. 7614 if (PredicateOptDisabled) 7615 return CM_ScalarEpilogueAllowed; 7616 7617 // 3) and 4) look if enabling predication is requested on the command line, 7618 // with a loop hint, or if the TTI hook indicates this is profitable, request 7619 // predication . 7620 if (PreferPredicateOverEpilog || 7621 Hints.getPredicate() == LoopVectorizeHints::FK_Enabled || 7622 (TTI->preferPredicateOverEpilogue(L, LI, *SE, *AC, TLI, DT, 7623 LVL.getLAI()) && 7624 Hints.getPredicate() != LoopVectorizeHints::FK_Disabled)) 7625 return CM_ScalarEpilogueNotNeededUsePredicate; 7626 7627 return CM_ScalarEpilogueAllowed; 7628 } 7629 7630 // Process the loop in the VPlan-native vectorization path. This path builds 7631 // VPlan upfront in the vectorization pipeline, which allows to apply 7632 // VPlan-to-VPlan transformations from the very beginning without modifying the 7633 // input LLVM IR. 7634 static bool processLoopInVPlanNativePath( 7635 Loop *L, PredicatedScalarEvolution &PSE, LoopInfo *LI, DominatorTree *DT, 7636 LoopVectorizationLegality *LVL, TargetTransformInfo *TTI, 7637 TargetLibraryInfo *TLI, DemandedBits *DB, AssumptionCache *AC, 7638 OptimizationRemarkEmitter *ORE, BlockFrequencyInfo *BFI, 7639 ProfileSummaryInfo *PSI, LoopVectorizeHints &Hints) { 7640 7641 assert(EnableVPlanNativePath && "VPlan-native path is disabled."); 7642 Function *F = L->getHeader()->getParent(); 7643 InterleavedAccessInfo IAI(PSE, L, DT, LI, LVL->getLAI()); 7644 7645 ScalarEpilogueLowering SEL = getScalarEpilogueLowering( 7646 F, L, Hints, PSI, BFI, TTI, TLI, AC, LI, PSE.getSE(), DT, *LVL); 7647 7648 LoopVectorizationCostModel CM(SEL, L, PSE, LI, LVL, *TTI, TLI, DB, AC, ORE, F, 7649 &Hints, IAI); 7650 // Use the planner for outer loop vectorization. 7651 // TODO: CM is not used at this point inside the planner. Turn CM into an 7652 // optional argument if we don't need it in the future. 7653 LoopVectorizationPlanner LVP(L, LI, TLI, TTI, LVL, CM, IAI, PSE); 7654 7655 // Get user vectorization factor. 7656 const unsigned UserVF = Hints.getWidth(); 7657 7658 // Plan how to best vectorize, return the best VF and its cost. 7659 const VectorizationFactor VF = LVP.planInVPlanNativePath(UserVF); 7660 7661 // If we are stress testing VPlan builds, do not attempt to generate vector 7662 // code. Masked vector code generation support will follow soon. 7663 // Also, do not attempt to vectorize if no vector code will be produced. 7664 if (VPlanBuildStressTest || EnableVPlanPredication || 7665 VectorizationFactor::Disabled() == VF) 7666 return false; 7667 7668 LVP.setBestPlan(VF.Width, 1); 7669 7670 InnerLoopVectorizer LB(L, PSE, LI, DT, TLI, TTI, AC, ORE, VF.Width, 1, LVL, 7671 &CM); 7672 LLVM_DEBUG(dbgs() << "Vectorizing outer loop in \"" 7673 << L->getHeader()->getParent()->getName() << "\"\n"); 7674 LVP.executePlan(LB, DT); 7675 7676 // Mark the loop as already vectorized to avoid vectorizing again. 7677 Hints.setAlreadyVectorized(); 7678 7679 assert(!verifyFunction(*L->getHeader()->getParent(), &dbgs())); 7680 return true; 7681 } 7682 7683 LoopVectorizePass::LoopVectorizePass(LoopVectorizeOptions Opts) 7684 : InterleaveOnlyWhenForced(Opts.InterleaveOnlyWhenForced || 7685 !EnableLoopInterleaving), 7686 VectorizeOnlyWhenForced(Opts.VectorizeOnlyWhenForced || 7687 !EnableLoopVectorization) {} 7688 7689 bool LoopVectorizePass::processLoop(Loop *L) { 7690 assert((EnableVPlanNativePath || L->empty()) && 7691 "VPlan-native path is not enabled. Only process inner loops."); 7692 7693 #ifndef NDEBUG 7694 const std::string DebugLocStr = getDebugLocString(L); 7695 #endif /* NDEBUG */ 7696 7697 LLVM_DEBUG(dbgs() << "\nLV: Checking a loop in \"" 7698 << L->getHeader()->getParent()->getName() << "\" from " 7699 << DebugLocStr << "\n"); 7700 7701 LoopVectorizeHints Hints(L, InterleaveOnlyWhenForced, *ORE); 7702 7703 LLVM_DEBUG( 7704 dbgs() << "LV: Loop hints:" 7705 << " force=" 7706 << (Hints.getForce() == LoopVectorizeHints::FK_Disabled 7707 ? "disabled" 7708 : (Hints.getForce() == LoopVectorizeHints::FK_Enabled 7709 ? "enabled" 7710 : "?")) 7711 << " width=" << Hints.getWidth() 7712 << " unroll=" << Hints.getInterleave() << "\n"); 7713 7714 // Function containing loop 7715 Function *F = L->getHeader()->getParent(); 7716 7717 // Looking at the diagnostic output is the only way to determine if a loop 7718 // was vectorized (other than looking at the IR or machine code), so it 7719 // is important to generate an optimization remark for each loop. Most of 7720 // these messages are generated as OptimizationRemarkAnalysis. Remarks 7721 // generated as OptimizationRemark and OptimizationRemarkMissed are 7722 // less verbose reporting vectorized loops and unvectorized loops that may 7723 // benefit from vectorization, respectively. 7724 7725 if (!Hints.allowVectorization(F, L, VectorizeOnlyWhenForced)) { 7726 LLVM_DEBUG(dbgs() << "LV: Loop hints prevent vectorization.\n"); 7727 return false; 7728 } 7729 7730 PredicatedScalarEvolution PSE(*SE, *L); 7731 7732 // Check if it is legal to vectorize the loop. 7733 LoopVectorizationRequirements Requirements(*ORE); 7734 LoopVectorizationLegality LVL(L, PSE, DT, TTI, TLI, AA, F, GetLAA, LI, ORE, 7735 &Requirements, &Hints, DB, AC); 7736 if (!LVL.canVectorize(EnableVPlanNativePath)) { 7737 LLVM_DEBUG(dbgs() << "LV: Not vectorizing: Cannot prove legality.\n"); 7738 Hints.emitRemarkWithHints(); 7739 return false; 7740 } 7741 7742 // Check the function attributes and profiles to find out if this function 7743 // should be optimized for size. 7744 ScalarEpilogueLowering SEL = getScalarEpilogueLowering( 7745 F, L, Hints, PSI, BFI, TTI, TLI, AC, LI, PSE.getSE(), DT, LVL); 7746 7747 // Entrance to the VPlan-native vectorization path. Outer loops are processed 7748 // here. They may require CFG and instruction level transformations before 7749 // even evaluating whether vectorization is profitable. Since we cannot modify 7750 // the incoming IR, we need to build VPlan upfront in the vectorization 7751 // pipeline. 7752 if (!L->empty()) 7753 return processLoopInVPlanNativePath(L, PSE, LI, DT, &LVL, TTI, TLI, DB, AC, 7754 ORE, BFI, PSI, Hints); 7755 7756 assert(L->empty() && "Inner loop expected."); 7757 7758 // Check the loop for a trip count threshold: vectorize loops with a tiny trip 7759 // count by optimizing for size, to minimize overheads. 7760 auto ExpectedTC = getSmallBestKnownTC(*SE, L); 7761 if (ExpectedTC && *ExpectedTC < TinyTripCountVectorThreshold) { 7762 LLVM_DEBUG(dbgs() << "LV: Found a loop with a very small trip count. " 7763 << "This loop is worth vectorizing only if no scalar " 7764 << "iteration overheads are incurred."); 7765 if (Hints.getForce() == LoopVectorizeHints::FK_Enabled) 7766 LLVM_DEBUG(dbgs() << " But vectorizing was explicitly forced.\n"); 7767 else { 7768 LLVM_DEBUG(dbgs() << "\n"); 7769 SEL = CM_ScalarEpilogueNotAllowedLowTripLoop; 7770 } 7771 } 7772 7773 // Check the function attributes to see if implicit floats are allowed. 7774 // FIXME: This check doesn't seem possibly correct -- what if the loop is 7775 // an integer loop and the vector instructions selected are purely integer 7776 // vector instructions? 7777 if (F->hasFnAttribute(Attribute::NoImplicitFloat)) { 7778 reportVectorizationFailure( 7779 "Can't vectorize when the NoImplicitFloat attribute is used", 7780 "loop not vectorized due to NoImplicitFloat attribute", 7781 "NoImplicitFloat", ORE, L); 7782 Hints.emitRemarkWithHints(); 7783 return false; 7784 } 7785 7786 // Check if the target supports potentially unsafe FP vectorization. 7787 // FIXME: Add a check for the type of safety issue (denormal, signaling) 7788 // for the target we're vectorizing for, to make sure none of the 7789 // additional fp-math flags can help. 7790 if (Hints.isPotentiallyUnsafe() && 7791 TTI->isFPVectorizationPotentiallyUnsafe()) { 7792 reportVectorizationFailure( 7793 "Potentially unsafe FP op prevents vectorization", 7794 "loop not vectorized due to unsafe FP support.", 7795 "UnsafeFP", ORE, L); 7796 Hints.emitRemarkWithHints(); 7797 return false; 7798 } 7799 7800 bool UseInterleaved = TTI->enableInterleavedAccessVectorization(); 7801 InterleavedAccessInfo IAI(PSE, L, DT, LI, LVL.getLAI()); 7802 7803 // If an override option has been passed in for interleaved accesses, use it. 7804 if (EnableInterleavedMemAccesses.getNumOccurrences() > 0) 7805 UseInterleaved = EnableInterleavedMemAccesses; 7806 7807 // Analyze interleaved memory accesses. 7808 if (UseInterleaved) { 7809 IAI.analyzeInterleaving(useMaskedInterleavedAccesses(*TTI)); 7810 } 7811 7812 // Use the cost model. 7813 LoopVectorizationCostModel CM(SEL, L, PSE, LI, &LVL, *TTI, TLI, DB, AC, ORE, 7814 F, &Hints, IAI); 7815 CM.collectValuesToIgnore(); 7816 7817 // Use the planner for vectorization. 7818 LoopVectorizationPlanner LVP(L, LI, TLI, TTI, &LVL, CM, IAI, PSE); 7819 7820 // Get user vectorization factor and interleave count. 7821 unsigned UserVF = Hints.getWidth(); 7822 unsigned UserIC = Hints.getInterleave(); 7823 7824 // Plan how to best vectorize, return the best VF and its cost. 7825 Optional<VectorizationFactor> MaybeVF = LVP.plan(UserVF, UserIC); 7826 7827 VectorizationFactor VF = VectorizationFactor::Disabled(); 7828 unsigned IC = 1; 7829 7830 if (MaybeVF) { 7831 VF = *MaybeVF; 7832 // Select the interleave count. 7833 IC = CM.selectInterleaveCount(VF.Width, VF.Cost); 7834 } 7835 7836 // Identify the diagnostic messages that should be produced. 7837 std::pair<StringRef, std::string> VecDiagMsg, IntDiagMsg; 7838 bool VectorizeLoop = true, InterleaveLoop = true; 7839 if (Requirements.doesNotMeet(F, L, Hints)) { 7840 LLVM_DEBUG(dbgs() << "LV: Not vectorizing: loop did not meet vectorization " 7841 "requirements.\n"); 7842 Hints.emitRemarkWithHints(); 7843 return false; 7844 } 7845 7846 if (VF.Width == 1) { 7847 LLVM_DEBUG(dbgs() << "LV: Vectorization is possible but not beneficial.\n"); 7848 VecDiagMsg = std::make_pair( 7849 "VectorizationNotBeneficial", 7850 "the cost-model indicates that vectorization is not beneficial"); 7851 VectorizeLoop = false; 7852 } 7853 7854 if (!MaybeVF && UserIC > 1) { 7855 // Tell the user interleaving was avoided up-front, despite being explicitly 7856 // requested. 7857 LLVM_DEBUG(dbgs() << "LV: Ignoring UserIC, because vectorization and " 7858 "interleaving should be avoided up front\n"); 7859 IntDiagMsg = std::make_pair( 7860 "InterleavingAvoided", 7861 "Ignoring UserIC, because interleaving was avoided up front"); 7862 InterleaveLoop = false; 7863 } else if (IC == 1 && UserIC <= 1) { 7864 // Tell the user interleaving is not beneficial. 7865 LLVM_DEBUG(dbgs() << "LV: Interleaving is not beneficial.\n"); 7866 IntDiagMsg = std::make_pair( 7867 "InterleavingNotBeneficial", 7868 "the cost-model indicates that interleaving is not beneficial"); 7869 InterleaveLoop = false; 7870 if (UserIC == 1) { 7871 IntDiagMsg.first = "InterleavingNotBeneficialAndDisabled"; 7872 IntDiagMsg.second += 7873 " and is explicitly disabled or interleave count is set to 1"; 7874 } 7875 } else if (IC > 1 && UserIC == 1) { 7876 // Tell the user interleaving is beneficial, but it explicitly disabled. 7877 LLVM_DEBUG( 7878 dbgs() << "LV: Interleaving is beneficial but is explicitly disabled."); 7879 IntDiagMsg = std::make_pair( 7880 "InterleavingBeneficialButDisabled", 7881 "the cost-model indicates that interleaving is beneficial " 7882 "but is explicitly disabled or interleave count is set to 1"); 7883 InterleaveLoop = false; 7884 } 7885 7886 // Override IC if user provided an interleave count. 7887 IC = UserIC > 0 ? UserIC : IC; 7888 7889 // Emit diagnostic messages, if any. 7890 const char *VAPassName = Hints.vectorizeAnalysisPassName(); 7891 if (!VectorizeLoop && !InterleaveLoop) { 7892 // Do not vectorize or interleaving the loop. 7893 ORE->emit([&]() { 7894 return OptimizationRemarkMissed(VAPassName, VecDiagMsg.first, 7895 L->getStartLoc(), L->getHeader()) 7896 << VecDiagMsg.second; 7897 }); 7898 ORE->emit([&]() { 7899 return OptimizationRemarkMissed(LV_NAME, IntDiagMsg.first, 7900 L->getStartLoc(), L->getHeader()) 7901 << IntDiagMsg.second; 7902 }); 7903 return false; 7904 } else if (!VectorizeLoop && InterleaveLoop) { 7905 LLVM_DEBUG(dbgs() << "LV: Interleave Count is " << IC << '\n'); 7906 ORE->emit([&]() { 7907 return OptimizationRemarkAnalysis(VAPassName, VecDiagMsg.first, 7908 L->getStartLoc(), L->getHeader()) 7909 << VecDiagMsg.second; 7910 }); 7911 } else if (VectorizeLoop && !InterleaveLoop) { 7912 LLVM_DEBUG(dbgs() << "LV: Found a vectorizable loop (" << VF.Width 7913 << ") in " << DebugLocStr << '\n'); 7914 ORE->emit([&]() { 7915 return OptimizationRemarkAnalysis(LV_NAME, IntDiagMsg.first, 7916 L->getStartLoc(), L->getHeader()) 7917 << IntDiagMsg.second; 7918 }); 7919 } else if (VectorizeLoop && InterleaveLoop) { 7920 LLVM_DEBUG(dbgs() << "LV: Found a vectorizable loop (" << VF.Width 7921 << ") in " << DebugLocStr << '\n'); 7922 LLVM_DEBUG(dbgs() << "LV: Interleave Count is " << IC << '\n'); 7923 } 7924 7925 LVP.setBestPlan(VF.Width, IC); 7926 7927 using namespace ore; 7928 bool DisableRuntimeUnroll = false; 7929 MDNode *OrigLoopID = L->getLoopID(); 7930 7931 if (!VectorizeLoop) { 7932 assert(IC > 1 && "interleave count should not be 1 or 0"); 7933 // If we decided that it is not legal to vectorize the loop, then 7934 // interleave it. 7935 InnerLoopUnroller Unroller(L, PSE, LI, DT, TLI, TTI, AC, ORE, IC, &LVL, 7936 &CM); 7937 LVP.executePlan(Unroller, DT); 7938 7939 ORE->emit([&]() { 7940 return OptimizationRemark(LV_NAME, "Interleaved", L->getStartLoc(), 7941 L->getHeader()) 7942 << "interleaved loop (interleaved count: " 7943 << NV("InterleaveCount", IC) << ")"; 7944 }); 7945 } else { 7946 // If we decided that it is *legal* to vectorize the loop, then do it. 7947 InnerLoopVectorizer LB(L, PSE, LI, DT, TLI, TTI, AC, ORE, VF.Width, IC, 7948 &LVL, &CM); 7949 LVP.executePlan(LB, DT); 7950 ++LoopsVectorized; 7951 7952 // Add metadata to disable runtime unrolling a scalar loop when there are 7953 // no runtime checks about strides and memory. A scalar loop that is 7954 // rarely used is not worth unrolling. 7955 if (!LB.areSafetyChecksAdded()) 7956 DisableRuntimeUnroll = true; 7957 7958 // Report the vectorization decision. 7959 ORE->emit([&]() { 7960 return OptimizationRemark(LV_NAME, "Vectorized", L->getStartLoc(), 7961 L->getHeader()) 7962 << "vectorized loop (vectorization width: " 7963 << NV("VectorizationFactor", VF.Width) 7964 << ", interleaved count: " << NV("InterleaveCount", IC) << ")"; 7965 }); 7966 } 7967 7968 Optional<MDNode *> RemainderLoopID = 7969 makeFollowupLoopID(OrigLoopID, {LLVMLoopVectorizeFollowupAll, 7970 LLVMLoopVectorizeFollowupEpilogue}); 7971 if (RemainderLoopID.hasValue()) { 7972 L->setLoopID(RemainderLoopID.getValue()); 7973 } else { 7974 if (DisableRuntimeUnroll) 7975 AddRuntimeUnrollDisableMetaData(L); 7976 7977 // Mark the loop as already vectorized to avoid vectorizing again. 7978 Hints.setAlreadyVectorized(); 7979 } 7980 7981 assert(!verifyFunction(*L->getHeader()->getParent())); 7982 return true; 7983 } 7984 7985 LoopVectorizeResult LoopVectorizePass::runImpl( 7986 Function &F, ScalarEvolution &SE_, LoopInfo &LI_, TargetTransformInfo &TTI_, 7987 DominatorTree &DT_, BlockFrequencyInfo &BFI_, TargetLibraryInfo *TLI_, 7988 DemandedBits &DB_, AliasAnalysis &AA_, AssumptionCache &AC_, 7989 std::function<const LoopAccessInfo &(Loop &)> &GetLAA_, 7990 OptimizationRemarkEmitter &ORE_, ProfileSummaryInfo *PSI_) { 7991 SE = &SE_; 7992 LI = &LI_; 7993 TTI = &TTI_; 7994 DT = &DT_; 7995 BFI = &BFI_; 7996 TLI = TLI_; 7997 AA = &AA_; 7998 AC = &AC_; 7999 GetLAA = &GetLAA_; 8000 DB = &DB_; 8001 ORE = &ORE_; 8002 PSI = PSI_; 8003 8004 // Don't attempt if 8005 // 1. the target claims to have no vector registers, and 8006 // 2. interleaving won't help ILP. 8007 // 8008 // The second condition is necessary because, even if the target has no 8009 // vector registers, loop vectorization may still enable scalar 8010 // interleaving. 8011 if (!TTI->getNumberOfRegisters(TTI->getRegisterClassForType(true)) && 8012 TTI->getMaxInterleaveFactor(1) < 2) 8013 return LoopVectorizeResult(false, false); 8014 8015 bool Changed = false, CFGChanged = false; 8016 8017 // The vectorizer requires loops to be in simplified form. 8018 // Since simplification may add new inner loops, it has to run before the 8019 // legality and profitability checks. This means running the loop vectorizer 8020 // will simplify all loops, regardless of whether anything end up being 8021 // vectorized. 8022 for (auto &L : *LI) 8023 Changed |= CFGChanged |= 8024 simplifyLoop(L, DT, LI, SE, AC, nullptr, false /* PreserveLCSSA */); 8025 8026 // Build up a worklist of inner-loops to vectorize. This is necessary as 8027 // the act of vectorizing or partially unrolling a loop creates new loops 8028 // and can invalidate iterators across the loops. 8029 SmallVector<Loop *, 8> Worklist; 8030 8031 for (Loop *L : *LI) 8032 collectSupportedLoops(*L, LI, ORE, Worklist); 8033 8034 LoopsAnalyzed += Worklist.size(); 8035 8036 // Now walk the identified inner loops. 8037 while (!Worklist.empty()) { 8038 Loop *L = Worklist.pop_back_val(); 8039 8040 // For the inner loops we actually process, form LCSSA to simplify the 8041 // transform. 8042 Changed |= formLCSSARecursively(*L, *DT, LI, SE); 8043 8044 Changed |= CFGChanged |= processLoop(L); 8045 } 8046 8047 // Process each loop nest in the function. 8048 return LoopVectorizeResult(Changed, CFGChanged); 8049 } 8050 8051 PreservedAnalyses LoopVectorizePass::run(Function &F, 8052 FunctionAnalysisManager &AM) { 8053 auto &SE = AM.getResult<ScalarEvolutionAnalysis>(F); 8054 auto &LI = AM.getResult<LoopAnalysis>(F); 8055 auto &TTI = AM.getResult<TargetIRAnalysis>(F); 8056 auto &DT = AM.getResult<DominatorTreeAnalysis>(F); 8057 auto &BFI = AM.getResult<BlockFrequencyAnalysis>(F); 8058 auto &TLI = AM.getResult<TargetLibraryAnalysis>(F); 8059 auto &AA = AM.getResult<AAManager>(F); 8060 auto &AC = AM.getResult<AssumptionAnalysis>(F); 8061 auto &DB = AM.getResult<DemandedBitsAnalysis>(F); 8062 auto &ORE = AM.getResult<OptimizationRemarkEmitterAnalysis>(F); 8063 MemorySSA *MSSA = EnableMSSALoopDependency 8064 ? &AM.getResult<MemorySSAAnalysis>(F).getMSSA() 8065 : nullptr; 8066 8067 auto &LAM = AM.getResult<LoopAnalysisManagerFunctionProxy>(F).getManager(); 8068 std::function<const LoopAccessInfo &(Loop &)> GetLAA = 8069 [&](Loop &L) -> const LoopAccessInfo & { 8070 LoopStandardAnalysisResults AR = {AA, AC, DT, LI, SE, TLI, TTI, MSSA}; 8071 return LAM.getResult<LoopAccessAnalysis>(L, AR); 8072 }; 8073 auto &MAMProxy = AM.getResult<ModuleAnalysisManagerFunctionProxy>(F); 8074 ProfileSummaryInfo *PSI = 8075 MAMProxy.getCachedResult<ProfileSummaryAnalysis>(*F.getParent()); 8076 LoopVectorizeResult Result = 8077 runImpl(F, SE, LI, TTI, DT, BFI, &TLI, DB, AA, AC, GetLAA, ORE, PSI); 8078 if (!Result.MadeAnyChange) 8079 return PreservedAnalyses::all(); 8080 PreservedAnalyses PA; 8081 8082 // We currently do not preserve loopinfo/dominator analyses with outer loop 8083 // vectorization. Until this is addressed, mark these analyses as preserved 8084 // only for non-VPlan-native path. 8085 // TODO: Preserve Loop and Dominator analyses for VPlan-native path. 8086 if (!EnableVPlanNativePath) { 8087 PA.preserve<LoopAnalysis>(); 8088 PA.preserve<DominatorTreeAnalysis>(); 8089 } 8090 PA.preserve<BasicAA>(); 8091 PA.preserve<GlobalsAA>(); 8092 if (!Result.MadeCFGChange) 8093 PA.preserveSet<CFGAnalyses>(); 8094 return PA; 8095 } 8096