1 //===- LoopVectorize.cpp - A Loop Vectorizer ------------------------------===// 2 // 3 // Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. 4 // See https://llvm.org/LICENSE.txt for license information. 5 // SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception 6 // 7 //===----------------------------------------------------------------------===// 8 // 9 // This is the LLVM loop vectorizer. This pass modifies 'vectorizable' loops 10 // and generates target-independent LLVM-IR. 11 // The vectorizer uses the TargetTransformInfo analysis to estimate the costs 12 // of instructions in order to estimate the profitability of vectorization. 13 // 14 // The loop vectorizer combines consecutive loop iterations into a single 15 // 'wide' iteration. After this transformation the index is incremented 16 // by the SIMD vector width, and not by one. 17 // 18 // This pass has three parts: 19 // 1. The main loop pass that drives the different parts. 20 // 2. LoopVectorizationLegality - A unit that checks for the legality 21 // of the vectorization. 22 // 3. InnerLoopVectorizer - A unit that performs the actual 23 // widening of instructions. 24 // 4. LoopVectorizationCostModel - A unit that checks for the profitability 25 // of vectorization. It decides on the optimal vector width, which 26 // can be one, if vectorization is not profitable. 27 // 28 // There is a development effort going on to migrate loop vectorizer to the 29 // VPlan infrastructure and to introduce outer loop vectorization support (see 30 // docs/Proposal/VectorizationPlan.rst and 31 // http://lists.llvm.org/pipermail/llvm-dev/2017-December/119523.html). For this 32 // purpose, we temporarily introduced the VPlan-native vectorization path: an 33 // alternative vectorization path that is natively implemented on top of the 34 // VPlan infrastructure. See EnableVPlanNativePath for enabling. 35 // 36 //===----------------------------------------------------------------------===// 37 // 38 // The reduction-variable vectorization is based on the paper: 39 // D. Nuzman and R. Henderson. Multi-platform Auto-vectorization. 40 // 41 // Variable uniformity checks are inspired by: 42 // Karrenberg, R. and Hack, S. Whole Function Vectorization. 43 // 44 // The interleaved access vectorization is based on the paper: 45 // Dorit Nuzman, Ira Rosen and Ayal Zaks. Auto-Vectorization of Interleaved 46 // Data for SIMD 47 // 48 // Other ideas/concepts are from: 49 // A. Zaks and D. Nuzman. Autovectorization in GCC-two years later. 50 // 51 // S. Maleki, Y. Gao, M. Garzaran, T. Wong and D. Padua. An Evaluation of 52 // Vectorizing Compilers. 53 // 54 //===----------------------------------------------------------------------===// 55 56 #include "llvm/Transforms/Vectorize/LoopVectorize.h" 57 #include "LoopVectorizationPlanner.h" 58 #include "VPRecipeBuilder.h" 59 #include "VPlan.h" 60 #include "VPlanHCFGBuilder.h" 61 #include "VPlanPredicator.h" 62 #include "VPlanTransforms.h" 63 #include "llvm/ADT/APInt.h" 64 #include "llvm/ADT/ArrayRef.h" 65 #include "llvm/ADT/DenseMap.h" 66 #include "llvm/ADT/DenseMapInfo.h" 67 #include "llvm/ADT/Hashing.h" 68 #include "llvm/ADT/MapVector.h" 69 #include "llvm/ADT/None.h" 70 #include "llvm/ADT/Optional.h" 71 #include "llvm/ADT/STLExtras.h" 72 #include "llvm/ADT/SetVector.h" 73 #include "llvm/ADT/SmallPtrSet.h" 74 #include "llvm/ADT/SmallVector.h" 75 #include "llvm/ADT/Statistic.h" 76 #include "llvm/ADT/StringRef.h" 77 #include "llvm/ADT/Twine.h" 78 #include "llvm/ADT/iterator_range.h" 79 #include "llvm/Analysis/AssumptionCache.h" 80 #include "llvm/Analysis/BasicAliasAnalysis.h" 81 #include "llvm/Analysis/BlockFrequencyInfo.h" 82 #include "llvm/Analysis/CFG.h" 83 #include "llvm/Analysis/CodeMetrics.h" 84 #include "llvm/Analysis/DemandedBits.h" 85 #include "llvm/Analysis/GlobalsModRef.h" 86 #include "llvm/Analysis/LoopAccessAnalysis.h" 87 #include "llvm/Analysis/LoopAnalysisManager.h" 88 #include "llvm/Analysis/LoopInfo.h" 89 #include "llvm/Analysis/LoopIterator.h" 90 #include "llvm/Analysis/MemorySSA.h" 91 #include "llvm/Analysis/OptimizationRemarkEmitter.h" 92 #include "llvm/Analysis/ProfileSummaryInfo.h" 93 #include "llvm/Analysis/ScalarEvolution.h" 94 #include "llvm/Analysis/ScalarEvolutionExpander.h" 95 #include "llvm/Analysis/ScalarEvolutionExpressions.h" 96 #include "llvm/Analysis/TargetLibraryInfo.h" 97 #include "llvm/Analysis/TargetTransformInfo.h" 98 #include "llvm/Analysis/VectorUtils.h" 99 #include "llvm/IR/Attributes.h" 100 #include "llvm/IR/BasicBlock.h" 101 #include "llvm/IR/CFG.h" 102 #include "llvm/IR/Constant.h" 103 #include "llvm/IR/Constants.h" 104 #include "llvm/IR/DataLayout.h" 105 #include "llvm/IR/DebugInfoMetadata.h" 106 #include "llvm/IR/DebugLoc.h" 107 #include "llvm/IR/DerivedTypes.h" 108 #include "llvm/IR/DiagnosticInfo.h" 109 #include "llvm/IR/Dominators.h" 110 #include "llvm/IR/Function.h" 111 #include "llvm/IR/IRBuilder.h" 112 #include "llvm/IR/InstrTypes.h" 113 #include "llvm/IR/Instruction.h" 114 #include "llvm/IR/Instructions.h" 115 #include "llvm/IR/IntrinsicInst.h" 116 #include "llvm/IR/Intrinsics.h" 117 #include "llvm/IR/LLVMContext.h" 118 #include "llvm/IR/Metadata.h" 119 #include "llvm/IR/Module.h" 120 #include "llvm/IR/Operator.h" 121 #include "llvm/IR/Type.h" 122 #include "llvm/IR/Use.h" 123 #include "llvm/IR/User.h" 124 #include "llvm/IR/Value.h" 125 #include "llvm/IR/ValueHandle.h" 126 #include "llvm/IR/Verifier.h" 127 #include "llvm/InitializePasses.h" 128 #include "llvm/Pass.h" 129 #include "llvm/Support/Casting.h" 130 #include "llvm/Support/CommandLine.h" 131 #include "llvm/Support/Compiler.h" 132 #include "llvm/Support/Debug.h" 133 #include "llvm/Support/ErrorHandling.h" 134 #include "llvm/Support/MathExtras.h" 135 #include "llvm/Support/raw_ostream.h" 136 #include "llvm/Transforms/Utils/BasicBlockUtils.h" 137 #include "llvm/Transforms/Utils/InjectTLIMappings.h" 138 #include "llvm/Transforms/Utils/LoopSimplify.h" 139 #include "llvm/Transforms/Utils/LoopUtils.h" 140 #include "llvm/Transforms/Utils/LoopVersioning.h" 141 #include "llvm/Transforms/Utils/SizeOpts.h" 142 #include "llvm/Transforms/Vectorize/LoopVectorizationLegality.h" 143 #include <algorithm> 144 #include <cassert> 145 #include <cstdint> 146 #include <cstdlib> 147 #include <functional> 148 #include <iterator> 149 #include <limits> 150 #include <memory> 151 #include <string> 152 #include <tuple> 153 #include <utility> 154 155 using namespace llvm; 156 157 #define LV_NAME "loop-vectorize" 158 #define DEBUG_TYPE LV_NAME 159 160 /// @{ 161 /// Metadata attribute names 162 static const char *const LLVMLoopVectorizeFollowupAll = 163 "llvm.loop.vectorize.followup_all"; 164 static const char *const LLVMLoopVectorizeFollowupVectorized = 165 "llvm.loop.vectorize.followup_vectorized"; 166 static const char *const LLVMLoopVectorizeFollowupEpilogue = 167 "llvm.loop.vectorize.followup_epilogue"; 168 /// @} 169 170 STATISTIC(LoopsVectorized, "Number of loops vectorized"); 171 STATISTIC(LoopsAnalyzed, "Number of loops analyzed for vectorization"); 172 173 /// Loops with a known constant trip count below this number are vectorized only 174 /// if no scalar iteration overheads are incurred. 175 static cl::opt<unsigned> TinyTripCountVectorThreshold( 176 "vectorizer-min-trip-count", cl::init(16), cl::Hidden, 177 cl::desc("Loops with a constant trip count that is smaller than this " 178 "value are vectorized only if no scalar iteration overheads " 179 "are incurred.")); 180 181 // Indicates that an epilogue is undesired, predication is preferred. 182 // This means that the vectorizer will try to fold the loop-tail (epilogue) 183 // into the loop and predicate the loop body accordingly. 184 static cl::opt<bool> PreferPredicateOverEpilog( 185 "prefer-predicate-over-epilog", cl::init(false), cl::Hidden, 186 cl::desc("Indicate that an epilogue is undesired, predication should be " 187 "used instead.")); 188 189 static cl::opt<bool> MaximizeBandwidth( 190 "vectorizer-maximize-bandwidth", cl::init(false), cl::Hidden, 191 cl::desc("Maximize bandwidth when selecting vectorization factor which " 192 "will be determined by the smallest type in loop.")); 193 194 static cl::opt<bool> EnableInterleavedMemAccesses( 195 "enable-interleaved-mem-accesses", cl::init(false), cl::Hidden, 196 cl::desc("Enable vectorization on interleaved memory accesses in a loop")); 197 198 /// An interleave-group may need masking if it resides in a block that needs 199 /// predication, or in order to mask away gaps. 200 static cl::opt<bool> EnableMaskedInterleavedMemAccesses( 201 "enable-masked-interleaved-mem-accesses", cl::init(false), cl::Hidden, 202 cl::desc("Enable vectorization on masked interleaved memory accesses in a loop")); 203 204 static cl::opt<unsigned> TinyTripCountInterleaveThreshold( 205 "tiny-trip-count-interleave-threshold", cl::init(128), cl::Hidden, 206 cl::desc("We don't interleave loops with a estimated constant trip count " 207 "below this number")); 208 209 static cl::opt<unsigned> ForceTargetNumScalarRegs( 210 "force-target-num-scalar-regs", cl::init(0), cl::Hidden, 211 cl::desc("A flag that overrides the target's number of scalar registers.")); 212 213 static cl::opt<unsigned> ForceTargetNumVectorRegs( 214 "force-target-num-vector-regs", cl::init(0), cl::Hidden, 215 cl::desc("A flag that overrides the target's number of vector registers.")); 216 217 static cl::opt<unsigned> ForceTargetMaxScalarInterleaveFactor( 218 "force-target-max-scalar-interleave", cl::init(0), cl::Hidden, 219 cl::desc("A flag that overrides the target's max interleave factor for " 220 "scalar loops.")); 221 222 static cl::opt<unsigned> ForceTargetMaxVectorInterleaveFactor( 223 "force-target-max-vector-interleave", cl::init(0), cl::Hidden, 224 cl::desc("A flag that overrides the target's max interleave factor for " 225 "vectorized loops.")); 226 227 static cl::opt<unsigned> ForceTargetInstructionCost( 228 "force-target-instruction-cost", cl::init(0), cl::Hidden, 229 cl::desc("A flag that overrides the target's expected cost for " 230 "an instruction to a single constant value. Mostly " 231 "useful for getting consistent testing.")); 232 233 static cl::opt<unsigned> SmallLoopCost( 234 "small-loop-cost", cl::init(20), cl::Hidden, 235 cl::desc( 236 "The cost of a loop that is considered 'small' by the interleaver.")); 237 238 static cl::opt<bool> LoopVectorizeWithBlockFrequency( 239 "loop-vectorize-with-block-frequency", cl::init(true), cl::Hidden, 240 cl::desc("Enable the use of the block frequency analysis to access PGO " 241 "heuristics minimizing code growth in cold regions and being more " 242 "aggressive in hot regions.")); 243 244 // Runtime interleave loops for load/store throughput. 245 static cl::opt<bool> EnableLoadStoreRuntimeInterleave( 246 "enable-loadstore-runtime-interleave", cl::init(true), cl::Hidden, 247 cl::desc( 248 "Enable runtime interleaving until load/store ports are saturated")); 249 250 /// The number of stores in a loop that are allowed to need predication. 251 static cl::opt<unsigned> NumberOfStoresToPredicate( 252 "vectorize-num-stores-pred", cl::init(1), cl::Hidden, 253 cl::desc("Max number of stores to be predicated behind an if.")); 254 255 static cl::opt<bool> EnableIndVarRegisterHeur( 256 "enable-ind-var-reg-heur", cl::init(true), cl::Hidden, 257 cl::desc("Count the induction variable only once when interleaving")); 258 259 static cl::opt<bool> EnableCondStoresVectorization( 260 "enable-cond-stores-vec", cl::init(true), cl::Hidden, 261 cl::desc("Enable if predication of stores during vectorization.")); 262 263 static cl::opt<unsigned> MaxNestedScalarReductionIC( 264 "max-nested-scalar-reduction-interleave", cl::init(2), cl::Hidden, 265 cl::desc("The maximum interleave count to use when interleaving a scalar " 266 "reduction in a nested loop.")); 267 268 cl::opt<bool> EnableVPlanNativePath( 269 "enable-vplan-native-path", cl::init(false), cl::Hidden, 270 cl::desc("Enable VPlan-native vectorization path with " 271 "support for outer loop vectorization.")); 272 273 // FIXME: Remove this switch once we have divergence analysis. Currently we 274 // assume divergent non-backedge branches when this switch is true. 275 cl::opt<bool> EnableVPlanPredication( 276 "enable-vplan-predication", cl::init(false), cl::Hidden, 277 cl::desc("Enable VPlan-native vectorization path predicator with " 278 "support for outer loop vectorization.")); 279 280 // This flag enables the stress testing of the VPlan H-CFG construction in the 281 // VPlan-native vectorization path. It must be used in conjuction with 282 // -enable-vplan-native-path. -vplan-verify-hcfg can also be used to enable the 283 // verification of the H-CFGs built. 284 static cl::opt<bool> VPlanBuildStressTest( 285 "vplan-build-stress-test", cl::init(false), cl::Hidden, 286 cl::desc( 287 "Build VPlan for every supported loop nest in the function and bail " 288 "out right after the build (stress test the VPlan H-CFG construction " 289 "in the VPlan-native vectorization path).")); 290 291 cl::opt<bool> llvm::EnableLoopInterleaving( 292 "interleave-loops", cl::init(true), cl::Hidden, 293 cl::desc("Enable loop interleaving in Loop vectorization passes")); 294 cl::opt<bool> llvm::EnableLoopVectorization( 295 "vectorize-loops", cl::init(true), cl::Hidden, 296 cl::desc("Run the Loop vectorization passes")); 297 298 /// A helper function that returns the type of loaded or stored value. 299 static Type *getMemInstValueType(Value *I) { 300 assert((isa<LoadInst>(I) || isa<StoreInst>(I)) && 301 "Expected Load or Store instruction"); 302 if (auto *LI = dyn_cast<LoadInst>(I)) 303 return LI->getType(); 304 return cast<StoreInst>(I)->getValueOperand()->getType(); 305 } 306 307 /// A helper function that returns true if the given type is irregular. The 308 /// type is irregular if its allocated size doesn't equal the store size of an 309 /// element of the corresponding vector type at the given vectorization factor. 310 static bool hasIrregularType(Type *Ty, const DataLayout &DL, unsigned VF) { 311 // Determine if an array of VF elements of type Ty is "bitcast compatible" 312 // with a <VF x Ty> vector. 313 if (VF > 1) { 314 auto *VectorTy = VectorType::get(Ty, VF); 315 return VF * DL.getTypeAllocSize(Ty) != DL.getTypeStoreSize(VectorTy); 316 } 317 318 // If the vectorization factor is one, we just check if an array of type Ty 319 // requires padding between elements. 320 return DL.getTypeAllocSizeInBits(Ty) != DL.getTypeSizeInBits(Ty); 321 } 322 323 /// A helper function that returns the reciprocal of the block probability of 324 /// predicated blocks. If we return X, we are assuming the predicated block 325 /// will execute once for every X iterations of the loop header. 326 /// 327 /// TODO: We should use actual block probability here, if available. Currently, 328 /// we always assume predicated blocks have a 50% chance of executing. 329 static unsigned getReciprocalPredBlockProb() { return 2; } 330 331 /// A helper function that adds a 'fast' flag to floating-point operations. 332 static Value *addFastMathFlag(Value *V) { 333 if (isa<FPMathOperator>(V)) 334 cast<Instruction>(V)->setFastMathFlags(FastMathFlags::getFast()); 335 return V; 336 } 337 338 static Value *addFastMathFlag(Value *V, FastMathFlags FMF) { 339 if (isa<FPMathOperator>(V)) 340 cast<Instruction>(V)->setFastMathFlags(FMF); 341 return V; 342 } 343 344 /// A helper function that returns an integer or floating-point constant with 345 /// value C. 346 static Constant *getSignedIntOrFpConstant(Type *Ty, int64_t C) { 347 return Ty->isIntegerTy() ? ConstantInt::getSigned(Ty, C) 348 : ConstantFP::get(Ty, C); 349 } 350 351 /// Returns "best known" trip count for the specified loop \p L as defined by 352 /// the following procedure: 353 /// 1) Returns exact trip count if it is known. 354 /// 2) Returns expected trip count according to profile data if any. 355 /// 3) Returns upper bound estimate if it is known. 356 /// 4) Returns None if all of the above failed. 357 static Optional<unsigned> getSmallBestKnownTC(ScalarEvolution &SE, Loop *L) { 358 // Check if exact trip count is known. 359 if (unsigned ExpectedTC = SE.getSmallConstantTripCount(L)) 360 return ExpectedTC; 361 362 // Check if there is an expected trip count available from profile data. 363 if (LoopVectorizeWithBlockFrequency) 364 if (auto EstimatedTC = getLoopEstimatedTripCount(L)) 365 return EstimatedTC; 366 367 // Check if upper bound estimate is known. 368 if (unsigned ExpectedTC = SE.getSmallConstantMaxTripCount(L)) 369 return ExpectedTC; 370 371 return None; 372 } 373 374 namespace llvm { 375 376 /// InnerLoopVectorizer vectorizes loops which contain only one basic 377 /// block to a specified vectorization factor (VF). 378 /// This class performs the widening of scalars into vectors, or multiple 379 /// scalars. This class also implements the following features: 380 /// * It inserts an epilogue loop for handling loops that don't have iteration 381 /// counts that are known to be a multiple of the vectorization factor. 382 /// * It handles the code generation for reduction variables. 383 /// * Scalarization (implementation using scalars) of un-vectorizable 384 /// instructions. 385 /// InnerLoopVectorizer does not perform any vectorization-legality 386 /// checks, and relies on the caller to check for the different legality 387 /// aspects. The InnerLoopVectorizer relies on the 388 /// LoopVectorizationLegality class to provide information about the induction 389 /// and reduction variables that were found to a given vectorization factor. 390 class InnerLoopVectorizer { 391 public: 392 InnerLoopVectorizer(Loop *OrigLoop, PredicatedScalarEvolution &PSE, 393 LoopInfo *LI, DominatorTree *DT, 394 const TargetLibraryInfo *TLI, 395 const TargetTransformInfo *TTI, AssumptionCache *AC, 396 OptimizationRemarkEmitter *ORE, unsigned VecWidth, 397 unsigned UnrollFactor, LoopVectorizationLegality *LVL, 398 LoopVectorizationCostModel *CM) 399 : OrigLoop(OrigLoop), PSE(PSE), LI(LI), DT(DT), TLI(TLI), TTI(TTI), 400 AC(AC), ORE(ORE), VF(VecWidth), UF(UnrollFactor), 401 Builder(PSE.getSE()->getContext()), 402 VectorLoopValueMap(UnrollFactor, VecWidth), Legal(LVL), Cost(CM) {} 403 virtual ~InnerLoopVectorizer() = default; 404 405 /// Create a new empty loop. Unlink the old loop and connect the new one. 406 /// Return the pre-header block of the new loop. 407 BasicBlock *createVectorizedLoopSkeleton(); 408 409 /// Widen a single instruction within the innermost loop. 410 void widenInstruction(Instruction &I); 411 412 /// Widen a single call instruction within the innermost loop. 413 void widenCallInstruction(CallInst &I, VPUser &ArgOperands, 414 VPTransformState &State); 415 416 /// Widen a single select instruction within the innermost loop. 417 void widenSelectInstruction(SelectInst &I, bool InvariantCond); 418 419 /// Fix the vectorized code, taking care of header phi's, live-outs, and more. 420 void fixVectorizedLoop(); 421 422 // Return true if any runtime check is added. 423 bool areSafetyChecksAdded() { return AddedSafetyChecks; } 424 425 /// A type for vectorized values in the new loop. Each value from the 426 /// original loop, when vectorized, is represented by UF vector values in the 427 /// new unrolled loop, where UF is the unroll factor. 428 using VectorParts = SmallVector<Value *, 2>; 429 430 /// Vectorize a single GetElementPtrInst based on information gathered and 431 /// decisions taken during planning. 432 void widenGEP(GetElementPtrInst *GEP, unsigned UF, unsigned VF, 433 bool IsPtrLoopInvariant, SmallBitVector &IsIndexLoopInvariant); 434 435 /// Vectorize a single PHINode in a block. This method handles the induction 436 /// variable canonicalization. It supports both VF = 1 for unrolled loops and 437 /// arbitrary length vectors. 438 void widenPHIInstruction(Instruction *PN, unsigned UF, unsigned VF); 439 440 /// A helper function to scalarize a single Instruction in the innermost loop. 441 /// Generates a sequence of scalar instances for each lane between \p MinLane 442 /// and \p MaxLane, times each part between \p MinPart and \p MaxPart, 443 /// inclusive.. 444 void scalarizeInstruction(Instruction *Instr, const VPIteration &Instance, 445 bool IfPredicateInstr); 446 447 /// Widen an integer or floating-point induction variable \p IV. If \p Trunc 448 /// is provided, the integer induction variable will first be truncated to 449 /// the corresponding type. 450 void widenIntOrFpInduction(PHINode *IV, TruncInst *Trunc = nullptr); 451 452 /// getOrCreateVectorValue and getOrCreateScalarValue coordinate to generate a 453 /// vector or scalar value on-demand if one is not yet available. When 454 /// vectorizing a loop, we visit the definition of an instruction before its 455 /// uses. When visiting the definition, we either vectorize or scalarize the 456 /// instruction, creating an entry for it in the corresponding map. (In some 457 /// cases, such as induction variables, we will create both vector and scalar 458 /// entries.) Then, as we encounter uses of the definition, we derive values 459 /// for each scalar or vector use unless such a value is already available. 460 /// For example, if we scalarize a definition and one of its uses is vector, 461 /// we build the required vector on-demand with an insertelement sequence 462 /// when visiting the use. Otherwise, if the use is scalar, we can use the 463 /// existing scalar definition. 464 /// 465 /// Return a value in the new loop corresponding to \p V from the original 466 /// loop at unroll index \p Part. If the value has already been vectorized, 467 /// the corresponding vector entry in VectorLoopValueMap is returned. If, 468 /// however, the value has a scalar entry in VectorLoopValueMap, we construct 469 /// a new vector value on-demand by inserting the scalar values into a vector 470 /// with an insertelement sequence. If the value has been neither vectorized 471 /// nor scalarized, it must be loop invariant, so we simply broadcast the 472 /// value into a vector. 473 Value *getOrCreateVectorValue(Value *V, unsigned Part); 474 475 /// Return a value in the new loop corresponding to \p V from the original 476 /// loop at unroll and vector indices \p Instance. If the value has been 477 /// vectorized but not scalarized, the necessary extractelement instruction 478 /// will be generated. 479 Value *getOrCreateScalarValue(Value *V, const VPIteration &Instance); 480 481 /// Construct the vector value of a scalarized value \p V one lane at a time. 482 void packScalarIntoVectorValue(Value *V, const VPIteration &Instance); 483 484 /// Try to vectorize interleaved access group \p Group with the base address 485 /// given in \p Addr, optionally masking the vector operations if \p 486 /// BlockInMask is non-null. Use \p State to translate given VPValues to IR 487 /// values in the vectorized loop. 488 void vectorizeInterleaveGroup(const InterleaveGroup<Instruction> *Group, 489 VPTransformState &State, VPValue *Addr, 490 VPValue *BlockInMask = nullptr); 491 492 /// Vectorize Load and Store instructions with the base address given in \p 493 /// Addr, optionally masking the vector operations if \p BlockInMask is 494 /// non-null. Use \p State to translate given VPValues to IR values in the 495 /// vectorized loop. 496 void vectorizeMemoryInstruction(Instruction *Instr, VPTransformState &State, 497 VPValue *Addr, VPValue *StoredValue, 498 VPValue *BlockInMask); 499 500 /// Set the debug location in the builder using the debug location in 501 /// the instruction. 502 void setDebugLocFromInst(IRBuilder<> &B, const Value *Ptr); 503 504 /// Fix the non-induction PHIs in the OrigPHIsToFix vector. 505 void fixNonInductionPHIs(void); 506 507 protected: 508 friend class LoopVectorizationPlanner; 509 510 /// A small list of PHINodes. 511 using PhiVector = SmallVector<PHINode *, 4>; 512 513 /// A type for scalarized values in the new loop. Each value from the 514 /// original loop, when scalarized, is represented by UF x VF scalar values 515 /// in the new unrolled loop, where UF is the unroll factor and VF is the 516 /// vectorization factor. 517 using ScalarParts = SmallVector<SmallVector<Value *, 4>, 2>; 518 519 /// Set up the values of the IVs correctly when exiting the vector loop. 520 void fixupIVUsers(PHINode *OrigPhi, const InductionDescriptor &II, 521 Value *CountRoundDown, Value *EndValue, 522 BasicBlock *MiddleBlock); 523 524 /// Create a new induction variable inside L. 525 PHINode *createInductionVariable(Loop *L, Value *Start, Value *End, 526 Value *Step, Instruction *DL); 527 528 /// Handle all cross-iteration phis in the header. 529 void fixCrossIterationPHIs(); 530 531 /// Fix a first-order recurrence. This is the second phase of vectorizing 532 /// this phi node. 533 void fixFirstOrderRecurrence(PHINode *Phi); 534 535 /// Fix a reduction cross-iteration phi. This is the second phase of 536 /// vectorizing this phi node. 537 void fixReduction(PHINode *Phi); 538 539 /// Clear NSW/NUW flags from reduction instructions if necessary. 540 void clearReductionWrapFlags(RecurrenceDescriptor &RdxDesc); 541 542 /// The Loop exit block may have single value PHI nodes with some 543 /// incoming value. While vectorizing we only handled real values 544 /// that were defined inside the loop and we should have one value for 545 /// each predecessor of its parent basic block. See PR14725. 546 void fixLCSSAPHIs(); 547 548 /// Iteratively sink the scalarized operands of a predicated instruction into 549 /// the block that was created for it. 550 void sinkScalarOperands(Instruction *PredInst); 551 552 /// Shrinks vector element sizes to the smallest bitwidth they can be legally 553 /// represented as. 554 void truncateToMinimalBitwidths(); 555 556 /// Create a broadcast instruction. This method generates a broadcast 557 /// instruction (shuffle) for loop invariant values and for the induction 558 /// value. If this is the induction variable then we extend it to N, N+1, ... 559 /// this is needed because each iteration in the loop corresponds to a SIMD 560 /// element. 561 virtual Value *getBroadcastInstrs(Value *V); 562 563 /// This function adds (StartIdx, StartIdx + Step, StartIdx + 2*Step, ...) 564 /// to each vector element of Val. The sequence starts at StartIndex. 565 /// \p Opcode is relevant for FP induction variable. 566 virtual Value *getStepVector(Value *Val, int StartIdx, Value *Step, 567 Instruction::BinaryOps Opcode = 568 Instruction::BinaryOpsEnd); 569 570 /// Compute scalar induction steps. \p ScalarIV is the scalar induction 571 /// variable on which to base the steps, \p Step is the size of the step, and 572 /// \p EntryVal is the value from the original loop that maps to the steps. 573 /// Note that \p EntryVal doesn't have to be an induction variable - it 574 /// can also be a truncate instruction. 575 void buildScalarSteps(Value *ScalarIV, Value *Step, Instruction *EntryVal, 576 const InductionDescriptor &ID); 577 578 /// Create a vector induction phi node based on an existing scalar one. \p 579 /// EntryVal is the value from the original loop that maps to the vector phi 580 /// node, and \p Step is the loop-invariant step. If \p EntryVal is a 581 /// truncate instruction, instead of widening the original IV, we widen a 582 /// version of the IV truncated to \p EntryVal's type. 583 void createVectorIntOrFpInductionPHI(const InductionDescriptor &II, 584 Value *Step, Instruction *EntryVal); 585 586 /// Returns true if an instruction \p I should be scalarized instead of 587 /// vectorized for the chosen vectorization factor. 588 bool shouldScalarizeInstruction(Instruction *I) const; 589 590 /// Returns true if we should generate a scalar version of \p IV. 591 bool needsScalarInduction(Instruction *IV) const; 592 593 /// If there is a cast involved in the induction variable \p ID, which should 594 /// be ignored in the vectorized loop body, this function records the 595 /// VectorLoopValue of the respective Phi also as the VectorLoopValue of the 596 /// cast. We had already proved that the casted Phi is equal to the uncasted 597 /// Phi in the vectorized loop (under a runtime guard), and therefore 598 /// there is no need to vectorize the cast - the same value can be used in the 599 /// vector loop for both the Phi and the cast. 600 /// If \p VectorLoopValue is a scalarized value, \p Lane is also specified, 601 /// Otherwise, \p VectorLoopValue is a widened/vectorized value. 602 /// 603 /// \p EntryVal is the value from the original loop that maps to the vector 604 /// phi node and is used to distinguish what is the IV currently being 605 /// processed - original one (if \p EntryVal is a phi corresponding to the 606 /// original IV) or the "newly-created" one based on the proof mentioned above 607 /// (see also buildScalarSteps() and createVectorIntOrFPInductionPHI()). In the 608 /// latter case \p EntryVal is a TruncInst and we must not record anything for 609 /// that IV, but it's error-prone to expect callers of this routine to care 610 /// about that, hence this explicit parameter. 611 void recordVectorLoopValueForInductionCast(const InductionDescriptor &ID, 612 const Instruction *EntryVal, 613 Value *VectorLoopValue, 614 unsigned Part, 615 unsigned Lane = UINT_MAX); 616 617 /// Generate a shuffle sequence that will reverse the vector Vec. 618 virtual Value *reverseVector(Value *Vec); 619 620 /// Returns (and creates if needed) the original loop trip count. 621 Value *getOrCreateTripCount(Loop *NewLoop); 622 623 /// Returns (and creates if needed) the trip count of the widened loop. 624 Value *getOrCreateVectorTripCount(Loop *NewLoop); 625 626 /// Returns a bitcasted value to the requested vector type. 627 /// Also handles bitcasts of vector<float> <-> vector<pointer> types. 628 Value *createBitOrPointerCast(Value *V, VectorType *DstVTy, 629 const DataLayout &DL); 630 631 /// Emit a bypass check to see if the vector trip count is zero, including if 632 /// it overflows. 633 void emitMinimumIterationCountCheck(Loop *L, BasicBlock *Bypass); 634 635 /// Emit a bypass check to see if all of the SCEV assumptions we've 636 /// had to make are correct. 637 void emitSCEVChecks(Loop *L, BasicBlock *Bypass); 638 639 /// Emit bypass checks to check any memory assumptions we may have made. 640 void emitMemRuntimeChecks(Loop *L, BasicBlock *Bypass); 641 642 /// Compute the transformed value of Index at offset StartValue using step 643 /// StepValue. 644 /// For integer induction, returns StartValue + Index * StepValue. 645 /// For pointer induction, returns StartValue[Index * StepValue]. 646 /// FIXME: The newly created binary instructions should contain nsw/nuw 647 /// flags, which can be found from the original scalar operations. 648 Value *emitTransformedIndex(IRBuilder<> &B, Value *Index, ScalarEvolution *SE, 649 const DataLayout &DL, 650 const InductionDescriptor &ID) const; 651 652 /// Add additional metadata to \p To that was not present on \p Orig. 653 /// 654 /// Currently this is used to add the noalias annotations based on the 655 /// inserted memchecks. Use this for instructions that are *cloned* into the 656 /// vector loop. 657 void addNewMetadata(Instruction *To, const Instruction *Orig); 658 659 /// Add metadata from one instruction to another. 660 /// 661 /// This includes both the original MDs from \p From and additional ones (\see 662 /// addNewMetadata). Use this for *newly created* instructions in the vector 663 /// loop. 664 void addMetadata(Instruction *To, Instruction *From); 665 666 /// Similar to the previous function but it adds the metadata to a 667 /// vector of instructions. 668 void addMetadata(ArrayRef<Value *> To, Instruction *From); 669 670 /// The original loop. 671 Loop *OrigLoop; 672 673 /// A wrapper around ScalarEvolution used to add runtime SCEV checks. Applies 674 /// dynamic knowledge to simplify SCEV expressions and converts them to a 675 /// more usable form. 676 PredicatedScalarEvolution &PSE; 677 678 /// Loop Info. 679 LoopInfo *LI; 680 681 /// Dominator Tree. 682 DominatorTree *DT; 683 684 /// Alias Analysis. 685 AliasAnalysis *AA; 686 687 /// Target Library Info. 688 const TargetLibraryInfo *TLI; 689 690 /// Target Transform Info. 691 const TargetTransformInfo *TTI; 692 693 /// Assumption Cache. 694 AssumptionCache *AC; 695 696 /// Interface to emit optimization remarks. 697 OptimizationRemarkEmitter *ORE; 698 699 /// LoopVersioning. It's only set up (non-null) if memchecks were 700 /// used. 701 /// 702 /// This is currently only used to add no-alias metadata based on the 703 /// memchecks. The actually versioning is performed manually. 704 std::unique_ptr<LoopVersioning> LVer; 705 706 /// The vectorization SIMD factor to use. Each vector will have this many 707 /// vector elements. 708 unsigned VF; 709 710 /// The vectorization unroll factor to use. Each scalar is vectorized to this 711 /// many different vector instructions. 712 unsigned UF; 713 714 /// The builder that we use 715 IRBuilder<> Builder; 716 717 // --- Vectorization state --- 718 719 /// The vector-loop preheader. 720 BasicBlock *LoopVectorPreHeader; 721 722 /// The scalar-loop preheader. 723 BasicBlock *LoopScalarPreHeader; 724 725 /// Middle Block between the vector and the scalar. 726 BasicBlock *LoopMiddleBlock; 727 728 /// The ExitBlock of the scalar loop. 729 BasicBlock *LoopExitBlock; 730 731 /// The vector loop body. 732 BasicBlock *LoopVectorBody; 733 734 /// The scalar loop body. 735 BasicBlock *LoopScalarBody; 736 737 /// A list of all bypass blocks. The first block is the entry of the loop. 738 SmallVector<BasicBlock *, 4> LoopBypassBlocks; 739 740 /// The new Induction variable which was added to the new block. 741 PHINode *Induction = nullptr; 742 743 /// The induction variable of the old basic block. 744 PHINode *OldInduction = nullptr; 745 746 /// Maps values from the original loop to their corresponding values in the 747 /// vectorized loop. A key value can map to either vector values, scalar 748 /// values or both kinds of values, depending on whether the key was 749 /// vectorized and scalarized. 750 VectorizerValueMap VectorLoopValueMap; 751 752 /// Store instructions that were predicated. 753 SmallVector<Instruction *, 4> PredicatedInstructions; 754 755 /// Trip count of the original loop. 756 Value *TripCount = nullptr; 757 758 /// Trip count of the widened loop (TripCount - TripCount % (VF*UF)) 759 Value *VectorTripCount = nullptr; 760 761 /// The legality analysis. 762 LoopVectorizationLegality *Legal; 763 764 /// The profitablity analysis. 765 LoopVectorizationCostModel *Cost; 766 767 // Record whether runtime checks are added. 768 bool AddedSafetyChecks = false; 769 770 // Holds the end values for each induction variable. We save the end values 771 // so we can later fix-up the external users of the induction variables. 772 DenseMap<PHINode *, Value *> IVEndValues; 773 774 // Vector of original scalar PHIs whose corresponding widened PHIs need to be 775 // fixed up at the end of vector code generation. 776 SmallVector<PHINode *, 8> OrigPHIsToFix; 777 }; 778 779 class InnerLoopUnroller : public InnerLoopVectorizer { 780 public: 781 InnerLoopUnroller(Loop *OrigLoop, PredicatedScalarEvolution &PSE, 782 LoopInfo *LI, DominatorTree *DT, 783 const TargetLibraryInfo *TLI, 784 const TargetTransformInfo *TTI, AssumptionCache *AC, 785 OptimizationRemarkEmitter *ORE, unsigned UnrollFactor, 786 LoopVectorizationLegality *LVL, 787 LoopVectorizationCostModel *CM) 788 : InnerLoopVectorizer(OrigLoop, PSE, LI, DT, TLI, TTI, AC, ORE, 1, 789 UnrollFactor, LVL, CM) {} 790 791 private: 792 Value *getBroadcastInstrs(Value *V) override; 793 Value *getStepVector(Value *Val, int StartIdx, Value *Step, 794 Instruction::BinaryOps Opcode = 795 Instruction::BinaryOpsEnd) override; 796 Value *reverseVector(Value *Vec) override; 797 }; 798 799 } // end namespace llvm 800 801 /// Look for a meaningful debug location on the instruction or it's 802 /// operands. 803 static Instruction *getDebugLocFromInstOrOperands(Instruction *I) { 804 if (!I) 805 return I; 806 807 DebugLoc Empty; 808 if (I->getDebugLoc() != Empty) 809 return I; 810 811 for (User::op_iterator OI = I->op_begin(), OE = I->op_end(); OI != OE; ++OI) { 812 if (Instruction *OpInst = dyn_cast<Instruction>(*OI)) 813 if (OpInst->getDebugLoc() != Empty) 814 return OpInst; 815 } 816 817 return I; 818 } 819 820 void InnerLoopVectorizer::setDebugLocFromInst(IRBuilder<> &B, const Value *Ptr) { 821 if (const Instruction *Inst = dyn_cast_or_null<Instruction>(Ptr)) { 822 const DILocation *DIL = Inst->getDebugLoc(); 823 if (DIL && Inst->getFunction()->isDebugInfoForProfiling() && 824 !isa<DbgInfoIntrinsic>(Inst)) { 825 auto NewDIL = DIL->cloneByMultiplyingDuplicationFactor(UF * VF); 826 if (NewDIL) 827 B.SetCurrentDebugLocation(NewDIL.getValue()); 828 else 829 LLVM_DEBUG(dbgs() 830 << "Failed to create new discriminator: " 831 << DIL->getFilename() << " Line: " << DIL->getLine()); 832 } 833 else 834 B.SetCurrentDebugLocation(DIL); 835 } else 836 B.SetCurrentDebugLocation(DebugLoc()); 837 } 838 839 /// Write a record \p DebugMsg about vectorization failure to the debug 840 /// output stream. If \p I is passed, it is an instruction that prevents 841 /// vectorization. 842 #ifndef NDEBUG 843 static void debugVectorizationFailure(const StringRef DebugMsg, 844 Instruction *I) { 845 dbgs() << "LV: Not vectorizing: " << DebugMsg; 846 if (I != nullptr) 847 dbgs() << " " << *I; 848 else 849 dbgs() << '.'; 850 dbgs() << '\n'; 851 } 852 #endif 853 854 /// Create an analysis remark that explains why vectorization failed 855 /// 856 /// \p PassName is the name of the pass (e.g. can be AlwaysPrint). \p 857 /// RemarkName is the identifier for the remark. If \p I is passed it is an 858 /// instruction that prevents vectorization. Otherwise \p TheLoop is used for 859 /// the location of the remark. \return the remark object that can be 860 /// streamed to. 861 static OptimizationRemarkAnalysis createLVAnalysis(const char *PassName, 862 StringRef RemarkName, Loop *TheLoop, Instruction *I) { 863 Value *CodeRegion = TheLoop->getHeader(); 864 DebugLoc DL = TheLoop->getStartLoc(); 865 866 if (I) { 867 CodeRegion = I->getParent(); 868 // If there is no debug location attached to the instruction, revert back to 869 // using the loop's. 870 if (I->getDebugLoc()) 871 DL = I->getDebugLoc(); 872 } 873 874 OptimizationRemarkAnalysis R(PassName, RemarkName, DL, CodeRegion); 875 R << "loop not vectorized: "; 876 return R; 877 } 878 879 namespace llvm { 880 881 void reportVectorizationFailure(const StringRef DebugMsg, 882 const StringRef OREMsg, const StringRef ORETag, 883 OptimizationRemarkEmitter *ORE, Loop *TheLoop, Instruction *I) { 884 LLVM_DEBUG(debugVectorizationFailure(DebugMsg, I)); 885 LoopVectorizeHints Hints(TheLoop, true /* doesn't matter */, *ORE); 886 ORE->emit(createLVAnalysis(Hints.vectorizeAnalysisPassName(), 887 ORETag, TheLoop, I) << OREMsg); 888 } 889 890 } // end namespace llvm 891 892 #ifndef NDEBUG 893 /// \return string containing a file name and a line # for the given loop. 894 static std::string getDebugLocString(const Loop *L) { 895 std::string Result; 896 if (L) { 897 raw_string_ostream OS(Result); 898 if (const DebugLoc LoopDbgLoc = L->getStartLoc()) 899 LoopDbgLoc.print(OS); 900 else 901 // Just print the module name. 902 OS << L->getHeader()->getParent()->getParent()->getModuleIdentifier(); 903 OS.flush(); 904 } 905 return Result; 906 } 907 #endif 908 909 void InnerLoopVectorizer::addNewMetadata(Instruction *To, 910 const Instruction *Orig) { 911 // If the loop was versioned with memchecks, add the corresponding no-alias 912 // metadata. 913 if (LVer && (isa<LoadInst>(Orig) || isa<StoreInst>(Orig))) 914 LVer->annotateInstWithNoAlias(To, Orig); 915 } 916 917 void InnerLoopVectorizer::addMetadata(Instruction *To, 918 Instruction *From) { 919 propagateMetadata(To, From); 920 addNewMetadata(To, From); 921 } 922 923 void InnerLoopVectorizer::addMetadata(ArrayRef<Value *> To, 924 Instruction *From) { 925 for (Value *V : To) { 926 if (Instruction *I = dyn_cast<Instruction>(V)) 927 addMetadata(I, From); 928 } 929 } 930 931 namespace llvm { 932 933 // Loop vectorization cost-model hints how the scalar epilogue loop should be 934 // lowered. 935 enum ScalarEpilogueLowering { 936 937 // The default: allowing scalar epilogues. 938 CM_ScalarEpilogueAllowed, 939 940 // Vectorization with OptForSize: don't allow epilogues. 941 CM_ScalarEpilogueNotAllowedOptSize, 942 943 // A special case of vectorisation with OptForSize: loops with a very small 944 // trip count are considered for vectorization under OptForSize, thereby 945 // making sure the cost of their loop body is dominant, free of runtime 946 // guards and scalar iteration overheads. 947 CM_ScalarEpilogueNotAllowedLowTripLoop, 948 949 // Loop hint predicate indicating an epilogue is undesired. 950 CM_ScalarEpilogueNotNeededUsePredicate 951 }; 952 953 /// LoopVectorizationCostModel - estimates the expected speedups due to 954 /// vectorization. 955 /// In many cases vectorization is not profitable. This can happen because of 956 /// a number of reasons. In this class we mainly attempt to predict the 957 /// expected speedup/slowdowns due to the supported instruction set. We use the 958 /// TargetTransformInfo to query the different backends for the cost of 959 /// different operations. 960 class LoopVectorizationCostModel { 961 public: 962 LoopVectorizationCostModel(ScalarEpilogueLowering SEL, Loop *L, 963 PredicatedScalarEvolution &PSE, LoopInfo *LI, 964 LoopVectorizationLegality *Legal, 965 const TargetTransformInfo &TTI, 966 const TargetLibraryInfo *TLI, DemandedBits *DB, 967 AssumptionCache *AC, 968 OptimizationRemarkEmitter *ORE, const Function *F, 969 const LoopVectorizeHints *Hints, 970 InterleavedAccessInfo &IAI) 971 : ScalarEpilogueStatus(SEL), TheLoop(L), PSE(PSE), LI(LI), Legal(Legal), 972 TTI(TTI), TLI(TLI), DB(DB), AC(AC), ORE(ORE), TheFunction(F), 973 Hints(Hints), InterleaveInfo(IAI) {} 974 975 /// \return An upper bound for the vectorization factor, or None if 976 /// vectorization and interleaving should be avoided up front. 977 Optional<unsigned> computeMaxVF(); 978 979 /// \return True if runtime checks are required for vectorization, and false 980 /// otherwise. 981 bool runtimeChecksRequired(); 982 983 /// \return The most profitable vectorization factor and the cost of that VF. 984 /// This method checks every power of two up to MaxVF. If UserVF is not ZERO 985 /// then this vectorization factor will be selected if vectorization is 986 /// possible. 987 VectorizationFactor selectVectorizationFactor(unsigned MaxVF); 988 989 /// Setup cost-based decisions for user vectorization factor. 990 void selectUserVectorizationFactor(unsigned UserVF) { 991 collectUniformsAndScalars(UserVF); 992 collectInstsToScalarize(UserVF); 993 } 994 995 /// \return The size (in bits) of the smallest and widest types in the code 996 /// that needs to be vectorized. We ignore values that remain scalar such as 997 /// 64 bit loop indices. 998 std::pair<unsigned, unsigned> getSmallestAndWidestTypes(); 999 1000 /// \return The desired interleave count. 1001 /// If interleave count has been specified by metadata it will be returned. 1002 /// Otherwise, the interleave count is computed and returned. VF and LoopCost 1003 /// are the selected vectorization factor and the cost of the selected VF. 1004 unsigned selectInterleaveCount(unsigned VF, unsigned LoopCost); 1005 1006 /// Memory access instruction may be vectorized in more than one way. 1007 /// Form of instruction after vectorization depends on cost. 1008 /// This function takes cost-based decisions for Load/Store instructions 1009 /// and collects them in a map. This decisions map is used for building 1010 /// the lists of loop-uniform and loop-scalar instructions. 1011 /// The calculated cost is saved with widening decision in order to 1012 /// avoid redundant calculations. 1013 void setCostBasedWideningDecision(unsigned VF); 1014 1015 /// A struct that represents some properties of the register usage 1016 /// of a loop. 1017 struct RegisterUsage { 1018 /// Holds the number of loop invariant values that are used in the loop. 1019 /// The key is ClassID of target-provided register class. 1020 SmallMapVector<unsigned, unsigned, 4> LoopInvariantRegs; 1021 /// Holds the maximum number of concurrent live intervals in the loop. 1022 /// The key is ClassID of target-provided register class. 1023 SmallMapVector<unsigned, unsigned, 4> MaxLocalUsers; 1024 }; 1025 1026 /// \return Returns information about the register usages of the loop for the 1027 /// given vectorization factors. 1028 SmallVector<RegisterUsage, 8> calculateRegisterUsage(ArrayRef<unsigned> VFs); 1029 1030 /// Collect values we want to ignore in the cost model. 1031 void collectValuesToIgnore(); 1032 1033 /// \returns The smallest bitwidth each instruction can be represented with. 1034 /// The vector equivalents of these instructions should be truncated to this 1035 /// type. 1036 const MapVector<Instruction *, uint64_t> &getMinimalBitwidths() const { 1037 return MinBWs; 1038 } 1039 1040 /// \returns True if it is more profitable to scalarize instruction \p I for 1041 /// vectorization factor \p VF. 1042 bool isProfitableToScalarize(Instruction *I, unsigned VF) const { 1043 assert(VF > 1 && "Profitable to scalarize relevant only for VF > 1."); 1044 1045 // Cost model is not run in the VPlan-native path - return conservative 1046 // result until this changes. 1047 if (EnableVPlanNativePath) 1048 return false; 1049 1050 auto Scalars = InstsToScalarize.find(VF); 1051 assert(Scalars != InstsToScalarize.end() && 1052 "VF not yet analyzed for scalarization profitability"); 1053 return Scalars->second.find(I) != Scalars->second.end(); 1054 } 1055 1056 /// Returns true if \p I is known to be uniform after vectorization. 1057 bool isUniformAfterVectorization(Instruction *I, unsigned VF) const { 1058 if (VF == 1) 1059 return true; 1060 1061 // Cost model is not run in the VPlan-native path - return conservative 1062 // result until this changes. 1063 if (EnableVPlanNativePath) 1064 return false; 1065 1066 auto UniformsPerVF = Uniforms.find(VF); 1067 assert(UniformsPerVF != Uniforms.end() && 1068 "VF not yet analyzed for uniformity"); 1069 return UniformsPerVF->second.find(I) != UniformsPerVF->second.end(); 1070 } 1071 1072 /// Returns true if \p I is known to be scalar after vectorization. 1073 bool isScalarAfterVectorization(Instruction *I, unsigned VF) const { 1074 if (VF == 1) 1075 return true; 1076 1077 // Cost model is not run in the VPlan-native path - return conservative 1078 // result until this changes. 1079 if (EnableVPlanNativePath) 1080 return false; 1081 1082 auto ScalarsPerVF = Scalars.find(VF); 1083 assert(ScalarsPerVF != Scalars.end() && 1084 "Scalar values are not calculated for VF"); 1085 return ScalarsPerVF->second.find(I) != ScalarsPerVF->second.end(); 1086 } 1087 1088 /// \returns True if instruction \p I can be truncated to a smaller bitwidth 1089 /// for vectorization factor \p VF. 1090 bool canTruncateToMinimalBitwidth(Instruction *I, unsigned VF) const { 1091 return VF > 1 && MinBWs.find(I) != MinBWs.end() && 1092 !isProfitableToScalarize(I, VF) && 1093 !isScalarAfterVectorization(I, VF); 1094 } 1095 1096 /// Decision that was taken during cost calculation for memory instruction. 1097 enum InstWidening { 1098 CM_Unknown, 1099 CM_Widen, // For consecutive accesses with stride +1. 1100 CM_Widen_Reverse, // For consecutive accesses with stride -1. 1101 CM_Interleave, 1102 CM_GatherScatter, 1103 CM_Scalarize 1104 }; 1105 1106 /// Save vectorization decision \p W and \p Cost taken by the cost model for 1107 /// instruction \p I and vector width \p VF. 1108 void setWideningDecision(Instruction *I, unsigned VF, InstWidening W, 1109 unsigned Cost) { 1110 assert(VF >= 2 && "Expected VF >=2"); 1111 WideningDecisions[std::make_pair(I, VF)] = std::make_pair(W, Cost); 1112 } 1113 1114 /// Save vectorization decision \p W and \p Cost taken by the cost model for 1115 /// interleaving group \p Grp and vector width \p VF. 1116 void setWideningDecision(const InterleaveGroup<Instruction> *Grp, unsigned VF, 1117 InstWidening W, unsigned Cost) { 1118 assert(VF >= 2 && "Expected VF >=2"); 1119 /// Broadcast this decicion to all instructions inside the group. 1120 /// But the cost will be assigned to one instruction only. 1121 for (unsigned i = 0; i < Grp->getFactor(); ++i) { 1122 if (auto *I = Grp->getMember(i)) { 1123 if (Grp->getInsertPos() == I) 1124 WideningDecisions[std::make_pair(I, VF)] = std::make_pair(W, Cost); 1125 else 1126 WideningDecisions[std::make_pair(I, VF)] = std::make_pair(W, 0); 1127 } 1128 } 1129 } 1130 1131 /// Return the cost model decision for the given instruction \p I and vector 1132 /// width \p VF. Return CM_Unknown if this instruction did not pass 1133 /// through the cost modeling. 1134 InstWidening getWideningDecision(Instruction *I, unsigned VF) { 1135 assert(VF >= 2 && "Expected VF >=2"); 1136 1137 // Cost model is not run in the VPlan-native path - return conservative 1138 // result until this changes. 1139 if (EnableVPlanNativePath) 1140 return CM_GatherScatter; 1141 1142 std::pair<Instruction *, unsigned> InstOnVF = std::make_pair(I, VF); 1143 auto Itr = WideningDecisions.find(InstOnVF); 1144 if (Itr == WideningDecisions.end()) 1145 return CM_Unknown; 1146 return Itr->second.first; 1147 } 1148 1149 /// Return the vectorization cost for the given instruction \p I and vector 1150 /// width \p VF. 1151 unsigned getWideningCost(Instruction *I, unsigned VF) { 1152 assert(VF >= 2 && "Expected VF >=2"); 1153 std::pair<Instruction *, unsigned> InstOnVF = std::make_pair(I, VF); 1154 assert(WideningDecisions.find(InstOnVF) != WideningDecisions.end() && 1155 "The cost is not calculated"); 1156 return WideningDecisions[InstOnVF].second; 1157 } 1158 1159 /// Return True if instruction \p I is an optimizable truncate whose operand 1160 /// is an induction variable. Such a truncate will be removed by adding a new 1161 /// induction variable with the destination type. 1162 bool isOptimizableIVTruncate(Instruction *I, unsigned VF) { 1163 // If the instruction is not a truncate, return false. 1164 auto *Trunc = dyn_cast<TruncInst>(I); 1165 if (!Trunc) 1166 return false; 1167 1168 // Get the source and destination types of the truncate. 1169 Type *SrcTy = ToVectorTy(cast<CastInst>(I)->getSrcTy(), VF); 1170 Type *DestTy = ToVectorTy(cast<CastInst>(I)->getDestTy(), VF); 1171 1172 // If the truncate is free for the given types, return false. Replacing a 1173 // free truncate with an induction variable would add an induction variable 1174 // update instruction to each iteration of the loop. We exclude from this 1175 // check the primary induction variable since it will need an update 1176 // instruction regardless. 1177 Value *Op = Trunc->getOperand(0); 1178 if (Op != Legal->getPrimaryInduction() && TTI.isTruncateFree(SrcTy, DestTy)) 1179 return false; 1180 1181 // If the truncated value is not an induction variable, return false. 1182 return Legal->isInductionPhi(Op); 1183 } 1184 1185 /// Collects the instructions to scalarize for each predicated instruction in 1186 /// the loop. 1187 void collectInstsToScalarize(unsigned VF); 1188 1189 /// Collect Uniform and Scalar values for the given \p VF. 1190 /// The sets depend on CM decision for Load/Store instructions 1191 /// that may be vectorized as interleave, gather-scatter or scalarized. 1192 void collectUniformsAndScalars(unsigned VF) { 1193 // Do the analysis once. 1194 if (VF == 1 || Uniforms.find(VF) != Uniforms.end()) 1195 return; 1196 setCostBasedWideningDecision(VF); 1197 collectLoopUniforms(VF); 1198 collectLoopScalars(VF); 1199 } 1200 1201 /// Returns true if the target machine supports masked store operation 1202 /// for the given \p DataType and kind of access to \p Ptr. 1203 bool isLegalMaskedStore(Type *DataType, Value *Ptr, MaybeAlign Alignment) { 1204 return Legal->isConsecutivePtr(Ptr) && 1205 TTI.isLegalMaskedStore(DataType, Alignment); 1206 } 1207 1208 /// Returns true if the target machine supports masked load operation 1209 /// for the given \p DataType and kind of access to \p Ptr. 1210 bool isLegalMaskedLoad(Type *DataType, Value *Ptr, MaybeAlign Alignment) { 1211 return Legal->isConsecutivePtr(Ptr) && 1212 TTI.isLegalMaskedLoad(DataType, Alignment); 1213 } 1214 1215 /// Returns true if the target machine supports masked scatter operation 1216 /// for the given \p DataType. 1217 bool isLegalMaskedScatter(Type *DataType, MaybeAlign Alignment) { 1218 return TTI.isLegalMaskedScatter(DataType, Alignment); 1219 } 1220 1221 /// Returns true if the target machine supports masked gather operation 1222 /// for the given \p DataType. 1223 bool isLegalMaskedGather(Type *DataType, MaybeAlign Alignment) { 1224 return TTI.isLegalMaskedGather(DataType, Alignment); 1225 } 1226 1227 /// Returns true if the target machine can represent \p V as a masked gather 1228 /// or scatter operation. 1229 bool isLegalGatherOrScatter(Value *V) { 1230 bool LI = isa<LoadInst>(V); 1231 bool SI = isa<StoreInst>(V); 1232 if (!LI && !SI) 1233 return false; 1234 auto *Ty = getMemInstValueType(V); 1235 MaybeAlign Align = getLoadStoreAlignment(V); 1236 return (LI && isLegalMaskedGather(Ty, Align)) || 1237 (SI && isLegalMaskedScatter(Ty, Align)); 1238 } 1239 1240 /// Returns true if \p I is an instruction that will be scalarized with 1241 /// predication. Such instructions include conditional stores and 1242 /// instructions that may divide by zero. 1243 /// If a non-zero VF has been calculated, we check if I will be scalarized 1244 /// predication for that VF. 1245 bool isScalarWithPredication(Instruction *I, unsigned VF = 1); 1246 1247 // Returns true if \p I is an instruction that will be predicated either 1248 // through scalar predication or masked load/store or masked gather/scatter. 1249 // Superset of instructions that return true for isScalarWithPredication. 1250 bool isPredicatedInst(Instruction *I) { 1251 if (!blockNeedsPredication(I->getParent())) 1252 return false; 1253 // Loads and stores that need some form of masked operation are predicated 1254 // instructions. 1255 if (isa<LoadInst>(I) || isa<StoreInst>(I)) 1256 return Legal->isMaskRequired(I); 1257 return isScalarWithPredication(I); 1258 } 1259 1260 /// Returns true if \p I is a memory instruction with consecutive memory 1261 /// access that can be widened. 1262 bool memoryInstructionCanBeWidened(Instruction *I, unsigned VF = 1); 1263 1264 /// Returns true if \p I is a memory instruction in an interleaved-group 1265 /// of memory accesses that can be vectorized with wide vector loads/stores 1266 /// and shuffles. 1267 bool interleavedAccessCanBeWidened(Instruction *I, unsigned VF = 1); 1268 1269 /// Check if \p Instr belongs to any interleaved access group. 1270 bool isAccessInterleaved(Instruction *Instr) { 1271 return InterleaveInfo.isInterleaved(Instr); 1272 } 1273 1274 /// Get the interleaved access group that \p Instr belongs to. 1275 const InterleaveGroup<Instruction> * 1276 getInterleavedAccessGroup(Instruction *Instr) { 1277 return InterleaveInfo.getInterleaveGroup(Instr); 1278 } 1279 1280 /// Returns true if an interleaved group requires a scalar iteration 1281 /// to handle accesses with gaps, and there is nothing preventing us from 1282 /// creating a scalar epilogue. 1283 bool requiresScalarEpilogue() const { 1284 return isScalarEpilogueAllowed() && InterleaveInfo.requiresScalarEpilogue(); 1285 } 1286 1287 /// Returns true if a scalar epilogue is not allowed due to optsize or a 1288 /// loop hint annotation. 1289 bool isScalarEpilogueAllowed() const { 1290 return ScalarEpilogueStatus == CM_ScalarEpilogueAllowed; 1291 } 1292 1293 /// Returns true if all loop blocks should be masked to fold tail loop. 1294 bool foldTailByMasking() const { return FoldTailByMasking; } 1295 1296 bool blockNeedsPredication(BasicBlock *BB) { 1297 return foldTailByMasking() || Legal->blockNeedsPredication(BB); 1298 } 1299 1300 /// Estimate cost of an intrinsic call instruction CI if it were vectorized 1301 /// with factor VF. Return the cost of the instruction, including 1302 /// scalarization overhead if it's needed. 1303 unsigned getVectorIntrinsicCost(CallInst *CI, unsigned VF); 1304 1305 /// Estimate cost of a call instruction CI if it were vectorized with factor 1306 /// VF. Return the cost of the instruction, including scalarization overhead 1307 /// if it's needed. The flag NeedToScalarize shows if the call needs to be 1308 /// scalarized - 1309 /// i.e. either vector version isn't available, or is too expensive. 1310 unsigned getVectorCallCost(CallInst *CI, unsigned VF, bool &NeedToScalarize); 1311 1312 /// Invalidates decisions already taken by the cost model. 1313 void invalidateCostModelingDecisions() { 1314 WideningDecisions.clear(); 1315 Uniforms.clear(); 1316 Scalars.clear(); 1317 } 1318 1319 private: 1320 unsigned NumPredStores = 0; 1321 1322 /// \return An upper bound for the vectorization factor, larger than zero. 1323 /// One is returned if vectorization should best be avoided due to cost. 1324 unsigned computeFeasibleMaxVF(unsigned ConstTripCount); 1325 1326 /// The vectorization cost is a combination of the cost itself and a boolean 1327 /// indicating whether any of the contributing operations will actually 1328 /// operate on 1329 /// vector values after type legalization in the backend. If this latter value 1330 /// is 1331 /// false, then all operations will be scalarized (i.e. no vectorization has 1332 /// actually taken place). 1333 using VectorizationCostTy = std::pair<unsigned, bool>; 1334 1335 /// Returns the expected execution cost. The unit of the cost does 1336 /// not matter because we use the 'cost' units to compare different 1337 /// vector widths. The cost that is returned is *not* normalized by 1338 /// the factor width. 1339 VectorizationCostTy expectedCost(unsigned VF); 1340 1341 /// Returns the execution time cost of an instruction for a given vector 1342 /// width. Vector width of one means scalar. 1343 VectorizationCostTy getInstructionCost(Instruction *I, unsigned VF); 1344 1345 /// The cost-computation logic from getInstructionCost which provides 1346 /// the vector type as an output parameter. 1347 unsigned getInstructionCost(Instruction *I, unsigned VF, Type *&VectorTy); 1348 1349 /// Calculate vectorization cost of memory instruction \p I. 1350 unsigned getMemoryInstructionCost(Instruction *I, unsigned VF); 1351 1352 /// The cost computation for scalarized memory instruction. 1353 unsigned getMemInstScalarizationCost(Instruction *I, unsigned VF); 1354 1355 /// The cost computation for interleaving group of memory instructions. 1356 unsigned getInterleaveGroupCost(Instruction *I, unsigned VF); 1357 1358 /// The cost computation for Gather/Scatter instruction. 1359 unsigned getGatherScatterCost(Instruction *I, unsigned VF); 1360 1361 /// The cost computation for widening instruction \p I with consecutive 1362 /// memory access. 1363 unsigned getConsecutiveMemOpCost(Instruction *I, unsigned VF); 1364 1365 /// The cost calculation for Load/Store instruction \p I with uniform pointer - 1366 /// Load: scalar load + broadcast. 1367 /// Store: scalar store + (loop invariant value stored? 0 : extract of last 1368 /// element) 1369 unsigned getUniformMemOpCost(Instruction *I, unsigned VF); 1370 1371 /// Estimate the overhead of scalarizing an instruction. This is a 1372 /// convenience wrapper for the type-based getScalarizationOverhead API. 1373 unsigned getScalarizationOverhead(Instruction *I, unsigned VF); 1374 1375 /// Returns whether the instruction is a load or store and will be a emitted 1376 /// as a vector operation. 1377 bool isConsecutiveLoadOrStore(Instruction *I); 1378 1379 /// Returns true if an artificially high cost for emulated masked memrefs 1380 /// should be used. 1381 bool useEmulatedMaskMemRefHack(Instruction *I); 1382 1383 /// Map of scalar integer values to the smallest bitwidth they can be legally 1384 /// represented as. The vector equivalents of these values should be truncated 1385 /// to this type. 1386 MapVector<Instruction *, uint64_t> MinBWs; 1387 1388 /// A type representing the costs for instructions if they were to be 1389 /// scalarized rather than vectorized. The entries are Instruction-Cost 1390 /// pairs. 1391 using ScalarCostsTy = DenseMap<Instruction *, unsigned>; 1392 1393 /// A set containing all BasicBlocks that are known to present after 1394 /// vectorization as a predicated block. 1395 SmallPtrSet<BasicBlock *, 4> PredicatedBBsAfterVectorization; 1396 1397 /// Records whether it is allowed to have the original scalar loop execute at 1398 /// least once. This may be needed as a fallback loop in case runtime 1399 /// aliasing/dependence checks fail, or to handle the tail/remainder 1400 /// iterations when the trip count is unknown or doesn't divide by the VF, 1401 /// or as a peel-loop to handle gaps in interleave-groups. 1402 /// Under optsize and when the trip count is very small we don't allow any 1403 /// iterations to execute in the scalar loop. 1404 ScalarEpilogueLowering ScalarEpilogueStatus = CM_ScalarEpilogueAllowed; 1405 1406 /// All blocks of loop are to be masked to fold tail of scalar iterations. 1407 bool FoldTailByMasking = false; 1408 1409 /// A map holding scalar costs for different vectorization factors. The 1410 /// presence of a cost for an instruction in the mapping indicates that the 1411 /// instruction will be scalarized when vectorizing with the associated 1412 /// vectorization factor. The entries are VF-ScalarCostTy pairs. 1413 DenseMap<unsigned, ScalarCostsTy> InstsToScalarize; 1414 1415 /// Holds the instructions known to be uniform after vectorization. 1416 /// The data is collected per VF. 1417 DenseMap<unsigned, SmallPtrSet<Instruction *, 4>> Uniforms; 1418 1419 /// Holds the instructions known to be scalar after vectorization. 1420 /// The data is collected per VF. 1421 DenseMap<unsigned, SmallPtrSet<Instruction *, 4>> Scalars; 1422 1423 /// Holds the instructions (address computations) that are forced to be 1424 /// scalarized. 1425 DenseMap<unsigned, SmallPtrSet<Instruction *, 4>> ForcedScalars; 1426 1427 /// Returns the expected difference in cost from scalarizing the expression 1428 /// feeding a predicated instruction \p PredInst. The instructions to 1429 /// scalarize and their scalar costs are collected in \p ScalarCosts. A 1430 /// non-negative return value implies the expression will be scalarized. 1431 /// Currently, only single-use chains are considered for scalarization. 1432 int computePredInstDiscount(Instruction *PredInst, ScalarCostsTy &ScalarCosts, 1433 unsigned VF); 1434 1435 /// Collect the instructions that are uniform after vectorization. An 1436 /// instruction is uniform if we represent it with a single scalar value in 1437 /// the vectorized loop corresponding to each vector iteration. Examples of 1438 /// uniform instructions include pointer operands of consecutive or 1439 /// interleaved memory accesses. Note that although uniformity implies an 1440 /// instruction will be scalar, the reverse is not true. In general, a 1441 /// scalarized instruction will be represented by VF scalar values in the 1442 /// vectorized loop, each corresponding to an iteration of the original 1443 /// scalar loop. 1444 void collectLoopUniforms(unsigned VF); 1445 1446 /// Collect the instructions that are scalar after vectorization. An 1447 /// instruction is scalar if it is known to be uniform or will be scalarized 1448 /// during vectorization. Non-uniform scalarized instructions will be 1449 /// represented by VF values in the vectorized loop, each corresponding to an 1450 /// iteration of the original scalar loop. 1451 void collectLoopScalars(unsigned VF); 1452 1453 /// Keeps cost model vectorization decision and cost for instructions. 1454 /// Right now it is used for memory instructions only. 1455 using DecisionList = DenseMap<std::pair<Instruction *, unsigned>, 1456 std::pair<InstWidening, unsigned>>; 1457 1458 DecisionList WideningDecisions; 1459 1460 /// Returns true if \p V is expected to be vectorized and it needs to be 1461 /// extracted. 1462 bool needsExtract(Value *V, unsigned VF) const { 1463 Instruction *I = dyn_cast<Instruction>(V); 1464 if (VF == 1 || !I || !TheLoop->contains(I) || TheLoop->isLoopInvariant(I)) 1465 return false; 1466 1467 // Assume we can vectorize V (and hence we need extraction) if the 1468 // scalars are not computed yet. This can happen, because it is called 1469 // via getScalarizationOverhead from setCostBasedWideningDecision, before 1470 // the scalars are collected. That should be a safe assumption in most 1471 // cases, because we check if the operands have vectorizable types 1472 // beforehand in LoopVectorizationLegality. 1473 return Scalars.find(VF) == Scalars.end() || 1474 !isScalarAfterVectorization(I, VF); 1475 }; 1476 1477 /// Returns a range containing only operands needing to be extracted. 1478 SmallVector<Value *, 4> filterExtractingOperands(Instruction::op_range Ops, 1479 unsigned VF) { 1480 return SmallVector<Value *, 4>(make_filter_range( 1481 Ops, [this, VF](Value *V) { return this->needsExtract(V, VF); })); 1482 } 1483 1484 public: 1485 /// The loop that we evaluate. 1486 Loop *TheLoop; 1487 1488 /// Predicated scalar evolution analysis. 1489 PredicatedScalarEvolution &PSE; 1490 1491 /// Loop Info analysis. 1492 LoopInfo *LI; 1493 1494 /// Vectorization legality. 1495 LoopVectorizationLegality *Legal; 1496 1497 /// Vector target information. 1498 const TargetTransformInfo &TTI; 1499 1500 /// Target Library Info. 1501 const TargetLibraryInfo *TLI; 1502 1503 /// Demanded bits analysis. 1504 DemandedBits *DB; 1505 1506 /// Assumption cache. 1507 AssumptionCache *AC; 1508 1509 /// Interface to emit optimization remarks. 1510 OptimizationRemarkEmitter *ORE; 1511 1512 const Function *TheFunction; 1513 1514 /// Loop Vectorize Hint. 1515 const LoopVectorizeHints *Hints; 1516 1517 /// The interleave access information contains groups of interleaved accesses 1518 /// with the same stride and close to each other. 1519 InterleavedAccessInfo &InterleaveInfo; 1520 1521 /// Values to ignore in the cost model. 1522 SmallPtrSet<const Value *, 16> ValuesToIgnore; 1523 1524 /// Values to ignore in the cost model when VF > 1. 1525 SmallPtrSet<const Value *, 16> VecValuesToIgnore; 1526 }; 1527 1528 } // end namespace llvm 1529 1530 // Return true if \p OuterLp is an outer loop annotated with hints for explicit 1531 // vectorization. The loop needs to be annotated with #pragma omp simd 1532 // simdlen(#) or #pragma clang vectorize(enable) vectorize_width(#). If the 1533 // vector length information is not provided, vectorization is not considered 1534 // explicit. Interleave hints are not allowed either. These limitations will be 1535 // relaxed in the future. 1536 // Please, note that we are currently forced to abuse the pragma 'clang 1537 // vectorize' semantics. This pragma provides *auto-vectorization hints* 1538 // (i.e., LV must check that vectorization is legal) whereas pragma 'omp simd' 1539 // provides *explicit vectorization hints* (LV can bypass legal checks and 1540 // assume that vectorization is legal). However, both hints are implemented 1541 // using the same metadata (llvm.loop.vectorize, processed by 1542 // LoopVectorizeHints). This will be fixed in the future when the native IR 1543 // representation for pragma 'omp simd' is introduced. 1544 static bool isExplicitVecOuterLoop(Loop *OuterLp, 1545 OptimizationRemarkEmitter *ORE) { 1546 assert(!OuterLp->empty() && "This is not an outer loop"); 1547 LoopVectorizeHints Hints(OuterLp, true /*DisableInterleaving*/, *ORE); 1548 1549 // Only outer loops with an explicit vectorization hint are supported. 1550 // Unannotated outer loops are ignored. 1551 if (Hints.getForce() == LoopVectorizeHints::FK_Undefined) 1552 return false; 1553 1554 Function *Fn = OuterLp->getHeader()->getParent(); 1555 if (!Hints.allowVectorization(Fn, OuterLp, 1556 true /*VectorizeOnlyWhenForced*/)) { 1557 LLVM_DEBUG(dbgs() << "LV: Loop hints prevent outer loop vectorization.\n"); 1558 return false; 1559 } 1560 1561 if (Hints.getInterleave() > 1) { 1562 // TODO: Interleave support is future work. 1563 LLVM_DEBUG(dbgs() << "LV: Not vectorizing: Interleave is not supported for " 1564 "outer loops.\n"); 1565 Hints.emitRemarkWithHints(); 1566 return false; 1567 } 1568 1569 return true; 1570 } 1571 1572 static void collectSupportedLoops(Loop &L, LoopInfo *LI, 1573 OptimizationRemarkEmitter *ORE, 1574 SmallVectorImpl<Loop *> &V) { 1575 // Collect inner loops and outer loops without irreducible control flow. For 1576 // now, only collect outer loops that have explicit vectorization hints. If we 1577 // are stress testing the VPlan H-CFG construction, we collect the outermost 1578 // loop of every loop nest. 1579 if (L.empty() || VPlanBuildStressTest || 1580 (EnableVPlanNativePath && isExplicitVecOuterLoop(&L, ORE))) { 1581 LoopBlocksRPO RPOT(&L); 1582 RPOT.perform(LI); 1583 if (!containsIrreducibleCFG<const BasicBlock *>(RPOT, *LI)) { 1584 V.push_back(&L); 1585 // TODO: Collect inner loops inside marked outer loops in case 1586 // vectorization fails for the outer loop. Do not invoke 1587 // 'containsIrreducibleCFG' again for inner loops when the outer loop is 1588 // already known to be reducible. We can use an inherited attribute for 1589 // that. 1590 return; 1591 } 1592 } 1593 for (Loop *InnerL : L) 1594 collectSupportedLoops(*InnerL, LI, ORE, V); 1595 } 1596 1597 namespace { 1598 1599 /// The LoopVectorize Pass. 1600 struct LoopVectorize : public FunctionPass { 1601 /// Pass identification, replacement for typeid 1602 static char ID; 1603 1604 LoopVectorizePass Impl; 1605 1606 explicit LoopVectorize(bool InterleaveOnlyWhenForced = false, 1607 bool VectorizeOnlyWhenForced = false) 1608 : FunctionPass(ID), 1609 Impl({InterleaveOnlyWhenForced, VectorizeOnlyWhenForced}) { 1610 initializeLoopVectorizePass(*PassRegistry::getPassRegistry()); 1611 } 1612 1613 bool runOnFunction(Function &F) override { 1614 if (skipFunction(F)) 1615 return false; 1616 1617 auto *SE = &getAnalysis<ScalarEvolutionWrapperPass>().getSE(); 1618 auto *LI = &getAnalysis<LoopInfoWrapperPass>().getLoopInfo(); 1619 auto *TTI = &getAnalysis<TargetTransformInfoWrapperPass>().getTTI(F); 1620 auto *DT = &getAnalysis<DominatorTreeWrapperPass>().getDomTree(); 1621 auto *BFI = &getAnalysis<BlockFrequencyInfoWrapperPass>().getBFI(); 1622 auto *TLIP = getAnalysisIfAvailable<TargetLibraryInfoWrapperPass>(); 1623 auto *TLI = TLIP ? &TLIP->getTLI(F) : nullptr; 1624 auto *AA = &getAnalysis<AAResultsWrapperPass>().getAAResults(); 1625 auto *AC = &getAnalysis<AssumptionCacheTracker>().getAssumptionCache(F); 1626 auto *LAA = &getAnalysis<LoopAccessLegacyAnalysis>(); 1627 auto *DB = &getAnalysis<DemandedBitsWrapperPass>().getDemandedBits(); 1628 auto *ORE = &getAnalysis<OptimizationRemarkEmitterWrapperPass>().getORE(); 1629 auto *PSI = &getAnalysis<ProfileSummaryInfoWrapperPass>().getPSI(); 1630 1631 std::function<const LoopAccessInfo &(Loop &)> GetLAA = 1632 [&](Loop &L) -> const LoopAccessInfo & { return LAA->getInfo(&L); }; 1633 1634 return Impl.runImpl(F, *SE, *LI, *TTI, *DT, *BFI, TLI, *DB, *AA, *AC, 1635 GetLAA, *ORE, PSI); 1636 } 1637 1638 void getAnalysisUsage(AnalysisUsage &AU) const override { 1639 AU.addRequired<AssumptionCacheTracker>(); 1640 AU.addRequired<BlockFrequencyInfoWrapperPass>(); 1641 AU.addRequired<DominatorTreeWrapperPass>(); 1642 AU.addRequired<LoopInfoWrapperPass>(); 1643 AU.addRequired<ScalarEvolutionWrapperPass>(); 1644 AU.addRequired<TargetTransformInfoWrapperPass>(); 1645 AU.addRequired<AAResultsWrapperPass>(); 1646 AU.addRequired<LoopAccessLegacyAnalysis>(); 1647 AU.addRequired<DemandedBitsWrapperPass>(); 1648 AU.addRequired<OptimizationRemarkEmitterWrapperPass>(); 1649 AU.addRequired<InjectTLIMappingsLegacy>(); 1650 1651 // We currently do not preserve loopinfo/dominator analyses with outer loop 1652 // vectorization. Until this is addressed, mark these analyses as preserved 1653 // only for non-VPlan-native path. 1654 // TODO: Preserve Loop and Dominator analyses for VPlan-native path. 1655 if (!EnableVPlanNativePath) { 1656 AU.addPreserved<LoopInfoWrapperPass>(); 1657 AU.addPreserved<DominatorTreeWrapperPass>(); 1658 } 1659 1660 AU.addPreserved<BasicAAWrapperPass>(); 1661 AU.addPreserved<GlobalsAAWrapperPass>(); 1662 AU.addRequired<ProfileSummaryInfoWrapperPass>(); 1663 } 1664 }; 1665 1666 } // end anonymous namespace 1667 1668 //===----------------------------------------------------------------------===// 1669 // Implementation of LoopVectorizationLegality, InnerLoopVectorizer and 1670 // LoopVectorizationCostModel and LoopVectorizationPlanner. 1671 //===----------------------------------------------------------------------===// 1672 1673 Value *InnerLoopVectorizer::getBroadcastInstrs(Value *V) { 1674 // We need to place the broadcast of invariant variables outside the loop, 1675 // but only if it's proven safe to do so. Else, broadcast will be inside 1676 // vector loop body. 1677 Instruction *Instr = dyn_cast<Instruction>(V); 1678 bool SafeToHoist = OrigLoop->isLoopInvariant(V) && 1679 (!Instr || 1680 DT->dominates(Instr->getParent(), LoopVectorPreHeader)); 1681 // Place the code for broadcasting invariant variables in the new preheader. 1682 IRBuilder<>::InsertPointGuard Guard(Builder); 1683 if (SafeToHoist) 1684 Builder.SetInsertPoint(LoopVectorPreHeader->getTerminator()); 1685 1686 // Broadcast the scalar into all locations in the vector. 1687 Value *Shuf = Builder.CreateVectorSplat(VF, V, "broadcast"); 1688 1689 return Shuf; 1690 } 1691 1692 void InnerLoopVectorizer::createVectorIntOrFpInductionPHI( 1693 const InductionDescriptor &II, Value *Step, Instruction *EntryVal) { 1694 assert((isa<PHINode>(EntryVal) || isa<TruncInst>(EntryVal)) && 1695 "Expected either an induction phi-node or a truncate of it!"); 1696 Value *Start = II.getStartValue(); 1697 1698 // Construct the initial value of the vector IV in the vector loop preheader 1699 auto CurrIP = Builder.saveIP(); 1700 Builder.SetInsertPoint(LoopVectorPreHeader->getTerminator()); 1701 if (isa<TruncInst>(EntryVal)) { 1702 assert(Start->getType()->isIntegerTy() && 1703 "Truncation requires an integer type"); 1704 auto *TruncType = cast<IntegerType>(EntryVal->getType()); 1705 Step = Builder.CreateTrunc(Step, TruncType); 1706 Start = Builder.CreateCast(Instruction::Trunc, Start, TruncType); 1707 } 1708 Value *SplatStart = Builder.CreateVectorSplat(VF, Start); 1709 Value *SteppedStart = 1710 getStepVector(SplatStart, 0, Step, II.getInductionOpcode()); 1711 1712 // We create vector phi nodes for both integer and floating-point induction 1713 // variables. Here, we determine the kind of arithmetic we will perform. 1714 Instruction::BinaryOps AddOp; 1715 Instruction::BinaryOps MulOp; 1716 if (Step->getType()->isIntegerTy()) { 1717 AddOp = Instruction::Add; 1718 MulOp = Instruction::Mul; 1719 } else { 1720 AddOp = II.getInductionOpcode(); 1721 MulOp = Instruction::FMul; 1722 } 1723 1724 // Multiply the vectorization factor by the step using integer or 1725 // floating-point arithmetic as appropriate. 1726 Value *ConstVF = getSignedIntOrFpConstant(Step->getType(), VF); 1727 Value *Mul = addFastMathFlag(Builder.CreateBinOp(MulOp, Step, ConstVF)); 1728 1729 // Create a vector splat to use in the induction update. 1730 // 1731 // FIXME: If the step is non-constant, we create the vector splat with 1732 // IRBuilder. IRBuilder can constant-fold the multiply, but it doesn't 1733 // handle a constant vector splat. 1734 Value *SplatVF = 1735 isa<Constant>(Mul) 1736 ? ConstantVector::getSplat({VF, false}, cast<Constant>(Mul)) 1737 : Builder.CreateVectorSplat(VF, Mul); 1738 Builder.restoreIP(CurrIP); 1739 1740 // We may need to add the step a number of times, depending on the unroll 1741 // factor. The last of those goes into the PHI. 1742 PHINode *VecInd = PHINode::Create(SteppedStart->getType(), 2, "vec.ind", 1743 &*LoopVectorBody->getFirstInsertionPt()); 1744 VecInd->setDebugLoc(EntryVal->getDebugLoc()); 1745 Instruction *LastInduction = VecInd; 1746 for (unsigned Part = 0; Part < UF; ++Part) { 1747 VectorLoopValueMap.setVectorValue(EntryVal, Part, LastInduction); 1748 1749 if (isa<TruncInst>(EntryVal)) 1750 addMetadata(LastInduction, EntryVal); 1751 recordVectorLoopValueForInductionCast(II, EntryVal, LastInduction, Part); 1752 1753 LastInduction = cast<Instruction>(addFastMathFlag( 1754 Builder.CreateBinOp(AddOp, LastInduction, SplatVF, "step.add"))); 1755 LastInduction->setDebugLoc(EntryVal->getDebugLoc()); 1756 } 1757 1758 // Move the last step to the end of the latch block. This ensures consistent 1759 // placement of all induction updates. 1760 auto *LoopVectorLatch = LI->getLoopFor(LoopVectorBody)->getLoopLatch(); 1761 auto *Br = cast<BranchInst>(LoopVectorLatch->getTerminator()); 1762 auto *ICmp = cast<Instruction>(Br->getCondition()); 1763 LastInduction->moveBefore(ICmp); 1764 LastInduction->setName("vec.ind.next"); 1765 1766 VecInd->addIncoming(SteppedStart, LoopVectorPreHeader); 1767 VecInd->addIncoming(LastInduction, LoopVectorLatch); 1768 } 1769 1770 bool InnerLoopVectorizer::shouldScalarizeInstruction(Instruction *I) const { 1771 return Cost->isScalarAfterVectorization(I, VF) || 1772 Cost->isProfitableToScalarize(I, VF); 1773 } 1774 1775 bool InnerLoopVectorizer::needsScalarInduction(Instruction *IV) const { 1776 if (shouldScalarizeInstruction(IV)) 1777 return true; 1778 auto isScalarInst = [&](User *U) -> bool { 1779 auto *I = cast<Instruction>(U); 1780 return (OrigLoop->contains(I) && shouldScalarizeInstruction(I)); 1781 }; 1782 return llvm::any_of(IV->users(), isScalarInst); 1783 } 1784 1785 void InnerLoopVectorizer::recordVectorLoopValueForInductionCast( 1786 const InductionDescriptor &ID, const Instruction *EntryVal, 1787 Value *VectorLoopVal, unsigned Part, unsigned Lane) { 1788 assert((isa<PHINode>(EntryVal) || isa<TruncInst>(EntryVal)) && 1789 "Expected either an induction phi-node or a truncate of it!"); 1790 1791 // This induction variable is not the phi from the original loop but the 1792 // newly-created IV based on the proof that casted Phi is equal to the 1793 // uncasted Phi in the vectorized loop (under a runtime guard possibly). It 1794 // re-uses the same InductionDescriptor that original IV uses but we don't 1795 // have to do any recording in this case - that is done when original IV is 1796 // processed. 1797 if (isa<TruncInst>(EntryVal)) 1798 return; 1799 1800 const SmallVectorImpl<Instruction *> &Casts = ID.getCastInsts(); 1801 if (Casts.empty()) 1802 return; 1803 // Only the first Cast instruction in the Casts vector is of interest. 1804 // The rest of the Casts (if exist) have no uses outside the 1805 // induction update chain itself. 1806 Instruction *CastInst = *Casts.begin(); 1807 if (Lane < UINT_MAX) 1808 VectorLoopValueMap.setScalarValue(CastInst, {Part, Lane}, VectorLoopVal); 1809 else 1810 VectorLoopValueMap.setVectorValue(CastInst, Part, VectorLoopVal); 1811 } 1812 1813 void InnerLoopVectorizer::widenIntOrFpInduction(PHINode *IV, TruncInst *Trunc) { 1814 assert((IV->getType()->isIntegerTy() || IV != OldInduction) && 1815 "Primary induction variable must have an integer type"); 1816 1817 auto II = Legal->getInductionVars().find(IV); 1818 assert(II != Legal->getInductionVars().end() && "IV is not an induction"); 1819 1820 auto ID = II->second; 1821 assert(IV->getType() == ID.getStartValue()->getType() && "Types must match"); 1822 1823 // The value from the original loop to which we are mapping the new induction 1824 // variable. 1825 Instruction *EntryVal = Trunc ? cast<Instruction>(Trunc) : IV; 1826 1827 auto &DL = OrigLoop->getHeader()->getModule()->getDataLayout(); 1828 1829 // Generate code for the induction step. Note that induction steps are 1830 // required to be loop-invariant 1831 auto CreateStepValue = [&](const SCEV *Step) -> Value * { 1832 assert(PSE.getSE()->isLoopInvariant(Step, OrigLoop) && 1833 "Induction step should be loop invariant"); 1834 if (PSE.getSE()->isSCEVable(IV->getType())) { 1835 SCEVExpander Exp(*PSE.getSE(), DL, "induction"); 1836 return Exp.expandCodeFor(Step, Step->getType(), 1837 LoopVectorPreHeader->getTerminator()); 1838 } 1839 return cast<SCEVUnknown>(Step)->getValue(); 1840 }; 1841 1842 // The scalar value to broadcast. This is derived from the canonical 1843 // induction variable. If a truncation type is given, truncate the canonical 1844 // induction variable and step. Otherwise, derive these values from the 1845 // induction descriptor. 1846 auto CreateScalarIV = [&](Value *&Step) -> Value * { 1847 Value *ScalarIV = Induction; 1848 if (IV != OldInduction) { 1849 ScalarIV = IV->getType()->isIntegerTy() 1850 ? Builder.CreateSExtOrTrunc(Induction, IV->getType()) 1851 : Builder.CreateCast(Instruction::SIToFP, Induction, 1852 IV->getType()); 1853 ScalarIV = emitTransformedIndex(Builder, ScalarIV, PSE.getSE(), DL, ID); 1854 ScalarIV->setName("offset.idx"); 1855 } 1856 if (Trunc) { 1857 auto *TruncType = cast<IntegerType>(Trunc->getType()); 1858 assert(Step->getType()->isIntegerTy() && 1859 "Truncation requires an integer step"); 1860 ScalarIV = Builder.CreateTrunc(ScalarIV, TruncType); 1861 Step = Builder.CreateTrunc(Step, TruncType); 1862 } 1863 return ScalarIV; 1864 }; 1865 1866 // Create the vector values from the scalar IV, in the absence of creating a 1867 // vector IV. 1868 auto CreateSplatIV = [&](Value *ScalarIV, Value *Step) { 1869 Value *Broadcasted = getBroadcastInstrs(ScalarIV); 1870 for (unsigned Part = 0; Part < UF; ++Part) { 1871 Value *EntryPart = 1872 getStepVector(Broadcasted, VF * Part, Step, ID.getInductionOpcode()); 1873 VectorLoopValueMap.setVectorValue(EntryVal, Part, EntryPart); 1874 if (Trunc) 1875 addMetadata(EntryPart, Trunc); 1876 recordVectorLoopValueForInductionCast(ID, EntryVal, EntryPart, Part); 1877 } 1878 }; 1879 1880 // Now do the actual transformations, and start with creating the step value. 1881 Value *Step = CreateStepValue(ID.getStep()); 1882 if (VF <= 1) { 1883 Value *ScalarIV = CreateScalarIV(Step); 1884 CreateSplatIV(ScalarIV, Step); 1885 return; 1886 } 1887 1888 // Determine if we want a scalar version of the induction variable. This is 1889 // true if the induction variable itself is not widened, or if it has at 1890 // least one user in the loop that is not widened. 1891 auto NeedsScalarIV = needsScalarInduction(EntryVal); 1892 if (!NeedsScalarIV) { 1893 createVectorIntOrFpInductionPHI(ID, Step, EntryVal); 1894 return; 1895 } 1896 1897 // Try to create a new independent vector induction variable. If we can't 1898 // create the phi node, we will splat the scalar induction variable in each 1899 // loop iteration. 1900 if (!shouldScalarizeInstruction(EntryVal)) { 1901 createVectorIntOrFpInductionPHI(ID, Step, EntryVal); 1902 Value *ScalarIV = CreateScalarIV(Step); 1903 // Create scalar steps that can be used by instructions we will later 1904 // scalarize. Note that the addition of the scalar steps will not increase 1905 // the number of instructions in the loop in the common case prior to 1906 // InstCombine. We will be trading one vector extract for each scalar step. 1907 buildScalarSteps(ScalarIV, Step, EntryVal, ID); 1908 return; 1909 } 1910 1911 // If we haven't yet vectorized the induction variable, splat the scalar 1912 // induction variable, and build the necessary step vectors. 1913 // TODO: Don't do it unless the vectorized IV is really required. 1914 Value *ScalarIV = CreateScalarIV(Step); 1915 CreateSplatIV(ScalarIV, Step); 1916 buildScalarSteps(ScalarIV, Step, EntryVal, ID); 1917 } 1918 1919 Value *InnerLoopVectorizer::getStepVector(Value *Val, int StartIdx, Value *Step, 1920 Instruction::BinaryOps BinOp) { 1921 // Create and check the types. 1922 auto *ValVTy = cast<VectorType>(Val->getType()); 1923 int VLen = ValVTy->getNumElements(); 1924 1925 Type *STy = Val->getType()->getScalarType(); 1926 assert((STy->isIntegerTy() || STy->isFloatingPointTy()) && 1927 "Induction Step must be an integer or FP"); 1928 assert(Step->getType() == STy && "Step has wrong type"); 1929 1930 SmallVector<Constant *, 8> Indices; 1931 1932 if (STy->isIntegerTy()) { 1933 // Create a vector of consecutive numbers from zero to VF. 1934 for (int i = 0; i < VLen; ++i) 1935 Indices.push_back(ConstantInt::get(STy, StartIdx + i)); 1936 1937 // Add the consecutive indices to the vector value. 1938 Constant *Cv = ConstantVector::get(Indices); 1939 assert(Cv->getType() == Val->getType() && "Invalid consecutive vec"); 1940 Step = Builder.CreateVectorSplat(VLen, Step); 1941 assert(Step->getType() == Val->getType() && "Invalid step vec"); 1942 // FIXME: The newly created binary instructions should contain nsw/nuw flags, 1943 // which can be found from the original scalar operations. 1944 Step = Builder.CreateMul(Cv, Step); 1945 return Builder.CreateAdd(Val, Step, "induction"); 1946 } 1947 1948 // Floating point induction. 1949 assert((BinOp == Instruction::FAdd || BinOp == Instruction::FSub) && 1950 "Binary Opcode should be specified for FP induction"); 1951 // Create a vector of consecutive numbers from zero to VF. 1952 for (int i = 0; i < VLen; ++i) 1953 Indices.push_back(ConstantFP::get(STy, (double)(StartIdx + i))); 1954 1955 // Add the consecutive indices to the vector value. 1956 Constant *Cv = ConstantVector::get(Indices); 1957 1958 Step = Builder.CreateVectorSplat(VLen, Step); 1959 1960 // Floating point operations had to be 'fast' to enable the induction. 1961 FastMathFlags Flags; 1962 Flags.setFast(); 1963 1964 Value *MulOp = Builder.CreateFMul(Cv, Step); 1965 if (isa<Instruction>(MulOp)) 1966 // Have to check, MulOp may be a constant 1967 cast<Instruction>(MulOp)->setFastMathFlags(Flags); 1968 1969 Value *BOp = Builder.CreateBinOp(BinOp, Val, MulOp, "induction"); 1970 if (isa<Instruction>(BOp)) 1971 cast<Instruction>(BOp)->setFastMathFlags(Flags); 1972 return BOp; 1973 } 1974 1975 void InnerLoopVectorizer::buildScalarSteps(Value *ScalarIV, Value *Step, 1976 Instruction *EntryVal, 1977 const InductionDescriptor &ID) { 1978 // We shouldn't have to build scalar steps if we aren't vectorizing. 1979 assert(VF > 1 && "VF should be greater than one"); 1980 1981 // Get the value type and ensure it and the step have the same integer type. 1982 Type *ScalarIVTy = ScalarIV->getType()->getScalarType(); 1983 assert(ScalarIVTy == Step->getType() && 1984 "Val and Step should have the same type"); 1985 1986 // We build scalar steps for both integer and floating-point induction 1987 // variables. Here, we determine the kind of arithmetic we will perform. 1988 Instruction::BinaryOps AddOp; 1989 Instruction::BinaryOps MulOp; 1990 if (ScalarIVTy->isIntegerTy()) { 1991 AddOp = Instruction::Add; 1992 MulOp = Instruction::Mul; 1993 } else { 1994 AddOp = ID.getInductionOpcode(); 1995 MulOp = Instruction::FMul; 1996 } 1997 1998 // Determine the number of scalars we need to generate for each unroll 1999 // iteration. If EntryVal is uniform, we only need to generate the first 2000 // lane. Otherwise, we generate all VF values. 2001 unsigned Lanes = 2002 Cost->isUniformAfterVectorization(cast<Instruction>(EntryVal), VF) ? 1 2003 : VF; 2004 // Compute the scalar steps and save the results in VectorLoopValueMap. 2005 for (unsigned Part = 0; Part < UF; ++Part) { 2006 for (unsigned Lane = 0; Lane < Lanes; ++Lane) { 2007 auto *StartIdx = getSignedIntOrFpConstant(ScalarIVTy, VF * Part + Lane); 2008 auto *Mul = addFastMathFlag(Builder.CreateBinOp(MulOp, StartIdx, Step)); 2009 auto *Add = addFastMathFlag(Builder.CreateBinOp(AddOp, ScalarIV, Mul)); 2010 VectorLoopValueMap.setScalarValue(EntryVal, {Part, Lane}, Add); 2011 recordVectorLoopValueForInductionCast(ID, EntryVal, Add, Part, Lane); 2012 } 2013 } 2014 } 2015 2016 Value *InnerLoopVectorizer::getOrCreateVectorValue(Value *V, unsigned Part) { 2017 assert(V != Induction && "The new induction variable should not be used."); 2018 assert(!V->getType()->isVectorTy() && "Can't widen a vector"); 2019 assert(!V->getType()->isVoidTy() && "Type does not produce a value"); 2020 2021 // If we have a stride that is replaced by one, do it here. Defer this for 2022 // the VPlan-native path until we start running Legal checks in that path. 2023 if (!EnableVPlanNativePath && Legal->hasStride(V)) 2024 V = ConstantInt::get(V->getType(), 1); 2025 2026 // If we have a vector mapped to this value, return it. 2027 if (VectorLoopValueMap.hasVectorValue(V, Part)) 2028 return VectorLoopValueMap.getVectorValue(V, Part); 2029 2030 // If the value has not been vectorized, check if it has been scalarized 2031 // instead. If it has been scalarized, and we actually need the value in 2032 // vector form, we will construct the vector values on demand. 2033 if (VectorLoopValueMap.hasAnyScalarValue(V)) { 2034 Value *ScalarValue = VectorLoopValueMap.getScalarValue(V, {Part, 0}); 2035 2036 // If we've scalarized a value, that value should be an instruction. 2037 auto *I = cast<Instruction>(V); 2038 2039 // If we aren't vectorizing, we can just copy the scalar map values over to 2040 // the vector map. 2041 if (VF == 1) { 2042 VectorLoopValueMap.setVectorValue(V, Part, ScalarValue); 2043 return ScalarValue; 2044 } 2045 2046 // Get the last scalar instruction we generated for V and Part. If the value 2047 // is known to be uniform after vectorization, this corresponds to lane zero 2048 // of the Part unroll iteration. Otherwise, the last instruction is the one 2049 // we created for the last vector lane of the Part unroll iteration. 2050 unsigned LastLane = Cost->isUniformAfterVectorization(I, VF) ? 0 : VF - 1; 2051 auto *LastInst = cast<Instruction>( 2052 VectorLoopValueMap.getScalarValue(V, {Part, LastLane})); 2053 2054 // Set the insert point after the last scalarized instruction. This ensures 2055 // the insertelement sequence will directly follow the scalar definitions. 2056 auto OldIP = Builder.saveIP(); 2057 auto NewIP = std::next(BasicBlock::iterator(LastInst)); 2058 Builder.SetInsertPoint(&*NewIP); 2059 2060 // However, if we are vectorizing, we need to construct the vector values. 2061 // If the value is known to be uniform after vectorization, we can just 2062 // broadcast the scalar value corresponding to lane zero for each unroll 2063 // iteration. Otherwise, we construct the vector values using insertelement 2064 // instructions. Since the resulting vectors are stored in 2065 // VectorLoopValueMap, we will only generate the insertelements once. 2066 Value *VectorValue = nullptr; 2067 if (Cost->isUniformAfterVectorization(I, VF)) { 2068 VectorValue = getBroadcastInstrs(ScalarValue); 2069 VectorLoopValueMap.setVectorValue(V, Part, VectorValue); 2070 } else { 2071 // Initialize packing with insertelements to start from undef. 2072 Value *Undef = UndefValue::get(VectorType::get(V->getType(), VF)); 2073 VectorLoopValueMap.setVectorValue(V, Part, Undef); 2074 for (unsigned Lane = 0; Lane < VF; ++Lane) 2075 packScalarIntoVectorValue(V, {Part, Lane}); 2076 VectorValue = VectorLoopValueMap.getVectorValue(V, Part); 2077 } 2078 Builder.restoreIP(OldIP); 2079 return VectorValue; 2080 } 2081 2082 // If this scalar is unknown, assume that it is a constant or that it is 2083 // loop invariant. Broadcast V and save the value for future uses. 2084 Value *B = getBroadcastInstrs(V); 2085 VectorLoopValueMap.setVectorValue(V, Part, B); 2086 return B; 2087 } 2088 2089 Value * 2090 InnerLoopVectorizer::getOrCreateScalarValue(Value *V, 2091 const VPIteration &Instance) { 2092 // If the value is not an instruction contained in the loop, it should 2093 // already be scalar. 2094 if (OrigLoop->isLoopInvariant(V)) 2095 return V; 2096 2097 assert(Instance.Lane > 0 2098 ? !Cost->isUniformAfterVectorization(cast<Instruction>(V), VF) 2099 : true && "Uniform values only have lane zero"); 2100 2101 // If the value from the original loop has not been vectorized, it is 2102 // represented by UF x VF scalar values in the new loop. Return the requested 2103 // scalar value. 2104 if (VectorLoopValueMap.hasScalarValue(V, Instance)) 2105 return VectorLoopValueMap.getScalarValue(V, Instance); 2106 2107 // If the value has not been scalarized, get its entry in VectorLoopValueMap 2108 // for the given unroll part. If this entry is not a vector type (i.e., the 2109 // vectorization factor is one), there is no need to generate an 2110 // extractelement instruction. 2111 auto *U = getOrCreateVectorValue(V, Instance.Part); 2112 if (!U->getType()->isVectorTy()) { 2113 assert(VF == 1 && "Value not scalarized has non-vector type"); 2114 return U; 2115 } 2116 2117 // Otherwise, the value from the original loop has been vectorized and is 2118 // represented by UF vector values. Extract and return the requested scalar 2119 // value from the appropriate vector lane. 2120 return Builder.CreateExtractElement(U, Builder.getInt32(Instance.Lane)); 2121 } 2122 2123 void InnerLoopVectorizer::packScalarIntoVectorValue( 2124 Value *V, const VPIteration &Instance) { 2125 assert(V != Induction && "The new induction variable should not be used."); 2126 assert(!V->getType()->isVectorTy() && "Can't pack a vector"); 2127 assert(!V->getType()->isVoidTy() && "Type does not produce a value"); 2128 2129 Value *ScalarInst = VectorLoopValueMap.getScalarValue(V, Instance); 2130 Value *VectorValue = VectorLoopValueMap.getVectorValue(V, Instance.Part); 2131 VectorValue = Builder.CreateInsertElement(VectorValue, ScalarInst, 2132 Builder.getInt32(Instance.Lane)); 2133 VectorLoopValueMap.resetVectorValue(V, Instance.Part, VectorValue); 2134 } 2135 2136 Value *InnerLoopVectorizer::reverseVector(Value *Vec) { 2137 assert(Vec->getType()->isVectorTy() && "Invalid type"); 2138 SmallVector<int, 8> ShuffleMask; 2139 for (unsigned i = 0; i < VF; ++i) 2140 ShuffleMask.push_back(VF - i - 1); 2141 2142 return Builder.CreateShuffleVector(Vec, UndefValue::get(Vec->getType()), 2143 ShuffleMask, "reverse"); 2144 } 2145 2146 // Return whether we allow using masked interleave-groups (for dealing with 2147 // strided loads/stores that reside in predicated blocks, or for dealing 2148 // with gaps). 2149 static bool useMaskedInterleavedAccesses(const TargetTransformInfo &TTI) { 2150 // If an override option has been passed in for interleaved accesses, use it. 2151 if (EnableMaskedInterleavedMemAccesses.getNumOccurrences() > 0) 2152 return EnableMaskedInterleavedMemAccesses; 2153 2154 return TTI.enableMaskedInterleavedAccessVectorization(); 2155 } 2156 2157 // Try to vectorize the interleave group that \p Instr belongs to. 2158 // 2159 // E.g. Translate following interleaved load group (factor = 3): 2160 // for (i = 0; i < N; i+=3) { 2161 // R = Pic[i]; // Member of index 0 2162 // G = Pic[i+1]; // Member of index 1 2163 // B = Pic[i+2]; // Member of index 2 2164 // ... // do something to R, G, B 2165 // } 2166 // To: 2167 // %wide.vec = load <12 x i32> ; Read 4 tuples of R,G,B 2168 // %R.vec = shuffle %wide.vec, undef, <0, 3, 6, 9> ; R elements 2169 // %G.vec = shuffle %wide.vec, undef, <1, 4, 7, 10> ; G elements 2170 // %B.vec = shuffle %wide.vec, undef, <2, 5, 8, 11> ; B elements 2171 // 2172 // Or translate following interleaved store group (factor = 3): 2173 // for (i = 0; i < N; i+=3) { 2174 // ... do something to R, G, B 2175 // Pic[i] = R; // Member of index 0 2176 // Pic[i+1] = G; // Member of index 1 2177 // Pic[i+2] = B; // Member of index 2 2178 // } 2179 // To: 2180 // %R_G.vec = shuffle %R.vec, %G.vec, <0, 1, 2, ..., 7> 2181 // %B_U.vec = shuffle %B.vec, undef, <0, 1, 2, 3, u, u, u, u> 2182 // %interleaved.vec = shuffle %R_G.vec, %B_U.vec, 2183 // <0, 4, 8, 1, 5, 9, 2, 6, 10, 3, 7, 11> ; Interleave R,G,B elements 2184 // store <12 x i32> %interleaved.vec ; Write 4 tuples of R,G,B 2185 void InnerLoopVectorizer::vectorizeInterleaveGroup( 2186 const InterleaveGroup<Instruction> *Group, VPTransformState &State, 2187 VPValue *Addr, VPValue *BlockInMask) { 2188 Instruction *Instr = Group->getInsertPos(); 2189 const DataLayout &DL = Instr->getModule()->getDataLayout(); 2190 2191 // Prepare for the vector type of the interleaved load/store. 2192 Type *ScalarTy = getMemInstValueType(Instr); 2193 unsigned InterleaveFactor = Group->getFactor(); 2194 Type *VecTy = VectorType::get(ScalarTy, InterleaveFactor * VF); 2195 2196 // Prepare for the new pointers. 2197 SmallVector<Value *, 2> AddrParts; 2198 unsigned Index = Group->getIndex(Instr); 2199 2200 // TODO: extend the masked interleaved-group support to reversed access. 2201 assert((!BlockInMask || !Group->isReverse()) && 2202 "Reversed masked interleave-group not supported."); 2203 2204 // If the group is reverse, adjust the index to refer to the last vector lane 2205 // instead of the first. We adjust the index from the first vector lane, 2206 // rather than directly getting the pointer for lane VF - 1, because the 2207 // pointer operand of the interleaved access is supposed to be uniform. For 2208 // uniform instructions, we're only required to generate a value for the 2209 // first vector lane in each unroll iteration. 2210 if (Group->isReverse()) 2211 Index += (VF - 1) * Group->getFactor(); 2212 2213 for (unsigned Part = 0; Part < UF; Part++) { 2214 Value *AddrPart = State.get(Addr, {Part, 0}); 2215 setDebugLocFromInst(Builder, AddrPart); 2216 2217 // Notice current instruction could be any index. Need to adjust the address 2218 // to the member of index 0. 2219 // 2220 // E.g. a = A[i+1]; // Member of index 1 (Current instruction) 2221 // b = A[i]; // Member of index 0 2222 // Current pointer is pointed to A[i+1], adjust it to A[i]. 2223 // 2224 // E.g. A[i+1] = a; // Member of index 1 2225 // A[i] = b; // Member of index 0 2226 // A[i+2] = c; // Member of index 2 (Current instruction) 2227 // Current pointer is pointed to A[i+2], adjust it to A[i]. 2228 2229 bool InBounds = false; 2230 if (auto *gep = dyn_cast<GetElementPtrInst>(AddrPart->stripPointerCasts())) 2231 InBounds = gep->isInBounds(); 2232 AddrPart = Builder.CreateGEP(ScalarTy, AddrPart, Builder.getInt32(-Index)); 2233 cast<GetElementPtrInst>(AddrPart)->setIsInBounds(InBounds); 2234 2235 // Cast to the vector pointer type. 2236 unsigned AddressSpace = AddrPart->getType()->getPointerAddressSpace(); 2237 Type *PtrTy = VecTy->getPointerTo(AddressSpace); 2238 AddrParts.push_back(Builder.CreateBitCast(AddrPart, PtrTy)); 2239 } 2240 2241 setDebugLocFromInst(Builder, Instr); 2242 Value *UndefVec = UndefValue::get(VecTy); 2243 2244 Value *MaskForGaps = nullptr; 2245 if (Group->requiresScalarEpilogue() && !Cost->isScalarEpilogueAllowed()) { 2246 MaskForGaps = createBitMaskForGaps(Builder, VF, *Group); 2247 assert(MaskForGaps && "Mask for Gaps is required but it is null"); 2248 } 2249 2250 // Vectorize the interleaved load group. 2251 if (isa<LoadInst>(Instr)) { 2252 // For each unroll part, create a wide load for the group. 2253 SmallVector<Value *, 2> NewLoads; 2254 for (unsigned Part = 0; Part < UF; Part++) { 2255 Instruction *NewLoad; 2256 if (BlockInMask || MaskForGaps) { 2257 assert(useMaskedInterleavedAccesses(*TTI) && 2258 "masked interleaved groups are not allowed."); 2259 Value *GroupMask = MaskForGaps; 2260 if (BlockInMask) { 2261 Value *BlockInMaskPart = State.get(BlockInMask, Part); 2262 auto *Undefs = UndefValue::get(BlockInMaskPart->getType()); 2263 Value *ShuffledMask = Builder.CreateShuffleVector( 2264 BlockInMaskPart, Undefs, 2265 createReplicatedMask(InterleaveFactor, VF), "interleaved.mask"); 2266 GroupMask = MaskForGaps 2267 ? Builder.CreateBinOp(Instruction::And, ShuffledMask, 2268 MaskForGaps) 2269 : ShuffledMask; 2270 } 2271 NewLoad = 2272 Builder.CreateMaskedLoad(AddrParts[Part], Group->getAlign(), 2273 GroupMask, UndefVec, "wide.masked.vec"); 2274 } 2275 else 2276 NewLoad = Builder.CreateAlignedLoad(VecTy, AddrParts[Part], 2277 Group->getAlign(), "wide.vec"); 2278 Group->addMetadata(NewLoad); 2279 NewLoads.push_back(NewLoad); 2280 } 2281 2282 // For each member in the group, shuffle out the appropriate data from the 2283 // wide loads. 2284 for (unsigned I = 0; I < InterleaveFactor; ++I) { 2285 Instruction *Member = Group->getMember(I); 2286 2287 // Skip the gaps in the group. 2288 if (!Member) 2289 continue; 2290 2291 auto StrideMask = createStrideMask(I, InterleaveFactor, VF); 2292 for (unsigned Part = 0; Part < UF; Part++) { 2293 Value *StridedVec = Builder.CreateShuffleVector( 2294 NewLoads[Part], UndefVec, StrideMask, "strided.vec"); 2295 2296 // If this member has different type, cast the result type. 2297 if (Member->getType() != ScalarTy) { 2298 VectorType *OtherVTy = VectorType::get(Member->getType(), VF); 2299 StridedVec = createBitOrPointerCast(StridedVec, OtherVTy, DL); 2300 } 2301 2302 if (Group->isReverse()) 2303 StridedVec = reverseVector(StridedVec); 2304 2305 VectorLoopValueMap.setVectorValue(Member, Part, StridedVec); 2306 } 2307 } 2308 return; 2309 } 2310 2311 // The sub vector type for current instruction. 2312 VectorType *SubVT = VectorType::get(ScalarTy, VF); 2313 2314 // Vectorize the interleaved store group. 2315 for (unsigned Part = 0; Part < UF; Part++) { 2316 // Collect the stored vector from each member. 2317 SmallVector<Value *, 4> StoredVecs; 2318 for (unsigned i = 0; i < InterleaveFactor; i++) { 2319 // Interleaved store group doesn't allow a gap, so each index has a member 2320 Instruction *Member = Group->getMember(i); 2321 assert(Member && "Fail to get a member from an interleaved store group"); 2322 2323 Value *StoredVec = getOrCreateVectorValue( 2324 cast<StoreInst>(Member)->getValueOperand(), Part); 2325 if (Group->isReverse()) 2326 StoredVec = reverseVector(StoredVec); 2327 2328 // If this member has different type, cast it to a unified type. 2329 2330 if (StoredVec->getType() != SubVT) 2331 StoredVec = createBitOrPointerCast(StoredVec, SubVT, DL); 2332 2333 StoredVecs.push_back(StoredVec); 2334 } 2335 2336 // Concatenate all vectors into a wide vector. 2337 Value *WideVec = concatenateVectors(Builder, StoredVecs); 2338 2339 // Interleave the elements in the wide vector. 2340 Value *IVec = Builder.CreateShuffleVector( 2341 WideVec, UndefVec, createInterleaveMask(VF, InterleaveFactor), 2342 "interleaved.vec"); 2343 2344 Instruction *NewStoreInstr; 2345 if (BlockInMask) { 2346 Value *BlockInMaskPart = State.get(BlockInMask, Part); 2347 auto *Undefs = UndefValue::get(BlockInMaskPart->getType()); 2348 Value *ShuffledMask = Builder.CreateShuffleVector( 2349 BlockInMaskPart, Undefs, createReplicatedMask(InterleaveFactor, VF), 2350 "interleaved.mask"); 2351 NewStoreInstr = Builder.CreateMaskedStore( 2352 IVec, AddrParts[Part], Group->getAlign(), ShuffledMask); 2353 } 2354 else 2355 NewStoreInstr = 2356 Builder.CreateAlignedStore(IVec, AddrParts[Part], Group->getAlign()); 2357 2358 Group->addMetadata(NewStoreInstr); 2359 } 2360 } 2361 2362 void InnerLoopVectorizer::vectorizeMemoryInstruction(Instruction *Instr, 2363 VPTransformState &State, 2364 VPValue *Addr, 2365 VPValue *StoredValue, 2366 VPValue *BlockInMask) { 2367 // Attempt to issue a wide load. 2368 LoadInst *LI = dyn_cast<LoadInst>(Instr); 2369 StoreInst *SI = dyn_cast<StoreInst>(Instr); 2370 2371 assert((LI || SI) && "Invalid Load/Store instruction"); 2372 assert((!SI || StoredValue) && "No stored value provided for widened store"); 2373 assert((!LI || !StoredValue) && "Stored value provided for widened load"); 2374 2375 LoopVectorizationCostModel::InstWidening Decision = 2376 Cost->getWideningDecision(Instr, VF); 2377 assert((Decision == LoopVectorizationCostModel::CM_Widen || 2378 Decision == LoopVectorizationCostModel::CM_Widen_Reverse || 2379 Decision == LoopVectorizationCostModel::CM_GatherScatter) && 2380 "CM decision is not to widen the memory instruction"); 2381 2382 Type *ScalarDataTy = getMemInstValueType(Instr); 2383 Type *DataTy = VectorType::get(ScalarDataTy, VF); 2384 // An alignment of 0 means target abi alignment. We need to use the scalar's 2385 // target abi alignment in such a case. 2386 const DataLayout &DL = Instr->getModule()->getDataLayout(); 2387 const Align Alignment = 2388 DL.getValueOrABITypeAlignment(getLoadStoreAlignment(Instr), ScalarDataTy); 2389 2390 // Determine if the pointer operand of the access is either consecutive or 2391 // reverse consecutive. 2392 bool Reverse = (Decision == LoopVectorizationCostModel::CM_Widen_Reverse); 2393 bool ConsecutiveStride = 2394 Reverse || (Decision == LoopVectorizationCostModel::CM_Widen); 2395 bool CreateGatherScatter = 2396 (Decision == LoopVectorizationCostModel::CM_GatherScatter); 2397 2398 // Either Ptr feeds a vector load/store, or a vector GEP should feed a vector 2399 // gather/scatter. Otherwise Decision should have been to Scalarize. 2400 assert((ConsecutiveStride || CreateGatherScatter) && 2401 "The instruction should be scalarized"); 2402 (void)ConsecutiveStride; 2403 2404 VectorParts BlockInMaskParts(UF); 2405 bool isMaskRequired = BlockInMask; 2406 if (isMaskRequired) 2407 for (unsigned Part = 0; Part < UF; ++Part) 2408 BlockInMaskParts[Part] = State.get(BlockInMask, Part); 2409 2410 const auto CreateVecPtr = [&](unsigned Part, Value *Ptr) -> Value * { 2411 // Calculate the pointer for the specific unroll-part. 2412 GetElementPtrInst *PartPtr = nullptr; 2413 2414 bool InBounds = false; 2415 if (auto *gep = dyn_cast<GetElementPtrInst>(Ptr->stripPointerCasts())) 2416 InBounds = gep->isInBounds(); 2417 2418 if (Reverse) { 2419 // If the address is consecutive but reversed, then the 2420 // wide store needs to start at the last vector element. 2421 PartPtr = cast<GetElementPtrInst>( 2422 Builder.CreateGEP(ScalarDataTy, Ptr, Builder.getInt32(-Part * VF))); 2423 PartPtr->setIsInBounds(InBounds); 2424 PartPtr = cast<GetElementPtrInst>( 2425 Builder.CreateGEP(ScalarDataTy, PartPtr, Builder.getInt32(1 - VF))); 2426 PartPtr->setIsInBounds(InBounds); 2427 if (isMaskRequired) // Reverse of a null all-one mask is a null mask. 2428 BlockInMaskParts[Part] = reverseVector(BlockInMaskParts[Part]); 2429 } else { 2430 PartPtr = cast<GetElementPtrInst>( 2431 Builder.CreateGEP(ScalarDataTy, Ptr, Builder.getInt32(Part * VF))); 2432 PartPtr->setIsInBounds(InBounds); 2433 } 2434 2435 unsigned AddressSpace = Ptr->getType()->getPointerAddressSpace(); 2436 return Builder.CreateBitCast(PartPtr, DataTy->getPointerTo(AddressSpace)); 2437 }; 2438 2439 // Handle Stores: 2440 if (SI) { 2441 setDebugLocFromInst(Builder, SI); 2442 2443 for (unsigned Part = 0; Part < UF; ++Part) { 2444 Instruction *NewSI = nullptr; 2445 Value *StoredVal = State.get(StoredValue, Part); 2446 if (CreateGatherScatter) { 2447 Value *MaskPart = isMaskRequired ? BlockInMaskParts[Part] : nullptr; 2448 Value *VectorGep = State.get(Addr, Part); 2449 NewSI = Builder.CreateMaskedScatter(StoredVal, VectorGep, Alignment, 2450 MaskPart); 2451 } else { 2452 if (Reverse) { 2453 // If we store to reverse consecutive memory locations, then we need 2454 // to reverse the order of elements in the stored value. 2455 StoredVal = reverseVector(StoredVal); 2456 // We don't want to update the value in the map as it might be used in 2457 // another expression. So don't call resetVectorValue(StoredVal). 2458 } 2459 auto *VecPtr = CreateVecPtr(Part, State.get(Addr, {0, 0})); 2460 if (isMaskRequired) 2461 NewSI = Builder.CreateMaskedStore(StoredVal, VecPtr, Alignment, 2462 BlockInMaskParts[Part]); 2463 else 2464 NewSI = Builder.CreateAlignedStore(StoredVal, VecPtr, Alignment); 2465 } 2466 addMetadata(NewSI, SI); 2467 } 2468 return; 2469 } 2470 2471 // Handle loads. 2472 assert(LI && "Must have a load instruction"); 2473 setDebugLocFromInst(Builder, LI); 2474 for (unsigned Part = 0; Part < UF; ++Part) { 2475 Value *NewLI; 2476 if (CreateGatherScatter) { 2477 Value *MaskPart = isMaskRequired ? BlockInMaskParts[Part] : nullptr; 2478 Value *VectorGep = State.get(Addr, Part); 2479 NewLI = Builder.CreateMaskedGather(VectorGep, Alignment, MaskPart, 2480 nullptr, "wide.masked.gather"); 2481 addMetadata(NewLI, LI); 2482 } else { 2483 auto *VecPtr = CreateVecPtr(Part, State.get(Addr, {0, 0})); 2484 if (isMaskRequired) 2485 NewLI = Builder.CreateMaskedLoad( 2486 VecPtr, Alignment, BlockInMaskParts[Part], UndefValue::get(DataTy), 2487 "wide.masked.load"); 2488 else 2489 NewLI = 2490 Builder.CreateAlignedLoad(DataTy, VecPtr, Alignment, "wide.load"); 2491 2492 // Add metadata to the load, but setVectorValue to the reverse shuffle. 2493 addMetadata(NewLI, LI); 2494 if (Reverse) 2495 NewLI = reverseVector(NewLI); 2496 } 2497 VectorLoopValueMap.setVectorValue(Instr, Part, NewLI); 2498 } 2499 } 2500 2501 void InnerLoopVectorizer::scalarizeInstruction(Instruction *Instr, 2502 const VPIteration &Instance, 2503 bool IfPredicateInstr) { 2504 assert(!Instr->getType()->isAggregateType() && "Can't handle vectors"); 2505 2506 setDebugLocFromInst(Builder, Instr); 2507 2508 // Does this instruction return a value ? 2509 bool IsVoidRetTy = Instr->getType()->isVoidTy(); 2510 2511 Instruction *Cloned = Instr->clone(); 2512 if (!IsVoidRetTy) 2513 Cloned->setName(Instr->getName() + ".cloned"); 2514 2515 // Replace the operands of the cloned instructions with their scalar 2516 // equivalents in the new loop. 2517 for (unsigned op = 0, e = Instr->getNumOperands(); op != e; ++op) { 2518 auto *NewOp = getOrCreateScalarValue(Instr->getOperand(op), Instance); 2519 Cloned->setOperand(op, NewOp); 2520 } 2521 addNewMetadata(Cloned, Instr); 2522 2523 // Place the cloned scalar in the new loop. 2524 Builder.Insert(Cloned); 2525 2526 // Add the cloned scalar to the scalar map entry. 2527 VectorLoopValueMap.setScalarValue(Instr, Instance, Cloned); 2528 2529 // If we just cloned a new assumption, add it the assumption cache. 2530 if (auto *II = dyn_cast<IntrinsicInst>(Cloned)) 2531 if (II->getIntrinsicID() == Intrinsic::assume) 2532 AC->registerAssumption(II); 2533 2534 // End if-block. 2535 if (IfPredicateInstr) 2536 PredicatedInstructions.push_back(Cloned); 2537 } 2538 2539 PHINode *InnerLoopVectorizer::createInductionVariable(Loop *L, Value *Start, 2540 Value *End, Value *Step, 2541 Instruction *DL) { 2542 BasicBlock *Header = L->getHeader(); 2543 BasicBlock *Latch = L->getLoopLatch(); 2544 // As we're just creating this loop, it's possible no latch exists 2545 // yet. If so, use the header as this will be a single block loop. 2546 if (!Latch) 2547 Latch = Header; 2548 2549 IRBuilder<> Builder(&*Header->getFirstInsertionPt()); 2550 Instruction *OldInst = getDebugLocFromInstOrOperands(OldInduction); 2551 setDebugLocFromInst(Builder, OldInst); 2552 auto *Induction = Builder.CreatePHI(Start->getType(), 2, "index"); 2553 2554 Builder.SetInsertPoint(Latch->getTerminator()); 2555 setDebugLocFromInst(Builder, OldInst); 2556 2557 // Create i+1 and fill the PHINode. 2558 Value *Next = Builder.CreateAdd(Induction, Step, "index.next"); 2559 Induction->addIncoming(Start, L->getLoopPreheader()); 2560 Induction->addIncoming(Next, Latch); 2561 // Create the compare. 2562 Value *ICmp = Builder.CreateICmpEQ(Next, End); 2563 Builder.CreateCondBr(ICmp, L->getExitBlock(), Header); 2564 2565 // Now we have two terminators. Remove the old one from the block. 2566 Latch->getTerminator()->eraseFromParent(); 2567 2568 return Induction; 2569 } 2570 2571 Value *InnerLoopVectorizer::getOrCreateTripCount(Loop *L) { 2572 if (TripCount) 2573 return TripCount; 2574 2575 assert(L && "Create Trip Count for null loop."); 2576 IRBuilder<> Builder(L->getLoopPreheader()->getTerminator()); 2577 // Find the loop boundaries. 2578 ScalarEvolution *SE = PSE.getSE(); 2579 const SCEV *BackedgeTakenCount = PSE.getBackedgeTakenCount(); 2580 assert(BackedgeTakenCount != SE->getCouldNotCompute() && 2581 "Invalid loop count"); 2582 2583 Type *IdxTy = Legal->getWidestInductionType(); 2584 assert(IdxTy && "No type for induction"); 2585 2586 // The exit count might have the type of i64 while the phi is i32. This can 2587 // happen if we have an induction variable that is sign extended before the 2588 // compare. The only way that we get a backedge taken count is that the 2589 // induction variable was signed and as such will not overflow. In such a case 2590 // truncation is legal. 2591 if (SE->getTypeSizeInBits(BackedgeTakenCount->getType()) > 2592 IdxTy->getPrimitiveSizeInBits()) 2593 BackedgeTakenCount = SE->getTruncateOrNoop(BackedgeTakenCount, IdxTy); 2594 BackedgeTakenCount = SE->getNoopOrZeroExtend(BackedgeTakenCount, IdxTy); 2595 2596 // Get the total trip count from the count by adding 1. 2597 const SCEV *ExitCount = SE->getAddExpr( 2598 BackedgeTakenCount, SE->getOne(BackedgeTakenCount->getType())); 2599 2600 const DataLayout &DL = L->getHeader()->getModule()->getDataLayout(); 2601 2602 // Expand the trip count and place the new instructions in the preheader. 2603 // Notice that the pre-header does not change, only the loop body. 2604 SCEVExpander Exp(*SE, DL, "induction"); 2605 2606 // Count holds the overall loop count (N). 2607 TripCount = Exp.expandCodeFor(ExitCount, ExitCount->getType(), 2608 L->getLoopPreheader()->getTerminator()); 2609 2610 if (TripCount->getType()->isPointerTy()) 2611 TripCount = 2612 CastInst::CreatePointerCast(TripCount, IdxTy, "exitcount.ptrcnt.to.int", 2613 L->getLoopPreheader()->getTerminator()); 2614 2615 return TripCount; 2616 } 2617 2618 Value *InnerLoopVectorizer::getOrCreateVectorTripCount(Loop *L) { 2619 if (VectorTripCount) 2620 return VectorTripCount; 2621 2622 Value *TC = getOrCreateTripCount(L); 2623 IRBuilder<> Builder(L->getLoopPreheader()->getTerminator()); 2624 2625 Type *Ty = TC->getType(); 2626 Constant *Step = ConstantInt::get(Ty, VF * UF); 2627 2628 // If the tail is to be folded by masking, round the number of iterations N 2629 // up to a multiple of Step instead of rounding down. This is done by first 2630 // adding Step-1 and then rounding down. Note that it's ok if this addition 2631 // overflows: the vector induction variable will eventually wrap to zero given 2632 // that it starts at zero and its Step is a power of two; the loop will then 2633 // exit, with the last early-exit vector comparison also producing all-true. 2634 if (Cost->foldTailByMasking()) { 2635 assert(isPowerOf2_32(VF * UF) && 2636 "VF*UF must be a power of 2 when folding tail by masking"); 2637 TC = Builder.CreateAdd(TC, ConstantInt::get(Ty, VF * UF - 1), "n.rnd.up"); 2638 } 2639 2640 // Now we need to generate the expression for the part of the loop that the 2641 // vectorized body will execute. This is equal to N - (N % Step) if scalar 2642 // iterations are not required for correctness, or N - Step, otherwise. Step 2643 // is equal to the vectorization factor (number of SIMD elements) times the 2644 // unroll factor (number of SIMD instructions). 2645 Value *R = Builder.CreateURem(TC, Step, "n.mod.vf"); 2646 2647 // If there is a non-reversed interleaved group that may speculatively access 2648 // memory out-of-bounds, we need to ensure that there will be at least one 2649 // iteration of the scalar epilogue loop. Thus, if the step evenly divides 2650 // the trip count, we set the remainder to be equal to the step. If the step 2651 // does not evenly divide the trip count, no adjustment is necessary since 2652 // there will already be scalar iterations. Note that the minimum iterations 2653 // check ensures that N >= Step. 2654 if (VF > 1 && Cost->requiresScalarEpilogue()) { 2655 auto *IsZero = Builder.CreateICmpEQ(R, ConstantInt::get(R->getType(), 0)); 2656 R = Builder.CreateSelect(IsZero, Step, R); 2657 } 2658 2659 VectorTripCount = Builder.CreateSub(TC, R, "n.vec"); 2660 2661 return VectorTripCount; 2662 } 2663 2664 Value *InnerLoopVectorizer::createBitOrPointerCast(Value *V, VectorType *DstVTy, 2665 const DataLayout &DL) { 2666 // Verify that V is a vector type with same number of elements as DstVTy. 2667 unsigned VF = DstVTy->getNumElements(); 2668 VectorType *SrcVecTy = cast<VectorType>(V->getType()); 2669 assert((VF == SrcVecTy->getNumElements()) && "Vector dimensions do not match"); 2670 Type *SrcElemTy = SrcVecTy->getElementType(); 2671 Type *DstElemTy = DstVTy->getElementType(); 2672 assert((DL.getTypeSizeInBits(SrcElemTy) == DL.getTypeSizeInBits(DstElemTy)) && 2673 "Vector elements must have same size"); 2674 2675 // Do a direct cast if element types are castable. 2676 if (CastInst::isBitOrNoopPointerCastable(SrcElemTy, DstElemTy, DL)) { 2677 return Builder.CreateBitOrPointerCast(V, DstVTy); 2678 } 2679 // V cannot be directly casted to desired vector type. 2680 // May happen when V is a floating point vector but DstVTy is a vector of 2681 // pointers or vice-versa. Handle this using a two-step bitcast using an 2682 // intermediate Integer type for the bitcast i.e. Ptr <-> Int <-> Float. 2683 assert((DstElemTy->isPointerTy() != SrcElemTy->isPointerTy()) && 2684 "Only one type should be a pointer type"); 2685 assert((DstElemTy->isFloatingPointTy() != SrcElemTy->isFloatingPointTy()) && 2686 "Only one type should be a floating point type"); 2687 Type *IntTy = 2688 IntegerType::getIntNTy(V->getContext(), DL.getTypeSizeInBits(SrcElemTy)); 2689 VectorType *VecIntTy = VectorType::get(IntTy, VF); 2690 Value *CastVal = Builder.CreateBitOrPointerCast(V, VecIntTy); 2691 return Builder.CreateBitOrPointerCast(CastVal, DstVTy); 2692 } 2693 2694 void InnerLoopVectorizer::emitMinimumIterationCountCheck(Loop *L, 2695 BasicBlock *Bypass) { 2696 Value *Count = getOrCreateTripCount(L); 2697 // Reuse existing vector loop preheader for TC checks. 2698 // Note that new preheader block is generated for vector loop. 2699 BasicBlock *const TCCheckBlock = LoopVectorPreHeader; 2700 IRBuilder<> Builder(TCCheckBlock->getTerminator()); 2701 2702 // Generate code to check if the loop's trip count is less than VF * UF, or 2703 // equal to it in case a scalar epilogue is required; this implies that the 2704 // vector trip count is zero. This check also covers the case where adding one 2705 // to the backedge-taken count overflowed leading to an incorrect trip count 2706 // of zero. In this case we will also jump to the scalar loop. 2707 auto P = Cost->requiresScalarEpilogue() ? ICmpInst::ICMP_ULE 2708 : ICmpInst::ICMP_ULT; 2709 2710 // If tail is to be folded, vector loop takes care of all iterations. 2711 Value *CheckMinIters = Builder.getFalse(); 2712 if (!Cost->foldTailByMasking()) 2713 CheckMinIters = Builder.CreateICmp( 2714 P, Count, ConstantInt::get(Count->getType(), VF * UF), 2715 "min.iters.check"); 2716 2717 // Create new preheader for vector loop. 2718 LoopVectorPreHeader = 2719 SplitBlock(TCCheckBlock, TCCheckBlock->getTerminator(), DT, LI, nullptr, 2720 "vector.ph"); 2721 2722 assert(DT->properlyDominates(DT->getNode(TCCheckBlock), 2723 DT->getNode(Bypass)->getIDom()) && 2724 "TC check is expected to dominate Bypass"); 2725 2726 // Update dominator for Bypass & LoopExit. 2727 DT->changeImmediateDominator(Bypass, TCCheckBlock); 2728 DT->changeImmediateDominator(LoopExitBlock, TCCheckBlock); 2729 2730 ReplaceInstWithInst( 2731 TCCheckBlock->getTerminator(), 2732 BranchInst::Create(Bypass, LoopVectorPreHeader, CheckMinIters)); 2733 LoopBypassBlocks.push_back(TCCheckBlock); 2734 } 2735 2736 void InnerLoopVectorizer::emitSCEVChecks(Loop *L, BasicBlock *Bypass) { 2737 // Reuse existing vector loop preheader for SCEV checks. 2738 // Note that new preheader block is generated for vector loop. 2739 BasicBlock *const SCEVCheckBlock = LoopVectorPreHeader; 2740 2741 // Generate the code to check that the SCEV assumptions that we made. 2742 // We want the new basic block to start at the first instruction in a 2743 // sequence of instructions that form a check. 2744 SCEVExpander Exp(*PSE.getSE(), Bypass->getModule()->getDataLayout(), 2745 "scev.check"); 2746 Value *SCEVCheck = Exp.expandCodeForPredicate( 2747 &PSE.getUnionPredicate(), SCEVCheckBlock->getTerminator()); 2748 2749 if (auto *C = dyn_cast<ConstantInt>(SCEVCheck)) 2750 if (C->isZero()) 2751 return; 2752 2753 assert(!SCEVCheckBlock->getParent()->hasOptSize() && 2754 "Cannot SCEV check stride or overflow when optimizing for size"); 2755 2756 SCEVCheckBlock->setName("vector.scevcheck"); 2757 // Create new preheader for vector loop. 2758 LoopVectorPreHeader = 2759 SplitBlock(SCEVCheckBlock, SCEVCheckBlock->getTerminator(), DT, LI, 2760 nullptr, "vector.ph"); 2761 2762 // Update dominator only if this is first RT check. 2763 if (LoopBypassBlocks.empty()) { 2764 DT->changeImmediateDominator(Bypass, SCEVCheckBlock); 2765 DT->changeImmediateDominator(LoopExitBlock, SCEVCheckBlock); 2766 } 2767 2768 ReplaceInstWithInst( 2769 SCEVCheckBlock->getTerminator(), 2770 BranchInst::Create(Bypass, LoopVectorPreHeader, SCEVCheck)); 2771 LoopBypassBlocks.push_back(SCEVCheckBlock); 2772 AddedSafetyChecks = true; 2773 } 2774 2775 void InnerLoopVectorizer::emitMemRuntimeChecks(Loop *L, BasicBlock *Bypass) { 2776 // VPlan-native path does not do any analysis for runtime checks currently. 2777 if (EnableVPlanNativePath) 2778 return; 2779 2780 // Reuse existing vector loop preheader for runtime memory checks. 2781 // Note that new preheader block is generated for vector loop. 2782 BasicBlock *const MemCheckBlock = L->getLoopPreheader(); 2783 2784 // Generate the code that checks in runtime if arrays overlap. We put the 2785 // checks into a separate block to make the more common case of few elements 2786 // faster. 2787 Instruction *FirstCheckInst; 2788 Instruction *MemRuntimeCheck; 2789 std::tie(FirstCheckInst, MemRuntimeCheck) = 2790 Legal->getLAI()->addRuntimeChecks(MemCheckBlock->getTerminator()); 2791 if (!MemRuntimeCheck) 2792 return; 2793 2794 if (MemCheckBlock->getParent()->hasOptSize()) { 2795 assert(Cost->Hints->getForce() == LoopVectorizeHints::FK_Enabled && 2796 "Cannot emit memory checks when optimizing for size, unless forced " 2797 "to vectorize."); 2798 ORE->emit([&]() { 2799 return OptimizationRemarkAnalysis(DEBUG_TYPE, "VectorizationCodeSize", 2800 L->getStartLoc(), L->getHeader()) 2801 << "Code-size may be reduced by not forcing " 2802 "vectorization, or by source-code modifications " 2803 "eliminating the need for runtime checks " 2804 "(e.g., adding 'restrict')."; 2805 }); 2806 } 2807 2808 MemCheckBlock->setName("vector.memcheck"); 2809 // Create new preheader for vector loop. 2810 LoopVectorPreHeader = 2811 SplitBlock(MemCheckBlock, MemCheckBlock->getTerminator(), DT, LI, nullptr, 2812 "vector.ph"); 2813 2814 // Update dominator only if this is first RT check. 2815 if (LoopBypassBlocks.empty()) { 2816 DT->changeImmediateDominator(Bypass, MemCheckBlock); 2817 DT->changeImmediateDominator(LoopExitBlock, MemCheckBlock); 2818 } 2819 2820 ReplaceInstWithInst( 2821 MemCheckBlock->getTerminator(), 2822 BranchInst::Create(Bypass, LoopVectorPreHeader, MemRuntimeCheck)); 2823 LoopBypassBlocks.push_back(MemCheckBlock); 2824 AddedSafetyChecks = true; 2825 2826 // We currently don't use LoopVersioning for the actual loop cloning but we 2827 // still use it to add the noalias metadata. 2828 LVer = std::make_unique<LoopVersioning>(*Legal->getLAI(), OrigLoop, LI, DT, 2829 PSE.getSE()); 2830 LVer->prepareNoAliasMetadata(); 2831 } 2832 2833 Value *InnerLoopVectorizer::emitTransformedIndex( 2834 IRBuilder<> &B, Value *Index, ScalarEvolution *SE, const DataLayout &DL, 2835 const InductionDescriptor &ID) const { 2836 2837 SCEVExpander Exp(*SE, DL, "induction"); 2838 auto Step = ID.getStep(); 2839 auto StartValue = ID.getStartValue(); 2840 assert(Index->getType() == Step->getType() && 2841 "Index type does not match StepValue type"); 2842 2843 // Note: the IR at this point is broken. We cannot use SE to create any new 2844 // SCEV and then expand it, hoping that SCEV's simplification will give us 2845 // a more optimal code. Unfortunately, attempt of doing so on invalid IR may 2846 // lead to various SCEV crashes. So all we can do is to use builder and rely 2847 // on InstCombine for future simplifications. Here we handle some trivial 2848 // cases only. 2849 auto CreateAdd = [&B](Value *X, Value *Y) { 2850 assert(X->getType() == Y->getType() && "Types don't match!"); 2851 if (auto *CX = dyn_cast<ConstantInt>(X)) 2852 if (CX->isZero()) 2853 return Y; 2854 if (auto *CY = dyn_cast<ConstantInt>(Y)) 2855 if (CY->isZero()) 2856 return X; 2857 return B.CreateAdd(X, Y); 2858 }; 2859 2860 auto CreateMul = [&B](Value *X, Value *Y) { 2861 assert(X->getType() == Y->getType() && "Types don't match!"); 2862 if (auto *CX = dyn_cast<ConstantInt>(X)) 2863 if (CX->isOne()) 2864 return Y; 2865 if (auto *CY = dyn_cast<ConstantInt>(Y)) 2866 if (CY->isOne()) 2867 return X; 2868 return B.CreateMul(X, Y); 2869 }; 2870 2871 switch (ID.getKind()) { 2872 case InductionDescriptor::IK_IntInduction: { 2873 assert(Index->getType() == StartValue->getType() && 2874 "Index type does not match StartValue type"); 2875 if (ID.getConstIntStepValue() && ID.getConstIntStepValue()->isMinusOne()) 2876 return B.CreateSub(StartValue, Index); 2877 auto *Offset = CreateMul( 2878 Index, Exp.expandCodeFor(Step, Index->getType(), &*B.GetInsertPoint())); 2879 return CreateAdd(StartValue, Offset); 2880 } 2881 case InductionDescriptor::IK_PtrInduction: { 2882 assert(isa<SCEVConstant>(Step) && 2883 "Expected constant step for pointer induction"); 2884 return B.CreateGEP( 2885 StartValue->getType()->getPointerElementType(), StartValue, 2886 CreateMul(Index, Exp.expandCodeFor(Step, Index->getType(), 2887 &*B.GetInsertPoint()))); 2888 } 2889 case InductionDescriptor::IK_FpInduction: { 2890 assert(Step->getType()->isFloatingPointTy() && "Expected FP Step value"); 2891 auto InductionBinOp = ID.getInductionBinOp(); 2892 assert(InductionBinOp && 2893 (InductionBinOp->getOpcode() == Instruction::FAdd || 2894 InductionBinOp->getOpcode() == Instruction::FSub) && 2895 "Original bin op should be defined for FP induction"); 2896 2897 Value *StepValue = cast<SCEVUnknown>(Step)->getValue(); 2898 2899 // Floating point operations had to be 'fast' to enable the induction. 2900 FastMathFlags Flags; 2901 Flags.setFast(); 2902 2903 Value *MulExp = B.CreateFMul(StepValue, Index); 2904 if (isa<Instruction>(MulExp)) 2905 // We have to check, the MulExp may be a constant. 2906 cast<Instruction>(MulExp)->setFastMathFlags(Flags); 2907 2908 Value *BOp = B.CreateBinOp(InductionBinOp->getOpcode(), StartValue, MulExp, 2909 "induction"); 2910 if (isa<Instruction>(BOp)) 2911 cast<Instruction>(BOp)->setFastMathFlags(Flags); 2912 2913 return BOp; 2914 } 2915 case InductionDescriptor::IK_NoInduction: 2916 return nullptr; 2917 } 2918 llvm_unreachable("invalid enum"); 2919 } 2920 2921 BasicBlock *InnerLoopVectorizer::createVectorizedLoopSkeleton() { 2922 /* 2923 In this function we generate a new loop. The new loop will contain 2924 the vectorized instructions while the old loop will continue to run the 2925 scalar remainder. 2926 2927 [ ] <-- loop iteration number check. 2928 / | 2929 / v 2930 | [ ] <-- vector loop bypass (may consist of multiple blocks). 2931 | / | 2932 | / v 2933 || [ ] <-- vector pre header. 2934 |/ | 2935 | v 2936 | [ ] \ 2937 | [ ]_| <-- vector loop. 2938 | | 2939 | v 2940 | -[ ] <--- middle-block. 2941 | / | 2942 | / v 2943 -|- >[ ] <--- new preheader. 2944 | | 2945 | v 2946 | [ ] \ 2947 | [ ]_| <-- old scalar loop to handle remainder. 2948 \ | 2949 \ v 2950 >[ ] <-- exit block. 2951 ... 2952 */ 2953 2954 MDNode *OrigLoopID = OrigLoop->getLoopID(); 2955 2956 // Some loops have a single integer induction variable, while other loops 2957 // don't. One example is c++ iterators that often have multiple pointer 2958 // induction variables. In the code below we also support a case where we 2959 // don't have a single induction variable. 2960 // 2961 // We try to obtain an induction variable from the original loop as hard 2962 // as possible. However if we don't find one that: 2963 // - is an integer 2964 // - counts from zero, stepping by one 2965 // - is the size of the widest induction variable type 2966 // then we create a new one. 2967 OldInduction = Legal->getPrimaryInduction(); 2968 Type *IdxTy = Legal->getWidestInductionType(); 2969 2970 // Split the single block loop into the two loop structure described above. 2971 LoopScalarBody = OrigLoop->getHeader(); 2972 LoopVectorPreHeader = OrigLoop->getLoopPreheader(); 2973 LoopExitBlock = OrigLoop->getExitBlock(); 2974 assert(LoopExitBlock && "Must have an exit block"); 2975 assert(LoopVectorPreHeader && "Invalid loop structure"); 2976 2977 LoopMiddleBlock = 2978 SplitBlock(LoopVectorPreHeader, LoopVectorPreHeader->getTerminator(), DT, 2979 LI, nullptr, "middle.block"); 2980 LoopScalarPreHeader = 2981 SplitBlock(LoopMiddleBlock, LoopMiddleBlock->getTerminator(), DT, LI, 2982 nullptr, "scalar.ph"); 2983 // We intentionally don't let SplitBlock to update LoopInfo since 2984 // LoopVectorBody should belong to another loop than LoopVectorPreHeader. 2985 // LoopVectorBody is explicitly added to the correct place few lines later. 2986 LoopVectorBody = 2987 SplitBlock(LoopVectorPreHeader, LoopVectorPreHeader->getTerminator(), DT, 2988 nullptr, nullptr, "vector.body"); 2989 2990 // Update dominator for loop exit. 2991 DT->changeImmediateDominator(LoopExitBlock, LoopMiddleBlock); 2992 2993 // Create and register the new vector loop. 2994 Loop *Lp = LI->AllocateLoop(); 2995 Loop *ParentLoop = OrigLoop->getParentLoop(); 2996 2997 // Insert the new loop into the loop nest and register the new basic blocks 2998 // before calling any utilities such as SCEV that require valid LoopInfo. 2999 if (ParentLoop) { 3000 ParentLoop->addChildLoop(Lp); 3001 } else { 3002 LI->addTopLevelLoop(Lp); 3003 } 3004 Lp->addBasicBlockToLoop(LoopVectorBody, *LI); 3005 3006 // Find the loop boundaries. 3007 Value *Count = getOrCreateTripCount(Lp); 3008 3009 Value *StartIdx = ConstantInt::get(IdxTy, 0); 3010 3011 // Now, compare the new count to zero. If it is zero skip the vector loop and 3012 // jump to the scalar loop. This check also covers the case where the 3013 // backedge-taken count is uint##_max: adding one to it will overflow leading 3014 // to an incorrect trip count of zero. In this (rare) case we will also jump 3015 // to the scalar loop. 3016 emitMinimumIterationCountCheck(Lp, LoopScalarPreHeader); 3017 3018 // Generate the code to check any assumptions that we've made for SCEV 3019 // expressions. 3020 emitSCEVChecks(Lp, LoopScalarPreHeader); 3021 3022 // Generate the code that checks in runtime if arrays overlap. We put the 3023 // checks into a separate block to make the more common case of few elements 3024 // faster. 3025 emitMemRuntimeChecks(Lp, LoopScalarPreHeader); 3026 3027 // Generate the induction variable. 3028 // The loop step is equal to the vectorization factor (num of SIMD elements) 3029 // times the unroll factor (num of SIMD instructions). 3030 Value *CountRoundDown = getOrCreateVectorTripCount(Lp); 3031 Constant *Step = ConstantInt::get(IdxTy, VF * UF); 3032 Induction = 3033 createInductionVariable(Lp, StartIdx, CountRoundDown, Step, 3034 getDebugLocFromInstOrOperands(OldInduction)); 3035 3036 // We are going to resume the execution of the scalar loop. 3037 // Go over all of the induction variables that we found and fix the 3038 // PHIs that are left in the scalar version of the loop. 3039 // The starting values of PHI nodes depend on the counter of the last 3040 // iteration in the vectorized loop. 3041 // If we come from a bypass edge then we need to start from the original 3042 // start value. 3043 3044 // This variable saves the new starting index for the scalar loop. It is used 3045 // to test if there are any tail iterations left once the vector loop has 3046 // completed. 3047 for (auto &InductionEntry : Legal->getInductionVars()) { 3048 PHINode *OrigPhi = InductionEntry.first; 3049 InductionDescriptor II = InductionEntry.second; 3050 3051 // Create phi nodes to merge from the backedge-taken check block. 3052 PHINode *BCResumeVal = 3053 PHINode::Create(OrigPhi->getType(), 3, "bc.resume.val", 3054 LoopScalarPreHeader->getTerminator()); 3055 // Copy original phi DL over to the new one. 3056 BCResumeVal->setDebugLoc(OrigPhi->getDebugLoc()); 3057 Value *&EndValue = IVEndValues[OrigPhi]; 3058 if (OrigPhi == OldInduction) { 3059 // We know what the end value is. 3060 EndValue = CountRoundDown; 3061 } else { 3062 IRBuilder<> B(Lp->getLoopPreheader()->getTerminator()); 3063 Type *StepType = II.getStep()->getType(); 3064 Instruction::CastOps CastOp = 3065 CastInst::getCastOpcode(CountRoundDown, true, StepType, true); 3066 Value *CRD = B.CreateCast(CastOp, CountRoundDown, StepType, "cast.crd"); 3067 const DataLayout &DL = LoopScalarBody->getModule()->getDataLayout(); 3068 EndValue = emitTransformedIndex(B, CRD, PSE.getSE(), DL, II); 3069 EndValue->setName("ind.end"); 3070 } 3071 3072 // The new PHI merges the original incoming value, in case of a bypass, 3073 // or the value at the end of the vectorized loop. 3074 BCResumeVal->addIncoming(EndValue, LoopMiddleBlock); 3075 3076 // Fix the scalar body counter (PHI node). 3077 // The old induction's phi node in the scalar body needs the truncated 3078 // value. 3079 for (BasicBlock *BB : LoopBypassBlocks) 3080 BCResumeVal->addIncoming(II.getStartValue(), BB); 3081 OrigPhi->setIncomingValueForBlock(LoopScalarPreHeader, BCResumeVal); 3082 } 3083 3084 // We need the OrigLoop (scalar loop part) latch terminator to help 3085 // produce correct debug info for the middle block BB instructions. 3086 // The legality check stage guarantees that the loop will have a single 3087 // latch. 3088 assert(isa<BranchInst>(OrigLoop->getLoopLatch()->getTerminator()) && 3089 "Scalar loop latch terminator isn't a branch"); 3090 BranchInst *ScalarLatchBr = 3091 cast<BranchInst>(OrigLoop->getLoopLatch()->getTerminator()); 3092 3093 // Add a check in the middle block to see if we have completed 3094 // all of the iterations in the first vector loop. 3095 // If (N - N%VF) == N, then we *don't* need to run the remainder. 3096 // If tail is to be folded, we know we don't need to run the remainder. 3097 Value *CmpN = Builder.getTrue(); 3098 if (!Cost->foldTailByMasking()) { 3099 CmpN = CmpInst::Create(Instruction::ICmp, CmpInst::ICMP_EQ, Count, 3100 CountRoundDown, "cmp.n", 3101 LoopMiddleBlock->getTerminator()); 3102 3103 // Here we use the same DebugLoc as the scalar loop latch branch instead 3104 // of the corresponding compare because they may have ended up with 3105 // different line numbers and we want to avoid awkward line stepping while 3106 // debugging. Eg. if the compare has got a line number inside the loop. 3107 cast<Instruction>(CmpN)->setDebugLoc(ScalarLatchBr->getDebugLoc()); 3108 } 3109 3110 BranchInst *BrInst = 3111 BranchInst::Create(LoopExitBlock, LoopScalarPreHeader, CmpN); 3112 BrInst->setDebugLoc(ScalarLatchBr->getDebugLoc()); 3113 ReplaceInstWithInst(LoopMiddleBlock->getTerminator(), BrInst); 3114 3115 // Get ready to start creating new instructions into the vectorized body. 3116 assert(LoopVectorPreHeader == Lp->getLoopPreheader() && 3117 "Inconsistent vector loop preheader"); 3118 Builder.SetInsertPoint(&*LoopVectorBody->getFirstInsertionPt()); 3119 3120 Optional<MDNode *> VectorizedLoopID = 3121 makeFollowupLoopID(OrigLoopID, {LLVMLoopVectorizeFollowupAll, 3122 LLVMLoopVectorizeFollowupVectorized}); 3123 if (VectorizedLoopID.hasValue()) { 3124 Lp->setLoopID(VectorizedLoopID.getValue()); 3125 3126 // Do not setAlreadyVectorized if loop attributes have been defined 3127 // explicitly. 3128 return LoopVectorPreHeader; 3129 } 3130 3131 // Keep all loop hints from the original loop on the vector loop (we'll 3132 // replace the vectorizer-specific hints below). 3133 if (MDNode *LID = OrigLoop->getLoopID()) 3134 Lp->setLoopID(LID); 3135 3136 LoopVectorizeHints Hints(Lp, true, *ORE); 3137 Hints.setAlreadyVectorized(); 3138 3139 #ifdef EXPENSIVE_CHECKS 3140 assert(DT->verify(DominatorTree::VerificationLevel::Fast)); 3141 LI->verify(*DT); 3142 #endif 3143 3144 return LoopVectorPreHeader; 3145 } 3146 3147 // Fix up external users of the induction variable. At this point, we are 3148 // in LCSSA form, with all external PHIs that use the IV having one input value, 3149 // coming from the remainder loop. We need those PHIs to also have a correct 3150 // value for the IV when arriving directly from the middle block. 3151 void InnerLoopVectorizer::fixupIVUsers(PHINode *OrigPhi, 3152 const InductionDescriptor &II, 3153 Value *CountRoundDown, Value *EndValue, 3154 BasicBlock *MiddleBlock) { 3155 // There are two kinds of external IV usages - those that use the value 3156 // computed in the last iteration (the PHI) and those that use the penultimate 3157 // value (the value that feeds into the phi from the loop latch). 3158 // We allow both, but they, obviously, have different values. 3159 3160 assert(OrigLoop->getExitBlock() && "Expected a single exit block"); 3161 3162 DenseMap<Value *, Value *> MissingVals; 3163 3164 // An external user of the last iteration's value should see the value that 3165 // the remainder loop uses to initialize its own IV. 3166 Value *PostInc = OrigPhi->getIncomingValueForBlock(OrigLoop->getLoopLatch()); 3167 for (User *U : PostInc->users()) { 3168 Instruction *UI = cast<Instruction>(U); 3169 if (!OrigLoop->contains(UI)) { 3170 assert(isa<PHINode>(UI) && "Expected LCSSA form"); 3171 MissingVals[UI] = EndValue; 3172 } 3173 } 3174 3175 // An external user of the penultimate value need to see EndValue - Step. 3176 // The simplest way to get this is to recompute it from the constituent SCEVs, 3177 // that is Start + (Step * (CRD - 1)). 3178 for (User *U : OrigPhi->users()) { 3179 auto *UI = cast<Instruction>(U); 3180 if (!OrigLoop->contains(UI)) { 3181 const DataLayout &DL = 3182 OrigLoop->getHeader()->getModule()->getDataLayout(); 3183 assert(isa<PHINode>(UI) && "Expected LCSSA form"); 3184 3185 IRBuilder<> B(MiddleBlock->getTerminator()); 3186 Value *CountMinusOne = B.CreateSub( 3187 CountRoundDown, ConstantInt::get(CountRoundDown->getType(), 1)); 3188 Value *CMO = 3189 !II.getStep()->getType()->isIntegerTy() 3190 ? B.CreateCast(Instruction::SIToFP, CountMinusOne, 3191 II.getStep()->getType()) 3192 : B.CreateSExtOrTrunc(CountMinusOne, II.getStep()->getType()); 3193 CMO->setName("cast.cmo"); 3194 Value *Escape = emitTransformedIndex(B, CMO, PSE.getSE(), DL, II); 3195 Escape->setName("ind.escape"); 3196 MissingVals[UI] = Escape; 3197 } 3198 } 3199 3200 for (auto &I : MissingVals) { 3201 PHINode *PHI = cast<PHINode>(I.first); 3202 // One corner case we have to handle is two IVs "chasing" each-other, 3203 // that is %IV2 = phi [...], [ %IV1, %latch ] 3204 // In this case, if IV1 has an external use, we need to avoid adding both 3205 // "last value of IV1" and "penultimate value of IV2". So, verify that we 3206 // don't already have an incoming value for the middle block. 3207 if (PHI->getBasicBlockIndex(MiddleBlock) == -1) 3208 PHI->addIncoming(I.second, MiddleBlock); 3209 } 3210 } 3211 3212 namespace { 3213 3214 struct CSEDenseMapInfo { 3215 static bool canHandle(const Instruction *I) { 3216 return isa<InsertElementInst>(I) || isa<ExtractElementInst>(I) || 3217 isa<ShuffleVectorInst>(I) || isa<GetElementPtrInst>(I); 3218 } 3219 3220 static inline Instruction *getEmptyKey() { 3221 return DenseMapInfo<Instruction *>::getEmptyKey(); 3222 } 3223 3224 static inline Instruction *getTombstoneKey() { 3225 return DenseMapInfo<Instruction *>::getTombstoneKey(); 3226 } 3227 3228 static unsigned getHashValue(const Instruction *I) { 3229 assert(canHandle(I) && "Unknown instruction!"); 3230 return hash_combine(I->getOpcode(), hash_combine_range(I->value_op_begin(), 3231 I->value_op_end())); 3232 } 3233 3234 static bool isEqual(const Instruction *LHS, const Instruction *RHS) { 3235 if (LHS == getEmptyKey() || RHS == getEmptyKey() || 3236 LHS == getTombstoneKey() || RHS == getTombstoneKey()) 3237 return LHS == RHS; 3238 return LHS->isIdenticalTo(RHS); 3239 } 3240 }; 3241 3242 } // end anonymous namespace 3243 3244 ///Perform cse of induction variable instructions. 3245 static void cse(BasicBlock *BB) { 3246 // Perform simple cse. 3247 SmallDenseMap<Instruction *, Instruction *, 4, CSEDenseMapInfo> CSEMap; 3248 for (BasicBlock::iterator I = BB->begin(), E = BB->end(); I != E;) { 3249 Instruction *In = &*I++; 3250 3251 if (!CSEDenseMapInfo::canHandle(In)) 3252 continue; 3253 3254 // Check if we can replace this instruction with any of the 3255 // visited instructions. 3256 if (Instruction *V = CSEMap.lookup(In)) { 3257 In->replaceAllUsesWith(V); 3258 In->eraseFromParent(); 3259 continue; 3260 } 3261 3262 CSEMap[In] = In; 3263 } 3264 } 3265 3266 unsigned LoopVectorizationCostModel::getVectorCallCost(CallInst *CI, 3267 unsigned VF, 3268 bool &NeedToScalarize) { 3269 Function *F = CI->getCalledFunction(); 3270 Type *ScalarRetTy = CI->getType(); 3271 SmallVector<Type *, 4> Tys, ScalarTys; 3272 for (auto &ArgOp : CI->arg_operands()) 3273 ScalarTys.push_back(ArgOp->getType()); 3274 3275 // Estimate cost of scalarized vector call. The source operands are assumed 3276 // to be vectors, so we need to extract individual elements from there, 3277 // execute VF scalar calls, and then gather the result into the vector return 3278 // value. 3279 unsigned ScalarCallCost = TTI.getCallInstrCost(F, ScalarRetTy, ScalarTys); 3280 if (VF == 1) 3281 return ScalarCallCost; 3282 3283 // Compute corresponding vector type for return value and arguments. 3284 Type *RetTy = ToVectorTy(ScalarRetTy, VF); 3285 for (Type *ScalarTy : ScalarTys) 3286 Tys.push_back(ToVectorTy(ScalarTy, VF)); 3287 3288 // Compute costs of unpacking argument values for the scalar calls and 3289 // packing the return values to a vector. 3290 unsigned ScalarizationCost = getScalarizationOverhead(CI, VF); 3291 3292 unsigned Cost = ScalarCallCost * VF + ScalarizationCost; 3293 3294 // If we can't emit a vector call for this function, then the currently found 3295 // cost is the cost we need to return. 3296 NeedToScalarize = true; 3297 VFShape Shape = VFShape::get(*CI, {VF, false}, false /*HasGlobalPred*/); 3298 Function *VecFunc = VFDatabase(*CI).getVectorizedFunction(Shape); 3299 3300 if (!TLI || CI->isNoBuiltin() || !VecFunc) 3301 return Cost; 3302 3303 // If the corresponding vector cost is cheaper, return its cost. 3304 unsigned VectorCallCost = TTI.getCallInstrCost(nullptr, RetTy, Tys); 3305 if (VectorCallCost < Cost) { 3306 NeedToScalarize = false; 3307 return VectorCallCost; 3308 } 3309 return Cost; 3310 } 3311 3312 unsigned LoopVectorizationCostModel::getVectorIntrinsicCost(CallInst *CI, 3313 unsigned VF) { 3314 Intrinsic::ID ID = getVectorIntrinsicIDForCall(CI, TLI); 3315 assert(ID && "Expected intrinsic call!"); 3316 3317 FastMathFlags FMF; 3318 if (auto *FPMO = dyn_cast<FPMathOperator>(CI)) 3319 FMF = FPMO->getFastMathFlags(); 3320 3321 SmallVector<Value *, 4> Operands(CI->arg_operands()); 3322 return TTI.getIntrinsicInstrCost(ID, CI->getType(), Operands, FMF, VF, CI); 3323 } 3324 3325 static Type *smallestIntegerVectorType(Type *T1, Type *T2) { 3326 auto *I1 = cast<IntegerType>(cast<VectorType>(T1)->getElementType()); 3327 auto *I2 = cast<IntegerType>(cast<VectorType>(T2)->getElementType()); 3328 return I1->getBitWidth() < I2->getBitWidth() ? T1 : T2; 3329 } 3330 3331 static Type *largestIntegerVectorType(Type *T1, Type *T2) { 3332 auto *I1 = cast<IntegerType>(cast<VectorType>(T1)->getElementType()); 3333 auto *I2 = cast<IntegerType>(cast<VectorType>(T2)->getElementType()); 3334 return I1->getBitWidth() > I2->getBitWidth() ? T1 : T2; 3335 } 3336 3337 void InnerLoopVectorizer::truncateToMinimalBitwidths() { 3338 // For every instruction `I` in MinBWs, truncate the operands, create a 3339 // truncated version of `I` and reextend its result. InstCombine runs 3340 // later and will remove any ext/trunc pairs. 3341 SmallPtrSet<Value *, 4> Erased; 3342 for (const auto &KV : Cost->getMinimalBitwidths()) { 3343 // If the value wasn't vectorized, we must maintain the original scalar 3344 // type. The absence of the value from VectorLoopValueMap indicates that it 3345 // wasn't vectorized. 3346 if (!VectorLoopValueMap.hasAnyVectorValue(KV.first)) 3347 continue; 3348 for (unsigned Part = 0; Part < UF; ++Part) { 3349 Value *I = getOrCreateVectorValue(KV.first, Part); 3350 if (Erased.find(I) != Erased.end() || I->use_empty() || 3351 !isa<Instruction>(I)) 3352 continue; 3353 Type *OriginalTy = I->getType(); 3354 Type *ScalarTruncatedTy = 3355 IntegerType::get(OriginalTy->getContext(), KV.second); 3356 Type *TruncatedTy = VectorType::get( 3357 ScalarTruncatedTy, cast<VectorType>(OriginalTy)->getNumElements()); 3358 if (TruncatedTy == OriginalTy) 3359 continue; 3360 3361 IRBuilder<> B(cast<Instruction>(I)); 3362 auto ShrinkOperand = [&](Value *V) -> Value * { 3363 if (auto *ZI = dyn_cast<ZExtInst>(V)) 3364 if (ZI->getSrcTy() == TruncatedTy) 3365 return ZI->getOperand(0); 3366 return B.CreateZExtOrTrunc(V, TruncatedTy); 3367 }; 3368 3369 // The actual instruction modification depends on the instruction type, 3370 // unfortunately. 3371 Value *NewI = nullptr; 3372 if (auto *BO = dyn_cast<BinaryOperator>(I)) { 3373 NewI = B.CreateBinOp(BO->getOpcode(), ShrinkOperand(BO->getOperand(0)), 3374 ShrinkOperand(BO->getOperand(1))); 3375 3376 // Any wrapping introduced by shrinking this operation shouldn't be 3377 // considered undefined behavior. So, we can't unconditionally copy 3378 // arithmetic wrapping flags to NewI. 3379 cast<BinaryOperator>(NewI)->copyIRFlags(I, /*IncludeWrapFlags=*/false); 3380 } else if (auto *CI = dyn_cast<ICmpInst>(I)) { 3381 NewI = 3382 B.CreateICmp(CI->getPredicate(), ShrinkOperand(CI->getOperand(0)), 3383 ShrinkOperand(CI->getOperand(1))); 3384 } else if (auto *SI = dyn_cast<SelectInst>(I)) { 3385 NewI = B.CreateSelect(SI->getCondition(), 3386 ShrinkOperand(SI->getTrueValue()), 3387 ShrinkOperand(SI->getFalseValue())); 3388 } else if (auto *CI = dyn_cast<CastInst>(I)) { 3389 switch (CI->getOpcode()) { 3390 default: 3391 llvm_unreachable("Unhandled cast!"); 3392 case Instruction::Trunc: 3393 NewI = ShrinkOperand(CI->getOperand(0)); 3394 break; 3395 case Instruction::SExt: 3396 NewI = B.CreateSExtOrTrunc( 3397 CI->getOperand(0), 3398 smallestIntegerVectorType(OriginalTy, TruncatedTy)); 3399 break; 3400 case Instruction::ZExt: 3401 NewI = B.CreateZExtOrTrunc( 3402 CI->getOperand(0), 3403 smallestIntegerVectorType(OriginalTy, TruncatedTy)); 3404 break; 3405 } 3406 } else if (auto *SI = dyn_cast<ShuffleVectorInst>(I)) { 3407 auto Elements0 = 3408 cast<VectorType>(SI->getOperand(0)->getType())->getNumElements(); 3409 auto *O0 = B.CreateZExtOrTrunc( 3410 SI->getOperand(0), VectorType::get(ScalarTruncatedTy, Elements0)); 3411 auto Elements1 = 3412 cast<VectorType>(SI->getOperand(1)->getType())->getNumElements(); 3413 auto *O1 = B.CreateZExtOrTrunc( 3414 SI->getOperand(1), VectorType::get(ScalarTruncatedTy, Elements1)); 3415 3416 NewI = B.CreateShuffleVector(O0, O1, SI->getShuffleMask()); 3417 } else if (isa<LoadInst>(I) || isa<PHINode>(I)) { 3418 // Don't do anything with the operands, just extend the result. 3419 continue; 3420 } else if (auto *IE = dyn_cast<InsertElementInst>(I)) { 3421 auto Elements = 3422 cast<VectorType>(IE->getOperand(0)->getType())->getNumElements(); 3423 auto *O0 = B.CreateZExtOrTrunc( 3424 IE->getOperand(0), VectorType::get(ScalarTruncatedTy, Elements)); 3425 auto *O1 = B.CreateZExtOrTrunc(IE->getOperand(1), ScalarTruncatedTy); 3426 NewI = B.CreateInsertElement(O0, O1, IE->getOperand(2)); 3427 } else if (auto *EE = dyn_cast<ExtractElementInst>(I)) { 3428 auto Elements = 3429 cast<VectorType>(EE->getOperand(0)->getType())->getNumElements(); 3430 auto *O0 = B.CreateZExtOrTrunc( 3431 EE->getOperand(0), VectorType::get(ScalarTruncatedTy, Elements)); 3432 NewI = B.CreateExtractElement(O0, EE->getOperand(2)); 3433 } else { 3434 // If we don't know what to do, be conservative and don't do anything. 3435 continue; 3436 } 3437 3438 // Lastly, extend the result. 3439 NewI->takeName(cast<Instruction>(I)); 3440 Value *Res = B.CreateZExtOrTrunc(NewI, OriginalTy); 3441 I->replaceAllUsesWith(Res); 3442 cast<Instruction>(I)->eraseFromParent(); 3443 Erased.insert(I); 3444 VectorLoopValueMap.resetVectorValue(KV.first, Part, Res); 3445 } 3446 } 3447 3448 // We'll have created a bunch of ZExts that are now parentless. Clean up. 3449 for (const auto &KV : Cost->getMinimalBitwidths()) { 3450 // If the value wasn't vectorized, we must maintain the original scalar 3451 // type. The absence of the value from VectorLoopValueMap indicates that it 3452 // wasn't vectorized. 3453 if (!VectorLoopValueMap.hasAnyVectorValue(KV.first)) 3454 continue; 3455 for (unsigned Part = 0; Part < UF; ++Part) { 3456 Value *I = getOrCreateVectorValue(KV.first, Part); 3457 ZExtInst *Inst = dyn_cast<ZExtInst>(I); 3458 if (Inst && Inst->use_empty()) { 3459 Value *NewI = Inst->getOperand(0); 3460 Inst->eraseFromParent(); 3461 VectorLoopValueMap.resetVectorValue(KV.first, Part, NewI); 3462 } 3463 } 3464 } 3465 } 3466 3467 void InnerLoopVectorizer::fixVectorizedLoop() { 3468 // Insert truncates and extends for any truncated instructions as hints to 3469 // InstCombine. 3470 if (VF > 1) 3471 truncateToMinimalBitwidths(); 3472 3473 // Fix widened non-induction PHIs by setting up the PHI operands. 3474 if (OrigPHIsToFix.size()) { 3475 assert(EnableVPlanNativePath && 3476 "Unexpected non-induction PHIs for fixup in non VPlan-native path"); 3477 fixNonInductionPHIs(); 3478 } 3479 3480 // At this point every instruction in the original loop is widened to a 3481 // vector form. Now we need to fix the recurrences in the loop. These PHI 3482 // nodes are currently empty because we did not want to introduce cycles. 3483 // This is the second stage of vectorizing recurrences. 3484 fixCrossIterationPHIs(); 3485 3486 // Forget the original basic block. 3487 PSE.getSE()->forgetLoop(OrigLoop); 3488 3489 // Fix-up external users of the induction variables. 3490 for (auto &Entry : Legal->getInductionVars()) 3491 fixupIVUsers(Entry.first, Entry.second, 3492 getOrCreateVectorTripCount(LI->getLoopFor(LoopVectorBody)), 3493 IVEndValues[Entry.first], LoopMiddleBlock); 3494 3495 fixLCSSAPHIs(); 3496 for (Instruction *PI : PredicatedInstructions) 3497 sinkScalarOperands(&*PI); 3498 3499 // Remove redundant induction instructions. 3500 cse(LoopVectorBody); 3501 3502 // Set/update profile weights for the vector and remainder loops as original 3503 // loop iterations are now distributed among them. Note that original loop 3504 // represented by LoopScalarBody becomes remainder loop after vectorization. 3505 // 3506 // For cases like foldTailByMasking() and requiresScalarEpiloque() we may 3507 // end up getting slightly roughened result but that should be OK since 3508 // profile is not inherently precise anyway. Note also possible bypass of 3509 // vector code caused by legality checks is ignored, assigning all the weight 3510 // to the vector loop, optimistically. 3511 setProfileInfoAfterUnrolling(LI->getLoopFor(LoopScalarBody), 3512 LI->getLoopFor(LoopVectorBody), 3513 LI->getLoopFor(LoopScalarBody), VF * UF); 3514 } 3515 3516 void InnerLoopVectorizer::fixCrossIterationPHIs() { 3517 // In order to support recurrences we need to be able to vectorize Phi nodes. 3518 // Phi nodes have cycles, so we need to vectorize them in two stages. This is 3519 // stage #2: We now need to fix the recurrences by adding incoming edges to 3520 // the currently empty PHI nodes. At this point every instruction in the 3521 // original loop is widened to a vector form so we can use them to construct 3522 // the incoming edges. 3523 for (PHINode &Phi : OrigLoop->getHeader()->phis()) { 3524 // Handle first-order recurrences and reductions that need to be fixed. 3525 if (Legal->isFirstOrderRecurrence(&Phi)) 3526 fixFirstOrderRecurrence(&Phi); 3527 else if (Legal->isReductionVariable(&Phi)) 3528 fixReduction(&Phi); 3529 } 3530 } 3531 3532 void InnerLoopVectorizer::fixFirstOrderRecurrence(PHINode *Phi) { 3533 // This is the second phase of vectorizing first-order recurrences. An 3534 // overview of the transformation is described below. Suppose we have the 3535 // following loop. 3536 // 3537 // for (int i = 0; i < n; ++i) 3538 // b[i] = a[i] - a[i - 1]; 3539 // 3540 // There is a first-order recurrence on "a". For this loop, the shorthand 3541 // scalar IR looks like: 3542 // 3543 // scalar.ph: 3544 // s_init = a[-1] 3545 // br scalar.body 3546 // 3547 // scalar.body: 3548 // i = phi [0, scalar.ph], [i+1, scalar.body] 3549 // s1 = phi [s_init, scalar.ph], [s2, scalar.body] 3550 // s2 = a[i] 3551 // b[i] = s2 - s1 3552 // br cond, scalar.body, ... 3553 // 3554 // In this example, s1 is a recurrence because it's value depends on the 3555 // previous iteration. In the first phase of vectorization, we created a 3556 // temporary value for s1. We now complete the vectorization and produce the 3557 // shorthand vector IR shown below (for VF = 4, UF = 1). 3558 // 3559 // vector.ph: 3560 // v_init = vector(..., ..., ..., a[-1]) 3561 // br vector.body 3562 // 3563 // vector.body 3564 // i = phi [0, vector.ph], [i+4, vector.body] 3565 // v1 = phi [v_init, vector.ph], [v2, vector.body] 3566 // v2 = a[i, i+1, i+2, i+3]; 3567 // v3 = vector(v1(3), v2(0, 1, 2)) 3568 // b[i, i+1, i+2, i+3] = v2 - v3 3569 // br cond, vector.body, middle.block 3570 // 3571 // middle.block: 3572 // x = v2(3) 3573 // br scalar.ph 3574 // 3575 // scalar.ph: 3576 // s_init = phi [x, middle.block], [a[-1], otherwise] 3577 // br scalar.body 3578 // 3579 // After execution completes the vector loop, we extract the next value of 3580 // the recurrence (x) to use as the initial value in the scalar loop. 3581 3582 // Get the original loop preheader and single loop latch. 3583 auto *Preheader = OrigLoop->getLoopPreheader(); 3584 auto *Latch = OrigLoop->getLoopLatch(); 3585 3586 // Get the initial and previous values of the scalar recurrence. 3587 auto *ScalarInit = Phi->getIncomingValueForBlock(Preheader); 3588 auto *Previous = Phi->getIncomingValueForBlock(Latch); 3589 3590 // Create a vector from the initial value. 3591 auto *VectorInit = ScalarInit; 3592 if (VF > 1) { 3593 Builder.SetInsertPoint(LoopVectorPreHeader->getTerminator()); 3594 VectorInit = Builder.CreateInsertElement( 3595 UndefValue::get(VectorType::get(VectorInit->getType(), VF)), VectorInit, 3596 Builder.getInt32(VF - 1), "vector.recur.init"); 3597 } 3598 3599 // We constructed a temporary phi node in the first phase of vectorization. 3600 // This phi node will eventually be deleted. 3601 Builder.SetInsertPoint( 3602 cast<Instruction>(VectorLoopValueMap.getVectorValue(Phi, 0))); 3603 3604 // Create a phi node for the new recurrence. The current value will either be 3605 // the initial value inserted into a vector or loop-varying vector value. 3606 auto *VecPhi = Builder.CreatePHI(VectorInit->getType(), 2, "vector.recur"); 3607 VecPhi->addIncoming(VectorInit, LoopVectorPreHeader); 3608 3609 // Get the vectorized previous value of the last part UF - 1. It appears last 3610 // among all unrolled iterations, due to the order of their construction. 3611 Value *PreviousLastPart = getOrCreateVectorValue(Previous, UF - 1); 3612 3613 // Find and set the insertion point after the previous value if it is an 3614 // instruction. 3615 BasicBlock::iterator InsertPt; 3616 // Note that the previous value may have been constant-folded so it is not 3617 // guaranteed to be an instruction in the vector loop. 3618 // FIXME: Loop invariant values do not form recurrences. We should deal with 3619 // them earlier. 3620 if (LI->getLoopFor(LoopVectorBody)->isLoopInvariant(PreviousLastPart)) 3621 InsertPt = LoopVectorBody->getFirstInsertionPt(); 3622 else { 3623 Instruction *PreviousInst = cast<Instruction>(PreviousLastPart); 3624 if (isa<PHINode>(PreviousLastPart)) 3625 // If the previous value is a phi node, we should insert after all the phi 3626 // nodes in the block containing the PHI to avoid breaking basic block 3627 // verification. Note that the basic block may be different to 3628 // LoopVectorBody, in case we predicate the loop. 3629 InsertPt = PreviousInst->getParent()->getFirstInsertionPt(); 3630 else 3631 InsertPt = ++PreviousInst->getIterator(); 3632 } 3633 Builder.SetInsertPoint(&*InsertPt); 3634 3635 // We will construct a vector for the recurrence by combining the values for 3636 // the current and previous iterations. This is the required shuffle mask. 3637 SmallVector<int, 8> ShuffleMask(VF); 3638 ShuffleMask[0] = VF - 1; 3639 for (unsigned I = 1; I < VF; ++I) 3640 ShuffleMask[I] = I + VF - 1; 3641 3642 // The vector from which to take the initial value for the current iteration 3643 // (actual or unrolled). Initially, this is the vector phi node. 3644 Value *Incoming = VecPhi; 3645 3646 // Shuffle the current and previous vector and update the vector parts. 3647 for (unsigned Part = 0; Part < UF; ++Part) { 3648 Value *PreviousPart = getOrCreateVectorValue(Previous, Part); 3649 Value *PhiPart = VectorLoopValueMap.getVectorValue(Phi, Part); 3650 auto *Shuffle = VF > 1 ? Builder.CreateShuffleVector(Incoming, PreviousPart, 3651 ShuffleMask) 3652 : Incoming; 3653 PhiPart->replaceAllUsesWith(Shuffle); 3654 cast<Instruction>(PhiPart)->eraseFromParent(); 3655 VectorLoopValueMap.resetVectorValue(Phi, Part, Shuffle); 3656 Incoming = PreviousPart; 3657 } 3658 3659 // Fix the latch value of the new recurrence in the vector loop. 3660 VecPhi->addIncoming(Incoming, LI->getLoopFor(LoopVectorBody)->getLoopLatch()); 3661 3662 // Extract the last vector element in the middle block. This will be the 3663 // initial value for the recurrence when jumping to the scalar loop. 3664 auto *ExtractForScalar = Incoming; 3665 if (VF > 1) { 3666 Builder.SetInsertPoint(LoopMiddleBlock->getTerminator()); 3667 ExtractForScalar = Builder.CreateExtractElement( 3668 ExtractForScalar, Builder.getInt32(VF - 1), "vector.recur.extract"); 3669 } 3670 // Extract the second last element in the middle block if the 3671 // Phi is used outside the loop. We need to extract the phi itself 3672 // and not the last element (the phi update in the current iteration). This 3673 // will be the value when jumping to the exit block from the LoopMiddleBlock, 3674 // when the scalar loop is not run at all. 3675 Value *ExtractForPhiUsedOutsideLoop = nullptr; 3676 if (VF > 1) 3677 ExtractForPhiUsedOutsideLoop = Builder.CreateExtractElement( 3678 Incoming, Builder.getInt32(VF - 2), "vector.recur.extract.for.phi"); 3679 // When loop is unrolled without vectorizing, initialize 3680 // ExtractForPhiUsedOutsideLoop with the value just prior to unrolled value of 3681 // `Incoming`. This is analogous to the vectorized case above: extracting the 3682 // second last element when VF > 1. 3683 else if (UF > 1) 3684 ExtractForPhiUsedOutsideLoop = getOrCreateVectorValue(Previous, UF - 2); 3685 3686 // Fix the initial value of the original recurrence in the scalar loop. 3687 Builder.SetInsertPoint(&*LoopScalarPreHeader->begin()); 3688 auto *Start = Builder.CreatePHI(Phi->getType(), 2, "scalar.recur.init"); 3689 for (auto *BB : predecessors(LoopScalarPreHeader)) { 3690 auto *Incoming = BB == LoopMiddleBlock ? ExtractForScalar : ScalarInit; 3691 Start->addIncoming(Incoming, BB); 3692 } 3693 3694 Phi->setIncomingValueForBlock(LoopScalarPreHeader, Start); 3695 Phi->setName("scalar.recur"); 3696 3697 // Finally, fix users of the recurrence outside the loop. The users will need 3698 // either the last value of the scalar recurrence or the last value of the 3699 // vector recurrence we extracted in the middle block. Since the loop is in 3700 // LCSSA form, we just need to find all the phi nodes for the original scalar 3701 // recurrence in the exit block, and then add an edge for the middle block. 3702 for (PHINode &LCSSAPhi : LoopExitBlock->phis()) { 3703 if (LCSSAPhi.getIncomingValue(0) == Phi) { 3704 LCSSAPhi.addIncoming(ExtractForPhiUsedOutsideLoop, LoopMiddleBlock); 3705 } 3706 } 3707 } 3708 3709 void InnerLoopVectorizer::fixReduction(PHINode *Phi) { 3710 Constant *Zero = Builder.getInt32(0); 3711 3712 // Get it's reduction variable descriptor. 3713 assert(Legal->isReductionVariable(Phi) && 3714 "Unable to find the reduction variable"); 3715 RecurrenceDescriptor RdxDesc = Legal->getReductionVars()[Phi]; 3716 3717 RecurrenceDescriptor::RecurrenceKind RK = RdxDesc.getRecurrenceKind(); 3718 TrackingVH<Value> ReductionStartValue = RdxDesc.getRecurrenceStartValue(); 3719 Instruction *LoopExitInst = RdxDesc.getLoopExitInstr(); 3720 RecurrenceDescriptor::MinMaxRecurrenceKind MinMaxKind = 3721 RdxDesc.getMinMaxRecurrenceKind(); 3722 setDebugLocFromInst(Builder, ReductionStartValue); 3723 3724 // We need to generate a reduction vector from the incoming scalar. 3725 // To do so, we need to generate the 'identity' vector and override 3726 // one of the elements with the incoming scalar reduction. We need 3727 // to do it in the vector-loop preheader. 3728 Builder.SetInsertPoint(LoopVectorPreHeader->getTerminator()); 3729 3730 // This is the vector-clone of the value that leaves the loop. 3731 Type *VecTy = getOrCreateVectorValue(LoopExitInst, 0)->getType(); 3732 3733 // Find the reduction identity variable. Zero for addition, or, xor, 3734 // one for multiplication, -1 for And. 3735 Value *Identity; 3736 Value *VectorStart; 3737 if (RK == RecurrenceDescriptor::RK_IntegerMinMax || 3738 RK == RecurrenceDescriptor::RK_FloatMinMax) { 3739 // MinMax reduction have the start value as their identify. 3740 if (VF == 1) { 3741 VectorStart = Identity = ReductionStartValue; 3742 } else { 3743 VectorStart = Identity = 3744 Builder.CreateVectorSplat(VF, ReductionStartValue, "minmax.ident"); 3745 } 3746 } else { 3747 // Handle other reduction kinds: 3748 Constant *Iden = RecurrenceDescriptor::getRecurrenceIdentity( 3749 RK, VecTy->getScalarType()); 3750 if (VF == 1) { 3751 Identity = Iden; 3752 // This vector is the Identity vector where the first element is the 3753 // incoming scalar reduction. 3754 VectorStart = ReductionStartValue; 3755 } else { 3756 Identity = ConstantVector::getSplat({VF, false}, Iden); 3757 3758 // This vector is the Identity vector where the first element is the 3759 // incoming scalar reduction. 3760 VectorStart = 3761 Builder.CreateInsertElement(Identity, ReductionStartValue, Zero); 3762 } 3763 } 3764 3765 // Wrap flags are in general invalid after vectorization, clear them. 3766 clearReductionWrapFlags(RdxDesc); 3767 3768 // Fix the vector-loop phi. 3769 3770 // Reductions do not have to start at zero. They can start with 3771 // any loop invariant values. 3772 BasicBlock *Latch = OrigLoop->getLoopLatch(); 3773 Value *LoopVal = Phi->getIncomingValueForBlock(Latch); 3774 3775 for (unsigned Part = 0; Part < UF; ++Part) { 3776 Value *VecRdxPhi = getOrCreateVectorValue(Phi, Part); 3777 Value *Val = getOrCreateVectorValue(LoopVal, Part); 3778 // Make sure to add the reduction start value only to the 3779 // first unroll part. 3780 Value *StartVal = (Part == 0) ? VectorStart : Identity; 3781 cast<PHINode>(VecRdxPhi)->addIncoming(StartVal, LoopVectorPreHeader); 3782 cast<PHINode>(VecRdxPhi) 3783 ->addIncoming(Val, LI->getLoopFor(LoopVectorBody)->getLoopLatch()); 3784 } 3785 3786 // Before each round, move the insertion point right between 3787 // the PHIs and the values we are going to write. 3788 // This allows us to write both PHINodes and the extractelement 3789 // instructions. 3790 Builder.SetInsertPoint(&*LoopMiddleBlock->getFirstInsertionPt()); 3791 3792 setDebugLocFromInst(Builder, LoopExitInst); 3793 3794 // If tail is folded by masking, the vector value to leave the loop should be 3795 // a Select choosing between the vectorized LoopExitInst and vectorized Phi, 3796 // instead of the former. 3797 if (Cost->foldTailByMasking()) { 3798 for (unsigned Part = 0; Part < UF; ++Part) { 3799 Value *VecLoopExitInst = 3800 VectorLoopValueMap.getVectorValue(LoopExitInst, Part); 3801 Value *Sel = nullptr; 3802 for (User *U : VecLoopExitInst->users()) { 3803 if (isa<SelectInst>(U)) { 3804 assert(!Sel && "Reduction exit feeding two selects"); 3805 Sel = U; 3806 } else 3807 assert(isa<PHINode>(U) && "Reduction exit must feed Phi's or select"); 3808 } 3809 assert(Sel && "Reduction exit feeds no select"); 3810 VectorLoopValueMap.resetVectorValue(LoopExitInst, Part, Sel); 3811 } 3812 } 3813 3814 // If the vector reduction can be performed in a smaller type, we truncate 3815 // then extend the loop exit value to enable InstCombine to evaluate the 3816 // entire expression in the smaller type. 3817 if (VF > 1 && Phi->getType() != RdxDesc.getRecurrenceType()) { 3818 Type *RdxVecTy = VectorType::get(RdxDesc.getRecurrenceType(), VF); 3819 Builder.SetInsertPoint( 3820 LI->getLoopFor(LoopVectorBody)->getLoopLatch()->getTerminator()); 3821 VectorParts RdxParts(UF); 3822 for (unsigned Part = 0; Part < UF; ++Part) { 3823 RdxParts[Part] = VectorLoopValueMap.getVectorValue(LoopExitInst, Part); 3824 Value *Trunc = Builder.CreateTrunc(RdxParts[Part], RdxVecTy); 3825 Value *Extnd = RdxDesc.isSigned() ? Builder.CreateSExt(Trunc, VecTy) 3826 : Builder.CreateZExt(Trunc, VecTy); 3827 for (Value::user_iterator UI = RdxParts[Part]->user_begin(); 3828 UI != RdxParts[Part]->user_end();) 3829 if (*UI != Trunc) { 3830 (*UI++)->replaceUsesOfWith(RdxParts[Part], Extnd); 3831 RdxParts[Part] = Extnd; 3832 } else { 3833 ++UI; 3834 } 3835 } 3836 Builder.SetInsertPoint(&*LoopMiddleBlock->getFirstInsertionPt()); 3837 for (unsigned Part = 0; Part < UF; ++Part) { 3838 RdxParts[Part] = Builder.CreateTrunc(RdxParts[Part], RdxVecTy); 3839 VectorLoopValueMap.resetVectorValue(LoopExitInst, Part, RdxParts[Part]); 3840 } 3841 } 3842 3843 // Reduce all of the unrolled parts into a single vector. 3844 Value *ReducedPartRdx = VectorLoopValueMap.getVectorValue(LoopExitInst, 0); 3845 unsigned Op = RecurrenceDescriptor::getRecurrenceBinOp(RK); 3846 3847 // The middle block terminator has already been assigned a DebugLoc here (the 3848 // OrigLoop's single latch terminator). We want the whole middle block to 3849 // appear to execute on this line because: (a) it is all compiler generated, 3850 // (b) these instructions are always executed after evaluating the latch 3851 // conditional branch, and (c) other passes may add new predecessors which 3852 // terminate on this line. This is the easiest way to ensure we don't 3853 // accidentally cause an extra step back into the loop while debugging. 3854 setDebugLocFromInst(Builder, LoopMiddleBlock->getTerminator()); 3855 for (unsigned Part = 1; Part < UF; ++Part) { 3856 Value *RdxPart = VectorLoopValueMap.getVectorValue(LoopExitInst, Part); 3857 if (Op != Instruction::ICmp && Op != Instruction::FCmp) 3858 // Floating point operations had to be 'fast' to enable the reduction. 3859 ReducedPartRdx = addFastMathFlag( 3860 Builder.CreateBinOp((Instruction::BinaryOps)Op, RdxPart, 3861 ReducedPartRdx, "bin.rdx"), 3862 RdxDesc.getFastMathFlags()); 3863 else 3864 ReducedPartRdx = createMinMaxOp(Builder, MinMaxKind, ReducedPartRdx, 3865 RdxPart); 3866 } 3867 3868 if (VF > 1) { 3869 bool NoNaN = Legal->hasFunNoNaNAttr(); 3870 ReducedPartRdx = 3871 createTargetReduction(Builder, TTI, RdxDesc, ReducedPartRdx, NoNaN); 3872 // If the reduction can be performed in a smaller type, we need to extend 3873 // the reduction to the wider type before we branch to the original loop. 3874 if (Phi->getType() != RdxDesc.getRecurrenceType()) 3875 ReducedPartRdx = 3876 RdxDesc.isSigned() 3877 ? Builder.CreateSExt(ReducedPartRdx, Phi->getType()) 3878 : Builder.CreateZExt(ReducedPartRdx, Phi->getType()); 3879 } 3880 3881 // Create a phi node that merges control-flow from the backedge-taken check 3882 // block and the middle block. 3883 PHINode *BCBlockPhi = PHINode::Create(Phi->getType(), 2, "bc.merge.rdx", 3884 LoopScalarPreHeader->getTerminator()); 3885 for (unsigned I = 0, E = LoopBypassBlocks.size(); I != E; ++I) 3886 BCBlockPhi->addIncoming(ReductionStartValue, LoopBypassBlocks[I]); 3887 BCBlockPhi->addIncoming(ReducedPartRdx, LoopMiddleBlock); 3888 3889 // Now, we need to fix the users of the reduction variable 3890 // inside and outside of the scalar remainder loop. 3891 // We know that the loop is in LCSSA form. We need to update the 3892 // PHI nodes in the exit blocks. 3893 for (PHINode &LCSSAPhi : LoopExitBlock->phis()) { 3894 // All PHINodes need to have a single entry edge, or two if 3895 // we already fixed them. 3896 assert(LCSSAPhi.getNumIncomingValues() < 3 && "Invalid LCSSA PHI"); 3897 3898 // We found a reduction value exit-PHI. Update it with the 3899 // incoming bypass edge. 3900 if (LCSSAPhi.getIncomingValue(0) == LoopExitInst) 3901 LCSSAPhi.addIncoming(ReducedPartRdx, LoopMiddleBlock); 3902 } // end of the LCSSA phi scan. 3903 3904 // Fix the scalar loop reduction variable with the incoming reduction sum 3905 // from the vector body and from the backedge value. 3906 int IncomingEdgeBlockIdx = 3907 Phi->getBasicBlockIndex(OrigLoop->getLoopLatch()); 3908 assert(IncomingEdgeBlockIdx >= 0 && "Invalid block index"); 3909 // Pick the other block. 3910 int SelfEdgeBlockIdx = (IncomingEdgeBlockIdx ? 0 : 1); 3911 Phi->setIncomingValue(SelfEdgeBlockIdx, BCBlockPhi); 3912 Phi->setIncomingValue(IncomingEdgeBlockIdx, LoopExitInst); 3913 } 3914 3915 void InnerLoopVectorizer::clearReductionWrapFlags( 3916 RecurrenceDescriptor &RdxDesc) { 3917 RecurrenceDescriptor::RecurrenceKind RK = RdxDesc.getRecurrenceKind(); 3918 if (RK != RecurrenceDescriptor::RK_IntegerAdd && 3919 RK != RecurrenceDescriptor::RK_IntegerMult) 3920 return; 3921 3922 Instruction *LoopExitInstr = RdxDesc.getLoopExitInstr(); 3923 assert(LoopExitInstr && "null loop exit instruction"); 3924 SmallVector<Instruction *, 8> Worklist; 3925 SmallPtrSet<Instruction *, 8> Visited; 3926 Worklist.push_back(LoopExitInstr); 3927 Visited.insert(LoopExitInstr); 3928 3929 while (!Worklist.empty()) { 3930 Instruction *Cur = Worklist.pop_back_val(); 3931 if (isa<OverflowingBinaryOperator>(Cur)) 3932 for (unsigned Part = 0; Part < UF; ++Part) { 3933 Value *V = getOrCreateVectorValue(Cur, Part); 3934 cast<Instruction>(V)->dropPoisonGeneratingFlags(); 3935 } 3936 3937 for (User *U : Cur->users()) { 3938 Instruction *UI = cast<Instruction>(U); 3939 if ((Cur != LoopExitInstr || OrigLoop->contains(UI->getParent())) && 3940 Visited.insert(UI).second) 3941 Worklist.push_back(UI); 3942 } 3943 } 3944 } 3945 3946 void InnerLoopVectorizer::fixLCSSAPHIs() { 3947 for (PHINode &LCSSAPhi : LoopExitBlock->phis()) { 3948 if (LCSSAPhi.getNumIncomingValues() == 1) { 3949 auto *IncomingValue = LCSSAPhi.getIncomingValue(0); 3950 // Non-instruction incoming values will have only one value. 3951 unsigned LastLane = 0; 3952 if (isa<Instruction>(IncomingValue)) 3953 LastLane = Cost->isUniformAfterVectorization( 3954 cast<Instruction>(IncomingValue), VF) 3955 ? 0 3956 : VF - 1; 3957 // Can be a loop invariant incoming value or the last scalar value to be 3958 // extracted from the vectorized loop. 3959 Builder.SetInsertPoint(LoopMiddleBlock->getTerminator()); 3960 Value *lastIncomingValue = 3961 getOrCreateScalarValue(IncomingValue, { UF - 1, LastLane }); 3962 LCSSAPhi.addIncoming(lastIncomingValue, LoopMiddleBlock); 3963 } 3964 } 3965 } 3966 3967 void InnerLoopVectorizer::sinkScalarOperands(Instruction *PredInst) { 3968 // The basic block and loop containing the predicated instruction. 3969 auto *PredBB = PredInst->getParent(); 3970 auto *VectorLoop = LI->getLoopFor(PredBB); 3971 3972 // Initialize a worklist with the operands of the predicated instruction. 3973 SetVector<Value *> Worklist(PredInst->op_begin(), PredInst->op_end()); 3974 3975 // Holds instructions that we need to analyze again. An instruction may be 3976 // reanalyzed if we don't yet know if we can sink it or not. 3977 SmallVector<Instruction *, 8> InstsToReanalyze; 3978 3979 // Returns true if a given use occurs in the predicated block. Phi nodes use 3980 // their operands in their corresponding predecessor blocks. 3981 auto isBlockOfUsePredicated = [&](Use &U) -> bool { 3982 auto *I = cast<Instruction>(U.getUser()); 3983 BasicBlock *BB = I->getParent(); 3984 if (auto *Phi = dyn_cast<PHINode>(I)) 3985 BB = Phi->getIncomingBlock( 3986 PHINode::getIncomingValueNumForOperand(U.getOperandNo())); 3987 return BB == PredBB; 3988 }; 3989 3990 // Iteratively sink the scalarized operands of the predicated instruction 3991 // into the block we created for it. When an instruction is sunk, it's 3992 // operands are then added to the worklist. The algorithm ends after one pass 3993 // through the worklist doesn't sink a single instruction. 3994 bool Changed; 3995 do { 3996 // Add the instructions that need to be reanalyzed to the worklist, and 3997 // reset the changed indicator. 3998 Worklist.insert(InstsToReanalyze.begin(), InstsToReanalyze.end()); 3999 InstsToReanalyze.clear(); 4000 Changed = false; 4001 4002 while (!Worklist.empty()) { 4003 auto *I = dyn_cast<Instruction>(Worklist.pop_back_val()); 4004 4005 // We can't sink an instruction if it is a phi node, is already in the 4006 // predicated block, is not in the loop, or may have side effects. 4007 if (!I || isa<PHINode>(I) || I->getParent() == PredBB || 4008 !VectorLoop->contains(I) || I->mayHaveSideEffects()) 4009 continue; 4010 4011 // It's legal to sink the instruction if all its uses occur in the 4012 // predicated block. Otherwise, there's nothing to do yet, and we may 4013 // need to reanalyze the instruction. 4014 if (!llvm::all_of(I->uses(), isBlockOfUsePredicated)) { 4015 InstsToReanalyze.push_back(I); 4016 continue; 4017 } 4018 4019 // Move the instruction to the beginning of the predicated block, and add 4020 // it's operands to the worklist. 4021 I->moveBefore(&*PredBB->getFirstInsertionPt()); 4022 Worklist.insert(I->op_begin(), I->op_end()); 4023 4024 // The sinking may have enabled other instructions to be sunk, so we will 4025 // need to iterate. 4026 Changed = true; 4027 } 4028 } while (Changed); 4029 } 4030 4031 void InnerLoopVectorizer::fixNonInductionPHIs() { 4032 for (PHINode *OrigPhi : OrigPHIsToFix) { 4033 PHINode *NewPhi = 4034 cast<PHINode>(VectorLoopValueMap.getVectorValue(OrigPhi, 0)); 4035 unsigned NumIncomingValues = OrigPhi->getNumIncomingValues(); 4036 4037 SmallVector<BasicBlock *, 2> ScalarBBPredecessors( 4038 predecessors(OrigPhi->getParent())); 4039 SmallVector<BasicBlock *, 2> VectorBBPredecessors( 4040 predecessors(NewPhi->getParent())); 4041 assert(ScalarBBPredecessors.size() == VectorBBPredecessors.size() && 4042 "Scalar and Vector BB should have the same number of predecessors"); 4043 4044 // The insertion point in Builder may be invalidated by the time we get 4045 // here. Force the Builder insertion point to something valid so that we do 4046 // not run into issues during insertion point restore in 4047 // getOrCreateVectorValue calls below. 4048 Builder.SetInsertPoint(NewPhi); 4049 4050 // The predecessor order is preserved and we can rely on mapping between 4051 // scalar and vector block predecessors. 4052 for (unsigned i = 0; i < NumIncomingValues; ++i) { 4053 BasicBlock *NewPredBB = VectorBBPredecessors[i]; 4054 4055 // When looking up the new scalar/vector values to fix up, use incoming 4056 // values from original phi. 4057 Value *ScIncV = 4058 OrigPhi->getIncomingValueForBlock(ScalarBBPredecessors[i]); 4059 4060 // Scalar incoming value may need a broadcast 4061 Value *NewIncV = getOrCreateVectorValue(ScIncV, 0); 4062 NewPhi->addIncoming(NewIncV, NewPredBB); 4063 } 4064 } 4065 } 4066 4067 void InnerLoopVectorizer::widenGEP(GetElementPtrInst *GEP, unsigned UF, 4068 unsigned VF, bool IsPtrLoopInvariant, 4069 SmallBitVector &IsIndexLoopInvariant) { 4070 // Construct a vector GEP by widening the operands of the scalar GEP as 4071 // necessary. We mark the vector GEP 'inbounds' if appropriate. A GEP 4072 // results in a vector of pointers when at least one operand of the GEP 4073 // is vector-typed. Thus, to keep the representation compact, we only use 4074 // vector-typed operands for loop-varying values. 4075 4076 if (VF > 1 && IsPtrLoopInvariant && IsIndexLoopInvariant.all()) { 4077 // If we are vectorizing, but the GEP has only loop-invariant operands, 4078 // the GEP we build (by only using vector-typed operands for 4079 // loop-varying values) would be a scalar pointer. Thus, to ensure we 4080 // produce a vector of pointers, we need to either arbitrarily pick an 4081 // operand to broadcast, or broadcast a clone of the original GEP. 4082 // Here, we broadcast a clone of the original. 4083 // 4084 // TODO: If at some point we decide to scalarize instructions having 4085 // loop-invariant operands, this special case will no longer be 4086 // required. We would add the scalarization decision to 4087 // collectLoopScalars() and teach getVectorValue() to broadcast 4088 // the lane-zero scalar value. 4089 auto *Clone = Builder.Insert(GEP->clone()); 4090 for (unsigned Part = 0; Part < UF; ++Part) { 4091 Value *EntryPart = Builder.CreateVectorSplat(VF, Clone); 4092 VectorLoopValueMap.setVectorValue(GEP, Part, EntryPart); 4093 addMetadata(EntryPart, GEP); 4094 } 4095 } else { 4096 // If the GEP has at least one loop-varying operand, we are sure to 4097 // produce a vector of pointers. But if we are only unrolling, we want 4098 // to produce a scalar GEP for each unroll part. Thus, the GEP we 4099 // produce with the code below will be scalar (if VF == 1) or vector 4100 // (otherwise). Note that for the unroll-only case, we still maintain 4101 // values in the vector mapping with initVector, as we do for other 4102 // instructions. 4103 for (unsigned Part = 0; Part < UF; ++Part) { 4104 // The pointer operand of the new GEP. If it's loop-invariant, we 4105 // won't broadcast it. 4106 auto *Ptr = IsPtrLoopInvariant 4107 ? GEP->getPointerOperand() 4108 : getOrCreateVectorValue(GEP->getPointerOperand(), Part); 4109 4110 // Collect all the indices for the new GEP. If any index is 4111 // loop-invariant, we won't broadcast it. 4112 SmallVector<Value *, 4> Indices; 4113 for (auto Index : enumerate(GEP->indices())) { 4114 Value *User = Index.value().get(); 4115 if (IsIndexLoopInvariant[Index.index()]) 4116 Indices.push_back(User); 4117 else 4118 Indices.push_back(getOrCreateVectorValue(User, Part)); 4119 } 4120 4121 // Create the new GEP. Note that this GEP may be a scalar if VF == 1, 4122 // but it should be a vector, otherwise. 4123 auto *NewGEP = 4124 GEP->isInBounds() 4125 ? Builder.CreateInBoundsGEP(GEP->getSourceElementType(), Ptr, 4126 Indices) 4127 : Builder.CreateGEP(GEP->getSourceElementType(), Ptr, Indices); 4128 assert((VF == 1 || NewGEP->getType()->isVectorTy()) && 4129 "NewGEP is not a pointer vector"); 4130 VectorLoopValueMap.setVectorValue(GEP, Part, NewGEP); 4131 addMetadata(NewGEP, GEP); 4132 } 4133 } 4134 } 4135 4136 void InnerLoopVectorizer::widenPHIInstruction(Instruction *PN, unsigned UF, 4137 unsigned VF) { 4138 PHINode *P = cast<PHINode>(PN); 4139 if (EnableVPlanNativePath) { 4140 // Currently we enter here in the VPlan-native path for non-induction 4141 // PHIs where all control flow is uniform. We simply widen these PHIs. 4142 // Create a vector phi with no operands - the vector phi operands will be 4143 // set at the end of vector code generation. 4144 Type *VecTy = 4145 (VF == 1) ? PN->getType() : VectorType::get(PN->getType(), VF); 4146 Value *VecPhi = Builder.CreatePHI(VecTy, PN->getNumOperands(), "vec.phi"); 4147 VectorLoopValueMap.setVectorValue(P, 0, VecPhi); 4148 OrigPHIsToFix.push_back(P); 4149 4150 return; 4151 } 4152 4153 assert(PN->getParent() == OrigLoop->getHeader() && 4154 "Non-header phis should have been handled elsewhere"); 4155 4156 // In order to support recurrences we need to be able to vectorize Phi nodes. 4157 // Phi nodes have cycles, so we need to vectorize them in two stages. This is 4158 // stage #1: We create a new vector PHI node with no incoming edges. We'll use 4159 // this value when we vectorize all of the instructions that use the PHI. 4160 if (Legal->isReductionVariable(P) || Legal->isFirstOrderRecurrence(P)) { 4161 for (unsigned Part = 0; Part < UF; ++Part) { 4162 // This is phase one of vectorizing PHIs. 4163 Type *VecTy = 4164 (VF == 1) ? PN->getType() : VectorType::get(PN->getType(), VF); 4165 Value *EntryPart = PHINode::Create( 4166 VecTy, 2, "vec.phi", &*LoopVectorBody->getFirstInsertionPt()); 4167 VectorLoopValueMap.setVectorValue(P, Part, EntryPart); 4168 } 4169 return; 4170 } 4171 4172 setDebugLocFromInst(Builder, P); 4173 4174 // This PHINode must be an induction variable. 4175 // Make sure that we know about it. 4176 assert(Legal->getInductionVars().count(P) && "Not an induction variable"); 4177 4178 InductionDescriptor II = Legal->getInductionVars().lookup(P); 4179 const DataLayout &DL = OrigLoop->getHeader()->getModule()->getDataLayout(); 4180 4181 // FIXME: The newly created binary instructions should contain nsw/nuw flags, 4182 // which can be found from the original scalar operations. 4183 switch (II.getKind()) { 4184 case InductionDescriptor::IK_NoInduction: 4185 llvm_unreachable("Unknown induction"); 4186 case InductionDescriptor::IK_IntInduction: 4187 case InductionDescriptor::IK_FpInduction: 4188 llvm_unreachable("Integer/fp induction is handled elsewhere."); 4189 case InductionDescriptor::IK_PtrInduction: { 4190 // Handle the pointer induction variable case. 4191 assert(P->getType()->isPointerTy() && "Unexpected type."); 4192 // This is the normalized GEP that starts counting at zero. 4193 Value *PtrInd = Induction; 4194 PtrInd = Builder.CreateSExtOrTrunc(PtrInd, II.getStep()->getType()); 4195 // Determine the number of scalars we need to generate for each unroll 4196 // iteration. If the instruction is uniform, we only need to generate the 4197 // first lane. Otherwise, we generate all VF values. 4198 unsigned Lanes = Cost->isUniformAfterVectorization(P, VF) ? 1 : VF; 4199 // These are the scalar results. Notice that we don't generate vector GEPs 4200 // because scalar GEPs result in better code. 4201 for (unsigned Part = 0; Part < UF; ++Part) { 4202 for (unsigned Lane = 0; Lane < Lanes; ++Lane) { 4203 Constant *Idx = ConstantInt::get(PtrInd->getType(), Lane + Part * VF); 4204 Value *GlobalIdx = Builder.CreateAdd(PtrInd, Idx); 4205 Value *SclrGep = 4206 emitTransformedIndex(Builder, GlobalIdx, PSE.getSE(), DL, II); 4207 SclrGep->setName("next.gep"); 4208 VectorLoopValueMap.setScalarValue(P, {Part, Lane}, SclrGep); 4209 } 4210 } 4211 return; 4212 } 4213 } 4214 } 4215 4216 /// A helper function for checking whether an integer division-related 4217 /// instruction may divide by zero (in which case it must be predicated if 4218 /// executed conditionally in the scalar code). 4219 /// TODO: It may be worthwhile to generalize and check isKnownNonZero(). 4220 /// Non-zero divisors that are non compile-time constants will not be 4221 /// converted into multiplication, so we will still end up scalarizing 4222 /// the division, but can do so w/o predication. 4223 static bool mayDivideByZero(Instruction &I) { 4224 assert((I.getOpcode() == Instruction::UDiv || 4225 I.getOpcode() == Instruction::SDiv || 4226 I.getOpcode() == Instruction::URem || 4227 I.getOpcode() == Instruction::SRem) && 4228 "Unexpected instruction"); 4229 Value *Divisor = I.getOperand(1); 4230 auto *CInt = dyn_cast<ConstantInt>(Divisor); 4231 return !CInt || CInt->isZero(); 4232 } 4233 4234 void InnerLoopVectorizer::widenInstruction(Instruction &I) { 4235 switch (I.getOpcode()) { 4236 case Instruction::Call: 4237 case Instruction::Br: 4238 case Instruction::PHI: 4239 case Instruction::GetElementPtr: 4240 case Instruction::Select: 4241 llvm_unreachable("This instruction is handled by a different recipe."); 4242 case Instruction::UDiv: 4243 case Instruction::SDiv: 4244 case Instruction::SRem: 4245 case Instruction::URem: 4246 case Instruction::Add: 4247 case Instruction::FAdd: 4248 case Instruction::Sub: 4249 case Instruction::FSub: 4250 case Instruction::FNeg: 4251 case Instruction::Mul: 4252 case Instruction::FMul: 4253 case Instruction::FDiv: 4254 case Instruction::FRem: 4255 case Instruction::Shl: 4256 case Instruction::LShr: 4257 case Instruction::AShr: 4258 case Instruction::And: 4259 case Instruction::Or: 4260 case Instruction::Xor: { 4261 // Just widen unops and binops. 4262 setDebugLocFromInst(Builder, &I); 4263 4264 for (unsigned Part = 0; Part < UF; ++Part) { 4265 SmallVector<Value *, 2> Ops; 4266 for (Value *Op : I.operands()) 4267 Ops.push_back(getOrCreateVectorValue(Op, Part)); 4268 4269 Value *V = Builder.CreateNAryOp(I.getOpcode(), Ops); 4270 4271 if (auto *VecOp = dyn_cast<Instruction>(V)) 4272 VecOp->copyIRFlags(&I); 4273 4274 // Use this vector value for all users of the original instruction. 4275 VectorLoopValueMap.setVectorValue(&I, Part, V); 4276 addMetadata(V, &I); 4277 } 4278 4279 break; 4280 } 4281 case Instruction::ICmp: 4282 case Instruction::FCmp: { 4283 // Widen compares. Generate vector compares. 4284 bool FCmp = (I.getOpcode() == Instruction::FCmp); 4285 auto *Cmp = cast<CmpInst>(&I); 4286 setDebugLocFromInst(Builder, Cmp); 4287 for (unsigned Part = 0; Part < UF; ++Part) { 4288 Value *A = getOrCreateVectorValue(Cmp->getOperand(0), Part); 4289 Value *B = getOrCreateVectorValue(Cmp->getOperand(1), Part); 4290 Value *C = nullptr; 4291 if (FCmp) { 4292 // Propagate fast math flags. 4293 IRBuilder<>::FastMathFlagGuard FMFG(Builder); 4294 Builder.setFastMathFlags(Cmp->getFastMathFlags()); 4295 C = Builder.CreateFCmp(Cmp->getPredicate(), A, B); 4296 } else { 4297 C = Builder.CreateICmp(Cmp->getPredicate(), A, B); 4298 } 4299 VectorLoopValueMap.setVectorValue(&I, Part, C); 4300 addMetadata(C, &I); 4301 } 4302 4303 break; 4304 } 4305 4306 case Instruction::ZExt: 4307 case Instruction::SExt: 4308 case Instruction::FPToUI: 4309 case Instruction::FPToSI: 4310 case Instruction::FPExt: 4311 case Instruction::PtrToInt: 4312 case Instruction::IntToPtr: 4313 case Instruction::SIToFP: 4314 case Instruction::UIToFP: 4315 case Instruction::Trunc: 4316 case Instruction::FPTrunc: 4317 case Instruction::BitCast: { 4318 auto *CI = cast<CastInst>(&I); 4319 setDebugLocFromInst(Builder, CI); 4320 4321 /// Vectorize casts. 4322 Type *DestTy = 4323 (VF == 1) ? CI->getType() : VectorType::get(CI->getType(), VF); 4324 4325 for (unsigned Part = 0; Part < UF; ++Part) { 4326 Value *A = getOrCreateVectorValue(CI->getOperand(0), Part); 4327 Value *Cast = Builder.CreateCast(CI->getOpcode(), A, DestTy); 4328 VectorLoopValueMap.setVectorValue(&I, Part, Cast); 4329 addMetadata(Cast, &I); 4330 } 4331 break; 4332 } 4333 default: 4334 // This instruction is not vectorized by simple widening. 4335 LLVM_DEBUG(dbgs() << "LV: Found an unhandled instruction: " << I); 4336 llvm_unreachable("Unhandled instruction!"); 4337 } // end of switch. 4338 } 4339 4340 void InnerLoopVectorizer::widenCallInstruction(CallInst &I, VPUser &ArgOperands, 4341 VPTransformState &State) { 4342 assert(!isa<DbgInfoIntrinsic>(I) && 4343 "DbgInfoIntrinsic should have been dropped during VPlan construction"); 4344 setDebugLocFromInst(Builder, &I); 4345 4346 Module *M = I.getParent()->getParent()->getParent(); 4347 auto *CI = cast<CallInst>(&I); 4348 4349 SmallVector<Type *, 4> Tys; 4350 for (Value *ArgOperand : CI->arg_operands()) 4351 Tys.push_back(ToVectorTy(ArgOperand->getType(), VF)); 4352 4353 Intrinsic::ID ID = getVectorIntrinsicIDForCall(CI, TLI); 4354 4355 // The flag shows whether we use Intrinsic or a usual Call for vectorized 4356 // version of the instruction. 4357 // Is it beneficial to perform intrinsic call compared to lib call? 4358 bool NeedToScalarize = false; 4359 unsigned CallCost = Cost->getVectorCallCost(CI, VF, NeedToScalarize); 4360 bool UseVectorIntrinsic = 4361 ID && Cost->getVectorIntrinsicCost(CI, VF) <= CallCost; 4362 assert((UseVectorIntrinsic || !NeedToScalarize) && 4363 "Instruction should be scalarized elsewhere."); 4364 4365 for (unsigned Part = 0; Part < UF; ++Part) { 4366 SmallVector<Value *, 4> Args; 4367 for (auto &I : enumerate(ArgOperands.operands())) { 4368 // Some intrinsics have a scalar argument - don't replace it with a 4369 // vector. 4370 Value *Arg; 4371 if (!UseVectorIntrinsic || !hasVectorInstrinsicScalarOpd(ID, I.index())) 4372 Arg = State.get(I.value(), Part); 4373 else 4374 Arg = State.get(I.value(), {0, 0}); 4375 Args.push_back(Arg); 4376 } 4377 4378 Function *VectorF; 4379 if (UseVectorIntrinsic) { 4380 // Use vector version of the intrinsic. 4381 Type *TysForDecl[] = {CI->getType()}; 4382 if (VF > 1) 4383 TysForDecl[0] = VectorType::get(CI->getType()->getScalarType(), VF); 4384 VectorF = Intrinsic::getDeclaration(M, ID, TysForDecl); 4385 } else { 4386 // Use vector version of the function call. 4387 const VFShape Shape = 4388 VFShape::get(*CI, {VF, false} /*EC*/, false /*HasGlobalPred*/); 4389 #ifndef NDEBUG 4390 const SmallVector<VFInfo, 8> Infos = VFDatabase::getMappings(*CI); 4391 assert(std::find_if(Infos.begin(), Infos.end(), 4392 [&Shape](const VFInfo &Info) { 4393 return Info.Shape == Shape; 4394 }) != Infos.end() && 4395 "Vector function shape is missing from the database."); 4396 #endif 4397 VectorF = VFDatabase(*CI).getVectorizedFunction(Shape); 4398 } 4399 assert(VectorF && "Can't create vector function."); 4400 4401 SmallVector<OperandBundleDef, 1> OpBundles; 4402 CI->getOperandBundlesAsDefs(OpBundles); 4403 CallInst *V = Builder.CreateCall(VectorF, Args, OpBundles); 4404 4405 if (isa<FPMathOperator>(V)) 4406 V->copyFastMathFlags(CI); 4407 4408 VectorLoopValueMap.setVectorValue(&I, Part, V); 4409 addMetadata(V, &I); 4410 } 4411 } 4412 4413 void InnerLoopVectorizer::widenSelectInstruction(SelectInst &I, 4414 bool InvariantCond) { 4415 setDebugLocFromInst(Builder, &I); 4416 4417 // The condition can be loop invariant but still defined inside the 4418 // loop. This means that we can't just use the original 'cond' value. 4419 // We have to take the 'vectorized' value and pick the first lane. 4420 // Instcombine will make this a no-op. 4421 4422 auto *ScalarCond = getOrCreateScalarValue(I.getOperand(0), {0, 0}); 4423 4424 for (unsigned Part = 0; Part < UF; ++Part) { 4425 Value *Cond = getOrCreateVectorValue(I.getOperand(0), Part); 4426 Value *Op0 = getOrCreateVectorValue(I.getOperand(1), Part); 4427 Value *Op1 = getOrCreateVectorValue(I.getOperand(2), Part); 4428 Value *Sel = 4429 Builder.CreateSelect(InvariantCond ? ScalarCond : Cond, Op0, Op1); 4430 VectorLoopValueMap.setVectorValue(&I, Part, Sel); 4431 addMetadata(Sel, &I); 4432 } 4433 } 4434 4435 void LoopVectorizationCostModel::collectLoopScalars(unsigned VF) { 4436 // We should not collect Scalars more than once per VF. Right now, this 4437 // function is called from collectUniformsAndScalars(), which already does 4438 // this check. Collecting Scalars for VF=1 does not make any sense. 4439 assert(VF >= 2 && Scalars.find(VF) == Scalars.end() && 4440 "This function should not be visited twice for the same VF"); 4441 4442 SmallSetVector<Instruction *, 8> Worklist; 4443 4444 // These sets are used to seed the analysis with pointers used by memory 4445 // accesses that will remain scalar. 4446 SmallSetVector<Instruction *, 8> ScalarPtrs; 4447 SmallPtrSet<Instruction *, 8> PossibleNonScalarPtrs; 4448 4449 // A helper that returns true if the use of Ptr by MemAccess will be scalar. 4450 // The pointer operands of loads and stores will be scalar as long as the 4451 // memory access is not a gather or scatter operation. The value operand of a 4452 // store will remain scalar if the store is scalarized. 4453 auto isScalarUse = [&](Instruction *MemAccess, Value *Ptr) { 4454 InstWidening WideningDecision = getWideningDecision(MemAccess, VF); 4455 assert(WideningDecision != CM_Unknown && 4456 "Widening decision should be ready at this moment"); 4457 if (auto *Store = dyn_cast<StoreInst>(MemAccess)) 4458 if (Ptr == Store->getValueOperand()) 4459 return WideningDecision == CM_Scalarize; 4460 assert(Ptr == getLoadStorePointerOperand(MemAccess) && 4461 "Ptr is neither a value or pointer operand"); 4462 return WideningDecision != CM_GatherScatter; 4463 }; 4464 4465 // A helper that returns true if the given value is a bitcast or 4466 // getelementptr instruction contained in the loop. 4467 auto isLoopVaryingBitCastOrGEP = [&](Value *V) { 4468 return ((isa<BitCastInst>(V) && V->getType()->isPointerTy()) || 4469 isa<GetElementPtrInst>(V)) && 4470 !TheLoop->isLoopInvariant(V); 4471 }; 4472 4473 // A helper that evaluates a memory access's use of a pointer. If the use 4474 // will be a scalar use, and the pointer is only used by memory accesses, we 4475 // place the pointer in ScalarPtrs. Otherwise, the pointer is placed in 4476 // PossibleNonScalarPtrs. 4477 auto evaluatePtrUse = [&](Instruction *MemAccess, Value *Ptr) { 4478 // We only care about bitcast and getelementptr instructions contained in 4479 // the loop. 4480 if (!isLoopVaryingBitCastOrGEP(Ptr)) 4481 return; 4482 4483 // If the pointer has already been identified as scalar (e.g., if it was 4484 // also identified as uniform), there's nothing to do. 4485 auto *I = cast<Instruction>(Ptr); 4486 if (Worklist.count(I)) 4487 return; 4488 4489 // If the use of the pointer will be a scalar use, and all users of the 4490 // pointer are memory accesses, place the pointer in ScalarPtrs. Otherwise, 4491 // place the pointer in PossibleNonScalarPtrs. 4492 if (isScalarUse(MemAccess, Ptr) && llvm::all_of(I->users(), [&](User *U) { 4493 return isa<LoadInst>(U) || isa<StoreInst>(U); 4494 })) 4495 ScalarPtrs.insert(I); 4496 else 4497 PossibleNonScalarPtrs.insert(I); 4498 }; 4499 4500 // We seed the scalars analysis with three classes of instructions: (1) 4501 // instructions marked uniform-after-vectorization, (2) bitcast and 4502 // getelementptr instructions used by memory accesses requiring a scalar use, 4503 // and (3) pointer induction variables and their update instructions (we 4504 // currently only scalarize these). 4505 // 4506 // (1) Add to the worklist all instructions that have been identified as 4507 // uniform-after-vectorization. 4508 Worklist.insert(Uniforms[VF].begin(), Uniforms[VF].end()); 4509 4510 // (2) Add to the worklist all bitcast and getelementptr instructions used by 4511 // memory accesses requiring a scalar use. The pointer operands of loads and 4512 // stores will be scalar as long as the memory accesses is not a gather or 4513 // scatter operation. The value operand of a store will remain scalar if the 4514 // store is scalarized. 4515 for (auto *BB : TheLoop->blocks()) 4516 for (auto &I : *BB) { 4517 if (auto *Load = dyn_cast<LoadInst>(&I)) { 4518 evaluatePtrUse(Load, Load->getPointerOperand()); 4519 } else if (auto *Store = dyn_cast<StoreInst>(&I)) { 4520 evaluatePtrUse(Store, Store->getPointerOperand()); 4521 evaluatePtrUse(Store, Store->getValueOperand()); 4522 } 4523 } 4524 for (auto *I : ScalarPtrs) 4525 if (PossibleNonScalarPtrs.find(I) == PossibleNonScalarPtrs.end()) { 4526 LLVM_DEBUG(dbgs() << "LV: Found scalar instruction: " << *I << "\n"); 4527 Worklist.insert(I); 4528 } 4529 4530 // (3) Add to the worklist all pointer induction variables and their update 4531 // instructions. 4532 // 4533 // TODO: Once we are able to vectorize pointer induction variables we should 4534 // no longer insert them into the worklist here. 4535 auto *Latch = TheLoop->getLoopLatch(); 4536 for (auto &Induction : Legal->getInductionVars()) { 4537 auto *Ind = Induction.first; 4538 auto *IndUpdate = cast<Instruction>(Ind->getIncomingValueForBlock(Latch)); 4539 if (Induction.second.getKind() != InductionDescriptor::IK_PtrInduction) 4540 continue; 4541 Worklist.insert(Ind); 4542 Worklist.insert(IndUpdate); 4543 LLVM_DEBUG(dbgs() << "LV: Found scalar instruction: " << *Ind << "\n"); 4544 LLVM_DEBUG(dbgs() << "LV: Found scalar instruction: " << *IndUpdate 4545 << "\n"); 4546 } 4547 4548 // Insert the forced scalars. 4549 // FIXME: Currently widenPHIInstruction() often creates a dead vector 4550 // induction variable when the PHI user is scalarized. 4551 auto ForcedScalar = ForcedScalars.find(VF); 4552 if (ForcedScalar != ForcedScalars.end()) 4553 for (auto *I : ForcedScalar->second) 4554 Worklist.insert(I); 4555 4556 // Expand the worklist by looking through any bitcasts and getelementptr 4557 // instructions we've already identified as scalar. This is similar to the 4558 // expansion step in collectLoopUniforms(); however, here we're only 4559 // expanding to include additional bitcasts and getelementptr instructions. 4560 unsigned Idx = 0; 4561 while (Idx != Worklist.size()) { 4562 Instruction *Dst = Worklist[Idx++]; 4563 if (!isLoopVaryingBitCastOrGEP(Dst->getOperand(0))) 4564 continue; 4565 auto *Src = cast<Instruction>(Dst->getOperand(0)); 4566 if (llvm::all_of(Src->users(), [&](User *U) -> bool { 4567 auto *J = cast<Instruction>(U); 4568 return !TheLoop->contains(J) || Worklist.count(J) || 4569 ((isa<LoadInst>(J) || isa<StoreInst>(J)) && 4570 isScalarUse(J, Src)); 4571 })) { 4572 Worklist.insert(Src); 4573 LLVM_DEBUG(dbgs() << "LV: Found scalar instruction: " << *Src << "\n"); 4574 } 4575 } 4576 4577 // An induction variable will remain scalar if all users of the induction 4578 // variable and induction variable update remain scalar. 4579 for (auto &Induction : Legal->getInductionVars()) { 4580 auto *Ind = Induction.first; 4581 auto *IndUpdate = cast<Instruction>(Ind->getIncomingValueForBlock(Latch)); 4582 4583 // We already considered pointer induction variables, so there's no reason 4584 // to look at their users again. 4585 // 4586 // TODO: Once we are able to vectorize pointer induction variables we 4587 // should no longer skip over them here. 4588 if (Induction.second.getKind() == InductionDescriptor::IK_PtrInduction) 4589 continue; 4590 4591 // Determine if all users of the induction variable are scalar after 4592 // vectorization. 4593 auto ScalarInd = llvm::all_of(Ind->users(), [&](User *U) -> bool { 4594 auto *I = cast<Instruction>(U); 4595 return I == IndUpdate || !TheLoop->contains(I) || Worklist.count(I); 4596 }); 4597 if (!ScalarInd) 4598 continue; 4599 4600 // Determine if all users of the induction variable update instruction are 4601 // scalar after vectorization. 4602 auto ScalarIndUpdate = 4603 llvm::all_of(IndUpdate->users(), [&](User *U) -> bool { 4604 auto *I = cast<Instruction>(U); 4605 return I == Ind || !TheLoop->contains(I) || Worklist.count(I); 4606 }); 4607 if (!ScalarIndUpdate) 4608 continue; 4609 4610 // The induction variable and its update instruction will remain scalar. 4611 Worklist.insert(Ind); 4612 Worklist.insert(IndUpdate); 4613 LLVM_DEBUG(dbgs() << "LV: Found scalar instruction: " << *Ind << "\n"); 4614 LLVM_DEBUG(dbgs() << "LV: Found scalar instruction: " << *IndUpdate 4615 << "\n"); 4616 } 4617 4618 Scalars[VF].insert(Worklist.begin(), Worklist.end()); 4619 } 4620 4621 bool LoopVectorizationCostModel::isScalarWithPredication(Instruction *I, unsigned VF) { 4622 if (!blockNeedsPredication(I->getParent())) 4623 return false; 4624 switch(I->getOpcode()) { 4625 default: 4626 break; 4627 case Instruction::Load: 4628 case Instruction::Store: { 4629 if (!Legal->isMaskRequired(I)) 4630 return false; 4631 auto *Ptr = getLoadStorePointerOperand(I); 4632 auto *Ty = getMemInstValueType(I); 4633 // We have already decided how to vectorize this instruction, get that 4634 // result. 4635 if (VF > 1) { 4636 InstWidening WideningDecision = getWideningDecision(I, VF); 4637 assert(WideningDecision != CM_Unknown && 4638 "Widening decision should be ready at this moment"); 4639 return WideningDecision == CM_Scalarize; 4640 } 4641 const MaybeAlign Alignment = getLoadStoreAlignment(I); 4642 return isa<LoadInst>(I) ? !(isLegalMaskedLoad(Ty, Ptr, Alignment) || 4643 isLegalMaskedGather(Ty, Alignment)) 4644 : !(isLegalMaskedStore(Ty, Ptr, Alignment) || 4645 isLegalMaskedScatter(Ty, Alignment)); 4646 } 4647 case Instruction::UDiv: 4648 case Instruction::SDiv: 4649 case Instruction::SRem: 4650 case Instruction::URem: 4651 return mayDivideByZero(*I); 4652 } 4653 return false; 4654 } 4655 4656 bool LoopVectorizationCostModel::interleavedAccessCanBeWidened(Instruction *I, 4657 unsigned VF) { 4658 assert(isAccessInterleaved(I) && "Expecting interleaved access."); 4659 assert(getWideningDecision(I, VF) == CM_Unknown && 4660 "Decision should not be set yet."); 4661 auto *Group = getInterleavedAccessGroup(I); 4662 assert(Group && "Must have a group."); 4663 4664 // If the instruction's allocated size doesn't equal it's type size, it 4665 // requires padding and will be scalarized. 4666 auto &DL = I->getModule()->getDataLayout(); 4667 auto *ScalarTy = getMemInstValueType(I); 4668 if (hasIrregularType(ScalarTy, DL, VF)) 4669 return false; 4670 4671 // Check if masking is required. 4672 // A Group may need masking for one of two reasons: it resides in a block that 4673 // needs predication, or it was decided to use masking to deal with gaps. 4674 bool PredicatedAccessRequiresMasking = 4675 Legal->blockNeedsPredication(I->getParent()) && Legal->isMaskRequired(I); 4676 bool AccessWithGapsRequiresMasking = 4677 Group->requiresScalarEpilogue() && !isScalarEpilogueAllowed(); 4678 if (!PredicatedAccessRequiresMasking && !AccessWithGapsRequiresMasking) 4679 return true; 4680 4681 // If masked interleaving is required, we expect that the user/target had 4682 // enabled it, because otherwise it either wouldn't have been created or 4683 // it should have been invalidated by the CostModel. 4684 assert(useMaskedInterleavedAccesses(TTI) && 4685 "Masked interleave-groups for predicated accesses are not enabled."); 4686 4687 auto *Ty = getMemInstValueType(I); 4688 const MaybeAlign Alignment = getLoadStoreAlignment(I); 4689 return isa<LoadInst>(I) ? TTI.isLegalMaskedLoad(Ty, Alignment) 4690 : TTI.isLegalMaskedStore(Ty, Alignment); 4691 } 4692 4693 bool LoopVectorizationCostModel::memoryInstructionCanBeWidened(Instruction *I, 4694 unsigned VF) { 4695 // Get and ensure we have a valid memory instruction. 4696 LoadInst *LI = dyn_cast<LoadInst>(I); 4697 StoreInst *SI = dyn_cast<StoreInst>(I); 4698 assert((LI || SI) && "Invalid memory instruction"); 4699 4700 auto *Ptr = getLoadStorePointerOperand(I); 4701 4702 // In order to be widened, the pointer should be consecutive, first of all. 4703 if (!Legal->isConsecutivePtr(Ptr)) 4704 return false; 4705 4706 // If the instruction is a store located in a predicated block, it will be 4707 // scalarized. 4708 if (isScalarWithPredication(I)) 4709 return false; 4710 4711 // If the instruction's allocated size doesn't equal it's type size, it 4712 // requires padding and will be scalarized. 4713 auto &DL = I->getModule()->getDataLayout(); 4714 auto *ScalarTy = LI ? LI->getType() : SI->getValueOperand()->getType(); 4715 if (hasIrregularType(ScalarTy, DL, VF)) 4716 return false; 4717 4718 return true; 4719 } 4720 4721 void LoopVectorizationCostModel::collectLoopUniforms(unsigned VF) { 4722 // We should not collect Uniforms more than once per VF. Right now, 4723 // this function is called from collectUniformsAndScalars(), which 4724 // already does this check. Collecting Uniforms for VF=1 does not make any 4725 // sense. 4726 4727 assert(VF >= 2 && Uniforms.find(VF) == Uniforms.end() && 4728 "This function should not be visited twice for the same VF"); 4729 4730 // Visit the list of Uniforms. If we'll not find any uniform value, we'll 4731 // not analyze again. Uniforms.count(VF) will return 1. 4732 Uniforms[VF].clear(); 4733 4734 // We now know that the loop is vectorizable! 4735 // Collect instructions inside the loop that will remain uniform after 4736 // vectorization. 4737 4738 // Global values, params and instructions outside of current loop are out of 4739 // scope. 4740 auto isOutOfScope = [&](Value *V) -> bool { 4741 Instruction *I = dyn_cast<Instruction>(V); 4742 return (!I || !TheLoop->contains(I)); 4743 }; 4744 4745 SetVector<Instruction *> Worklist; 4746 BasicBlock *Latch = TheLoop->getLoopLatch(); 4747 4748 // Instructions that are scalar with predication must not be considered 4749 // uniform after vectorization, because that would create an erroneous 4750 // replicating region where only a single instance out of VF should be formed. 4751 // TODO: optimize such seldom cases if found important, see PR40816. 4752 auto addToWorklistIfAllowed = [&](Instruction *I) -> void { 4753 if (isScalarWithPredication(I, VF)) { 4754 LLVM_DEBUG(dbgs() << "LV: Found not uniform being ScalarWithPredication: " 4755 << *I << "\n"); 4756 return; 4757 } 4758 LLVM_DEBUG(dbgs() << "LV: Found uniform instruction: " << *I << "\n"); 4759 Worklist.insert(I); 4760 }; 4761 4762 // Start with the conditional branch. If the branch condition is an 4763 // instruction contained in the loop that is only used by the branch, it is 4764 // uniform. 4765 auto *Cmp = dyn_cast<Instruction>(Latch->getTerminator()->getOperand(0)); 4766 if (Cmp && TheLoop->contains(Cmp) && Cmp->hasOneUse()) 4767 addToWorklistIfAllowed(Cmp); 4768 4769 // Holds consecutive and consecutive-like pointers. Consecutive-like pointers 4770 // are pointers that are treated like consecutive pointers during 4771 // vectorization. The pointer operands of interleaved accesses are an 4772 // example. 4773 SmallSetVector<Instruction *, 8> ConsecutiveLikePtrs; 4774 4775 // Holds pointer operands of instructions that are possibly non-uniform. 4776 SmallPtrSet<Instruction *, 8> PossibleNonUniformPtrs; 4777 4778 auto isUniformDecision = [&](Instruction *I, unsigned VF) { 4779 InstWidening WideningDecision = getWideningDecision(I, VF); 4780 assert(WideningDecision != CM_Unknown && 4781 "Widening decision should be ready at this moment"); 4782 4783 return (WideningDecision == CM_Widen || 4784 WideningDecision == CM_Widen_Reverse || 4785 WideningDecision == CM_Interleave); 4786 }; 4787 // Iterate over the instructions in the loop, and collect all 4788 // consecutive-like pointer operands in ConsecutiveLikePtrs. If it's possible 4789 // that a consecutive-like pointer operand will be scalarized, we collect it 4790 // in PossibleNonUniformPtrs instead. We use two sets here because a single 4791 // getelementptr instruction can be used by both vectorized and scalarized 4792 // memory instructions. For example, if a loop loads and stores from the same 4793 // location, but the store is conditional, the store will be scalarized, and 4794 // the getelementptr won't remain uniform. 4795 for (auto *BB : TheLoop->blocks()) 4796 for (auto &I : *BB) { 4797 // If there's no pointer operand, there's nothing to do. 4798 auto *Ptr = dyn_cast_or_null<Instruction>(getLoadStorePointerOperand(&I)); 4799 if (!Ptr) 4800 continue; 4801 4802 // True if all users of Ptr are memory accesses that have Ptr as their 4803 // pointer operand. 4804 auto UsersAreMemAccesses = 4805 llvm::all_of(Ptr->users(), [&](User *U) -> bool { 4806 return getLoadStorePointerOperand(U) == Ptr; 4807 }); 4808 4809 // Ensure the memory instruction will not be scalarized or used by 4810 // gather/scatter, making its pointer operand non-uniform. If the pointer 4811 // operand is used by any instruction other than a memory access, we 4812 // conservatively assume the pointer operand may be non-uniform. 4813 if (!UsersAreMemAccesses || !isUniformDecision(&I, VF)) 4814 PossibleNonUniformPtrs.insert(Ptr); 4815 4816 // If the memory instruction will be vectorized and its pointer operand 4817 // is consecutive-like, or interleaving - the pointer operand should 4818 // remain uniform. 4819 else 4820 ConsecutiveLikePtrs.insert(Ptr); 4821 } 4822 4823 // Add to the Worklist all consecutive and consecutive-like pointers that 4824 // aren't also identified as possibly non-uniform. 4825 for (auto *V : ConsecutiveLikePtrs) 4826 if (PossibleNonUniformPtrs.find(V) == PossibleNonUniformPtrs.end()) 4827 addToWorklistIfAllowed(V); 4828 4829 // Expand Worklist in topological order: whenever a new instruction 4830 // is added , its users should be already inside Worklist. It ensures 4831 // a uniform instruction will only be used by uniform instructions. 4832 unsigned idx = 0; 4833 while (idx != Worklist.size()) { 4834 Instruction *I = Worklist[idx++]; 4835 4836 for (auto OV : I->operand_values()) { 4837 // isOutOfScope operands cannot be uniform instructions. 4838 if (isOutOfScope(OV)) 4839 continue; 4840 // First order recurrence Phi's should typically be considered 4841 // non-uniform. 4842 auto *OP = dyn_cast<PHINode>(OV); 4843 if (OP && Legal->isFirstOrderRecurrence(OP)) 4844 continue; 4845 // If all the users of the operand are uniform, then add the 4846 // operand into the uniform worklist. 4847 auto *OI = cast<Instruction>(OV); 4848 if (llvm::all_of(OI->users(), [&](User *U) -> bool { 4849 auto *J = cast<Instruction>(U); 4850 return Worklist.count(J) || 4851 (OI == getLoadStorePointerOperand(J) && 4852 isUniformDecision(J, VF)); 4853 })) 4854 addToWorklistIfAllowed(OI); 4855 } 4856 } 4857 4858 // Returns true if Ptr is the pointer operand of a memory access instruction 4859 // I, and I is known to not require scalarization. 4860 auto isVectorizedMemAccessUse = [&](Instruction *I, Value *Ptr) -> bool { 4861 return getLoadStorePointerOperand(I) == Ptr && isUniformDecision(I, VF); 4862 }; 4863 4864 // For an instruction to be added into Worklist above, all its users inside 4865 // the loop should also be in Worklist. However, this condition cannot be 4866 // true for phi nodes that form a cyclic dependence. We must process phi 4867 // nodes separately. An induction variable will remain uniform if all users 4868 // of the induction variable and induction variable update remain uniform. 4869 // The code below handles both pointer and non-pointer induction variables. 4870 for (auto &Induction : Legal->getInductionVars()) { 4871 auto *Ind = Induction.first; 4872 auto *IndUpdate = cast<Instruction>(Ind->getIncomingValueForBlock(Latch)); 4873 4874 // Determine if all users of the induction variable are uniform after 4875 // vectorization. 4876 auto UniformInd = llvm::all_of(Ind->users(), [&](User *U) -> bool { 4877 auto *I = cast<Instruction>(U); 4878 return I == IndUpdate || !TheLoop->contains(I) || Worklist.count(I) || 4879 isVectorizedMemAccessUse(I, Ind); 4880 }); 4881 if (!UniformInd) 4882 continue; 4883 4884 // Determine if all users of the induction variable update instruction are 4885 // uniform after vectorization. 4886 auto UniformIndUpdate = 4887 llvm::all_of(IndUpdate->users(), [&](User *U) -> bool { 4888 auto *I = cast<Instruction>(U); 4889 return I == Ind || !TheLoop->contains(I) || Worklist.count(I) || 4890 isVectorizedMemAccessUse(I, IndUpdate); 4891 }); 4892 if (!UniformIndUpdate) 4893 continue; 4894 4895 // The induction variable and its update instruction will remain uniform. 4896 addToWorklistIfAllowed(Ind); 4897 addToWorklistIfAllowed(IndUpdate); 4898 } 4899 4900 Uniforms[VF].insert(Worklist.begin(), Worklist.end()); 4901 } 4902 4903 bool LoopVectorizationCostModel::runtimeChecksRequired() { 4904 LLVM_DEBUG(dbgs() << "LV: Performing code size checks.\n"); 4905 4906 if (Legal->getRuntimePointerChecking()->Need) { 4907 reportVectorizationFailure("Runtime ptr check is required with -Os/-Oz", 4908 "runtime pointer checks needed. Enable vectorization of this " 4909 "loop with '#pragma clang loop vectorize(enable)' when " 4910 "compiling with -Os/-Oz", 4911 "CantVersionLoopWithOptForSize", ORE, TheLoop); 4912 return true; 4913 } 4914 4915 if (!PSE.getUnionPredicate().getPredicates().empty()) { 4916 reportVectorizationFailure("Runtime SCEV check is required with -Os/-Oz", 4917 "runtime SCEV checks needed. Enable vectorization of this " 4918 "loop with '#pragma clang loop vectorize(enable)' when " 4919 "compiling with -Os/-Oz", 4920 "CantVersionLoopWithOptForSize", ORE, TheLoop); 4921 return true; 4922 } 4923 4924 // FIXME: Avoid specializing for stride==1 instead of bailing out. 4925 if (!Legal->getLAI()->getSymbolicStrides().empty()) { 4926 reportVectorizationFailure("Runtime stride check is required with -Os/-Oz", 4927 "runtime stride == 1 checks needed. Enable vectorization of " 4928 "this loop with '#pragma clang loop vectorize(enable)' when " 4929 "compiling with -Os/-Oz", 4930 "CantVersionLoopWithOptForSize", ORE, TheLoop); 4931 return true; 4932 } 4933 4934 return false; 4935 } 4936 4937 Optional<unsigned> LoopVectorizationCostModel::computeMaxVF() { 4938 if (Legal->getRuntimePointerChecking()->Need && TTI.hasBranchDivergence()) { 4939 // TODO: It may by useful to do since it's still likely to be dynamically 4940 // uniform if the target can skip. 4941 reportVectorizationFailure( 4942 "Not inserting runtime ptr check for divergent target", 4943 "runtime pointer checks needed. Not enabled for divergent target", 4944 "CantVersionLoopWithDivergentTarget", ORE, TheLoop); 4945 return None; 4946 } 4947 4948 unsigned TC = PSE.getSE()->getSmallConstantTripCount(TheLoop); 4949 LLVM_DEBUG(dbgs() << "LV: Found trip count: " << TC << '\n'); 4950 if (TC == 1) { 4951 reportVectorizationFailure("Single iteration (non) loop", 4952 "loop trip count is one, irrelevant for vectorization", 4953 "SingleIterationLoop", ORE, TheLoop); 4954 return None; 4955 } 4956 4957 switch (ScalarEpilogueStatus) { 4958 case CM_ScalarEpilogueAllowed: 4959 return computeFeasibleMaxVF(TC); 4960 case CM_ScalarEpilogueNotNeededUsePredicate: 4961 LLVM_DEBUG( 4962 dbgs() << "LV: vector predicate hint/switch found.\n" 4963 << "LV: Not allowing scalar epilogue, creating predicated " 4964 << "vector loop.\n"); 4965 break; 4966 case CM_ScalarEpilogueNotAllowedLowTripLoop: 4967 // fallthrough as a special case of OptForSize 4968 case CM_ScalarEpilogueNotAllowedOptSize: 4969 if (ScalarEpilogueStatus == CM_ScalarEpilogueNotAllowedOptSize) 4970 LLVM_DEBUG( 4971 dbgs() << "LV: Not allowing scalar epilogue due to -Os/-Oz.\n"); 4972 else 4973 LLVM_DEBUG(dbgs() << "LV: Not allowing scalar epilogue due to low trip " 4974 << "count.\n"); 4975 4976 // Bail if runtime checks are required, which are not good when optimising 4977 // for size. 4978 if (runtimeChecksRequired()) 4979 return None; 4980 break; 4981 } 4982 4983 // Now try the tail folding 4984 4985 // Invalidate interleave groups that require an epilogue if we can't mask 4986 // the interleave-group. 4987 if (!useMaskedInterleavedAccesses(TTI)) { 4988 assert(WideningDecisions.empty() && Uniforms.empty() && Scalars.empty() && 4989 "No decisions should have been taken at this point"); 4990 // Note: There is no need to invalidate any cost modeling decisions here, as 4991 // non where taken so far. 4992 InterleaveInfo.invalidateGroupsRequiringScalarEpilogue(); 4993 } 4994 4995 unsigned MaxVF = computeFeasibleMaxVF(TC); 4996 if (TC > 0 && TC % MaxVF == 0) { 4997 // Accept MaxVF if we do not have a tail. 4998 LLVM_DEBUG(dbgs() << "LV: No tail will remain for any chosen VF.\n"); 4999 return MaxVF; 5000 } 5001 5002 // If we don't know the precise trip count, or if the trip count that we 5003 // found modulo the vectorization factor is not zero, try to fold the tail 5004 // by masking. 5005 // FIXME: look for a smaller MaxVF that does divide TC rather than masking. 5006 if (Legal->prepareToFoldTailByMasking()) { 5007 FoldTailByMasking = true; 5008 return MaxVF; 5009 } 5010 5011 if (TC == 0) { 5012 reportVectorizationFailure( 5013 "Unable to calculate the loop count due to complex control flow", 5014 "unable to calculate the loop count due to complex control flow", 5015 "UnknownLoopCountComplexCFG", ORE, TheLoop); 5016 return None; 5017 } 5018 5019 reportVectorizationFailure( 5020 "Cannot optimize for size and vectorize at the same time.", 5021 "cannot optimize for size and vectorize at the same time. " 5022 "Enable vectorization of this loop with '#pragma clang loop " 5023 "vectorize(enable)' when compiling with -Os/-Oz", 5024 "NoTailLoopWithOptForSize", ORE, TheLoop); 5025 return None; 5026 } 5027 5028 unsigned 5029 LoopVectorizationCostModel::computeFeasibleMaxVF(unsigned ConstTripCount) { 5030 MinBWs = computeMinimumValueSizes(TheLoop->getBlocks(), *DB, &TTI); 5031 unsigned SmallestType, WidestType; 5032 std::tie(SmallestType, WidestType) = getSmallestAndWidestTypes(); 5033 unsigned WidestRegister = TTI.getRegisterBitWidth(true); 5034 5035 // Get the maximum safe dependence distance in bits computed by LAA. 5036 // It is computed by MaxVF * sizeOf(type) * 8, where type is taken from 5037 // the memory accesses that is most restrictive (involved in the smallest 5038 // dependence distance). 5039 unsigned MaxSafeRegisterWidth = Legal->getMaxSafeRegisterWidth(); 5040 5041 WidestRegister = std::min(WidestRegister, MaxSafeRegisterWidth); 5042 5043 unsigned MaxVectorSize = WidestRegister / WidestType; 5044 5045 LLVM_DEBUG(dbgs() << "LV: The Smallest and Widest types: " << SmallestType 5046 << " / " << WidestType << " bits.\n"); 5047 LLVM_DEBUG(dbgs() << "LV: The Widest register safe to use is: " 5048 << WidestRegister << " bits.\n"); 5049 5050 assert(MaxVectorSize <= 256 && "Did not expect to pack so many elements" 5051 " into one vector!"); 5052 if (MaxVectorSize == 0) { 5053 LLVM_DEBUG(dbgs() << "LV: The target has no vector registers.\n"); 5054 MaxVectorSize = 1; 5055 return MaxVectorSize; 5056 } else if (ConstTripCount && ConstTripCount < MaxVectorSize && 5057 isPowerOf2_32(ConstTripCount)) { 5058 // We need to clamp the VF to be the ConstTripCount. There is no point in 5059 // choosing a higher viable VF as done in the loop below. 5060 LLVM_DEBUG(dbgs() << "LV: Clamping the MaxVF to the constant trip count: " 5061 << ConstTripCount << "\n"); 5062 MaxVectorSize = ConstTripCount; 5063 return MaxVectorSize; 5064 } 5065 5066 unsigned MaxVF = MaxVectorSize; 5067 if (TTI.shouldMaximizeVectorBandwidth(!isScalarEpilogueAllowed()) || 5068 (MaximizeBandwidth && isScalarEpilogueAllowed())) { 5069 // Collect all viable vectorization factors larger than the default MaxVF 5070 // (i.e. MaxVectorSize). 5071 SmallVector<unsigned, 8> VFs; 5072 unsigned NewMaxVectorSize = WidestRegister / SmallestType; 5073 for (unsigned VS = MaxVectorSize * 2; VS <= NewMaxVectorSize; VS *= 2) 5074 VFs.push_back(VS); 5075 5076 // For each VF calculate its register usage. 5077 auto RUs = calculateRegisterUsage(VFs); 5078 5079 // Select the largest VF which doesn't require more registers than existing 5080 // ones. 5081 for (int i = RUs.size() - 1; i >= 0; --i) { 5082 bool Selected = true; 5083 for (auto& pair : RUs[i].MaxLocalUsers) { 5084 unsigned TargetNumRegisters = TTI.getNumberOfRegisters(pair.first); 5085 if (pair.second > TargetNumRegisters) 5086 Selected = false; 5087 } 5088 if (Selected) { 5089 MaxVF = VFs[i]; 5090 break; 5091 } 5092 } 5093 if (unsigned MinVF = TTI.getMinimumVF(SmallestType)) { 5094 if (MaxVF < MinVF) { 5095 LLVM_DEBUG(dbgs() << "LV: Overriding calculated MaxVF(" << MaxVF 5096 << ") with target's minimum: " << MinVF << '\n'); 5097 MaxVF = MinVF; 5098 } 5099 } 5100 } 5101 return MaxVF; 5102 } 5103 5104 VectorizationFactor 5105 LoopVectorizationCostModel::selectVectorizationFactor(unsigned MaxVF) { 5106 float Cost = expectedCost(1).first; 5107 const float ScalarCost = Cost; 5108 unsigned Width = 1; 5109 LLVM_DEBUG(dbgs() << "LV: Scalar loop costs: " << (int)ScalarCost << ".\n"); 5110 5111 bool ForceVectorization = Hints->getForce() == LoopVectorizeHints::FK_Enabled; 5112 if (ForceVectorization && MaxVF > 1) { 5113 // Ignore scalar width, because the user explicitly wants vectorization. 5114 // Initialize cost to max so that VF = 2 is, at least, chosen during cost 5115 // evaluation. 5116 Cost = std::numeric_limits<float>::max(); 5117 } 5118 5119 for (unsigned i = 2; i <= MaxVF; i *= 2) { 5120 // Notice that the vector loop needs to be executed less times, so 5121 // we need to divide the cost of the vector loops by the width of 5122 // the vector elements. 5123 VectorizationCostTy C = expectedCost(i); 5124 float VectorCost = C.first / (float)i; 5125 LLVM_DEBUG(dbgs() << "LV: Vector loop of width " << i 5126 << " costs: " << (int)VectorCost << ".\n"); 5127 if (!C.second && !ForceVectorization) { 5128 LLVM_DEBUG( 5129 dbgs() << "LV: Not considering vector loop of width " << i 5130 << " because it will not generate any vector instructions.\n"); 5131 continue; 5132 } 5133 if (VectorCost < Cost) { 5134 Cost = VectorCost; 5135 Width = i; 5136 } 5137 } 5138 5139 if (!EnableCondStoresVectorization && NumPredStores) { 5140 reportVectorizationFailure("There are conditional stores.", 5141 "store that is conditionally executed prevents vectorization", 5142 "ConditionalStore", ORE, TheLoop); 5143 Width = 1; 5144 Cost = ScalarCost; 5145 } 5146 5147 LLVM_DEBUG(if (ForceVectorization && Width > 1 && Cost >= ScalarCost) dbgs() 5148 << "LV: Vectorization seems to be not beneficial, " 5149 << "but was forced by a user.\n"); 5150 LLVM_DEBUG(dbgs() << "LV: Selecting VF: " << Width << ".\n"); 5151 VectorizationFactor Factor = {Width, (unsigned)(Width * Cost)}; 5152 return Factor; 5153 } 5154 5155 std::pair<unsigned, unsigned> 5156 LoopVectorizationCostModel::getSmallestAndWidestTypes() { 5157 unsigned MinWidth = -1U; 5158 unsigned MaxWidth = 8; 5159 const DataLayout &DL = TheFunction->getParent()->getDataLayout(); 5160 5161 // For each block. 5162 for (BasicBlock *BB : TheLoop->blocks()) { 5163 // For each instruction in the loop. 5164 for (Instruction &I : BB->instructionsWithoutDebug()) { 5165 Type *T = I.getType(); 5166 5167 // Skip ignored values. 5168 if (ValuesToIgnore.find(&I) != ValuesToIgnore.end()) 5169 continue; 5170 5171 // Only examine Loads, Stores and PHINodes. 5172 if (!isa<LoadInst>(I) && !isa<StoreInst>(I) && !isa<PHINode>(I)) 5173 continue; 5174 5175 // Examine PHI nodes that are reduction variables. Update the type to 5176 // account for the recurrence type. 5177 if (auto *PN = dyn_cast<PHINode>(&I)) { 5178 if (!Legal->isReductionVariable(PN)) 5179 continue; 5180 RecurrenceDescriptor RdxDesc = Legal->getReductionVars()[PN]; 5181 T = RdxDesc.getRecurrenceType(); 5182 } 5183 5184 // Examine the stored values. 5185 if (auto *ST = dyn_cast<StoreInst>(&I)) 5186 T = ST->getValueOperand()->getType(); 5187 5188 // Ignore loaded pointer types and stored pointer types that are not 5189 // vectorizable. 5190 // 5191 // FIXME: The check here attempts to predict whether a load or store will 5192 // be vectorized. We only know this for certain after a VF has 5193 // been selected. Here, we assume that if an access can be 5194 // vectorized, it will be. We should also look at extending this 5195 // optimization to non-pointer types. 5196 // 5197 if (T->isPointerTy() && !isConsecutiveLoadOrStore(&I) && 5198 !isAccessInterleaved(&I) && !isLegalGatherOrScatter(&I)) 5199 continue; 5200 5201 MinWidth = std::min(MinWidth, 5202 (unsigned)DL.getTypeSizeInBits(T->getScalarType())); 5203 MaxWidth = std::max(MaxWidth, 5204 (unsigned)DL.getTypeSizeInBits(T->getScalarType())); 5205 } 5206 } 5207 5208 return {MinWidth, MaxWidth}; 5209 } 5210 5211 unsigned LoopVectorizationCostModel::selectInterleaveCount(unsigned VF, 5212 unsigned LoopCost) { 5213 // -- The interleave heuristics -- 5214 // We interleave the loop in order to expose ILP and reduce the loop overhead. 5215 // There are many micro-architectural considerations that we can't predict 5216 // at this level. For example, frontend pressure (on decode or fetch) due to 5217 // code size, or the number and capabilities of the execution ports. 5218 // 5219 // We use the following heuristics to select the interleave count: 5220 // 1. If the code has reductions, then we interleave to break the cross 5221 // iteration dependency. 5222 // 2. If the loop is really small, then we interleave to reduce the loop 5223 // overhead. 5224 // 3. We don't interleave if we think that we will spill registers to memory 5225 // due to the increased register pressure. 5226 5227 if (!isScalarEpilogueAllowed()) 5228 return 1; 5229 5230 // We used the distance for the interleave count. 5231 if (Legal->getMaxSafeDepDistBytes() != -1U) 5232 return 1; 5233 5234 // Do not interleave loops with a relatively small known or estimated trip 5235 // count. 5236 auto BestKnownTC = getSmallBestKnownTC(*PSE.getSE(), TheLoop); 5237 if (BestKnownTC && *BestKnownTC < TinyTripCountInterleaveThreshold) 5238 return 1; 5239 5240 RegisterUsage R = calculateRegisterUsage({VF})[0]; 5241 // We divide by these constants so assume that we have at least one 5242 // instruction that uses at least one register. 5243 for (auto& pair : R.MaxLocalUsers) { 5244 pair.second = std::max(pair.second, 1U); 5245 } 5246 5247 // We calculate the interleave count using the following formula. 5248 // Subtract the number of loop invariants from the number of available 5249 // registers. These registers are used by all of the interleaved instances. 5250 // Next, divide the remaining registers by the number of registers that is 5251 // required by the loop, in order to estimate how many parallel instances 5252 // fit without causing spills. All of this is rounded down if necessary to be 5253 // a power of two. We want power of two interleave count to simplify any 5254 // addressing operations or alignment considerations. 5255 // We also want power of two interleave counts to ensure that the induction 5256 // variable of the vector loop wraps to zero, when tail is folded by masking; 5257 // this currently happens when OptForSize, in which case IC is set to 1 above. 5258 unsigned IC = UINT_MAX; 5259 5260 for (auto& pair : R.MaxLocalUsers) { 5261 unsigned TargetNumRegisters = TTI.getNumberOfRegisters(pair.first); 5262 LLVM_DEBUG(dbgs() << "LV: The target has " << TargetNumRegisters 5263 << " registers of " 5264 << TTI.getRegisterClassName(pair.first) << " register class\n"); 5265 if (VF == 1) { 5266 if (ForceTargetNumScalarRegs.getNumOccurrences() > 0) 5267 TargetNumRegisters = ForceTargetNumScalarRegs; 5268 } else { 5269 if (ForceTargetNumVectorRegs.getNumOccurrences() > 0) 5270 TargetNumRegisters = ForceTargetNumVectorRegs; 5271 } 5272 unsigned MaxLocalUsers = pair.second; 5273 unsigned LoopInvariantRegs = 0; 5274 if (R.LoopInvariantRegs.find(pair.first) != R.LoopInvariantRegs.end()) 5275 LoopInvariantRegs = R.LoopInvariantRegs[pair.first]; 5276 5277 unsigned TmpIC = PowerOf2Floor((TargetNumRegisters - LoopInvariantRegs) / MaxLocalUsers); 5278 // Don't count the induction variable as interleaved. 5279 if (EnableIndVarRegisterHeur) { 5280 TmpIC = 5281 PowerOf2Floor((TargetNumRegisters - LoopInvariantRegs - 1) / 5282 std::max(1U, (MaxLocalUsers - 1))); 5283 } 5284 5285 IC = std::min(IC, TmpIC); 5286 } 5287 5288 // Clamp the interleave ranges to reasonable counts. 5289 unsigned MaxInterleaveCount = TTI.getMaxInterleaveFactor(VF); 5290 5291 // Check if the user has overridden the max. 5292 if (VF == 1) { 5293 if (ForceTargetMaxScalarInterleaveFactor.getNumOccurrences() > 0) 5294 MaxInterleaveCount = ForceTargetMaxScalarInterleaveFactor; 5295 } else { 5296 if (ForceTargetMaxVectorInterleaveFactor.getNumOccurrences() > 0) 5297 MaxInterleaveCount = ForceTargetMaxVectorInterleaveFactor; 5298 } 5299 5300 // If trip count is known or estimated compile time constant, limit the 5301 // interleave count to be less than the trip count divided by VF. 5302 if (BestKnownTC) { 5303 MaxInterleaveCount = std::min(*BestKnownTC / VF, MaxInterleaveCount); 5304 } 5305 5306 // If we did not calculate the cost for VF (because the user selected the VF) 5307 // then we calculate the cost of VF here. 5308 if (LoopCost == 0) 5309 LoopCost = expectedCost(VF).first; 5310 5311 assert(LoopCost && "Non-zero loop cost expected"); 5312 5313 // Clamp the calculated IC to be between the 1 and the max interleave count 5314 // that the target and trip count allows. 5315 if (IC > MaxInterleaveCount) 5316 IC = MaxInterleaveCount; 5317 else if (IC < 1) 5318 IC = 1; 5319 5320 // Interleave if we vectorized this loop and there is a reduction that could 5321 // benefit from interleaving. 5322 if (VF > 1 && !Legal->getReductionVars().empty()) { 5323 LLVM_DEBUG(dbgs() << "LV: Interleaving because of reductions.\n"); 5324 return IC; 5325 } 5326 5327 // Note that if we've already vectorized the loop we will have done the 5328 // runtime check and so interleaving won't require further checks. 5329 bool InterleavingRequiresRuntimePointerCheck = 5330 (VF == 1 && Legal->getRuntimePointerChecking()->Need); 5331 5332 // We want to interleave small loops in order to reduce the loop overhead and 5333 // potentially expose ILP opportunities. 5334 LLVM_DEBUG(dbgs() << "LV: Loop cost is " << LoopCost << '\n'); 5335 if (!InterleavingRequiresRuntimePointerCheck && LoopCost < SmallLoopCost) { 5336 // We assume that the cost overhead is 1 and we use the cost model 5337 // to estimate the cost of the loop and interleave until the cost of the 5338 // loop overhead is about 5% of the cost of the loop. 5339 unsigned SmallIC = 5340 std::min(IC, (unsigned)PowerOf2Floor(SmallLoopCost / LoopCost)); 5341 5342 // Interleave until store/load ports (estimated by max interleave count) are 5343 // saturated. 5344 unsigned NumStores = Legal->getNumStores(); 5345 unsigned NumLoads = Legal->getNumLoads(); 5346 unsigned StoresIC = IC / (NumStores ? NumStores : 1); 5347 unsigned LoadsIC = IC / (NumLoads ? NumLoads : 1); 5348 5349 // If we have a scalar reduction (vector reductions are already dealt with 5350 // by this point), we can increase the critical path length if the loop 5351 // we're interleaving is inside another loop. Limit, by default to 2, so the 5352 // critical path only gets increased by one reduction operation. 5353 if (!Legal->getReductionVars().empty() && TheLoop->getLoopDepth() > 1) { 5354 unsigned F = static_cast<unsigned>(MaxNestedScalarReductionIC); 5355 SmallIC = std::min(SmallIC, F); 5356 StoresIC = std::min(StoresIC, F); 5357 LoadsIC = std::min(LoadsIC, F); 5358 } 5359 5360 if (EnableLoadStoreRuntimeInterleave && 5361 std::max(StoresIC, LoadsIC) > SmallIC) { 5362 LLVM_DEBUG( 5363 dbgs() << "LV: Interleaving to saturate store or load ports.\n"); 5364 return std::max(StoresIC, LoadsIC); 5365 } 5366 5367 LLVM_DEBUG(dbgs() << "LV: Interleaving to reduce branch cost.\n"); 5368 return SmallIC; 5369 } 5370 5371 // Interleave if this is a large loop (small loops are already dealt with by 5372 // this point) that could benefit from interleaving. 5373 bool HasReductions = !Legal->getReductionVars().empty(); 5374 if (TTI.enableAggressiveInterleaving(HasReductions)) { 5375 LLVM_DEBUG(dbgs() << "LV: Interleaving to expose ILP.\n"); 5376 return IC; 5377 } 5378 5379 LLVM_DEBUG(dbgs() << "LV: Not Interleaving.\n"); 5380 return 1; 5381 } 5382 5383 SmallVector<LoopVectorizationCostModel::RegisterUsage, 8> 5384 LoopVectorizationCostModel::calculateRegisterUsage(ArrayRef<unsigned> VFs) { 5385 // This function calculates the register usage by measuring the highest number 5386 // of values that are alive at a single location. Obviously, this is a very 5387 // rough estimation. We scan the loop in a topological order in order and 5388 // assign a number to each instruction. We use RPO to ensure that defs are 5389 // met before their users. We assume that each instruction that has in-loop 5390 // users starts an interval. We record every time that an in-loop value is 5391 // used, so we have a list of the first and last occurrences of each 5392 // instruction. Next, we transpose this data structure into a multi map that 5393 // holds the list of intervals that *end* at a specific location. This multi 5394 // map allows us to perform a linear search. We scan the instructions linearly 5395 // and record each time that a new interval starts, by placing it in a set. 5396 // If we find this value in the multi-map then we remove it from the set. 5397 // The max register usage is the maximum size of the set. 5398 // We also search for instructions that are defined outside the loop, but are 5399 // used inside the loop. We need this number separately from the max-interval 5400 // usage number because when we unroll, loop-invariant values do not take 5401 // more register. 5402 LoopBlocksDFS DFS(TheLoop); 5403 DFS.perform(LI); 5404 5405 RegisterUsage RU; 5406 5407 // Each 'key' in the map opens a new interval. The values 5408 // of the map are the index of the 'last seen' usage of the 5409 // instruction that is the key. 5410 using IntervalMap = DenseMap<Instruction *, unsigned>; 5411 5412 // Maps instruction to its index. 5413 SmallVector<Instruction *, 64> IdxToInstr; 5414 // Marks the end of each interval. 5415 IntervalMap EndPoint; 5416 // Saves the list of instruction indices that are used in the loop. 5417 SmallPtrSet<Instruction *, 8> Ends; 5418 // Saves the list of values that are used in the loop but are 5419 // defined outside the loop, such as arguments and constants. 5420 SmallPtrSet<Value *, 8> LoopInvariants; 5421 5422 for (BasicBlock *BB : make_range(DFS.beginRPO(), DFS.endRPO())) { 5423 for (Instruction &I : BB->instructionsWithoutDebug()) { 5424 IdxToInstr.push_back(&I); 5425 5426 // Save the end location of each USE. 5427 for (Value *U : I.operands()) { 5428 auto *Instr = dyn_cast<Instruction>(U); 5429 5430 // Ignore non-instruction values such as arguments, constants, etc. 5431 if (!Instr) 5432 continue; 5433 5434 // If this instruction is outside the loop then record it and continue. 5435 if (!TheLoop->contains(Instr)) { 5436 LoopInvariants.insert(Instr); 5437 continue; 5438 } 5439 5440 // Overwrite previous end points. 5441 EndPoint[Instr] = IdxToInstr.size(); 5442 Ends.insert(Instr); 5443 } 5444 } 5445 } 5446 5447 // Saves the list of intervals that end with the index in 'key'. 5448 using InstrList = SmallVector<Instruction *, 2>; 5449 DenseMap<unsigned, InstrList> TransposeEnds; 5450 5451 // Transpose the EndPoints to a list of values that end at each index. 5452 for (auto &Interval : EndPoint) 5453 TransposeEnds[Interval.second].push_back(Interval.first); 5454 5455 SmallPtrSet<Instruction *, 8> OpenIntervals; 5456 5457 // Get the size of the widest register. 5458 unsigned MaxSafeDepDist = -1U; 5459 if (Legal->getMaxSafeDepDistBytes() != -1U) 5460 MaxSafeDepDist = Legal->getMaxSafeDepDistBytes() * 8; 5461 unsigned WidestRegister = 5462 std::min(TTI.getRegisterBitWidth(true), MaxSafeDepDist); 5463 const DataLayout &DL = TheFunction->getParent()->getDataLayout(); 5464 5465 SmallVector<RegisterUsage, 8> RUs(VFs.size()); 5466 SmallVector<SmallMapVector<unsigned, unsigned, 4>, 8> MaxUsages(VFs.size()); 5467 5468 LLVM_DEBUG(dbgs() << "LV(REG): Calculating max register usage:\n"); 5469 5470 // A lambda that gets the register usage for the given type and VF. 5471 auto GetRegUsage = [&DL, WidestRegister](Type *Ty, unsigned VF) { 5472 if (Ty->isTokenTy()) 5473 return 0U; 5474 unsigned TypeSize = DL.getTypeSizeInBits(Ty->getScalarType()); 5475 return std::max<unsigned>(1, VF * TypeSize / WidestRegister); 5476 }; 5477 5478 for (unsigned int i = 0, s = IdxToInstr.size(); i < s; ++i) { 5479 Instruction *I = IdxToInstr[i]; 5480 5481 // Remove all of the instructions that end at this location. 5482 InstrList &List = TransposeEnds[i]; 5483 for (Instruction *ToRemove : List) 5484 OpenIntervals.erase(ToRemove); 5485 5486 // Ignore instructions that are never used within the loop. 5487 if (Ends.find(I) == Ends.end()) 5488 continue; 5489 5490 // Skip ignored values. 5491 if (ValuesToIgnore.find(I) != ValuesToIgnore.end()) 5492 continue; 5493 5494 // For each VF find the maximum usage of registers. 5495 for (unsigned j = 0, e = VFs.size(); j < e; ++j) { 5496 // Count the number of live intervals. 5497 SmallMapVector<unsigned, unsigned, 4> RegUsage; 5498 5499 if (VFs[j] == 1) { 5500 for (auto Inst : OpenIntervals) { 5501 unsigned ClassID = TTI.getRegisterClassForType(false, Inst->getType()); 5502 if (RegUsage.find(ClassID) == RegUsage.end()) 5503 RegUsage[ClassID] = 1; 5504 else 5505 RegUsage[ClassID] += 1; 5506 } 5507 } else { 5508 collectUniformsAndScalars(VFs[j]); 5509 for (auto Inst : OpenIntervals) { 5510 // Skip ignored values for VF > 1. 5511 if (VecValuesToIgnore.find(Inst) != VecValuesToIgnore.end()) 5512 continue; 5513 if (isScalarAfterVectorization(Inst, VFs[j])) { 5514 unsigned ClassID = TTI.getRegisterClassForType(false, Inst->getType()); 5515 if (RegUsage.find(ClassID) == RegUsage.end()) 5516 RegUsage[ClassID] = 1; 5517 else 5518 RegUsage[ClassID] += 1; 5519 } else { 5520 unsigned ClassID = TTI.getRegisterClassForType(true, Inst->getType()); 5521 if (RegUsage.find(ClassID) == RegUsage.end()) 5522 RegUsage[ClassID] = GetRegUsage(Inst->getType(), VFs[j]); 5523 else 5524 RegUsage[ClassID] += GetRegUsage(Inst->getType(), VFs[j]); 5525 } 5526 } 5527 } 5528 5529 for (auto& pair : RegUsage) { 5530 if (MaxUsages[j].find(pair.first) != MaxUsages[j].end()) 5531 MaxUsages[j][pair.first] = std::max(MaxUsages[j][pair.first], pair.second); 5532 else 5533 MaxUsages[j][pair.first] = pair.second; 5534 } 5535 } 5536 5537 LLVM_DEBUG(dbgs() << "LV(REG): At #" << i << " Interval # " 5538 << OpenIntervals.size() << '\n'); 5539 5540 // Add the current instruction to the list of open intervals. 5541 OpenIntervals.insert(I); 5542 } 5543 5544 for (unsigned i = 0, e = VFs.size(); i < e; ++i) { 5545 SmallMapVector<unsigned, unsigned, 4> Invariant; 5546 5547 for (auto Inst : LoopInvariants) { 5548 unsigned Usage = VFs[i] == 1 ? 1 : GetRegUsage(Inst->getType(), VFs[i]); 5549 unsigned ClassID = TTI.getRegisterClassForType(VFs[i] > 1, Inst->getType()); 5550 if (Invariant.find(ClassID) == Invariant.end()) 5551 Invariant[ClassID] = Usage; 5552 else 5553 Invariant[ClassID] += Usage; 5554 } 5555 5556 LLVM_DEBUG({ 5557 dbgs() << "LV(REG): VF = " << VFs[i] << '\n'; 5558 dbgs() << "LV(REG): Found max usage: " << MaxUsages[i].size() 5559 << " item\n"; 5560 for (const auto &pair : MaxUsages[i]) { 5561 dbgs() << "LV(REG): RegisterClass: " 5562 << TTI.getRegisterClassName(pair.first) << ", " << pair.second 5563 << " registers\n"; 5564 } 5565 dbgs() << "LV(REG): Found invariant usage: " << Invariant.size() 5566 << " item\n"; 5567 for (const auto &pair : Invariant) { 5568 dbgs() << "LV(REG): RegisterClass: " 5569 << TTI.getRegisterClassName(pair.first) << ", " << pair.second 5570 << " registers\n"; 5571 } 5572 }); 5573 5574 RU.LoopInvariantRegs = Invariant; 5575 RU.MaxLocalUsers = MaxUsages[i]; 5576 RUs[i] = RU; 5577 } 5578 5579 return RUs; 5580 } 5581 5582 bool LoopVectorizationCostModel::useEmulatedMaskMemRefHack(Instruction *I){ 5583 // TODO: Cost model for emulated masked load/store is completely 5584 // broken. This hack guides the cost model to use an artificially 5585 // high enough value to practically disable vectorization with such 5586 // operations, except where previously deployed legality hack allowed 5587 // using very low cost values. This is to avoid regressions coming simply 5588 // from moving "masked load/store" check from legality to cost model. 5589 // Masked Load/Gather emulation was previously never allowed. 5590 // Limited number of Masked Store/Scatter emulation was allowed. 5591 assert(isPredicatedInst(I) && "Expecting a scalar emulated instruction"); 5592 return isa<LoadInst>(I) || 5593 (isa<StoreInst>(I) && 5594 NumPredStores > NumberOfStoresToPredicate); 5595 } 5596 5597 void LoopVectorizationCostModel::collectInstsToScalarize(unsigned VF) { 5598 // If we aren't vectorizing the loop, or if we've already collected the 5599 // instructions to scalarize, there's nothing to do. Collection may already 5600 // have occurred if we have a user-selected VF and are now computing the 5601 // expected cost for interleaving. 5602 if (VF < 2 || InstsToScalarize.find(VF) != InstsToScalarize.end()) 5603 return; 5604 5605 // Initialize a mapping for VF in InstsToScalalarize. If we find that it's 5606 // not profitable to scalarize any instructions, the presence of VF in the 5607 // map will indicate that we've analyzed it already. 5608 ScalarCostsTy &ScalarCostsVF = InstsToScalarize[VF]; 5609 5610 // Find all the instructions that are scalar with predication in the loop and 5611 // determine if it would be better to not if-convert the blocks they are in. 5612 // If so, we also record the instructions to scalarize. 5613 for (BasicBlock *BB : TheLoop->blocks()) { 5614 if (!blockNeedsPredication(BB)) 5615 continue; 5616 for (Instruction &I : *BB) 5617 if (isScalarWithPredication(&I)) { 5618 ScalarCostsTy ScalarCosts; 5619 // Do not apply discount logic if hacked cost is needed 5620 // for emulated masked memrefs. 5621 if (!useEmulatedMaskMemRefHack(&I) && 5622 computePredInstDiscount(&I, ScalarCosts, VF) >= 0) 5623 ScalarCostsVF.insert(ScalarCosts.begin(), ScalarCosts.end()); 5624 // Remember that BB will remain after vectorization. 5625 PredicatedBBsAfterVectorization.insert(BB); 5626 } 5627 } 5628 } 5629 5630 int LoopVectorizationCostModel::computePredInstDiscount( 5631 Instruction *PredInst, DenseMap<Instruction *, unsigned> &ScalarCosts, 5632 unsigned VF) { 5633 assert(!isUniformAfterVectorization(PredInst, VF) && 5634 "Instruction marked uniform-after-vectorization will be predicated"); 5635 5636 // Initialize the discount to zero, meaning that the scalar version and the 5637 // vector version cost the same. 5638 int Discount = 0; 5639 5640 // Holds instructions to analyze. The instructions we visit are mapped in 5641 // ScalarCosts. Those instructions are the ones that would be scalarized if 5642 // we find that the scalar version costs less. 5643 SmallVector<Instruction *, 8> Worklist; 5644 5645 // Returns true if the given instruction can be scalarized. 5646 auto canBeScalarized = [&](Instruction *I) -> bool { 5647 // We only attempt to scalarize instructions forming a single-use chain 5648 // from the original predicated block that would otherwise be vectorized. 5649 // Although not strictly necessary, we give up on instructions we know will 5650 // already be scalar to avoid traversing chains that are unlikely to be 5651 // beneficial. 5652 if (!I->hasOneUse() || PredInst->getParent() != I->getParent() || 5653 isScalarAfterVectorization(I, VF)) 5654 return false; 5655 5656 // If the instruction is scalar with predication, it will be analyzed 5657 // separately. We ignore it within the context of PredInst. 5658 if (isScalarWithPredication(I)) 5659 return false; 5660 5661 // If any of the instruction's operands are uniform after vectorization, 5662 // the instruction cannot be scalarized. This prevents, for example, a 5663 // masked load from being scalarized. 5664 // 5665 // We assume we will only emit a value for lane zero of an instruction 5666 // marked uniform after vectorization, rather than VF identical values. 5667 // Thus, if we scalarize an instruction that uses a uniform, we would 5668 // create uses of values corresponding to the lanes we aren't emitting code 5669 // for. This behavior can be changed by allowing getScalarValue to clone 5670 // the lane zero values for uniforms rather than asserting. 5671 for (Use &U : I->operands()) 5672 if (auto *J = dyn_cast<Instruction>(U.get())) 5673 if (isUniformAfterVectorization(J, VF)) 5674 return false; 5675 5676 // Otherwise, we can scalarize the instruction. 5677 return true; 5678 }; 5679 5680 // Compute the expected cost discount from scalarizing the entire expression 5681 // feeding the predicated instruction. We currently only consider expressions 5682 // that are single-use instruction chains. 5683 Worklist.push_back(PredInst); 5684 while (!Worklist.empty()) { 5685 Instruction *I = Worklist.pop_back_val(); 5686 5687 // If we've already analyzed the instruction, there's nothing to do. 5688 if (ScalarCosts.find(I) != ScalarCosts.end()) 5689 continue; 5690 5691 // Compute the cost of the vector instruction. Note that this cost already 5692 // includes the scalarization overhead of the predicated instruction. 5693 unsigned VectorCost = getInstructionCost(I, VF).first; 5694 5695 // Compute the cost of the scalarized instruction. This cost is the cost of 5696 // the instruction as if it wasn't if-converted and instead remained in the 5697 // predicated block. We will scale this cost by block probability after 5698 // computing the scalarization overhead. 5699 unsigned ScalarCost = VF * getInstructionCost(I, 1).first; 5700 5701 // Compute the scalarization overhead of needed insertelement instructions 5702 // and phi nodes. 5703 if (isScalarWithPredication(I) && !I->getType()->isVoidTy()) { 5704 ScalarCost += TTI.getScalarizationOverhead(ToVectorTy(I->getType(), VF), 5705 true, false); 5706 ScalarCost += VF * TTI.getCFInstrCost(Instruction::PHI); 5707 } 5708 5709 // Compute the scalarization overhead of needed extractelement 5710 // instructions. For each of the instruction's operands, if the operand can 5711 // be scalarized, add it to the worklist; otherwise, account for the 5712 // overhead. 5713 for (Use &U : I->operands()) 5714 if (auto *J = dyn_cast<Instruction>(U.get())) { 5715 assert(VectorType::isValidElementType(J->getType()) && 5716 "Instruction has non-scalar type"); 5717 if (canBeScalarized(J)) 5718 Worklist.push_back(J); 5719 else if (needsExtract(J, VF)) 5720 ScalarCost += TTI.getScalarizationOverhead( 5721 ToVectorTy(J->getType(),VF), false, true); 5722 } 5723 5724 // Scale the total scalar cost by block probability. 5725 ScalarCost /= getReciprocalPredBlockProb(); 5726 5727 // Compute the discount. A non-negative discount means the vector version 5728 // of the instruction costs more, and scalarizing would be beneficial. 5729 Discount += VectorCost - ScalarCost; 5730 ScalarCosts[I] = ScalarCost; 5731 } 5732 5733 return Discount; 5734 } 5735 5736 LoopVectorizationCostModel::VectorizationCostTy 5737 LoopVectorizationCostModel::expectedCost(unsigned VF) { 5738 VectorizationCostTy Cost; 5739 5740 // For each block. 5741 for (BasicBlock *BB : TheLoop->blocks()) { 5742 VectorizationCostTy BlockCost; 5743 5744 // For each instruction in the old loop. 5745 for (Instruction &I : BB->instructionsWithoutDebug()) { 5746 // Skip ignored values. 5747 if (ValuesToIgnore.find(&I) != ValuesToIgnore.end() || 5748 (VF > 1 && VecValuesToIgnore.find(&I) != VecValuesToIgnore.end())) 5749 continue; 5750 5751 VectorizationCostTy C = getInstructionCost(&I, VF); 5752 5753 // Check if we should override the cost. 5754 if (ForceTargetInstructionCost.getNumOccurrences() > 0) 5755 C.first = ForceTargetInstructionCost; 5756 5757 BlockCost.first += C.first; 5758 BlockCost.second |= C.second; 5759 LLVM_DEBUG(dbgs() << "LV: Found an estimated cost of " << C.first 5760 << " for VF " << VF << " For instruction: " << I 5761 << '\n'); 5762 } 5763 5764 // If we are vectorizing a predicated block, it will have been 5765 // if-converted. This means that the block's instructions (aside from 5766 // stores and instructions that may divide by zero) will now be 5767 // unconditionally executed. For the scalar case, we may not always execute 5768 // the predicated block. Thus, scale the block's cost by the probability of 5769 // executing it. 5770 if (VF == 1 && blockNeedsPredication(BB)) 5771 BlockCost.first /= getReciprocalPredBlockProb(); 5772 5773 Cost.first += BlockCost.first; 5774 Cost.second |= BlockCost.second; 5775 } 5776 5777 return Cost; 5778 } 5779 5780 /// Gets Address Access SCEV after verifying that the access pattern 5781 /// is loop invariant except the induction variable dependence. 5782 /// 5783 /// This SCEV can be sent to the Target in order to estimate the address 5784 /// calculation cost. 5785 static const SCEV *getAddressAccessSCEV( 5786 Value *Ptr, 5787 LoopVectorizationLegality *Legal, 5788 PredicatedScalarEvolution &PSE, 5789 const Loop *TheLoop) { 5790 5791 auto *Gep = dyn_cast<GetElementPtrInst>(Ptr); 5792 if (!Gep) 5793 return nullptr; 5794 5795 // We are looking for a gep with all loop invariant indices except for one 5796 // which should be an induction variable. 5797 auto SE = PSE.getSE(); 5798 unsigned NumOperands = Gep->getNumOperands(); 5799 for (unsigned i = 1; i < NumOperands; ++i) { 5800 Value *Opd = Gep->getOperand(i); 5801 if (!SE->isLoopInvariant(SE->getSCEV(Opd), TheLoop) && 5802 !Legal->isInductionVariable(Opd)) 5803 return nullptr; 5804 } 5805 5806 // Now we know we have a GEP ptr, %inv, %ind, %inv. return the Ptr SCEV. 5807 return PSE.getSCEV(Ptr); 5808 } 5809 5810 static bool isStrideMul(Instruction *I, LoopVectorizationLegality *Legal) { 5811 return Legal->hasStride(I->getOperand(0)) || 5812 Legal->hasStride(I->getOperand(1)); 5813 } 5814 5815 unsigned LoopVectorizationCostModel::getMemInstScalarizationCost(Instruction *I, 5816 unsigned VF) { 5817 assert(VF > 1 && "Scalarization cost of instruction implies vectorization."); 5818 Type *ValTy = getMemInstValueType(I); 5819 auto SE = PSE.getSE(); 5820 5821 unsigned AS = getLoadStoreAddressSpace(I); 5822 Value *Ptr = getLoadStorePointerOperand(I); 5823 Type *PtrTy = ToVectorTy(Ptr->getType(), VF); 5824 5825 // Figure out whether the access is strided and get the stride value 5826 // if it's known in compile time 5827 const SCEV *PtrSCEV = getAddressAccessSCEV(Ptr, Legal, PSE, TheLoop); 5828 5829 // Get the cost of the scalar memory instruction and address computation. 5830 unsigned Cost = VF * TTI.getAddressComputationCost(PtrTy, SE, PtrSCEV); 5831 5832 // Don't pass *I here, since it is scalar but will actually be part of a 5833 // vectorized loop where the user of it is a vectorized instruction. 5834 const MaybeAlign Alignment = getLoadStoreAlignment(I); 5835 Cost += VF * TTI.getMemoryOpCost(I->getOpcode(), ValTy->getScalarType(), 5836 Alignment, AS); 5837 5838 // Get the overhead of the extractelement and insertelement instructions 5839 // we might create due to scalarization. 5840 Cost += getScalarizationOverhead(I, VF); 5841 5842 // If we have a predicated store, it may not be executed for each vector 5843 // lane. Scale the cost by the probability of executing the predicated 5844 // block. 5845 if (isPredicatedInst(I)) { 5846 Cost /= getReciprocalPredBlockProb(); 5847 5848 if (useEmulatedMaskMemRefHack(I)) 5849 // Artificially setting to a high enough value to practically disable 5850 // vectorization with such operations. 5851 Cost = 3000000; 5852 } 5853 5854 return Cost; 5855 } 5856 5857 unsigned LoopVectorizationCostModel::getConsecutiveMemOpCost(Instruction *I, 5858 unsigned VF) { 5859 Type *ValTy = getMemInstValueType(I); 5860 auto *VectorTy = cast<VectorType>(ToVectorTy(ValTy, VF)); 5861 Value *Ptr = getLoadStorePointerOperand(I); 5862 unsigned AS = getLoadStoreAddressSpace(I); 5863 int ConsecutiveStride = Legal->isConsecutivePtr(Ptr); 5864 5865 assert((ConsecutiveStride == 1 || ConsecutiveStride == -1) && 5866 "Stride should be 1 or -1 for consecutive memory access"); 5867 const MaybeAlign Alignment = getLoadStoreAlignment(I); 5868 unsigned Cost = 0; 5869 if (Legal->isMaskRequired(I)) 5870 Cost += TTI.getMaskedMemoryOpCost(I->getOpcode(), VectorTy, 5871 Alignment ? Alignment->value() : 0, AS); 5872 else 5873 Cost += TTI.getMemoryOpCost(I->getOpcode(), VectorTy, Alignment, AS, I); 5874 5875 bool Reverse = ConsecutiveStride < 0; 5876 if (Reverse) 5877 Cost += TTI.getShuffleCost(TargetTransformInfo::SK_Reverse, VectorTy, 0); 5878 return Cost; 5879 } 5880 5881 unsigned LoopVectorizationCostModel::getUniformMemOpCost(Instruction *I, 5882 unsigned VF) { 5883 Type *ValTy = getMemInstValueType(I); 5884 auto *VectorTy = cast<VectorType>(ToVectorTy(ValTy, VF)); 5885 const MaybeAlign Alignment = getLoadStoreAlignment(I); 5886 unsigned AS = getLoadStoreAddressSpace(I); 5887 if (isa<LoadInst>(I)) { 5888 return TTI.getAddressComputationCost(ValTy) + 5889 TTI.getMemoryOpCost(Instruction::Load, ValTy, Alignment, AS) + 5890 TTI.getShuffleCost(TargetTransformInfo::SK_Broadcast, VectorTy); 5891 } 5892 StoreInst *SI = cast<StoreInst>(I); 5893 5894 bool isLoopInvariantStoreValue = Legal->isUniform(SI->getValueOperand()); 5895 return TTI.getAddressComputationCost(ValTy) + 5896 TTI.getMemoryOpCost(Instruction::Store, ValTy, Alignment, AS) + 5897 (isLoopInvariantStoreValue 5898 ? 0 5899 : TTI.getVectorInstrCost(Instruction::ExtractElement, VectorTy, 5900 VF - 1)); 5901 } 5902 5903 unsigned LoopVectorizationCostModel::getGatherScatterCost(Instruction *I, 5904 unsigned VF) { 5905 Type *ValTy = getMemInstValueType(I); 5906 auto *VectorTy = cast<VectorType>(ToVectorTy(ValTy, VF)); 5907 const MaybeAlign Alignment = getLoadStoreAlignment(I); 5908 Value *Ptr = getLoadStorePointerOperand(I); 5909 5910 return TTI.getAddressComputationCost(VectorTy) + 5911 TTI.getGatherScatterOpCost(I->getOpcode(), VectorTy, Ptr, 5912 Legal->isMaskRequired(I), 5913 Alignment ? Alignment->value() : 0, I); 5914 } 5915 5916 unsigned LoopVectorizationCostModel::getInterleaveGroupCost(Instruction *I, 5917 unsigned VF) { 5918 Type *ValTy = getMemInstValueType(I); 5919 auto *VectorTy = cast<VectorType>(ToVectorTy(ValTy, VF)); 5920 unsigned AS = getLoadStoreAddressSpace(I); 5921 5922 auto Group = getInterleavedAccessGroup(I); 5923 assert(Group && "Fail to get an interleaved access group."); 5924 5925 unsigned InterleaveFactor = Group->getFactor(); 5926 VectorType *WideVecTy = VectorType::get(ValTy, VF * InterleaveFactor); 5927 5928 // Holds the indices of existing members in an interleaved load group. 5929 // An interleaved store group doesn't need this as it doesn't allow gaps. 5930 SmallVector<unsigned, 4> Indices; 5931 if (isa<LoadInst>(I)) { 5932 for (unsigned i = 0; i < InterleaveFactor; i++) 5933 if (Group->getMember(i)) 5934 Indices.push_back(i); 5935 } 5936 5937 // Calculate the cost of the whole interleaved group. 5938 bool UseMaskForGaps = 5939 Group->requiresScalarEpilogue() && !isScalarEpilogueAllowed(); 5940 unsigned Cost = TTI.getInterleavedMemoryOpCost( 5941 I->getOpcode(), WideVecTy, Group->getFactor(), Indices, 5942 Group->getAlign().value(), AS, Legal->isMaskRequired(I), UseMaskForGaps); 5943 5944 if (Group->isReverse()) { 5945 // TODO: Add support for reversed masked interleaved access. 5946 assert(!Legal->isMaskRequired(I) && 5947 "Reverse masked interleaved access not supported."); 5948 Cost += Group->getNumMembers() * 5949 TTI.getShuffleCost(TargetTransformInfo::SK_Reverse, VectorTy, 0); 5950 } 5951 return Cost; 5952 } 5953 5954 unsigned LoopVectorizationCostModel::getMemoryInstructionCost(Instruction *I, 5955 unsigned VF) { 5956 // Calculate scalar cost only. Vectorization cost should be ready at this 5957 // moment. 5958 if (VF == 1) { 5959 Type *ValTy = getMemInstValueType(I); 5960 const MaybeAlign Alignment = getLoadStoreAlignment(I); 5961 unsigned AS = getLoadStoreAddressSpace(I); 5962 5963 return TTI.getAddressComputationCost(ValTy) + 5964 TTI.getMemoryOpCost(I->getOpcode(), ValTy, Alignment, AS, I); 5965 } 5966 return getWideningCost(I, VF); 5967 } 5968 5969 LoopVectorizationCostModel::VectorizationCostTy 5970 LoopVectorizationCostModel::getInstructionCost(Instruction *I, unsigned VF) { 5971 // If we know that this instruction will remain uniform, check the cost of 5972 // the scalar version. 5973 if (isUniformAfterVectorization(I, VF)) 5974 VF = 1; 5975 5976 if (VF > 1 && isProfitableToScalarize(I, VF)) 5977 return VectorizationCostTy(InstsToScalarize[VF][I], false); 5978 5979 // Forced scalars do not have any scalarization overhead. 5980 auto ForcedScalar = ForcedScalars.find(VF); 5981 if (VF > 1 && ForcedScalar != ForcedScalars.end()) { 5982 auto InstSet = ForcedScalar->second; 5983 if (InstSet.find(I) != InstSet.end()) 5984 return VectorizationCostTy((getInstructionCost(I, 1).first * VF), false); 5985 } 5986 5987 Type *VectorTy; 5988 unsigned C = getInstructionCost(I, VF, VectorTy); 5989 5990 bool TypeNotScalarized = 5991 VF > 1 && VectorTy->isVectorTy() && TTI.getNumberOfParts(VectorTy) < VF; 5992 return VectorizationCostTy(C, TypeNotScalarized); 5993 } 5994 5995 unsigned LoopVectorizationCostModel::getScalarizationOverhead(Instruction *I, 5996 unsigned VF) { 5997 5998 if (VF == 1) 5999 return 0; 6000 6001 unsigned Cost = 0; 6002 Type *RetTy = ToVectorTy(I->getType(), VF); 6003 if (!RetTy->isVoidTy() && 6004 (!isa<LoadInst>(I) || !TTI.supportsEfficientVectorElementLoadStore())) 6005 Cost += TTI.getScalarizationOverhead(RetTy, true, false); 6006 6007 // Some targets keep addresses scalar. 6008 if (isa<LoadInst>(I) && !TTI.prefersVectorizedAddressing()) 6009 return Cost; 6010 6011 // Some targets support efficient element stores. 6012 if (isa<StoreInst>(I) && TTI.supportsEfficientVectorElementLoadStore()) 6013 return Cost; 6014 6015 // Collect operands to consider. 6016 CallInst *CI = dyn_cast<CallInst>(I); 6017 Instruction::op_range Ops = CI ? CI->arg_operands() : I->operands(); 6018 6019 // Skip operands that do not require extraction/scalarization and do not incur 6020 // any overhead. 6021 return Cost + TTI.getOperandsScalarizationOverhead( 6022 filterExtractingOperands(Ops, VF), VF); 6023 } 6024 6025 void LoopVectorizationCostModel::setCostBasedWideningDecision(unsigned VF) { 6026 if (VF == 1) 6027 return; 6028 NumPredStores = 0; 6029 for (BasicBlock *BB : TheLoop->blocks()) { 6030 // For each instruction in the old loop. 6031 for (Instruction &I : *BB) { 6032 Value *Ptr = getLoadStorePointerOperand(&I); 6033 if (!Ptr) 6034 continue; 6035 6036 // TODO: We should generate better code and update the cost model for 6037 // predicated uniform stores. Today they are treated as any other 6038 // predicated store (see added test cases in 6039 // invariant-store-vectorization.ll). 6040 if (isa<StoreInst>(&I) && isScalarWithPredication(&I)) 6041 NumPredStores++; 6042 6043 if (Legal->isUniform(Ptr) && 6044 // Conditional loads and stores should be scalarized and predicated. 6045 // isScalarWithPredication cannot be used here since masked 6046 // gather/scatters are not considered scalar with predication. 6047 !Legal->blockNeedsPredication(I.getParent())) { 6048 // TODO: Avoid replicating loads and stores instead of 6049 // relying on instcombine to remove them. 6050 // Load: Scalar load + broadcast 6051 // Store: Scalar store + isLoopInvariantStoreValue ? 0 : extract 6052 unsigned Cost = getUniformMemOpCost(&I, VF); 6053 setWideningDecision(&I, VF, CM_Scalarize, Cost); 6054 continue; 6055 } 6056 6057 // We assume that widening is the best solution when possible. 6058 if (memoryInstructionCanBeWidened(&I, VF)) { 6059 unsigned Cost = getConsecutiveMemOpCost(&I, VF); 6060 int ConsecutiveStride = 6061 Legal->isConsecutivePtr(getLoadStorePointerOperand(&I)); 6062 assert((ConsecutiveStride == 1 || ConsecutiveStride == -1) && 6063 "Expected consecutive stride."); 6064 InstWidening Decision = 6065 ConsecutiveStride == 1 ? CM_Widen : CM_Widen_Reverse; 6066 setWideningDecision(&I, VF, Decision, Cost); 6067 continue; 6068 } 6069 6070 // Choose between Interleaving, Gather/Scatter or Scalarization. 6071 unsigned InterleaveCost = std::numeric_limits<unsigned>::max(); 6072 unsigned NumAccesses = 1; 6073 if (isAccessInterleaved(&I)) { 6074 auto Group = getInterleavedAccessGroup(&I); 6075 assert(Group && "Fail to get an interleaved access group."); 6076 6077 // Make one decision for the whole group. 6078 if (getWideningDecision(&I, VF) != CM_Unknown) 6079 continue; 6080 6081 NumAccesses = Group->getNumMembers(); 6082 if (interleavedAccessCanBeWidened(&I, VF)) 6083 InterleaveCost = getInterleaveGroupCost(&I, VF); 6084 } 6085 6086 unsigned GatherScatterCost = 6087 isLegalGatherOrScatter(&I) 6088 ? getGatherScatterCost(&I, VF) * NumAccesses 6089 : std::numeric_limits<unsigned>::max(); 6090 6091 unsigned ScalarizationCost = 6092 getMemInstScalarizationCost(&I, VF) * NumAccesses; 6093 6094 // Choose better solution for the current VF, 6095 // write down this decision and use it during vectorization. 6096 unsigned Cost; 6097 InstWidening Decision; 6098 if (InterleaveCost <= GatherScatterCost && 6099 InterleaveCost < ScalarizationCost) { 6100 Decision = CM_Interleave; 6101 Cost = InterleaveCost; 6102 } else if (GatherScatterCost < ScalarizationCost) { 6103 Decision = CM_GatherScatter; 6104 Cost = GatherScatterCost; 6105 } else { 6106 Decision = CM_Scalarize; 6107 Cost = ScalarizationCost; 6108 } 6109 // If the instructions belongs to an interleave group, the whole group 6110 // receives the same decision. The whole group receives the cost, but 6111 // the cost will actually be assigned to one instruction. 6112 if (auto Group = getInterleavedAccessGroup(&I)) 6113 setWideningDecision(Group, VF, Decision, Cost); 6114 else 6115 setWideningDecision(&I, VF, Decision, Cost); 6116 } 6117 } 6118 6119 // Make sure that any load of address and any other address computation 6120 // remains scalar unless there is gather/scatter support. This avoids 6121 // inevitable extracts into address registers, and also has the benefit of 6122 // activating LSR more, since that pass can't optimize vectorized 6123 // addresses. 6124 if (TTI.prefersVectorizedAddressing()) 6125 return; 6126 6127 // Start with all scalar pointer uses. 6128 SmallPtrSet<Instruction *, 8> AddrDefs; 6129 for (BasicBlock *BB : TheLoop->blocks()) 6130 for (Instruction &I : *BB) { 6131 Instruction *PtrDef = 6132 dyn_cast_or_null<Instruction>(getLoadStorePointerOperand(&I)); 6133 if (PtrDef && TheLoop->contains(PtrDef) && 6134 getWideningDecision(&I, VF) != CM_GatherScatter) 6135 AddrDefs.insert(PtrDef); 6136 } 6137 6138 // Add all instructions used to generate the addresses. 6139 SmallVector<Instruction *, 4> Worklist; 6140 for (auto *I : AddrDefs) 6141 Worklist.push_back(I); 6142 while (!Worklist.empty()) { 6143 Instruction *I = Worklist.pop_back_val(); 6144 for (auto &Op : I->operands()) 6145 if (auto *InstOp = dyn_cast<Instruction>(Op)) 6146 if ((InstOp->getParent() == I->getParent()) && !isa<PHINode>(InstOp) && 6147 AddrDefs.insert(InstOp).second) 6148 Worklist.push_back(InstOp); 6149 } 6150 6151 for (auto *I : AddrDefs) { 6152 if (isa<LoadInst>(I)) { 6153 // Setting the desired widening decision should ideally be handled in 6154 // by cost functions, but since this involves the task of finding out 6155 // if the loaded register is involved in an address computation, it is 6156 // instead changed here when we know this is the case. 6157 InstWidening Decision = getWideningDecision(I, VF); 6158 if (Decision == CM_Widen || Decision == CM_Widen_Reverse) 6159 // Scalarize a widened load of address. 6160 setWideningDecision(I, VF, CM_Scalarize, 6161 (VF * getMemoryInstructionCost(I, 1))); 6162 else if (auto Group = getInterleavedAccessGroup(I)) { 6163 // Scalarize an interleave group of address loads. 6164 for (unsigned I = 0; I < Group->getFactor(); ++I) { 6165 if (Instruction *Member = Group->getMember(I)) 6166 setWideningDecision(Member, VF, CM_Scalarize, 6167 (VF * getMemoryInstructionCost(Member, 1))); 6168 } 6169 } 6170 } else 6171 // Make sure I gets scalarized and a cost estimate without 6172 // scalarization overhead. 6173 ForcedScalars[VF].insert(I); 6174 } 6175 } 6176 6177 unsigned LoopVectorizationCostModel::getInstructionCost(Instruction *I, 6178 unsigned VF, 6179 Type *&VectorTy) { 6180 Type *RetTy = I->getType(); 6181 if (canTruncateToMinimalBitwidth(I, VF)) 6182 RetTy = IntegerType::get(RetTy->getContext(), MinBWs[I]); 6183 VectorTy = isScalarAfterVectorization(I, VF) ? RetTy : ToVectorTy(RetTy, VF); 6184 auto SE = PSE.getSE(); 6185 6186 // TODO: We need to estimate the cost of intrinsic calls. 6187 switch (I->getOpcode()) { 6188 case Instruction::GetElementPtr: 6189 // We mark this instruction as zero-cost because the cost of GEPs in 6190 // vectorized code depends on whether the corresponding memory instruction 6191 // is scalarized or not. Therefore, we handle GEPs with the memory 6192 // instruction cost. 6193 return 0; 6194 case Instruction::Br: { 6195 // In cases of scalarized and predicated instructions, there will be VF 6196 // predicated blocks in the vectorized loop. Each branch around these 6197 // blocks requires also an extract of its vector compare i1 element. 6198 bool ScalarPredicatedBB = false; 6199 BranchInst *BI = cast<BranchInst>(I); 6200 if (VF > 1 && BI->isConditional() && 6201 (PredicatedBBsAfterVectorization.find(BI->getSuccessor(0)) != 6202 PredicatedBBsAfterVectorization.end() || 6203 PredicatedBBsAfterVectorization.find(BI->getSuccessor(1)) != 6204 PredicatedBBsAfterVectorization.end())) 6205 ScalarPredicatedBB = true; 6206 6207 if (ScalarPredicatedBB) { 6208 // Return cost for branches around scalarized and predicated blocks. 6209 Type *Vec_i1Ty = 6210 VectorType::get(IntegerType::getInt1Ty(RetTy->getContext()), VF); 6211 return (TTI.getScalarizationOverhead(Vec_i1Ty, false, true) + 6212 (TTI.getCFInstrCost(Instruction::Br) * VF)); 6213 } else if (I->getParent() == TheLoop->getLoopLatch() || VF == 1) 6214 // The back-edge branch will remain, as will all scalar branches. 6215 return TTI.getCFInstrCost(Instruction::Br); 6216 else 6217 // This branch will be eliminated by if-conversion. 6218 return 0; 6219 // Note: We currently assume zero cost for an unconditional branch inside 6220 // a predicated block since it will become a fall-through, although we 6221 // may decide in the future to call TTI for all branches. 6222 } 6223 case Instruction::PHI: { 6224 auto *Phi = cast<PHINode>(I); 6225 6226 // First-order recurrences are replaced by vector shuffles inside the loop. 6227 // NOTE: Don't use ToVectorTy as SK_ExtractSubvector expects a vector type. 6228 if (VF > 1 && Legal->isFirstOrderRecurrence(Phi)) 6229 return TTI.getShuffleCost(TargetTransformInfo::SK_ExtractSubvector, 6230 cast<VectorType>(VectorTy), VF - 1, 6231 VectorType::get(RetTy, 1)); 6232 6233 // Phi nodes in non-header blocks (not inductions, reductions, etc.) are 6234 // converted into select instructions. We require N - 1 selects per phi 6235 // node, where N is the number of incoming values. 6236 if (VF > 1 && Phi->getParent() != TheLoop->getHeader()) 6237 return (Phi->getNumIncomingValues() - 1) * 6238 TTI.getCmpSelInstrCost( 6239 Instruction::Select, ToVectorTy(Phi->getType(), VF), 6240 ToVectorTy(Type::getInt1Ty(Phi->getContext()), VF)); 6241 6242 return TTI.getCFInstrCost(Instruction::PHI); 6243 } 6244 case Instruction::UDiv: 6245 case Instruction::SDiv: 6246 case Instruction::URem: 6247 case Instruction::SRem: 6248 // If we have a predicated instruction, it may not be executed for each 6249 // vector lane. Get the scalarization cost and scale this amount by the 6250 // probability of executing the predicated block. If the instruction is not 6251 // predicated, we fall through to the next case. 6252 if (VF > 1 && isScalarWithPredication(I)) { 6253 unsigned Cost = 0; 6254 6255 // These instructions have a non-void type, so account for the phi nodes 6256 // that we will create. This cost is likely to be zero. The phi node 6257 // cost, if any, should be scaled by the block probability because it 6258 // models a copy at the end of each predicated block. 6259 Cost += VF * TTI.getCFInstrCost(Instruction::PHI); 6260 6261 // The cost of the non-predicated instruction. 6262 Cost += VF * TTI.getArithmeticInstrCost(I->getOpcode(), RetTy); 6263 6264 // The cost of insertelement and extractelement instructions needed for 6265 // scalarization. 6266 Cost += getScalarizationOverhead(I, VF); 6267 6268 // Scale the cost by the probability of executing the predicated blocks. 6269 // This assumes the predicated block for each vector lane is equally 6270 // likely. 6271 return Cost / getReciprocalPredBlockProb(); 6272 } 6273 LLVM_FALLTHROUGH; 6274 case Instruction::Add: 6275 case Instruction::FAdd: 6276 case Instruction::Sub: 6277 case Instruction::FSub: 6278 case Instruction::Mul: 6279 case Instruction::FMul: 6280 case Instruction::FDiv: 6281 case Instruction::FRem: 6282 case Instruction::Shl: 6283 case Instruction::LShr: 6284 case Instruction::AShr: 6285 case Instruction::And: 6286 case Instruction::Or: 6287 case Instruction::Xor: { 6288 // Since we will replace the stride by 1 the multiplication should go away. 6289 if (I->getOpcode() == Instruction::Mul && isStrideMul(I, Legal)) 6290 return 0; 6291 // Certain instructions can be cheaper to vectorize if they have a constant 6292 // second vector operand. One example of this are shifts on x86. 6293 Value *Op2 = I->getOperand(1); 6294 TargetTransformInfo::OperandValueProperties Op2VP; 6295 TargetTransformInfo::OperandValueKind Op2VK = 6296 TTI.getOperandInfo(Op2, Op2VP); 6297 if (Op2VK == TargetTransformInfo::OK_AnyValue && Legal->isUniform(Op2)) 6298 Op2VK = TargetTransformInfo::OK_UniformValue; 6299 6300 SmallVector<const Value *, 4> Operands(I->operand_values()); 6301 unsigned N = isScalarAfterVectorization(I, VF) ? VF : 1; 6302 return N * TTI.getArithmeticInstrCost( 6303 I->getOpcode(), VectorTy, TargetTransformInfo::OK_AnyValue, 6304 Op2VK, TargetTransformInfo::OP_None, Op2VP, Operands, I); 6305 } 6306 case Instruction::FNeg: { 6307 unsigned N = isScalarAfterVectorization(I, VF) ? VF : 1; 6308 return N * TTI.getArithmeticInstrCost( 6309 I->getOpcode(), VectorTy, TargetTransformInfo::OK_AnyValue, 6310 TargetTransformInfo::OK_AnyValue, 6311 TargetTransformInfo::OP_None, TargetTransformInfo::OP_None, 6312 I->getOperand(0), I); 6313 } 6314 case Instruction::Select: { 6315 SelectInst *SI = cast<SelectInst>(I); 6316 const SCEV *CondSCEV = SE->getSCEV(SI->getCondition()); 6317 bool ScalarCond = (SE->isLoopInvariant(CondSCEV, TheLoop)); 6318 Type *CondTy = SI->getCondition()->getType(); 6319 if (!ScalarCond) 6320 CondTy = VectorType::get(CondTy, VF); 6321 6322 return TTI.getCmpSelInstrCost(I->getOpcode(), VectorTy, CondTy, I); 6323 } 6324 case Instruction::ICmp: 6325 case Instruction::FCmp: { 6326 Type *ValTy = I->getOperand(0)->getType(); 6327 Instruction *Op0AsInstruction = dyn_cast<Instruction>(I->getOperand(0)); 6328 if (canTruncateToMinimalBitwidth(Op0AsInstruction, VF)) 6329 ValTy = IntegerType::get(ValTy->getContext(), MinBWs[Op0AsInstruction]); 6330 VectorTy = ToVectorTy(ValTy, VF); 6331 return TTI.getCmpSelInstrCost(I->getOpcode(), VectorTy, nullptr, I); 6332 } 6333 case Instruction::Store: 6334 case Instruction::Load: { 6335 unsigned Width = VF; 6336 if (Width > 1) { 6337 InstWidening Decision = getWideningDecision(I, Width); 6338 assert(Decision != CM_Unknown && 6339 "CM decision should be taken at this point"); 6340 if (Decision == CM_Scalarize) 6341 Width = 1; 6342 } 6343 VectorTy = ToVectorTy(getMemInstValueType(I), Width); 6344 return getMemoryInstructionCost(I, VF); 6345 } 6346 case Instruction::ZExt: 6347 case Instruction::SExt: 6348 case Instruction::FPToUI: 6349 case Instruction::FPToSI: 6350 case Instruction::FPExt: 6351 case Instruction::PtrToInt: 6352 case Instruction::IntToPtr: 6353 case Instruction::SIToFP: 6354 case Instruction::UIToFP: 6355 case Instruction::Trunc: 6356 case Instruction::FPTrunc: 6357 case Instruction::BitCast: { 6358 // We optimize the truncation of induction variables having constant 6359 // integer steps. The cost of these truncations is the same as the scalar 6360 // operation. 6361 if (isOptimizableIVTruncate(I, VF)) { 6362 auto *Trunc = cast<TruncInst>(I); 6363 return TTI.getCastInstrCost(Instruction::Trunc, Trunc->getDestTy(), 6364 Trunc->getSrcTy(), Trunc); 6365 } 6366 6367 Type *SrcScalarTy = I->getOperand(0)->getType(); 6368 Type *SrcVecTy = 6369 VectorTy->isVectorTy() ? ToVectorTy(SrcScalarTy, VF) : SrcScalarTy; 6370 if (canTruncateToMinimalBitwidth(I, VF)) { 6371 // This cast is going to be shrunk. This may remove the cast or it might 6372 // turn it into slightly different cast. For example, if MinBW == 16, 6373 // "zext i8 %1 to i32" becomes "zext i8 %1 to i16". 6374 // 6375 // Calculate the modified src and dest types. 6376 Type *MinVecTy = VectorTy; 6377 if (I->getOpcode() == Instruction::Trunc) { 6378 SrcVecTy = smallestIntegerVectorType(SrcVecTy, MinVecTy); 6379 VectorTy = 6380 largestIntegerVectorType(ToVectorTy(I->getType(), VF), MinVecTy); 6381 } else if (I->getOpcode() == Instruction::ZExt || 6382 I->getOpcode() == Instruction::SExt) { 6383 SrcVecTy = largestIntegerVectorType(SrcVecTy, MinVecTy); 6384 VectorTy = 6385 smallestIntegerVectorType(ToVectorTy(I->getType(), VF), MinVecTy); 6386 } 6387 } 6388 6389 unsigned N = isScalarAfterVectorization(I, VF) ? VF : 1; 6390 return N * TTI.getCastInstrCost(I->getOpcode(), VectorTy, SrcVecTy, I); 6391 } 6392 case Instruction::Call: { 6393 bool NeedToScalarize; 6394 CallInst *CI = cast<CallInst>(I); 6395 unsigned CallCost = getVectorCallCost(CI, VF, NeedToScalarize); 6396 if (getVectorIntrinsicIDForCall(CI, TLI)) 6397 return std::min(CallCost, getVectorIntrinsicCost(CI, VF)); 6398 return CallCost; 6399 } 6400 default: 6401 // The cost of executing VF copies of the scalar instruction. This opcode 6402 // is unknown. Assume that it is the same as 'mul'. 6403 return VF * TTI.getArithmeticInstrCost(Instruction::Mul, VectorTy) + 6404 getScalarizationOverhead(I, VF); 6405 } // end of switch. 6406 } 6407 6408 char LoopVectorize::ID = 0; 6409 6410 static const char lv_name[] = "Loop Vectorization"; 6411 6412 INITIALIZE_PASS_BEGIN(LoopVectorize, LV_NAME, lv_name, false, false) 6413 INITIALIZE_PASS_DEPENDENCY(TargetTransformInfoWrapperPass) 6414 INITIALIZE_PASS_DEPENDENCY(BasicAAWrapperPass) 6415 INITIALIZE_PASS_DEPENDENCY(AAResultsWrapperPass) 6416 INITIALIZE_PASS_DEPENDENCY(GlobalsAAWrapperPass) 6417 INITIALIZE_PASS_DEPENDENCY(AssumptionCacheTracker) 6418 INITIALIZE_PASS_DEPENDENCY(BlockFrequencyInfoWrapperPass) 6419 INITIALIZE_PASS_DEPENDENCY(DominatorTreeWrapperPass) 6420 INITIALIZE_PASS_DEPENDENCY(ScalarEvolutionWrapperPass) 6421 INITIALIZE_PASS_DEPENDENCY(LoopInfoWrapperPass) 6422 INITIALIZE_PASS_DEPENDENCY(LoopAccessLegacyAnalysis) 6423 INITIALIZE_PASS_DEPENDENCY(DemandedBitsWrapperPass) 6424 INITIALIZE_PASS_DEPENDENCY(OptimizationRemarkEmitterWrapperPass) 6425 INITIALIZE_PASS_DEPENDENCY(ProfileSummaryInfoWrapperPass) 6426 INITIALIZE_PASS_DEPENDENCY(InjectTLIMappingsLegacy) 6427 INITIALIZE_PASS_END(LoopVectorize, LV_NAME, lv_name, false, false) 6428 6429 namespace llvm { 6430 6431 Pass *createLoopVectorizePass() { return new LoopVectorize(); } 6432 6433 Pass *createLoopVectorizePass(bool InterleaveOnlyWhenForced, 6434 bool VectorizeOnlyWhenForced) { 6435 return new LoopVectorize(InterleaveOnlyWhenForced, VectorizeOnlyWhenForced); 6436 } 6437 6438 } // end namespace llvm 6439 6440 bool LoopVectorizationCostModel::isConsecutiveLoadOrStore(Instruction *Inst) { 6441 // Check if the pointer operand of a load or store instruction is 6442 // consecutive. 6443 if (auto *Ptr = getLoadStorePointerOperand(Inst)) 6444 return Legal->isConsecutivePtr(Ptr); 6445 return false; 6446 } 6447 6448 void LoopVectorizationCostModel::collectValuesToIgnore() { 6449 // Ignore ephemeral values. 6450 CodeMetrics::collectEphemeralValues(TheLoop, AC, ValuesToIgnore); 6451 6452 // Ignore type-promoting instructions we identified during reduction 6453 // detection. 6454 for (auto &Reduction : Legal->getReductionVars()) { 6455 RecurrenceDescriptor &RedDes = Reduction.second; 6456 SmallPtrSetImpl<Instruction *> &Casts = RedDes.getCastInsts(); 6457 VecValuesToIgnore.insert(Casts.begin(), Casts.end()); 6458 } 6459 // Ignore type-casting instructions we identified during induction 6460 // detection. 6461 for (auto &Induction : Legal->getInductionVars()) { 6462 InductionDescriptor &IndDes = Induction.second; 6463 const SmallVectorImpl<Instruction *> &Casts = IndDes.getCastInsts(); 6464 VecValuesToIgnore.insert(Casts.begin(), Casts.end()); 6465 } 6466 } 6467 6468 // TODO: we could return a pair of values that specify the max VF and 6469 // min VF, to be used in `buildVPlans(MinVF, MaxVF)` instead of 6470 // `buildVPlans(VF, VF)`. We cannot do it because VPLAN at the moment 6471 // doesn't have a cost model that can choose which plan to execute if 6472 // more than one is generated. 6473 static unsigned determineVPlanVF(const unsigned WidestVectorRegBits, 6474 LoopVectorizationCostModel &CM) { 6475 unsigned WidestType; 6476 std::tie(std::ignore, WidestType) = CM.getSmallestAndWidestTypes(); 6477 return WidestVectorRegBits / WidestType; 6478 } 6479 6480 VectorizationFactor 6481 LoopVectorizationPlanner::planInVPlanNativePath(unsigned UserVF) { 6482 unsigned VF = UserVF; 6483 // Outer loop handling: They may require CFG and instruction level 6484 // transformations before even evaluating whether vectorization is profitable. 6485 // Since we cannot modify the incoming IR, we need to build VPlan upfront in 6486 // the vectorization pipeline. 6487 if (!OrigLoop->empty()) { 6488 // If the user doesn't provide a vectorization factor, determine a 6489 // reasonable one. 6490 if (!UserVF) { 6491 VF = determineVPlanVF(TTI->getRegisterBitWidth(true /* Vector*/), CM); 6492 LLVM_DEBUG(dbgs() << "LV: VPlan computed VF " << VF << ".\n"); 6493 6494 // Make sure we have a VF > 1 for stress testing. 6495 if (VPlanBuildStressTest && VF < 2) { 6496 LLVM_DEBUG(dbgs() << "LV: VPlan stress testing: " 6497 << "overriding computed VF.\n"); 6498 VF = 4; 6499 } 6500 } 6501 assert(EnableVPlanNativePath && "VPlan-native path is not enabled."); 6502 assert(isPowerOf2_32(VF) && "VF needs to be a power of two"); 6503 LLVM_DEBUG(dbgs() << "LV: Using " << (UserVF ? "user " : "") << "VF " << VF 6504 << " to build VPlans.\n"); 6505 buildVPlans(VF, VF); 6506 6507 // For VPlan build stress testing, we bail out after VPlan construction. 6508 if (VPlanBuildStressTest) 6509 return VectorizationFactor::Disabled(); 6510 6511 return {VF, 0}; 6512 } 6513 6514 LLVM_DEBUG( 6515 dbgs() << "LV: Not vectorizing. Inner loops aren't supported in the " 6516 "VPlan-native path.\n"); 6517 return VectorizationFactor::Disabled(); 6518 } 6519 6520 Optional<VectorizationFactor> LoopVectorizationPlanner::plan(unsigned UserVF) { 6521 assert(OrigLoop->empty() && "Inner loop expected."); 6522 Optional<unsigned> MaybeMaxVF = CM.computeMaxVF(); 6523 if (!MaybeMaxVF) // Cases that should not to be vectorized nor interleaved. 6524 return None; 6525 6526 // Invalidate interleave groups if all blocks of loop will be predicated. 6527 if (CM.blockNeedsPredication(OrigLoop->getHeader()) && 6528 !useMaskedInterleavedAccesses(*TTI)) { 6529 LLVM_DEBUG( 6530 dbgs() 6531 << "LV: Invalidate all interleaved groups due to fold-tail by masking " 6532 "which requires masked-interleaved support.\n"); 6533 if (CM.InterleaveInfo.invalidateGroups()) 6534 // Invalidating interleave groups also requires invalidating all decisions 6535 // based on them, which includes widening decisions and uniform and scalar 6536 // values. 6537 CM.invalidateCostModelingDecisions(); 6538 } 6539 6540 if (UserVF) { 6541 LLVM_DEBUG(dbgs() << "LV: Using user VF " << UserVF << ".\n"); 6542 assert(isPowerOf2_32(UserVF) && "VF needs to be a power of two"); 6543 // Collect the instructions (and their associated costs) that will be more 6544 // profitable to scalarize. 6545 CM.selectUserVectorizationFactor(UserVF); 6546 buildVPlansWithVPRecipes(UserVF, UserVF); 6547 LLVM_DEBUG(printPlans(dbgs())); 6548 return {{UserVF, 0}}; 6549 } 6550 6551 unsigned MaxVF = MaybeMaxVF.getValue(); 6552 assert(MaxVF != 0 && "MaxVF is zero."); 6553 6554 for (unsigned VF = 1; VF <= MaxVF; VF *= 2) { 6555 // Collect Uniform and Scalar instructions after vectorization with VF. 6556 CM.collectUniformsAndScalars(VF); 6557 6558 // Collect the instructions (and their associated costs) that will be more 6559 // profitable to scalarize. 6560 if (VF > 1) 6561 CM.collectInstsToScalarize(VF); 6562 } 6563 6564 buildVPlansWithVPRecipes(1, MaxVF); 6565 LLVM_DEBUG(printPlans(dbgs())); 6566 if (MaxVF == 1) 6567 return VectorizationFactor::Disabled(); 6568 6569 // Select the optimal vectorization factor. 6570 return CM.selectVectorizationFactor(MaxVF); 6571 } 6572 6573 void LoopVectorizationPlanner::setBestPlan(unsigned VF, unsigned UF) { 6574 LLVM_DEBUG(dbgs() << "Setting best plan to VF=" << VF << ", UF=" << UF 6575 << '\n'); 6576 BestVF = VF; 6577 BestUF = UF; 6578 6579 erase_if(VPlans, [VF](const VPlanPtr &Plan) { 6580 return !Plan->hasVF(VF); 6581 }); 6582 assert(VPlans.size() == 1 && "Best VF has not a single VPlan."); 6583 } 6584 6585 void LoopVectorizationPlanner::executePlan(InnerLoopVectorizer &ILV, 6586 DominatorTree *DT) { 6587 // Perform the actual loop transformation. 6588 6589 // 1. Create a new empty loop. Unlink the old loop and connect the new one. 6590 VPCallbackILV CallbackILV(ILV); 6591 6592 VPTransformState State{BestVF, BestUF, LI, 6593 DT, ILV.Builder, ILV.VectorLoopValueMap, 6594 &ILV, CallbackILV}; 6595 State.CFG.PrevBB = ILV.createVectorizedLoopSkeleton(); 6596 State.TripCount = ILV.getOrCreateTripCount(nullptr); 6597 State.CanonicalIV = ILV.Induction; 6598 6599 //===------------------------------------------------===// 6600 // 6601 // Notice: any optimization or new instruction that go 6602 // into the code below should also be implemented in 6603 // the cost-model. 6604 // 6605 //===------------------------------------------------===// 6606 6607 // 2. Copy and widen instructions from the old loop into the new loop. 6608 assert(VPlans.size() == 1 && "Not a single VPlan to execute."); 6609 VPlans.front()->execute(&State); 6610 6611 // 3. Fix the vectorized code: take care of header phi's, live-outs, 6612 // predication, updating analyses. 6613 ILV.fixVectorizedLoop(); 6614 } 6615 6616 void LoopVectorizationPlanner::collectTriviallyDeadInstructions( 6617 SmallPtrSetImpl<Instruction *> &DeadInstructions) { 6618 BasicBlock *Latch = OrigLoop->getLoopLatch(); 6619 6620 // We create new control-flow for the vectorized loop, so the original 6621 // condition will be dead after vectorization if it's only used by the 6622 // branch. 6623 auto *Cmp = dyn_cast<Instruction>(Latch->getTerminator()->getOperand(0)); 6624 if (Cmp && Cmp->hasOneUse()) 6625 DeadInstructions.insert(Cmp); 6626 6627 // We create new "steps" for induction variable updates to which the original 6628 // induction variables map. An original update instruction will be dead if 6629 // all its users except the induction variable are dead. 6630 for (auto &Induction : Legal->getInductionVars()) { 6631 PHINode *Ind = Induction.first; 6632 auto *IndUpdate = cast<Instruction>(Ind->getIncomingValueForBlock(Latch)); 6633 if (llvm::all_of(IndUpdate->users(), [&](User *U) -> bool { 6634 return U == Ind || DeadInstructions.find(cast<Instruction>(U)) != 6635 DeadInstructions.end(); 6636 })) 6637 DeadInstructions.insert(IndUpdate); 6638 6639 // We record as "Dead" also the type-casting instructions we had identified 6640 // during induction analysis. We don't need any handling for them in the 6641 // vectorized loop because we have proven that, under a proper runtime 6642 // test guarding the vectorized loop, the value of the phi, and the casted 6643 // value of the phi, are the same. The last instruction in this casting chain 6644 // will get its scalar/vector/widened def from the scalar/vector/widened def 6645 // of the respective phi node. Any other casts in the induction def-use chain 6646 // have no other uses outside the phi update chain, and will be ignored. 6647 InductionDescriptor &IndDes = Induction.second; 6648 const SmallVectorImpl<Instruction *> &Casts = IndDes.getCastInsts(); 6649 DeadInstructions.insert(Casts.begin(), Casts.end()); 6650 } 6651 } 6652 6653 Value *InnerLoopUnroller::reverseVector(Value *Vec) { return Vec; } 6654 6655 Value *InnerLoopUnroller::getBroadcastInstrs(Value *V) { return V; } 6656 6657 Value *InnerLoopUnroller::getStepVector(Value *Val, int StartIdx, Value *Step, 6658 Instruction::BinaryOps BinOp) { 6659 // When unrolling and the VF is 1, we only need to add a simple scalar. 6660 Type *Ty = Val->getType(); 6661 assert(!Ty->isVectorTy() && "Val must be a scalar"); 6662 6663 if (Ty->isFloatingPointTy()) { 6664 Constant *C = ConstantFP::get(Ty, (double)StartIdx); 6665 6666 // Floating point operations had to be 'fast' to enable the unrolling. 6667 Value *MulOp = addFastMathFlag(Builder.CreateFMul(C, Step)); 6668 return addFastMathFlag(Builder.CreateBinOp(BinOp, Val, MulOp)); 6669 } 6670 Constant *C = ConstantInt::get(Ty, StartIdx); 6671 return Builder.CreateAdd(Val, Builder.CreateMul(C, Step), "induction"); 6672 } 6673 6674 static void AddRuntimeUnrollDisableMetaData(Loop *L) { 6675 SmallVector<Metadata *, 4> MDs; 6676 // Reserve first location for self reference to the LoopID metadata node. 6677 MDs.push_back(nullptr); 6678 bool IsUnrollMetadata = false; 6679 MDNode *LoopID = L->getLoopID(); 6680 if (LoopID) { 6681 // First find existing loop unrolling disable metadata. 6682 for (unsigned i = 1, ie = LoopID->getNumOperands(); i < ie; ++i) { 6683 auto *MD = dyn_cast<MDNode>(LoopID->getOperand(i)); 6684 if (MD) { 6685 const auto *S = dyn_cast<MDString>(MD->getOperand(0)); 6686 IsUnrollMetadata = 6687 S && S->getString().startswith("llvm.loop.unroll.disable"); 6688 } 6689 MDs.push_back(LoopID->getOperand(i)); 6690 } 6691 } 6692 6693 if (!IsUnrollMetadata) { 6694 // Add runtime unroll disable metadata. 6695 LLVMContext &Context = L->getHeader()->getContext(); 6696 SmallVector<Metadata *, 1> DisableOperands; 6697 DisableOperands.push_back( 6698 MDString::get(Context, "llvm.loop.unroll.runtime.disable")); 6699 MDNode *DisableNode = MDNode::get(Context, DisableOperands); 6700 MDs.push_back(DisableNode); 6701 MDNode *NewLoopID = MDNode::get(Context, MDs); 6702 // Set operand 0 to refer to the loop id itself. 6703 NewLoopID->replaceOperandWith(0, NewLoopID); 6704 L->setLoopID(NewLoopID); 6705 } 6706 } 6707 6708 bool LoopVectorizationPlanner::getDecisionAndClampRange( 6709 const std::function<bool(unsigned)> &Predicate, VFRange &Range) { 6710 assert(Range.End > Range.Start && "Trying to test an empty VF range."); 6711 bool PredicateAtRangeStart = Predicate(Range.Start); 6712 6713 for (unsigned TmpVF = Range.Start * 2; TmpVF < Range.End; TmpVF *= 2) 6714 if (Predicate(TmpVF) != PredicateAtRangeStart) { 6715 Range.End = TmpVF; 6716 break; 6717 } 6718 6719 return PredicateAtRangeStart; 6720 } 6721 6722 /// Build VPlans for the full range of feasible VF's = {\p MinVF, 2 * \p MinVF, 6723 /// 4 * \p MinVF, ..., \p MaxVF} by repeatedly building a VPlan for a sub-range 6724 /// of VF's starting at a given VF and extending it as much as possible. Each 6725 /// vectorization decision can potentially shorten this sub-range during 6726 /// buildVPlan(). 6727 void LoopVectorizationPlanner::buildVPlans(unsigned MinVF, unsigned MaxVF) { 6728 for (unsigned VF = MinVF; VF < MaxVF + 1;) { 6729 VFRange SubRange = {VF, MaxVF + 1}; 6730 VPlans.push_back(buildVPlan(SubRange)); 6731 VF = SubRange.End; 6732 } 6733 } 6734 6735 VPValue *VPRecipeBuilder::createEdgeMask(BasicBlock *Src, BasicBlock *Dst, 6736 VPlanPtr &Plan) { 6737 assert(is_contained(predecessors(Dst), Src) && "Invalid edge"); 6738 6739 // Look for cached value. 6740 std::pair<BasicBlock *, BasicBlock *> Edge(Src, Dst); 6741 EdgeMaskCacheTy::iterator ECEntryIt = EdgeMaskCache.find(Edge); 6742 if (ECEntryIt != EdgeMaskCache.end()) 6743 return ECEntryIt->second; 6744 6745 VPValue *SrcMask = createBlockInMask(Src, Plan); 6746 6747 // The terminator has to be a branch inst! 6748 BranchInst *BI = dyn_cast<BranchInst>(Src->getTerminator()); 6749 assert(BI && "Unexpected terminator found"); 6750 6751 if (!BI->isConditional() || BI->getSuccessor(0) == BI->getSuccessor(1)) 6752 return EdgeMaskCache[Edge] = SrcMask; 6753 6754 VPValue *EdgeMask = Plan->getVPValue(BI->getCondition()); 6755 assert(EdgeMask && "No Edge Mask found for condition"); 6756 6757 if (BI->getSuccessor(0) != Dst) 6758 EdgeMask = Builder.createNot(EdgeMask); 6759 6760 if (SrcMask) // Otherwise block in-mask is all-one, no need to AND. 6761 EdgeMask = Builder.createAnd(EdgeMask, SrcMask); 6762 6763 return EdgeMaskCache[Edge] = EdgeMask; 6764 } 6765 6766 VPValue *VPRecipeBuilder::createBlockInMask(BasicBlock *BB, VPlanPtr &Plan) { 6767 assert(OrigLoop->contains(BB) && "Block is not a part of a loop"); 6768 6769 // Look for cached value. 6770 BlockMaskCacheTy::iterator BCEntryIt = BlockMaskCache.find(BB); 6771 if (BCEntryIt != BlockMaskCache.end()) 6772 return BCEntryIt->second; 6773 6774 // All-one mask is modelled as no-mask following the convention for masked 6775 // load/store/gather/scatter. Initialize BlockMask to no-mask. 6776 VPValue *BlockMask = nullptr; 6777 6778 if (OrigLoop->getHeader() == BB) { 6779 if (!CM.blockNeedsPredication(BB)) 6780 return BlockMaskCache[BB] = BlockMask; // Loop incoming mask is all-one. 6781 6782 // Introduce the early-exit compare IV <= BTC to form header block mask. 6783 // This is used instead of IV < TC because TC may wrap, unlike BTC. 6784 // Start by constructing the desired canonical IV. 6785 VPValue *IV = nullptr; 6786 if (Legal->getPrimaryInduction()) 6787 IV = Plan->getVPValue(Legal->getPrimaryInduction()); 6788 else { 6789 auto IVRecipe = new VPWidenCanonicalIVRecipe(); 6790 Builder.getInsertBlock()->appendRecipe(IVRecipe); 6791 IV = IVRecipe->getVPValue(); 6792 } 6793 VPValue *BTC = Plan->getOrCreateBackedgeTakenCount(); 6794 BlockMask = Builder.createNaryOp(VPInstruction::ICmpULE, {IV, BTC}); 6795 return BlockMaskCache[BB] = BlockMask; 6796 } 6797 6798 // This is the block mask. We OR all incoming edges. 6799 for (auto *Predecessor : predecessors(BB)) { 6800 VPValue *EdgeMask = createEdgeMask(Predecessor, BB, Plan); 6801 if (!EdgeMask) // Mask of predecessor is all-one so mask of block is too. 6802 return BlockMaskCache[BB] = EdgeMask; 6803 6804 if (!BlockMask) { // BlockMask has its initialized nullptr value. 6805 BlockMask = EdgeMask; 6806 continue; 6807 } 6808 6809 BlockMask = Builder.createOr(BlockMask, EdgeMask); 6810 } 6811 6812 return BlockMaskCache[BB] = BlockMask; 6813 } 6814 6815 VPWidenMemoryInstructionRecipe * 6816 VPRecipeBuilder::tryToWidenMemory(Instruction *I, VFRange &Range, 6817 VPlanPtr &Plan) { 6818 assert((isa<LoadInst>(I) || isa<StoreInst>(I)) && 6819 "Must be called with either a load or store"); 6820 6821 auto willWiden = [&](unsigned VF) -> bool { 6822 if (VF == 1) 6823 return false; 6824 LoopVectorizationCostModel::InstWidening Decision = 6825 CM.getWideningDecision(I, VF); 6826 assert(Decision != LoopVectorizationCostModel::CM_Unknown && 6827 "CM decision should be taken at this point."); 6828 if (Decision == LoopVectorizationCostModel::CM_Interleave) 6829 return true; 6830 if (CM.isScalarAfterVectorization(I, VF) || 6831 CM.isProfitableToScalarize(I, VF)) 6832 return false; 6833 return Decision != LoopVectorizationCostModel::CM_Scalarize; 6834 }; 6835 6836 if (!LoopVectorizationPlanner::getDecisionAndClampRange(willWiden, Range)) 6837 return nullptr; 6838 6839 VPValue *Mask = nullptr; 6840 if (Legal->isMaskRequired(I)) 6841 Mask = createBlockInMask(I->getParent(), Plan); 6842 6843 VPValue *Addr = Plan->getOrAddVPValue(getLoadStorePointerOperand(I)); 6844 if (LoadInst *Load = dyn_cast<LoadInst>(I)) 6845 return new VPWidenMemoryInstructionRecipe(*Load, Addr, Mask); 6846 6847 StoreInst *Store = cast<StoreInst>(I); 6848 VPValue *StoredValue = Plan->getOrAddVPValue(Store->getValueOperand()); 6849 return new VPWidenMemoryInstructionRecipe(*Store, Addr, StoredValue, Mask); 6850 } 6851 6852 VPWidenIntOrFpInductionRecipe * 6853 VPRecipeBuilder::tryToOptimizeInductionPHI(PHINode *Phi) const { 6854 // Check if this is an integer or fp induction. If so, build the recipe that 6855 // produces its scalar and vector values. 6856 InductionDescriptor II = Legal->getInductionVars().lookup(Phi); 6857 if (II.getKind() == InductionDescriptor::IK_IntInduction || 6858 II.getKind() == InductionDescriptor::IK_FpInduction) 6859 return new VPWidenIntOrFpInductionRecipe(Phi); 6860 6861 return nullptr; 6862 } 6863 6864 VPWidenIntOrFpInductionRecipe * 6865 VPRecipeBuilder::tryToOptimizeInductionTruncate(TruncInst *I, 6866 VFRange &Range) const { 6867 // Optimize the special case where the source is a constant integer 6868 // induction variable. Notice that we can only optimize the 'trunc' case 6869 // because (a) FP conversions lose precision, (b) sext/zext may wrap, and 6870 // (c) other casts depend on pointer size. 6871 6872 // Determine whether \p K is a truncation based on an induction variable that 6873 // can be optimized. 6874 auto isOptimizableIVTruncate = 6875 [&](Instruction *K) -> std::function<bool(unsigned)> { 6876 return 6877 [=](unsigned VF) -> bool { return CM.isOptimizableIVTruncate(K, VF); }; 6878 }; 6879 6880 if (LoopVectorizationPlanner::getDecisionAndClampRange( 6881 isOptimizableIVTruncate(I), Range)) 6882 return new VPWidenIntOrFpInductionRecipe(cast<PHINode>(I->getOperand(0)), 6883 I); 6884 return nullptr; 6885 } 6886 6887 VPBlendRecipe *VPRecipeBuilder::tryToBlend(PHINode *Phi, VPlanPtr &Plan) { 6888 // We know that all PHIs in non-header blocks are converted into selects, so 6889 // we don't have to worry about the insertion order and we can just use the 6890 // builder. At this point we generate the predication tree. There may be 6891 // duplications since this is a simple recursive scan, but future 6892 // optimizations will clean it up. 6893 6894 SmallVector<VPValue *, 2> Operands; 6895 unsigned NumIncoming = Phi->getNumIncomingValues(); 6896 for (unsigned In = 0; In < NumIncoming; In++) { 6897 VPValue *EdgeMask = 6898 createEdgeMask(Phi->getIncomingBlock(In), Phi->getParent(), Plan); 6899 assert((EdgeMask || NumIncoming == 1) && 6900 "Multiple predecessors with one having a full mask"); 6901 Operands.push_back(Plan->getOrAddVPValue(Phi->getIncomingValue(In))); 6902 if (EdgeMask) 6903 Operands.push_back(EdgeMask); 6904 } 6905 return new VPBlendRecipe(Phi, Operands); 6906 } 6907 6908 VPWidenCallRecipe *VPRecipeBuilder::tryToWidenCall(CallInst *CI, VFRange &Range, 6909 VPlan &Plan) const { 6910 6911 bool IsPredicated = LoopVectorizationPlanner::getDecisionAndClampRange( 6912 [this, CI](unsigned VF) { return CM.isScalarWithPredication(CI, VF); }, 6913 Range); 6914 6915 if (IsPredicated) 6916 return nullptr; 6917 6918 Intrinsic::ID ID = getVectorIntrinsicIDForCall(CI, TLI); 6919 if (ID && (ID == Intrinsic::assume || ID == Intrinsic::lifetime_end || 6920 ID == Intrinsic::lifetime_start || ID == Intrinsic::sideeffect)) 6921 return nullptr; 6922 6923 auto willWiden = [&](unsigned VF) -> bool { 6924 Intrinsic::ID ID = getVectorIntrinsicIDForCall(CI, TLI); 6925 // The following case may be scalarized depending on the VF. 6926 // The flag shows whether we use Intrinsic or a usual Call for vectorized 6927 // version of the instruction. 6928 // Is it beneficial to perform intrinsic call compared to lib call? 6929 bool NeedToScalarize = false; 6930 unsigned CallCost = CM.getVectorCallCost(CI, VF, NeedToScalarize); 6931 bool UseVectorIntrinsic = 6932 ID && CM.getVectorIntrinsicCost(CI, VF) <= CallCost; 6933 return UseVectorIntrinsic || !NeedToScalarize; 6934 }; 6935 6936 if (!LoopVectorizationPlanner::getDecisionAndClampRange(willWiden, Range)) 6937 return nullptr; 6938 6939 // Success: widen this call. 6940 auto VPValues = map_range(CI->arg_operands(), [&Plan](Value *Op) { 6941 return Plan.getOrAddVPValue(Op); 6942 }); 6943 6944 return new VPWidenCallRecipe(*CI, VPValues); 6945 } 6946 6947 bool VPRecipeBuilder::shouldWiden(Instruction *I, VFRange &Range) const { 6948 assert(!isa<BranchInst>(I) && !isa<PHINode>(I) && !isa<LoadInst>(I) && 6949 !isa<StoreInst>(I) && "Instruction should have been handled earlier"); 6950 // Instruction should be widened, unless it is scalar after vectorization, 6951 // scalarization is profitable or it is predicated. 6952 auto WillScalarize = [this, I](unsigned VF) -> bool { 6953 return CM.isScalarAfterVectorization(I, VF) || 6954 CM.isProfitableToScalarize(I, VF) || 6955 CM.isScalarWithPredication(I, VF); 6956 }; 6957 return !LoopVectorizationPlanner::getDecisionAndClampRange(WillScalarize, 6958 Range); 6959 } 6960 6961 VPWidenRecipe *VPRecipeBuilder::tryToWiden(Instruction *I, VPlan &Plan) const { 6962 auto IsVectorizableOpcode = [](unsigned Opcode) { 6963 switch (Opcode) { 6964 case Instruction::Add: 6965 case Instruction::And: 6966 case Instruction::AShr: 6967 case Instruction::BitCast: 6968 case Instruction::FAdd: 6969 case Instruction::FCmp: 6970 case Instruction::FDiv: 6971 case Instruction::FMul: 6972 case Instruction::FNeg: 6973 case Instruction::FPExt: 6974 case Instruction::FPToSI: 6975 case Instruction::FPToUI: 6976 case Instruction::FPTrunc: 6977 case Instruction::FRem: 6978 case Instruction::FSub: 6979 case Instruction::ICmp: 6980 case Instruction::IntToPtr: 6981 case Instruction::LShr: 6982 case Instruction::Mul: 6983 case Instruction::Or: 6984 case Instruction::PtrToInt: 6985 case Instruction::SDiv: 6986 case Instruction::Select: 6987 case Instruction::SExt: 6988 case Instruction::Shl: 6989 case Instruction::SIToFP: 6990 case Instruction::SRem: 6991 case Instruction::Sub: 6992 case Instruction::Trunc: 6993 case Instruction::UDiv: 6994 case Instruction::UIToFP: 6995 case Instruction::URem: 6996 case Instruction::Xor: 6997 case Instruction::ZExt: 6998 return true; 6999 } 7000 return false; 7001 }; 7002 7003 if (!IsVectorizableOpcode(I->getOpcode())) 7004 return nullptr; 7005 7006 // Success: widen this instruction. 7007 return new VPWidenRecipe(*I); 7008 } 7009 7010 VPBasicBlock *VPRecipeBuilder::handleReplication( 7011 Instruction *I, VFRange &Range, VPBasicBlock *VPBB, 7012 DenseMap<Instruction *, VPReplicateRecipe *> &PredInst2Recipe, 7013 VPlanPtr &Plan) { 7014 bool IsUniform = LoopVectorizationPlanner::getDecisionAndClampRange( 7015 [&](unsigned VF) { return CM.isUniformAfterVectorization(I, VF); }, 7016 Range); 7017 7018 bool IsPredicated = LoopVectorizationPlanner::getDecisionAndClampRange( 7019 [&](unsigned VF) { return CM.isScalarWithPredication(I, VF); }, Range); 7020 7021 auto *Recipe = new VPReplicateRecipe(I, IsUniform, IsPredicated); 7022 setRecipe(I, Recipe); 7023 7024 // Find if I uses a predicated instruction. If so, it will use its scalar 7025 // value. Avoid hoisting the insert-element which packs the scalar value into 7026 // a vector value, as that happens iff all users use the vector value. 7027 for (auto &Op : I->operands()) 7028 if (auto *PredInst = dyn_cast<Instruction>(Op)) 7029 if (PredInst2Recipe.find(PredInst) != PredInst2Recipe.end()) 7030 PredInst2Recipe[PredInst]->setAlsoPack(false); 7031 7032 // Finalize the recipe for Instr, first if it is not predicated. 7033 if (!IsPredicated) { 7034 LLVM_DEBUG(dbgs() << "LV: Scalarizing:" << *I << "\n"); 7035 VPBB->appendRecipe(Recipe); 7036 return VPBB; 7037 } 7038 LLVM_DEBUG(dbgs() << "LV: Scalarizing and predicating:" << *I << "\n"); 7039 assert(VPBB->getSuccessors().empty() && 7040 "VPBB has successors when handling predicated replication."); 7041 // Record predicated instructions for above packing optimizations. 7042 PredInst2Recipe[I] = Recipe; 7043 VPBlockBase *Region = createReplicateRegion(I, Recipe, Plan); 7044 VPBlockUtils::insertBlockAfter(Region, VPBB); 7045 auto *RegSucc = new VPBasicBlock(); 7046 VPBlockUtils::insertBlockAfter(RegSucc, Region); 7047 return RegSucc; 7048 } 7049 7050 VPRegionBlock *VPRecipeBuilder::createReplicateRegion(Instruction *Instr, 7051 VPRecipeBase *PredRecipe, 7052 VPlanPtr &Plan) { 7053 // Instructions marked for predication are replicated and placed under an 7054 // if-then construct to prevent side-effects. 7055 7056 // Generate recipes to compute the block mask for this region. 7057 VPValue *BlockInMask = createBlockInMask(Instr->getParent(), Plan); 7058 7059 // Build the triangular if-then region. 7060 std::string RegionName = (Twine("pred.") + Instr->getOpcodeName()).str(); 7061 assert(Instr->getParent() && "Predicated instruction not in any basic block"); 7062 auto *BOMRecipe = new VPBranchOnMaskRecipe(BlockInMask); 7063 auto *Entry = new VPBasicBlock(Twine(RegionName) + ".entry", BOMRecipe); 7064 auto *PHIRecipe = 7065 Instr->getType()->isVoidTy() ? nullptr : new VPPredInstPHIRecipe(Instr); 7066 auto *Exit = new VPBasicBlock(Twine(RegionName) + ".continue", PHIRecipe); 7067 auto *Pred = new VPBasicBlock(Twine(RegionName) + ".if", PredRecipe); 7068 VPRegionBlock *Region = new VPRegionBlock(Entry, Exit, RegionName, true); 7069 7070 // Note: first set Entry as region entry and then connect successors starting 7071 // from it in order, to propagate the "parent" of each VPBasicBlock. 7072 VPBlockUtils::insertTwoBlocksAfter(Pred, Exit, BlockInMask, Entry); 7073 VPBlockUtils::connectBlocks(Pred, Exit); 7074 7075 return Region; 7076 } 7077 7078 VPRecipeBase *VPRecipeBuilder::tryToCreateWidenRecipe(Instruction *Instr, 7079 VFRange &Range, 7080 VPlanPtr &Plan) { 7081 // First, check for specific widening recipes that deal with calls, memory 7082 // operations, inductions and Phi nodes. 7083 if (auto *CI = dyn_cast<CallInst>(Instr)) 7084 return tryToWidenCall(CI, Range, *Plan); 7085 7086 if (isa<LoadInst>(Instr) || isa<StoreInst>(Instr)) 7087 return tryToWidenMemory(Instr, Range, Plan); 7088 7089 VPRecipeBase *Recipe; 7090 if (auto Phi = dyn_cast<PHINode>(Instr)) { 7091 if (Phi->getParent() != OrigLoop->getHeader()) 7092 return tryToBlend(Phi, Plan); 7093 if ((Recipe = tryToOptimizeInductionPHI(Phi))) 7094 return Recipe; 7095 return new VPWidenPHIRecipe(Phi); 7096 return new VPWidenPHIRecipe(Phi); 7097 } 7098 7099 if (isa<TruncInst>(Instr) && 7100 (Recipe = tryToOptimizeInductionTruncate(cast<TruncInst>(Instr), Range))) 7101 return Recipe; 7102 7103 if (!shouldWiden(Instr, Range)) 7104 return nullptr; 7105 7106 if (auto GEP = dyn_cast<GetElementPtrInst>(Instr)) 7107 return new VPWidenGEPRecipe(GEP, OrigLoop); 7108 7109 if (auto *SI = dyn_cast<SelectInst>(Instr)) { 7110 bool InvariantCond = 7111 PSE.getSE()->isLoopInvariant(PSE.getSCEV(SI->getOperand(0)), OrigLoop); 7112 return new VPWidenSelectRecipe(*SI, InvariantCond); 7113 } 7114 7115 return tryToWiden(Instr, *Plan); 7116 } 7117 7118 void LoopVectorizationPlanner::buildVPlansWithVPRecipes(unsigned MinVF, 7119 unsigned MaxVF) { 7120 assert(OrigLoop->empty() && "Inner loop expected."); 7121 7122 // Collect conditions feeding internal conditional branches; they need to be 7123 // represented in VPlan for it to model masking. 7124 SmallPtrSet<Value *, 1> NeedDef; 7125 7126 auto *Latch = OrigLoop->getLoopLatch(); 7127 for (BasicBlock *BB : OrigLoop->blocks()) { 7128 if (BB == Latch) 7129 continue; 7130 BranchInst *Branch = dyn_cast<BranchInst>(BB->getTerminator()); 7131 if (Branch && Branch->isConditional()) 7132 NeedDef.insert(Branch->getCondition()); 7133 } 7134 7135 // If the tail is to be folded by masking, the primary induction variable, if 7136 // exists needs to be represented in VPlan for it to model early-exit masking. 7137 // Also, both the Phi and the live-out instruction of each reduction are 7138 // required in order to introduce a select between them in VPlan. 7139 if (CM.foldTailByMasking()) { 7140 if (Legal->getPrimaryInduction()) 7141 NeedDef.insert(Legal->getPrimaryInduction()); 7142 for (auto &Reduction : Legal->getReductionVars()) { 7143 NeedDef.insert(Reduction.first); 7144 NeedDef.insert(Reduction.second.getLoopExitInstr()); 7145 } 7146 } 7147 7148 // Collect instructions from the original loop that will become trivially dead 7149 // in the vectorized loop. We don't need to vectorize these instructions. For 7150 // example, original induction update instructions can become dead because we 7151 // separately emit induction "steps" when generating code for the new loop. 7152 // Similarly, we create a new latch condition when setting up the structure 7153 // of the new loop, so the old one can become dead. 7154 SmallPtrSet<Instruction *, 4> DeadInstructions; 7155 collectTriviallyDeadInstructions(DeadInstructions); 7156 7157 // Add assume instructions we need to drop to DeadInstructions, to prevent 7158 // them from being added to the VPlan. 7159 // TODO: We only need to drop assumes in blocks that get flattend. If the 7160 // control flow is preserved, we should keep them. 7161 auto &ConditionalAssumes = Legal->getConditionalAssumes(); 7162 DeadInstructions.insert(ConditionalAssumes.begin(), ConditionalAssumes.end()); 7163 7164 DenseMap<Instruction *, Instruction *> &SinkAfter = Legal->getSinkAfter(); 7165 // Dead instructions do not need sinking. Remove them from SinkAfter. 7166 for (Instruction *I : DeadInstructions) 7167 SinkAfter.erase(I); 7168 7169 for (unsigned VF = MinVF; VF < MaxVF + 1;) { 7170 VFRange SubRange = {VF, MaxVF + 1}; 7171 VPlans.push_back(buildVPlanWithVPRecipes(SubRange, NeedDef, 7172 DeadInstructions, SinkAfter)); 7173 VF = SubRange.End; 7174 } 7175 } 7176 7177 VPlanPtr LoopVectorizationPlanner::buildVPlanWithVPRecipes( 7178 VFRange &Range, SmallPtrSetImpl<Value *> &NeedDef, 7179 SmallPtrSetImpl<Instruction *> &DeadInstructions, 7180 const DenseMap<Instruction *, Instruction *> &SinkAfter) { 7181 7182 // Hold a mapping from predicated instructions to their recipes, in order to 7183 // fix their AlsoPack behavior if a user is determined to replicate and use a 7184 // scalar instead of vector value. 7185 DenseMap<Instruction *, VPReplicateRecipe *> PredInst2Recipe; 7186 7187 SmallPtrSet<const InterleaveGroup<Instruction> *, 1> InterleaveGroups; 7188 7189 VPRecipeBuilder RecipeBuilder(OrigLoop, TLI, Legal, CM, PSE, Builder); 7190 7191 // --------------------------------------------------------------------------- 7192 // Pre-construction: record ingredients whose recipes we'll need to further 7193 // process after constructing the initial VPlan. 7194 // --------------------------------------------------------------------------- 7195 7196 // Mark instructions we'll need to sink later and their targets as 7197 // ingredients whose recipe we'll need to record. 7198 for (auto &Entry : SinkAfter) { 7199 RecipeBuilder.recordRecipeOf(Entry.first); 7200 RecipeBuilder.recordRecipeOf(Entry.second); 7201 } 7202 7203 // For each interleave group which is relevant for this (possibly trimmed) 7204 // Range, add it to the set of groups to be later applied to the VPlan and add 7205 // placeholders for its members' Recipes which we'll be replacing with a 7206 // single VPInterleaveRecipe. 7207 for (InterleaveGroup<Instruction> *IG : IAI.getInterleaveGroups()) { 7208 auto applyIG = [IG, this](unsigned VF) -> bool { 7209 return (VF >= 2 && // Query is illegal for VF == 1 7210 CM.getWideningDecision(IG->getInsertPos(), VF) == 7211 LoopVectorizationCostModel::CM_Interleave); 7212 }; 7213 if (!getDecisionAndClampRange(applyIG, Range)) 7214 continue; 7215 InterleaveGroups.insert(IG); 7216 for (unsigned i = 0; i < IG->getFactor(); i++) 7217 if (Instruction *Member = IG->getMember(i)) 7218 RecipeBuilder.recordRecipeOf(Member); 7219 }; 7220 7221 // --------------------------------------------------------------------------- 7222 // Build initial VPlan: Scan the body of the loop in a topological order to 7223 // visit each basic block after having visited its predecessor basic blocks. 7224 // --------------------------------------------------------------------------- 7225 7226 // Create a dummy pre-entry VPBasicBlock to start building the VPlan. 7227 auto Plan = std::make_unique<VPlan>(); 7228 VPBasicBlock *VPBB = new VPBasicBlock("Pre-Entry"); 7229 Plan->setEntry(VPBB); 7230 7231 // Represent values that will have defs inside VPlan. 7232 for (Value *V : NeedDef) 7233 Plan->addVPValue(V); 7234 7235 // Scan the body of the loop in a topological order to visit each basic block 7236 // after having visited its predecessor basic blocks. 7237 LoopBlocksDFS DFS(OrigLoop); 7238 DFS.perform(LI); 7239 7240 for (BasicBlock *BB : make_range(DFS.beginRPO(), DFS.endRPO())) { 7241 // Relevant instructions from basic block BB will be grouped into VPRecipe 7242 // ingredients and fill a new VPBasicBlock. 7243 unsigned VPBBsForBB = 0; 7244 auto *FirstVPBBForBB = new VPBasicBlock(BB->getName()); 7245 VPBlockUtils::insertBlockAfter(FirstVPBBForBB, VPBB); 7246 VPBB = FirstVPBBForBB; 7247 Builder.setInsertPoint(VPBB); 7248 7249 // Introduce each ingredient into VPlan. 7250 // TODO: Model and preserve debug instrinsics in VPlan. 7251 for (Instruction &I : BB->instructionsWithoutDebug()) { 7252 Instruction *Instr = &I; 7253 7254 // First filter out irrelevant instructions, to ensure no recipes are 7255 // built for them. 7256 if (isa<BranchInst>(Instr) || 7257 DeadInstructions.find(Instr) != DeadInstructions.end()) 7258 continue; 7259 7260 if (auto Recipe = 7261 RecipeBuilder.tryToCreateWidenRecipe(Instr, Range, Plan)) { 7262 RecipeBuilder.setRecipe(Instr, Recipe); 7263 VPBB->appendRecipe(Recipe); 7264 continue; 7265 } 7266 7267 // Otherwise, if all widening options failed, Instruction is to be 7268 // replicated. This may create a successor for VPBB. 7269 VPBasicBlock *NextVPBB = RecipeBuilder.handleReplication( 7270 Instr, Range, VPBB, PredInst2Recipe, Plan); 7271 if (NextVPBB != VPBB) { 7272 VPBB = NextVPBB; 7273 VPBB->setName(BB->hasName() ? BB->getName() + "." + Twine(VPBBsForBB++) 7274 : ""); 7275 } 7276 } 7277 } 7278 7279 // Discard empty dummy pre-entry VPBasicBlock. Note that other VPBasicBlocks 7280 // may also be empty, such as the last one VPBB, reflecting original 7281 // basic-blocks with no recipes. 7282 VPBasicBlock *PreEntry = cast<VPBasicBlock>(Plan->getEntry()); 7283 assert(PreEntry->empty() && "Expecting empty pre-entry block."); 7284 VPBlockBase *Entry = Plan->setEntry(PreEntry->getSingleSuccessor()); 7285 VPBlockUtils::disconnectBlocks(PreEntry, Entry); 7286 delete PreEntry; 7287 7288 // --------------------------------------------------------------------------- 7289 // Transform initial VPlan: Apply previously taken decisions, in order, to 7290 // bring the VPlan to its final state. 7291 // --------------------------------------------------------------------------- 7292 7293 // Apply Sink-After legal constraints. 7294 for (auto &Entry : SinkAfter) { 7295 VPRecipeBase *Sink = RecipeBuilder.getRecipe(Entry.first); 7296 VPRecipeBase *Target = RecipeBuilder.getRecipe(Entry.second); 7297 Sink->moveAfter(Target); 7298 } 7299 7300 // Interleave memory: for each Interleave Group we marked earlier as relevant 7301 // for this VPlan, replace the Recipes widening its memory instructions with a 7302 // single VPInterleaveRecipe at its insertion point. 7303 for (auto IG : InterleaveGroups) { 7304 auto *Recipe = cast<VPWidenMemoryInstructionRecipe>( 7305 RecipeBuilder.getRecipe(IG->getInsertPos())); 7306 (new VPInterleaveRecipe(IG, Recipe->getAddr(), Recipe->getMask())) 7307 ->insertBefore(Recipe); 7308 7309 for (unsigned i = 0; i < IG->getFactor(); ++i) 7310 if (Instruction *Member = IG->getMember(i)) { 7311 RecipeBuilder.getRecipe(Member)->eraseFromParent(); 7312 } 7313 } 7314 7315 // Finally, if tail is folded by masking, introduce selects between the phi 7316 // and the live-out instruction of each reduction, at the end of the latch. 7317 if (CM.foldTailByMasking()) { 7318 Builder.setInsertPoint(VPBB); 7319 auto *Cond = RecipeBuilder.createBlockInMask(OrigLoop->getHeader(), Plan); 7320 for (auto &Reduction : Legal->getReductionVars()) { 7321 VPValue *Phi = Plan->getVPValue(Reduction.first); 7322 VPValue *Red = Plan->getVPValue(Reduction.second.getLoopExitInstr()); 7323 Builder.createNaryOp(Instruction::Select, {Cond, Red, Phi}); 7324 } 7325 } 7326 7327 std::string PlanName; 7328 raw_string_ostream RSO(PlanName); 7329 unsigned VF = Range.Start; 7330 Plan->addVF(VF); 7331 RSO << "Initial VPlan for VF={" << VF; 7332 for (VF *= 2; VF < Range.End; VF *= 2) { 7333 Plan->addVF(VF); 7334 RSO << "," << VF; 7335 } 7336 RSO << "},UF>=1"; 7337 RSO.flush(); 7338 Plan->setName(PlanName); 7339 7340 return Plan; 7341 } 7342 7343 VPlanPtr LoopVectorizationPlanner::buildVPlan(VFRange &Range) { 7344 // Outer loop handling: They may require CFG and instruction level 7345 // transformations before even evaluating whether vectorization is profitable. 7346 // Since we cannot modify the incoming IR, we need to build VPlan upfront in 7347 // the vectorization pipeline. 7348 assert(!OrigLoop->empty()); 7349 assert(EnableVPlanNativePath && "VPlan-native path is not enabled."); 7350 7351 // Create new empty VPlan 7352 auto Plan = std::make_unique<VPlan>(); 7353 7354 // Build hierarchical CFG 7355 VPlanHCFGBuilder HCFGBuilder(OrigLoop, LI, *Plan); 7356 HCFGBuilder.buildHierarchicalCFG(); 7357 7358 for (unsigned VF = Range.Start; VF < Range.End; VF *= 2) 7359 Plan->addVF(VF); 7360 7361 if (EnableVPlanPredication) { 7362 VPlanPredicator VPP(*Plan); 7363 VPP.predicate(); 7364 7365 // Avoid running transformation to recipes until masked code generation in 7366 // VPlan-native path is in place. 7367 return Plan; 7368 } 7369 7370 SmallPtrSet<Instruction *, 1> DeadInstructions; 7371 VPlanTransforms::VPInstructionsToVPRecipes( 7372 OrigLoop, Plan, Legal->getInductionVars(), DeadInstructions); 7373 return Plan; 7374 } 7375 7376 Value* LoopVectorizationPlanner::VPCallbackILV:: 7377 getOrCreateVectorValues(Value *V, unsigned Part) { 7378 return ILV.getOrCreateVectorValue(V, Part); 7379 } 7380 7381 Value *LoopVectorizationPlanner::VPCallbackILV::getOrCreateScalarValue( 7382 Value *V, const VPIteration &Instance) { 7383 return ILV.getOrCreateScalarValue(V, Instance); 7384 } 7385 7386 void VPInterleaveRecipe::print(raw_ostream &O, const Twine &Indent, 7387 VPSlotTracker &SlotTracker) const { 7388 O << " +\n" 7389 << Indent << "\"INTERLEAVE-GROUP with factor " << IG->getFactor() << " at "; 7390 IG->getInsertPos()->printAsOperand(O, false); 7391 O << ", "; 7392 getAddr()->printAsOperand(O, SlotTracker); 7393 VPValue *Mask = getMask(); 7394 if (Mask) { 7395 O << ", "; 7396 Mask->printAsOperand(O, SlotTracker); 7397 } 7398 O << "\\l\""; 7399 for (unsigned i = 0; i < IG->getFactor(); ++i) 7400 if (Instruction *I = IG->getMember(i)) 7401 O << " +\n" 7402 << Indent << "\" " << VPlanIngredient(I) << " " << i << "\\l\""; 7403 } 7404 7405 void VPWidenCallRecipe::execute(VPTransformState &State) { 7406 State.ILV->widenCallInstruction(Ingredient, User, State); 7407 } 7408 7409 void VPWidenSelectRecipe::execute(VPTransformState &State) { 7410 State.ILV->widenSelectInstruction(Ingredient, InvariantCond); 7411 } 7412 7413 void VPWidenRecipe::execute(VPTransformState &State) { 7414 State.ILV->widenInstruction(Ingredient); 7415 } 7416 7417 void VPWidenGEPRecipe::execute(VPTransformState &State) { 7418 State.ILV->widenGEP(GEP, State.UF, State.VF, IsPtrLoopInvariant, 7419 IsIndexLoopInvariant); 7420 } 7421 7422 void VPWidenIntOrFpInductionRecipe::execute(VPTransformState &State) { 7423 assert(!State.Instance && "Int or FP induction being replicated."); 7424 State.ILV->widenIntOrFpInduction(IV, Trunc); 7425 } 7426 7427 void VPWidenPHIRecipe::execute(VPTransformState &State) { 7428 State.ILV->widenPHIInstruction(Phi, State.UF, State.VF); 7429 } 7430 7431 void VPBlendRecipe::execute(VPTransformState &State) { 7432 State.ILV->setDebugLocFromInst(State.Builder, Phi); 7433 // We know that all PHIs in non-header blocks are converted into 7434 // selects, so we don't have to worry about the insertion order and we 7435 // can just use the builder. 7436 // At this point we generate the predication tree. There may be 7437 // duplications since this is a simple recursive scan, but future 7438 // optimizations will clean it up. 7439 7440 unsigned NumIncoming = getNumIncomingValues(); 7441 7442 // Generate a sequence of selects of the form: 7443 // SELECT(Mask3, In3, 7444 // SELECT(Mask2, In2, 7445 // SELECT(Mask1, In1, 7446 // In0))) 7447 // Note that Mask0 is never used: lanes for which no path reaches this phi and 7448 // are essentially undef are taken from In0. 7449 InnerLoopVectorizer::VectorParts Entry(State.UF); 7450 for (unsigned In = 0; In < NumIncoming; ++In) { 7451 for (unsigned Part = 0; Part < State.UF; ++Part) { 7452 // We might have single edge PHIs (blocks) - use an identity 7453 // 'select' for the first PHI operand. 7454 Value *In0 = State.get(getIncomingValue(In), Part); 7455 if (In == 0) 7456 Entry[Part] = In0; // Initialize with the first incoming value. 7457 else { 7458 // Select between the current value and the previous incoming edge 7459 // based on the incoming mask. 7460 Value *Cond = State.get(getMask(In), Part); 7461 Entry[Part] = 7462 State.Builder.CreateSelect(Cond, In0, Entry[Part], "predphi"); 7463 } 7464 } 7465 } 7466 for (unsigned Part = 0; Part < State.UF; ++Part) 7467 State.ValueMap.setVectorValue(Phi, Part, Entry[Part]); 7468 } 7469 7470 void VPInterleaveRecipe::execute(VPTransformState &State) { 7471 assert(!State.Instance && "Interleave group being replicated."); 7472 State.ILV->vectorizeInterleaveGroup(IG, State, getAddr(), getMask()); 7473 } 7474 7475 void VPReplicateRecipe::execute(VPTransformState &State) { 7476 if (State.Instance) { // Generate a single instance. 7477 State.ILV->scalarizeInstruction(Ingredient, *State.Instance, IsPredicated); 7478 // Insert scalar instance packing it into a vector. 7479 if (AlsoPack && State.VF > 1) { 7480 // If we're constructing lane 0, initialize to start from undef. 7481 if (State.Instance->Lane == 0) { 7482 Value *Undef = 7483 UndefValue::get(VectorType::get(Ingredient->getType(), State.VF)); 7484 State.ValueMap.setVectorValue(Ingredient, State.Instance->Part, Undef); 7485 } 7486 State.ILV->packScalarIntoVectorValue(Ingredient, *State.Instance); 7487 } 7488 return; 7489 } 7490 7491 // Generate scalar instances for all VF lanes of all UF parts, unless the 7492 // instruction is uniform inwhich case generate only the first lane for each 7493 // of the UF parts. 7494 unsigned EndLane = IsUniform ? 1 : State.VF; 7495 for (unsigned Part = 0; Part < State.UF; ++Part) 7496 for (unsigned Lane = 0; Lane < EndLane; ++Lane) 7497 State.ILV->scalarizeInstruction(Ingredient, {Part, Lane}, IsPredicated); 7498 } 7499 7500 void VPBranchOnMaskRecipe::execute(VPTransformState &State) { 7501 assert(State.Instance && "Branch on Mask works only on single instance."); 7502 7503 unsigned Part = State.Instance->Part; 7504 unsigned Lane = State.Instance->Lane; 7505 7506 Value *ConditionBit = nullptr; 7507 if (!User) // Block in mask is all-one. 7508 ConditionBit = State.Builder.getTrue(); 7509 else { 7510 VPValue *BlockInMask = User->getOperand(0); 7511 ConditionBit = State.get(BlockInMask, Part); 7512 if (ConditionBit->getType()->isVectorTy()) 7513 ConditionBit = State.Builder.CreateExtractElement( 7514 ConditionBit, State.Builder.getInt32(Lane)); 7515 } 7516 7517 // Replace the temporary unreachable terminator with a new conditional branch, 7518 // whose two destinations will be set later when they are created. 7519 auto *CurrentTerminator = State.CFG.PrevBB->getTerminator(); 7520 assert(isa<UnreachableInst>(CurrentTerminator) && 7521 "Expected to replace unreachable terminator with conditional branch."); 7522 auto *CondBr = BranchInst::Create(State.CFG.PrevBB, nullptr, ConditionBit); 7523 CondBr->setSuccessor(0, nullptr); 7524 ReplaceInstWithInst(CurrentTerminator, CondBr); 7525 } 7526 7527 void VPPredInstPHIRecipe::execute(VPTransformState &State) { 7528 assert(State.Instance && "Predicated instruction PHI works per instance."); 7529 Instruction *ScalarPredInst = cast<Instruction>( 7530 State.ValueMap.getScalarValue(PredInst, *State.Instance)); 7531 BasicBlock *PredicatedBB = ScalarPredInst->getParent(); 7532 BasicBlock *PredicatingBB = PredicatedBB->getSinglePredecessor(); 7533 assert(PredicatingBB && "Predicated block has no single predecessor."); 7534 7535 // By current pack/unpack logic we need to generate only a single phi node: if 7536 // a vector value for the predicated instruction exists at this point it means 7537 // the instruction has vector users only, and a phi for the vector value is 7538 // needed. In this case the recipe of the predicated instruction is marked to 7539 // also do that packing, thereby "hoisting" the insert-element sequence. 7540 // Otherwise, a phi node for the scalar value is needed. 7541 unsigned Part = State.Instance->Part; 7542 if (State.ValueMap.hasVectorValue(PredInst, Part)) { 7543 Value *VectorValue = State.ValueMap.getVectorValue(PredInst, Part); 7544 InsertElementInst *IEI = cast<InsertElementInst>(VectorValue); 7545 PHINode *VPhi = State.Builder.CreatePHI(IEI->getType(), 2); 7546 VPhi->addIncoming(IEI->getOperand(0), PredicatingBB); // Unmodified vector. 7547 VPhi->addIncoming(IEI, PredicatedBB); // New vector with inserted element. 7548 State.ValueMap.resetVectorValue(PredInst, Part, VPhi); // Update cache. 7549 } else { 7550 Type *PredInstType = PredInst->getType(); 7551 PHINode *Phi = State.Builder.CreatePHI(PredInstType, 2); 7552 Phi->addIncoming(UndefValue::get(ScalarPredInst->getType()), PredicatingBB); 7553 Phi->addIncoming(ScalarPredInst, PredicatedBB); 7554 State.ValueMap.resetScalarValue(PredInst, *State.Instance, Phi); 7555 } 7556 } 7557 7558 void VPWidenMemoryInstructionRecipe::execute(VPTransformState &State) { 7559 VPValue *StoredValue = isa<StoreInst>(Instr) ? getStoredValue() : nullptr; 7560 State.ILV->vectorizeMemoryInstruction(&Instr, State, getAddr(), StoredValue, 7561 getMask()); 7562 } 7563 7564 // Determine how to lower the scalar epilogue, which depends on 1) optimising 7565 // for minimum code-size, 2) predicate compiler options, 3) loop hints forcing 7566 // predication, and 4) a TTI hook that analyses whether the loop is suitable 7567 // for predication. 7568 static ScalarEpilogueLowering getScalarEpilogueLowering( 7569 Function *F, Loop *L, LoopVectorizeHints &Hints, ProfileSummaryInfo *PSI, 7570 BlockFrequencyInfo *BFI, TargetTransformInfo *TTI, TargetLibraryInfo *TLI, 7571 AssumptionCache *AC, LoopInfo *LI, ScalarEvolution *SE, DominatorTree *DT, 7572 LoopVectorizationLegality &LVL) { 7573 bool OptSize = 7574 F->hasOptSize() || llvm::shouldOptimizeForSize(L->getHeader(), PSI, BFI, 7575 PGSOQueryType::IRPass); 7576 // 1) OptSize takes precedence over all other options, i.e. if this is set, 7577 // don't look at hints or options, and don't request a scalar epilogue. 7578 if (OptSize && Hints.getForce() != LoopVectorizeHints::FK_Enabled) 7579 return CM_ScalarEpilogueNotAllowedOptSize; 7580 7581 bool PredicateOptDisabled = PreferPredicateOverEpilog.getNumOccurrences() && 7582 !PreferPredicateOverEpilog; 7583 7584 // 2) Next, if disabling predication is requested on the command line, honour 7585 // this and request a scalar epilogue. 7586 if (PredicateOptDisabled) 7587 return CM_ScalarEpilogueAllowed; 7588 7589 // 3) and 4) look if enabling predication is requested on the command line, 7590 // with a loop hint, or if the TTI hook indicates this is profitable, request 7591 // predication . 7592 if (PreferPredicateOverEpilog || 7593 Hints.getPredicate() == LoopVectorizeHints::FK_Enabled || 7594 (TTI->preferPredicateOverEpilogue(L, LI, *SE, *AC, TLI, DT, 7595 LVL.getLAI()) && 7596 Hints.getPredicate() != LoopVectorizeHints::FK_Disabled)) 7597 return CM_ScalarEpilogueNotNeededUsePredicate; 7598 7599 return CM_ScalarEpilogueAllowed; 7600 } 7601 7602 // Process the loop in the VPlan-native vectorization path. This path builds 7603 // VPlan upfront in the vectorization pipeline, which allows to apply 7604 // VPlan-to-VPlan transformations from the very beginning without modifying the 7605 // input LLVM IR. 7606 static bool processLoopInVPlanNativePath( 7607 Loop *L, PredicatedScalarEvolution &PSE, LoopInfo *LI, DominatorTree *DT, 7608 LoopVectorizationLegality *LVL, TargetTransformInfo *TTI, 7609 TargetLibraryInfo *TLI, DemandedBits *DB, AssumptionCache *AC, 7610 OptimizationRemarkEmitter *ORE, BlockFrequencyInfo *BFI, 7611 ProfileSummaryInfo *PSI, LoopVectorizeHints &Hints) { 7612 7613 assert(EnableVPlanNativePath && "VPlan-native path is disabled."); 7614 Function *F = L->getHeader()->getParent(); 7615 InterleavedAccessInfo IAI(PSE, L, DT, LI, LVL->getLAI()); 7616 7617 ScalarEpilogueLowering SEL = getScalarEpilogueLowering( 7618 F, L, Hints, PSI, BFI, TTI, TLI, AC, LI, PSE.getSE(), DT, *LVL); 7619 7620 LoopVectorizationCostModel CM(SEL, L, PSE, LI, LVL, *TTI, TLI, DB, AC, ORE, F, 7621 &Hints, IAI); 7622 // Use the planner for outer loop vectorization. 7623 // TODO: CM is not used at this point inside the planner. Turn CM into an 7624 // optional argument if we don't need it in the future. 7625 LoopVectorizationPlanner LVP(L, LI, TLI, TTI, LVL, CM, IAI, PSE); 7626 7627 // Get user vectorization factor. 7628 const unsigned UserVF = Hints.getWidth(); 7629 7630 // Plan how to best vectorize, return the best VF and its cost. 7631 const VectorizationFactor VF = LVP.planInVPlanNativePath(UserVF); 7632 7633 // If we are stress testing VPlan builds, do not attempt to generate vector 7634 // code. Masked vector code generation support will follow soon. 7635 // Also, do not attempt to vectorize if no vector code will be produced. 7636 if (VPlanBuildStressTest || EnableVPlanPredication || 7637 VectorizationFactor::Disabled() == VF) 7638 return false; 7639 7640 LVP.setBestPlan(VF.Width, 1); 7641 7642 InnerLoopVectorizer LB(L, PSE, LI, DT, TLI, TTI, AC, ORE, VF.Width, 1, LVL, 7643 &CM); 7644 LLVM_DEBUG(dbgs() << "Vectorizing outer loop in \"" 7645 << L->getHeader()->getParent()->getName() << "\"\n"); 7646 LVP.executePlan(LB, DT); 7647 7648 // Mark the loop as already vectorized to avoid vectorizing again. 7649 Hints.setAlreadyVectorized(); 7650 7651 LLVM_DEBUG(verifyFunction(*L->getHeader()->getParent())); 7652 return true; 7653 } 7654 7655 LoopVectorizePass::LoopVectorizePass(LoopVectorizeOptions Opts) 7656 : InterleaveOnlyWhenForced(Opts.InterleaveOnlyWhenForced || 7657 !EnableLoopInterleaving), 7658 VectorizeOnlyWhenForced(Opts.VectorizeOnlyWhenForced || 7659 !EnableLoopVectorization) {} 7660 7661 bool LoopVectorizePass::processLoop(Loop *L) { 7662 assert((EnableVPlanNativePath || L->empty()) && 7663 "VPlan-native path is not enabled. Only process inner loops."); 7664 7665 #ifndef NDEBUG 7666 const std::string DebugLocStr = getDebugLocString(L); 7667 #endif /* NDEBUG */ 7668 7669 LLVM_DEBUG(dbgs() << "\nLV: Checking a loop in \"" 7670 << L->getHeader()->getParent()->getName() << "\" from " 7671 << DebugLocStr << "\n"); 7672 7673 LoopVectorizeHints Hints(L, InterleaveOnlyWhenForced, *ORE); 7674 7675 LLVM_DEBUG( 7676 dbgs() << "LV: Loop hints:" 7677 << " force=" 7678 << (Hints.getForce() == LoopVectorizeHints::FK_Disabled 7679 ? "disabled" 7680 : (Hints.getForce() == LoopVectorizeHints::FK_Enabled 7681 ? "enabled" 7682 : "?")) 7683 << " width=" << Hints.getWidth() 7684 << " unroll=" << Hints.getInterleave() << "\n"); 7685 7686 // Function containing loop 7687 Function *F = L->getHeader()->getParent(); 7688 7689 // Looking at the diagnostic output is the only way to determine if a loop 7690 // was vectorized (other than looking at the IR or machine code), so it 7691 // is important to generate an optimization remark for each loop. Most of 7692 // these messages are generated as OptimizationRemarkAnalysis. Remarks 7693 // generated as OptimizationRemark and OptimizationRemarkMissed are 7694 // less verbose reporting vectorized loops and unvectorized loops that may 7695 // benefit from vectorization, respectively. 7696 7697 if (!Hints.allowVectorization(F, L, VectorizeOnlyWhenForced)) { 7698 LLVM_DEBUG(dbgs() << "LV: Loop hints prevent vectorization.\n"); 7699 return false; 7700 } 7701 7702 PredicatedScalarEvolution PSE(*SE, *L); 7703 7704 // Check if it is legal to vectorize the loop. 7705 LoopVectorizationRequirements Requirements(*ORE); 7706 LoopVectorizationLegality LVL(L, PSE, DT, TTI, TLI, AA, F, GetLAA, LI, ORE, 7707 &Requirements, &Hints, DB, AC); 7708 if (!LVL.canVectorize(EnableVPlanNativePath)) { 7709 LLVM_DEBUG(dbgs() << "LV: Not vectorizing: Cannot prove legality.\n"); 7710 Hints.emitRemarkWithHints(); 7711 return false; 7712 } 7713 7714 // Check the function attributes and profiles to find out if this function 7715 // should be optimized for size. 7716 ScalarEpilogueLowering SEL = getScalarEpilogueLowering( 7717 F, L, Hints, PSI, BFI, TTI, TLI, AC, LI, PSE.getSE(), DT, LVL); 7718 7719 // Entrance to the VPlan-native vectorization path. Outer loops are processed 7720 // here. They may require CFG and instruction level transformations before 7721 // even evaluating whether vectorization is profitable. Since we cannot modify 7722 // the incoming IR, we need to build VPlan upfront in the vectorization 7723 // pipeline. 7724 if (!L->empty()) 7725 return processLoopInVPlanNativePath(L, PSE, LI, DT, &LVL, TTI, TLI, DB, AC, 7726 ORE, BFI, PSI, Hints); 7727 7728 assert(L->empty() && "Inner loop expected."); 7729 7730 // Check the loop for a trip count threshold: vectorize loops with a tiny trip 7731 // count by optimizing for size, to minimize overheads. 7732 auto ExpectedTC = getSmallBestKnownTC(*SE, L); 7733 if (ExpectedTC && *ExpectedTC < TinyTripCountVectorThreshold) { 7734 LLVM_DEBUG(dbgs() << "LV: Found a loop with a very small trip count. " 7735 << "This loop is worth vectorizing only if no scalar " 7736 << "iteration overheads are incurred."); 7737 if (Hints.getForce() == LoopVectorizeHints::FK_Enabled) 7738 LLVM_DEBUG(dbgs() << " But vectorizing was explicitly forced.\n"); 7739 else { 7740 LLVM_DEBUG(dbgs() << "\n"); 7741 SEL = CM_ScalarEpilogueNotAllowedLowTripLoop; 7742 } 7743 } 7744 7745 // Check the function attributes to see if implicit floats are allowed. 7746 // FIXME: This check doesn't seem possibly correct -- what if the loop is 7747 // an integer loop and the vector instructions selected are purely integer 7748 // vector instructions? 7749 if (F->hasFnAttribute(Attribute::NoImplicitFloat)) { 7750 reportVectorizationFailure( 7751 "Can't vectorize when the NoImplicitFloat attribute is used", 7752 "loop not vectorized due to NoImplicitFloat attribute", 7753 "NoImplicitFloat", ORE, L); 7754 Hints.emitRemarkWithHints(); 7755 return false; 7756 } 7757 7758 // Check if the target supports potentially unsafe FP vectorization. 7759 // FIXME: Add a check for the type of safety issue (denormal, signaling) 7760 // for the target we're vectorizing for, to make sure none of the 7761 // additional fp-math flags can help. 7762 if (Hints.isPotentiallyUnsafe() && 7763 TTI->isFPVectorizationPotentiallyUnsafe()) { 7764 reportVectorizationFailure( 7765 "Potentially unsafe FP op prevents vectorization", 7766 "loop not vectorized due to unsafe FP support.", 7767 "UnsafeFP", ORE, L); 7768 Hints.emitRemarkWithHints(); 7769 return false; 7770 } 7771 7772 bool UseInterleaved = TTI->enableInterleavedAccessVectorization(); 7773 InterleavedAccessInfo IAI(PSE, L, DT, LI, LVL.getLAI()); 7774 7775 // If an override option has been passed in for interleaved accesses, use it. 7776 if (EnableInterleavedMemAccesses.getNumOccurrences() > 0) 7777 UseInterleaved = EnableInterleavedMemAccesses; 7778 7779 // Analyze interleaved memory accesses. 7780 if (UseInterleaved) { 7781 IAI.analyzeInterleaving(useMaskedInterleavedAccesses(*TTI)); 7782 } 7783 7784 // Use the cost model. 7785 LoopVectorizationCostModel CM(SEL, L, PSE, LI, &LVL, *TTI, TLI, DB, AC, ORE, 7786 F, &Hints, IAI); 7787 CM.collectValuesToIgnore(); 7788 7789 // Use the planner for vectorization. 7790 LoopVectorizationPlanner LVP(L, LI, TLI, TTI, &LVL, CM, IAI, PSE); 7791 7792 // Get user vectorization factor. 7793 unsigned UserVF = Hints.getWidth(); 7794 7795 // Plan how to best vectorize, return the best VF and its cost. 7796 Optional<VectorizationFactor> MaybeVF = LVP.plan(UserVF); 7797 7798 VectorizationFactor VF = VectorizationFactor::Disabled(); 7799 unsigned IC = 1; 7800 unsigned UserIC = Hints.getInterleave(); 7801 7802 if (MaybeVF) { 7803 VF = *MaybeVF; 7804 // Select the interleave count. 7805 IC = CM.selectInterleaveCount(VF.Width, VF.Cost); 7806 } 7807 7808 // Identify the diagnostic messages that should be produced. 7809 std::pair<StringRef, std::string> VecDiagMsg, IntDiagMsg; 7810 bool VectorizeLoop = true, InterleaveLoop = true; 7811 if (Requirements.doesNotMeet(F, L, Hints)) { 7812 LLVM_DEBUG(dbgs() << "LV: Not vectorizing: loop did not meet vectorization " 7813 "requirements.\n"); 7814 Hints.emitRemarkWithHints(); 7815 return false; 7816 } 7817 7818 if (VF.Width == 1) { 7819 LLVM_DEBUG(dbgs() << "LV: Vectorization is possible but not beneficial.\n"); 7820 VecDiagMsg = std::make_pair( 7821 "VectorizationNotBeneficial", 7822 "the cost-model indicates that vectorization is not beneficial"); 7823 VectorizeLoop = false; 7824 } 7825 7826 if (!MaybeVF && UserIC > 1) { 7827 // Tell the user interleaving was avoided up-front, despite being explicitly 7828 // requested. 7829 LLVM_DEBUG(dbgs() << "LV: Ignoring UserIC, because vectorization and " 7830 "interleaving should be avoided up front\n"); 7831 IntDiagMsg = std::make_pair( 7832 "InterleavingAvoided", 7833 "Ignoring UserIC, because interleaving was avoided up front"); 7834 InterleaveLoop = false; 7835 } else if (IC == 1 && UserIC <= 1) { 7836 // Tell the user interleaving is not beneficial. 7837 LLVM_DEBUG(dbgs() << "LV: Interleaving is not beneficial.\n"); 7838 IntDiagMsg = std::make_pair( 7839 "InterleavingNotBeneficial", 7840 "the cost-model indicates that interleaving is not beneficial"); 7841 InterleaveLoop = false; 7842 if (UserIC == 1) { 7843 IntDiagMsg.first = "InterleavingNotBeneficialAndDisabled"; 7844 IntDiagMsg.second += 7845 " and is explicitly disabled or interleave count is set to 1"; 7846 } 7847 } else if (IC > 1 && UserIC == 1) { 7848 // Tell the user interleaving is beneficial, but it explicitly disabled. 7849 LLVM_DEBUG( 7850 dbgs() << "LV: Interleaving is beneficial but is explicitly disabled."); 7851 IntDiagMsg = std::make_pair( 7852 "InterleavingBeneficialButDisabled", 7853 "the cost-model indicates that interleaving is beneficial " 7854 "but is explicitly disabled or interleave count is set to 1"); 7855 InterleaveLoop = false; 7856 } 7857 7858 // Override IC if user provided an interleave count. 7859 IC = UserIC > 0 ? UserIC : IC; 7860 7861 // Emit diagnostic messages, if any. 7862 const char *VAPassName = Hints.vectorizeAnalysisPassName(); 7863 if (!VectorizeLoop && !InterleaveLoop) { 7864 // Do not vectorize or interleaving the loop. 7865 ORE->emit([&]() { 7866 return OptimizationRemarkMissed(VAPassName, VecDiagMsg.first, 7867 L->getStartLoc(), L->getHeader()) 7868 << VecDiagMsg.second; 7869 }); 7870 ORE->emit([&]() { 7871 return OptimizationRemarkMissed(LV_NAME, IntDiagMsg.first, 7872 L->getStartLoc(), L->getHeader()) 7873 << IntDiagMsg.second; 7874 }); 7875 return false; 7876 } else if (!VectorizeLoop && InterleaveLoop) { 7877 LLVM_DEBUG(dbgs() << "LV: Interleave Count is " << IC << '\n'); 7878 ORE->emit([&]() { 7879 return OptimizationRemarkAnalysis(VAPassName, VecDiagMsg.first, 7880 L->getStartLoc(), L->getHeader()) 7881 << VecDiagMsg.second; 7882 }); 7883 } else if (VectorizeLoop && !InterleaveLoop) { 7884 LLVM_DEBUG(dbgs() << "LV: Found a vectorizable loop (" << VF.Width 7885 << ") in " << DebugLocStr << '\n'); 7886 ORE->emit([&]() { 7887 return OptimizationRemarkAnalysis(LV_NAME, IntDiagMsg.first, 7888 L->getStartLoc(), L->getHeader()) 7889 << IntDiagMsg.second; 7890 }); 7891 } else if (VectorizeLoop && InterleaveLoop) { 7892 LLVM_DEBUG(dbgs() << "LV: Found a vectorizable loop (" << VF.Width 7893 << ") in " << DebugLocStr << '\n'); 7894 LLVM_DEBUG(dbgs() << "LV: Interleave Count is " << IC << '\n'); 7895 } 7896 7897 LVP.setBestPlan(VF.Width, IC); 7898 7899 using namespace ore; 7900 bool DisableRuntimeUnroll = false; 7901 MDNode *OrigLoopID = L->getLoopID(); 7902 7903 if (!VectorizeLoop) { 7904 assert(IC > 1 && "interleave count should not be 1 or 0"); 7905 // If we decided that it is not legal to vectorize the loop, then 7906 // interleave it. 7907 InnerLoopUnroller Unroller(L, PSE, LI, DT, TLI, TTI, AC, ORE, IC, &LVL, 7908 &CM); 7909 LVP.executePlan(Unroller, DT); 7910 7911 ORE->emit([&]() { 7912 return OptimizationRemark(LV_NAME, "Interleaved", L->getStartLoc(), 7913 L->getHeader()) 7914 << "interleaved loop (interleaved count: " 7915 << NV("InterleaveCount", IC) << ")"; 7916 }); 7917 } else { 7918 // If we decided that it is *legal* to vectorize the loop, then do it. 7919 InnerLoopVectorizer LB(L, PSE, LI, DT, TLI, TTI, AC, ORE, VF.Width, IC, 7920 &LVL, &CM); 7921 LVP.executePlan(LB, DT); 7922 ++LoopsVectorized; 7923 7924 // Add metadata to disable runtime unrolling a scalar loop when there are 7925 // no runtime checks about strides and memory. A scalar loop that is 7926 // rarely used is not worth unrolling. 7927 if (!LB.areSafetyChecksAdded()) 7928 DisableRuntimeUnroll = true; 7929 7930 // Report the vectorization decision. 7931 ORE->emit([&]() { 7932 return OptimizationRemark(LV_NAME, "Vectorized", L->getStartLoc(), 7933 L->getHeader()) 7934 << "vectorized loop (vectorization width: " 7935 << NV("VectorizationFactor", VF.Width) 7936 << ", interleaved count: " << NV("InterleaveCount", IC) << ")"; 7937 }); 7938 } 7939 7940 Optional<MDNode *> RemainderLoopID = 7941 makeFollowupLoopID(OrigLoopID, {LLVMLoopVectorizeFollowupAll, 7942 LLVMLoopVectorizeFollowupEpilogue}); 7943 if (RemainderLoopID.hasValue()) { 7944 L->setLoopID(RemainderLoopID.getValue()); 7945 } else { 7946 if (DisableRuntimeUnroll) 7947 AddRuntimeUnrollDisableMetaData(L); 7948 7949 // Mark the loop as already vectorized to avoid vectorizing again. 7950 Hints.setAlreadyVectorized(); 7951 } 7952 7953 LLVM_DEBUG(verifyFunction(*L->getHeader()->getParent())); 7954 return true; 7955 } 7956 7957 bool LoopVectorizePass::runImpl( 7958 Function &F, ScalarEvolution &SE_, LoopInfo &LI_, TargetTransformInfo &TTI_, 7959 DominatorTree &DT_, BlockFrequencyInfo &BFI_, TargetLibraryInfo *TLI_, 7960 DemandedBits &DB_, AliasAnalysis &AA_, AssumptionCache &AC_, 7961 std::function<const LoopAccessInfo &(Loop &)> &GetLAA_, 7962 OptimizationRemarkEmitter &ORE_, ProfileSummaryInfo *PSI_) { 7963 SE = &SE_; 7964 LI = &LI_; 7965 TTI = &TTI_; 7966 DT = &DT_; 7967 BFI = &BFI_; 7968 TLI = TLI_; 7969 AA = &AA_; 7970 AC = &AC_; 7971 GetLAA = &GetLAA_; 7972 DB = &DB_; 7973 ORE = &ORE_; 7974 PSI = PSI_; 7975 7976 // Don't attempt if 7977 // 1. the target claims to have no vector registers, and 7978 // 2. interleaving won't help ILP. 7979 // 7980 // The second condition is necessary because, even if the target has no 7981 // vector registers, loop vectorization may still enable scalar 7982 // interleaving. 7983 if (!TTI->getNumberOfRegisters(TTI->getRegisterClassForType(true)) && 7984 TTI->getMaxInterleaveFactor(1) < 2) 7985 return false; 7986 7987 bool Changed = false; 7988 7989 // The vectorizer requires loops to be in simplified form. 7990 // Since simplification may add new inner loops, it has to run before the 7991 // legality and profitability checks. This means running the loop vectorizer 7992 // will simplify all loops, regardless of whether anything end up being 7993 // vectorized. 7994 for (auto &L : *LI) 7995 Changed |= 7996 simplifyLoop(L, DT, LI, SE, AC, nullptr, false /* PreserveLCSSA */); 7997 7998 // Build up a worklist of inner-loops to vectorize. This is necessary as 7999 // the act of vectorizing or partially unrolling a loop creates new loops 8000 // and can invalidate iterators across the loops. 8001 SmallVector<Loop *, 8> Worklist; 8002 8003 for (Loop *L : *LI) 8004 collectSupportedLoops(*L, LI, ORE, Worklist); 8005 8006 LoopsAnalyzed += Worklist.size(); 8007 8008 // Now walk the identified inner loops. 8009 while (!Worklist.empty()) { 8010 Loop *L = Worklist.pop_back_val(); 8011 8012 // For the inner loops we actually process, form LCSSA to simplify the 8013 // transform. 8014 Changed |= formLCSSARecursively(*L, *DT, LI, SE); 8015 8016 Changed |= processLoop(L); 8017 } 8018 8019 // Process each loop nest in the function. 8020 return Changed; 8021 } 8022 8023 PreservedAnalyses LoopVectorizePass::run(Function &F, 8024 FunctionAnalysisManager &AM) { 8025 auto &SE = AM.getResult<ScalarEvolutionAnalysis>(F); 8026 auto &LI = AM.getResult<LoopAnalysis>(F); 8027 auto &TTI = AM.getResult<TargetIRAnalysis>(F); 8028 auto &DT = AM.getResult<DominatorTreeAnalysis>(F); 8029 auto &BFI = AM.getResult<BlockFrequencyAnalysis>(F); 8030 auto &TLI = AM.getResult<TargetLibraryAnalysis>(F); 8031 auto &AA = AM.getResult<AAManager>(F); 8032 auto &AC = AM.getResult<AssumptionAnalysis>(F); 8033 auto &DB = AM.getResult<DemandedBitsAnalysis>(F); 8034 auto &ORE = AM.getResult<OptimizationRemarkEmitterAnalysis>(F); 8035 MemorySSA *MSSA = EnableMSSALoopDependency 8036 ? &AM.getResult<MemorySSAAnalysis>(F).getMSSA() 8037 : nullptr; 8038 8039 auto &LAM = AM.getResult<LoopAnalysisManagerFunctionProxy>(F).getManager(); 8040 std::function<const LoopAccessInfo &(Loop &)> GetLAA = 8041 [&](Loop &L) -> const LoopAccessInfo & { 8042 LoopStandardAnalysisResults AR = {AA, AC, DT, LI, SE, TLI, TTI, MSSA}; 8043 return LAM.getResult<LoopAccessAnalysis>(L, AR); 8044 }; 8045 const ModuleAnalysisManager &MAM = 8046 AM.getResult<ModuleAnalysisManagerFunctionProxy>(F).getManager(); 8047 ProfileSummaryInfo *PSI = 8048 MAM.getCachedResult<ProfileSummaryAnalysis>(*F.getParent()); 8049 bool Changed = 8050 runImpl(F, SE, LI, TTI, DT, BFI, &TLI, DB, AA, AC, GetLAA, ORE, PSI); 8051 if (!Changed) 8052 return PreservedAnalyses::all(); 8053 PreservedAnalyses PA; 8054 8055 // We currently do not preserve loopinfo/dominator analyses with outer loop 8056 // vectorization. Until this is addressed, mark these analyses as preserved 8057 // only for non-VPlan-native path. 8058 // TODO: Preserve Loop and Dominator analyses for VPlan-native path. 8059 if (!EnableVPlanNativePath) { 8060 PA.preserve<LoopAnalysis>(); 8061 PA.preserve<DominatorTreeAnalysis>(); 8062 } 8063 PA.preserve<BasicAA>(); 8064 PA.preserve<GlobalsAA>(); 8065 return PA; 8066 } 8067