1 //===- LoopVectorize.cpp - A Loop Vectorizer ------------------------------===// 2 // 3 // Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. 4 // See https://llvm.org/LICENSE.txt for license information. 5 // SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception 6 // 7 //===----------------------------------------------------------------------===// 8 // 9 // This is the LLVM loop vectorizer. This pass modifies 'vectorizable' loops 10 // and generates target-independent LLVM-IR. 11 // The vectorizer uses the TargetTransformInfo analysis to estimate the costs 12 // of instructions in order to estimate the profitability of vectorization. 13 // 14 // The loop vectorizer combines consecutive loop iterations into a single 15 // 'wide' iteration. After this transformation the index is incremented 16 // by the SIMD vector width, and not by one. 17 // 18 // This pass has three parts: 19 // 1. The main loop pass that drives the different parts. 20 // 2. LoopVectorizationLegality - A unit that checks for the legality 21 // of the vectorization. 22 // 3. InnerLoopVectorizer - A unit that performs the actual 23 // widening of instructions. 24 // 4. LoopVectorizationCostModel - A unit that checks for the profitability 25 // of vectorization. It decides on the optimal vector width, which 26 // can be one, if vectorization is not profitable. 27 // 28 // There is a development effort going on to migrate loop vectorizer to the 29 // VPlan infrastructure and to introduce outer loop vectorization support (see 30 // docs/Proposal/VectorizationPlan.rst and 31 // http://lists.llvm.org/pipermail/llvm-dev/2017-December/119523.html). For this 32 // purpose, we temporarily introduced the VPlan-native vectorization path: an 33 // alternative vectorization path that is natively implemented on top of the 34 // VPlan infrastructure. See EnableVPlanNativePath for enabling. 35 // 36 //===----------------------------------------------------------------------===// 37 // 38 // The reduction-variable vectorization is based on the paper: 39 // D. Nuzman and R. Henderson. Multi-platform Auto-vectorization. 40 // 41 // Variable uniformity checks are inspired by: 42 // Karrenberg, R. and Hack, S. Whole Function Vectorization. 43 // 44 // The interleaved access vectorization is based on the paper: 45 // Dorit Nuzman, Ira Rosen and Ayal Zaks. Auto-Vectorization of Interleaved 46 // Data for SIMD 47 // 48 // Other ideas/concepts are from: 49 // A. Zaks and D. Nuzman. Autovectorization in GCC-two years later. 50 // 51 // S. Maleki, Y. Gao, M. Garzaran, T. Wong and D. Padua. An Evaluation of 52 // Vectorizing Compilers. 53 // 54 //===----------------------------------------------------------------------===// 55 56 #include "llvm/Transforms/Vectorize/LoopVectorize.h" 57 #include "LoopVectorizationPlanner.h" 58 #include "VPRecipeBuilder.h" 59 #include "VPlan.h" 60 #include "VPlanHCFGBuilder.h" 61 #include "VPlanPredicator.h" 62 #include "VPlanTransforms.h" 63 #include "llvm/ADT/APInt.h" 64 #include "llvm/ADT/ArrayRef.h" 65 #include "llvm/ADT/DenseMap.h" 66 #include "llvm/ADT/DenseMapInfo.h" 67 #include "llvm/ADT/Hashing.h" 68 #include "llvm/ADT/MapVector.h" 69 #include "llvm/ADT/None.h" 70 #include "llvm/ADT/Optional.h" 71 #include "llvm/ADT/STLExtras.h" 72 #include "llvm/ADT/SetVector.h" 73 #include "llvm/ADT/SmallPtrSet.h" 74 #include "llvm/ADT/SmallVector.h" 75 #include "llvm/ADT/Statistic.h" 76 #include "llvm/ADT/StringRef.h" 77 #include "llvm/ADT/Twine.h" 78 #include "llvm/ADT/iterator_range.h" 79 #include "llvm/Analysis/AssumptionCache.h" 80 #include "llvm/Analysis/BasicAliasAnalysis.h" 81 #include "llvm/Analysis/BlockFrequencyInfo.h" 82 #include "llvm/Analysis/CFG.h" 83 #include "llvm/Analysis/CodeMetrics.h" 84 #include "llvm/Analysis/DemandedBits.h" 85 #include "llvm/Analysis/GlobalsModRef.h" 86 #include "llvm/Analysis/LoopAccessAnalysis.h" 87 #include "llvm/Analysis/LoopAnalysisManager.h" 88 #include "llvm/Analysis/LoopInfo.h" 89 #include "llvm/Analysis/LoopIterator.h" 90 #include "llvm/Analysis/MemorySSA.h" 91 #include "llvm/Analysis/OptimizationRemarkEmitter.h" 92 #include "llvm/Analysis/ProfileSummaryInfo.h" 93 #include "llvm/Analysis/ScalarEvolution.h" 94 #include "llvm/Analysis/ScalarEvolutionExpressions.h" 95 #include "llvm/Analysis/TargetLibraryInfo.h" 96 #include "llvm/Analysis/TargetTransformInfo.h" 97 #include "llvm/Analysis/VectorUtils.h" 98 #include "llvm/IR/Attributes.h" 99 #include "llvm/IR/BasicBlock.h" 100 #include "llvm/IR/CFG.h" 101 #include "llvm/IR/Constant.h" 102 #include "llvm/IR/Constants.h" 103 #include "llvm/IR/DataLayout.h" 104 #include "llvm/IR/DebugInfoMetadata.h" 105 #include "llvm/IR/DebugLoc.h" 106 #include "llvm/IR/DerivedTypes.h" 107 #include "llvm/IR/DiagnosticInfo.h" 108 #include "llvm/IR/Dominators.h" 109 #include "llvm/IR/Function.h" 110 #include "llvm/IR/IRBuilder.h" 111 #include "llvm/IR/InstrTypes.h" 112 #include "llvm/IR/Instruction.h" 113 #include "llvm/IR/Instructions.h" 114 #include "llvm/IR/IntrinsicInst.h" 115 #include "llvm/IR/Intrinsics.h" 116 #include "llvm/IR/LLVMContext.h" 117 #include "llvm/IR/Metadata.h" 118 #include "llvm/IR/Module.h" 119 #include "llvm/IR/Operator.h" 120 #include "llvm/IR/Type.h" 121 #include "llvm/IR/Use.h" 122 #include "llvm/IR/User.h" 123 #include "llvm/IR/Value.h" 124 #include "llvm/IR/ValueHandle.h" 125 #include "llvm/IR/Verifier.h" 126 #include "llvm/InitializePasses.h" 127 #include "llvm/Pass.h" 128 #include "llvm/Support/Casting.h" 129 #include "llvm/Support/CommandLine.h" 130 #include "llvm/Support/Compiler.h" 131 #include "llvm/Support/Debug.h" 132 #include "llvm/Support/ErrorHandling.h" 133 #include "llvm/Support/MathExtras.h" 134 #include "llvm/Support/raw_ostream.h" 135 #include "llvm/Transforms/Utils/BasicBlockUtils.h" 136 #include "llvm/Transforms/Utils/InjectTLIMappings.h" 137 #include "llvm/Transforms/Utils/LoopSimplify.h" 138 #include "llvm/Transforms/Utils/LoopUtils.h" 139 #include "llvm/Transforms/Utils/LoopVersioning.h" 140 #include "llvm/Transforms/Utils/ScalarEvolutionExpander.h" 141 #include "llvm/Transforms/Utils/SizeOpts.h" 142 #include "llvm/Transforms/Vectorize/LoopVectorizationLegality.h" 143 #include <algorithm> 144 #include <cassert> 145 #include <cstdint> 146 #include <cstdlib> 147 #include <functional> 148 #include <iterator> 149 #include <limits> 150 #include <memory> 151 #include <string> 152 #include <tuple> 153 #include <utility> 154 155 using namespace llvm; 156 157 #define LV_NAME "loop-vectorize" 158 #define DEBUG_TYPE LV_NAME 159 160 /// @{ 161 /// Metadata attribute names 162 static const char *const LLVMLoopVectorizeFollowupAll = 163 "llvm.loop.vectorize.followup_all"; 164 static const char *const LLVMLoopVectorizeFollowupVectorized = 165 "llvm.loop.vectorize.followup_vectorized"; 166 static const char *const LLVMLoopVectorizeFollowupEpilogue = 167 "llvm.loop.vectorize.followup_epilogue"; 168 /// @} 169 170 STATISTIC(LoopsVectorized, "Number of loops vectorized"); 171 STATISTIC(LoopsAnalyzed, "Number of loops analyzed for vectorization"); 172 173 /// Loops with a known constant trip count below this number are vectorized only 174 /// if no scalar iteration overheads are incurred. 175 static cl::opt<unsigned> TinyTripCountVectorThreshold( 176 "vectorizer-min-trip-count", cl::init(16), cl::Hidden, 177 cl::desc("Loops with a constant trip count that is smaller than this " 178 "value are vectorized only if no scalar iteration overheads " 179 "are incurred.")); 180 181 // Indicates that an epilogue is undesired, predication is preferred. 182 // This means that the vectorizer will try to fold the loop-tail (epilogue) 183 // into the loop and predicate the loop body accordingly. 184 static cl::opt<bool> PreferPredicateOverEpilog( 185 "prefer-predicate-over-epilog", cl::init(false), cl::Hidden, 186 cl::desc("Indicate that an epilogue is undesired, predication should be " 187 "used instead.")); 188 189 static cl::opt<bool> MaximizeBandwidth( 190 "vectorizer-maximize-bandwidth", cl::init(false), cl::Hidden, 191 cl::desc("Maximize bandwidth when selecting vectorization factor which " 192 "will be determined by the smallest type in loop.")); 193 194 static cl::opt<bool> EnableInterleavedMemAccesses( 195 "enable-interleaved-mem-accesses", cl::init(false), cl::Hidden, 196 cl::desc("Enable vectorization on interleaved memory accesses in a loop")); 197 198 /// An interleave-group may need masking if it resides in a block that needs 199 /// predication, or in order to mask away gaps. 200 static cl::opt<bool> EnableMaskedInterleavedMemAccesses( 201 "enable-masked-interleaved-mem-accesses", cl::init(false), cl::Hidden, 202 cl::desc("Enable vectorization on masked interleaved memory accesses in a loop")); 203 204 static cl::opt<unsigned> TinyTripCountInterleaveThreshold( 205 "tiny-trip-count-interleave-threshold", cl::init(128), cl::Hidden, 206 cl::desc("We don't interleave loops with a estimated constant trip count " 207 "below this number")); 208 209 static cl::opt<unsigned> ForceTargetNumScalarRegs( 210 "force-target-num-scalar-regs", cl::init(0), cl::Hidden, 211 cl::desc("A flag that overrides the target's number of scalar registers.")); 212 213 static cl::opt<unsigned> ForceTargetNumVectorRegs( 214 "force-target-num-vector-regs", cl::init(0), cl::Hidden, 215 cl::desc("A flag that overrides the target's number of vector registers.")); 216 217 static cl::opt<unsigned> ForceTargetMaxScalarInterleaveFactor( 218 "force-target-max-scalar-interleave", cl::init(0), cl::Hidden, 219 cl::desc("A flag that overrides the target's max interleave factor for " 220 "scalar loops.")); 221 222 static cl::opt<unsigned> ForceTargetMaxVectorInterleaveFactor( 223 "force-target-max-vector-interleave", cl::init(0), cl::Hidden, 224 cl::desc("A flag that overrides the target's max interleave factor for " 225 "vectorized loops.")); 226 227 static cl::opt<unsigned> ForceTargetInstructionCost( 228 "force-target-instruction-cost", cl::init(0), cl::Hidden, 229 cl::desc("A flag that overrides the target's expected cost for " 230 "an instruction to a single constant value. Mostly " 231 "useful for getting consistent testing.")); 232 233 static cl::opt<unsigned> SmallLoopCost( 234 "small-loop-cost", cl::init(20), cl::Hidden, 235 cl::desc( 236 "The cost of a loop that is considered 'small' by the interleaver.")); 237 238 static cl::opt<bool> LoopVectorizeWithBlockFrequency( 239 "loop-vectorize-with-block-frequency", cl::init(true), cl::Hidden, 240 cl::desc("Enable the use of the block frequency analysis to access PGO " 241 "heuristics minimizing code growth in cold regions and being more " 242 "aggressive in hot regions.")); 243 244 // Runtime interleave loops for load/store throughput. 245 static cl::opt<bool> EnableLoadStoreRuntimeInterleave( 246 "enable-loadstore-runtime-interleave", cl::init(true), cl::Hidden, 247 cl::desc( 248 "Enable runtime interleaving until load/store ports are saturated")); 249 250 /// The number of stores in a loop that are allowed to need predication. 251 static cl::opt<unsigned> NumberOfStoresToPredicate( 252 "vectorize-num-stores-pred", cl::init(1), cl::Hidden, 253 cl::desc("Max number of stores to be predicated behind an if.")); 254 255 static cl::opt<bool> EnableIndVarRegisterHeur( 256 "enable-ind-var-reg-heur", cl::init(true), cl::Hidden, 257 cl::desc("Count the induction variable only once when interleaving")); 258 259 static cl::opt<bool> EnableCondStoresVectorization( 260 "enable-cond-stores-vec", cl::init(true), cl::Hidden, 261 cl::desc("Enable if predication of stores during vectorization.")); 262 263 static cl::opt<unsigned> MaxNestedScalarReductionIC( 264 "max-nested-scalar-reduction-interleave", cl::init(2), cl::Hidden, 265 cl::desc("The maximum interleave count to use when interleaving a scalar " 266 "reduction in a nested loop.")); 267 268 cl::opt<bool> EnableVPlanNativePath( 269 "enable-vplan-native-path", cl::init(false), cl::Hidden, 270 cl::desc("Enable VPlan-native vectorization path with " 271 "support for outer loop vectorization.")); 272 273 // FIXME: Remove this switch once we have divergence analysis. Currently we 274 // assume divergent non-backedge branches when this switch is true. 275 cl::opt<bool> EnableVPlanPredication( 276 "enable-vplan-predication", cl::init(false), cl::Hidden, 277 cl::desc("Enable VPlan-native vectorization path predicator with " 278 "support for outer loop vectorization.")); 279 280 // This flag enables the stress testing of the VPlan H-CFG construction in the 281 // VPlan-native vectorization path. It must be used in conjuction with 282 // -enable-vplan-native-path. -vplan-verify-hcfg can also be used to enable the 283 // verification of the H-CFGs built. 284 static cl::opt<bool> VPlanBuildStressTest( 285 "vplan-build-stress-test", cl::init(false), cl::Hidden, 286 cl::desc( 287 "Build VPlan for every supported loop nest in the function and bail " 288 "out right after the build (stress test the VPlan H-CFG construction " 289 "in the VPlan-native vectorization path).")); 290 291 cl::opt<bool> llvm::EnableLoopInterleaving( 292 "interleave-loops", cl::init(true), cl::Hidden, 293 cl::desc("Enable loop interleaving in Loop vectorization passes")); 294 cl::opt<bool> llvm::EnableLoopVectorization( 295 "vectorize-loops", cl::init(true), cl::Hidden, 296 cl::desc("Run the Loop vectorization passes")); 297 298 /// A helper function that returns the type of loaded or stored value. 299 static Type *getMemInstValueType(Value *I) { 300 assert((isa<LoadInst>(I) || isa<StoreInst>(I)) && 301 "Expected Load or Store instruction"); 302 if (auto *LI = dyn_cast<LoadInst>(I)) 303 return LI->getType(); 304 return cast<StoreInst>(I)->getValueOperand()->getType(); 305 } 306 307 /// A helper function that returns true if the given type is irregular. The 308 /// type is irregular if its allocated size doesn't equal the store size of an 309 /// element of the corresponding vector type at the given vectorization factor. 310 static bool hasIrregularType(Type *Ty, const DataLayout &DL, unsigned VF) { 311 // Determine if an array of VF elements of type Ty is "bitcast compatible" 312 // with a <VF x Ty> vector. 313 if (VF > 1) { 314 auto *VectorTy = VectorType::get(Ty, VF); 315 return VF * DL.getTypeAllocSize(Ty) != DL.getTypeStoreSize(VectorTy); 316 } 317 318 // If the vectorization factor is one, we just check if an array of type Ty 319 // requires padding between elements. 320 return DL.getTypeAllocSizeInBits(Ty) != DL.getTypeSizeInBits(Ty); 321 } 322 323 /// A helper function that returns the reciprocal of the block probability of 324 /// predicated blocks. If we return X, we are assuming the predicated block 325 /// will execute once for every X iterations of the loop header. 326 /// 327 /// TODO: We should use actual block probability here, if available. Currently, 328 /// we always assume predicated blocks have a 50% chance of executing. 329 static unsigned getReciprocalPredBlockProb() { return 2; } 330 331 /// A helper function that adds a 'fast' flag to floating-point operations. 332 static Value *addFastMathFlag(Value *V) { 333 if (isa<FPMathOperator>(V)) 334 cast<Instruction>(V)->setFastMathFlags(FastMathFlags::getFast()); 335 return V; 336 } 337 338 static Value *addFastMathFlag(Value *V, FastMathFlags FMF) { 339 if (isa<FPMathOperator>(V)) 340 cast<Instruction>(V)->setFastMathFlags(FMF); 341 return V; 342 } 343 344 /// A helper function that returns an integer or floating-point constant with 345 /// value C. 346 static Constant *getSignedIntOrFpConstant(Type *Ty, int64_t C) { 347 return Ty->isIntegerTy() ? ConstantInt::getSigned(Ty, C) 348 : ConstantFP::get(Ty, C); 349 } 350 351 /// Returns "best known" trip count for the specified loop \p L as defined by 352 /// the following procedure: 353 /// 1) Returns exact trip count if it is known. 354 /// 2) Returns expected trip count according to profile data if any. 355 /// 3) Returns upper bound estimate if it is known. 356 /// 4) Returns None if all of the above failed. 357 static Optional<unsigned> getSmallBestKnownTC(ScalarEvolution &SE, Loop *L) { 358 // Check if exact trip count is known. 359 if (unsigned ExpectedTC = SE.getSmallConstantTripCount(L)) 360 return ExpectedTC; 361 362 // Check if there is an expected trip count available from profile data. 363 if (LoopVectorizeWithBlockFrequency) 364 if (auto EstimatedTC = getLoopEstimatedTripCount(L)) 365 return EstimatedTC; 366 367 // Check if upper bound estimate is known. 368 if (unsigned ExpectedTC = SE.getSmallConstantMaxTripCount(L)) 369 return ExpectedTC; 370 371 return None; 372 } 373 374 namespace llvm { 375 376 /// InnerLoopVectorizer vectorizes loops which contain only one basic 377 /// block to a specified vectorization factor (VF). 378 /// This class performs the widening of scalars into vectors, or multiple 379 /// scalars. This class also implements the following features: 380 /// * It inserts an epilogue loop for handling loops that don't have iteration 381 /// counts that are known to be a multiple of the vectorization factor. 382 /// * It handles the code generation for reduction variables. 383 /// * Scalarization (implementation using scalars) of un-vectorizable 384 /// instructions. 385 /// InnerLoopVectorizer does not perform any vectorization-legality 386 /// checks, and relies on the caller to check for the different legality 387 /// aspects. The InnerLoopVectorizer relies on the 388 /// LoopVectorizationLegality class to provide information about the induction 389 /// and reduction variables that were found to a given vectorization factor. 390 class InnerLoopVectorizer { 391 public: 392 InnerLoopVectorizer(Loop *OrigLoop, PredicatedScalarEvolution &PSE, 393 LoopInfo *LI, DominatorTree *DT, 394 const TargetLibraryInfo *TLI, 395 const TargetTransformInfo *TTI, AssumptionCache *AC, 396 OptimizationRemarkEmitter *ORE, unsigned VecWidth, 397 unsigned UnrollFactor, LoopVectorizationLegality *LVL, 398 LoopVectorizationCostModel *CM) 399 : OrigLoop(OrigLoop), PSE(PSE), LI(LI), DT(DT), TLI(TLI), TTI(TTI), 400 AC(AC), ORE(ORE), VF(VecWidth), UF(UnrollFactor), 401 Builder(PSE.getSE()->getContext()), 402 VectorLoopValueMap(UnrollFactor, VecWidth), Legal(LVL), Cost(CM) {} 403 virtual ~InnerLoopVectorizer() = default; 404 405 /// Create a new empty loop. Unlink the old loop and connect the new one. 406 /// Return the pre-header block of the new loop. 407 BasicBlock *createVectorizedLoopSkeleton(); 408 409 /// Widen a single instruction within the innermost loop. 410 void widenInstruction(Instruction &I, VPUser &Operands, 411 VPTransformState &State); 412 413 /// Widen a single call instruction within the innermost loop. 414 void widenCallInstruction(CallInst &I, VPUser &ArgOperands, 415 VPTransformState &State); 416 417 /// Widen a single select instruction within the innermost loop. 418 void widenSelectInstruction(SelectInst &I, bool InvariantCond); 419 420 /// Fix the vectorized code, taking care of header phi's, live-outs, and more. 421 void fixVectorizedLoop(); 422 423 // Return true if any runtime check is added. 424 bool areSafetyChecksAdded() { return AddedSafetyChecks; } 425 426 /// A type for vectorized values in the new loop. Each value from the 427 /// original loop, when vectorized, is represented by UF vector values in the 428 /// new unrolled loop, where UF is the unroll factor. 429 using VectorParts = SmallVector<Value *, 2>; 430 431 /// Vectorize a single GetElementPtrInst based on information gathered and 432 /// decisions taken during planning. 433 void widenGEP(GetElementPtrInst *GEP, unsigned UF, unsigned VF, 434 bool IsPtrLoopInvariant, SmallBitVector &IsIndexLoopInvariant); 435 436 /// Vectorize a single PHINode in a block. This method handles the induction 437 /// variable canonicalization. It supports both VF = 1 for unrolled loops and 438 /// arbitrary length vectors. 439 void widenPHIInstruction(Instruction *PN, unsigned UF, unsigned VF); 440 441 /// A helper function to scalarize a single Instruction in the innermost loop. 442 /// Generates a sequence of scalar instances for each lane between \p MinLane 443 /// and \p MaxLane, times each part between \p MinPart and \p MaxPart, 444 /// inclusive. Uses the VPValue operands from \p Operands instead of \p 445 /// Instr's operands. 446 void scalarizeInstruction(Instruction *Instr, VPUser &Operands, 447 const VPIteration &Instance, bool IfPredicateInstr, 448 VPTransformState &State); 449 450 /// Widen an integer or floating-point induction variable \p IV. If \p Trunc 451 /// is provided, the integer induction variable will first be truncated to 452 /// the corresponding type. 453 void widenIntOrFpInduction(PHINode *IV, TruncInst *Trunc = nullptr); 454 455 /// getOrCreateVectorValue and getOrCreateScalarValue coordinate to generate a 456 /// vector or scalar value on-demand if one is not yet available. When 457 /// vectorizing a loop, we visit the definition of an instruction before its 458 /// uses. When visiting the definition, we either vectorize or scalarize the 459 /// instruction, creating an entry for it in the corresponding map. (In some 460 /// cases, such as induction variables, we will create both vector and scalar 461 /// entries.) Then, as we encounter uses of the definition, we derive values 462 /// for each scalar or vector use unless such a value is already available. 463 /// For example, if we scalarize a definition and one of its uses is vector, 464 /// we build the required vector on-demand with an insertelement sequence 465 /// when visiting the use. Otherwise, if the use is scalar, we can use the 466 /// existing scalar definition. 467 /// 468 /// Return a value in the new loop corresponding to \p V from the original 469 /// loop at unroll index \p Part. If the value has already been vectorized, 470 /// the corresponding vector entry in VectorLoopValueMap is returned. If, 471 /// however, the value has a scalar entry in VectorLoopValueMap, we construct 472 /// a new vector value on-demand by inserting the scalar values into a vector 473 /// with an insertelement sequence. If the value has been neither vectorized 474 /// nor scalarized, it must be loop invariant, so we simply broadcast the 475 /// value into a vector. 476 Value *getOrCreateVectorValue(Value *V, unsigned Part); 477 478 /// Return a value in the new loop corresponding to \p V from the original 479 /// loop at unroll and vector indices \p Instance. If the value has been 480 /// vectorized but not scalarized, the necessary extractelement instruction 481 /// will be generated. 482 Value *getOrCreateScalarValue(Value *V, const VPIteration &Instance); 483 484 /// Construct the vector value of a scalarized value \p V one lane at a time. 485 void packScalarIntoVectorValue(Value *V, const VPIteration &Instance); 486 487 /// Try to vectorize interleaved access group \p Group with the base address 488 /// given in \p Addr, optionally masking the vector operations if \p 489 /// BlockInMask is non-null. Use \p State to translate given VPValues to IR 490 /// values in the vectorized loop. 491 void vectorizeInterleaveGroup(const InterleaveGroup<Instruction> *Group, 492 VPTransformState &State, VPValue *Addr, 493 VPValue *BlockInMask = nullptr); 494 495 /// Vectorize Load and Store instructions with the base address given in \p 496 /// Addr, optionally masking the vector operations if \p BlockInMask is 497 /// non-null. Use \p State to translate given VPValues to IR values in the 498 /// vectorized loop. 499 void vectorizeMemoryInstruction(Instruction *Instr, VPTransformState &State, 500 VPValue *Addr, VPValue *StoredValue, 501 VPValue *BlockInMask); 502 503 /// Set the debug location in the builder using the debug location in 504 /// the instruction. 505 void setDebugLocFromInst(IRBuilder<> &B, const Value *Ptr); 506 507 /// Fix the non-induction PHIs in the OrigPHIsToFix vector. 508 void fixNonInductionPHIs(void); 509 510 protected: 511 friend class LoopVectorizationPlanner; 512 513 /// A small list of PHINodes. 514 using PhiVector = SmallVector<PHINode *, 4>; 515 516 /// A type for scalarized values in the new loop. Each value from the 517 /// original loop, when scalarized, is represented by UF x VF scalar values 518 /// in the new unrolled loop, where UF is the unroll factor and VF is the 519 /// vectorization factor. 520 using ScalarParts = SmallVector<SmallVector<Value *, 4>, 2>; 521 522 /// Set up the values of the IVs correctly when exiting the vector loop. 523 void fixupIVUsers(PHINode *OrigPhi, const InductionDescriptor &II, 524 Value *CountRoundDown, Value *EndValue, 525 BasicBlock *MiddleBlock); 526 527 /// Create a new induction variable inside L. 528 PHINode *createInductionVariable(Loop *L, Value *Start, Value *End, 529 Value *Step, Instruction *DL); 530 531 /// Handle all cross-iteration phis in the header. 532 void fixCrossIterationPHIs(); 533 534 /// Fix a first-order recurrence. This is the second phase of vectorizing 535 /// this phi node. 536 void fixFirstOrderRecurrence(PHINode *Phi); 537 538 /// Fix a reduction cross-iteration phi. This is the second phase of 539 /// vectorizing this phi node. 540 void fixReduction(PHINode *Phi); 541 542 /// Clear NSW/NUW flags from reduction instructions if necessary. 543 void clearReductionWrapFlags(RecurrenceDescriptor &RdxDesc); 544 545 /// The Loop exit block may have single value PHI nodes with some 546 /// incoming value. While vectorizing we only handled real values 547 /// that were defined inside the loop and we should have one value for 548 /// each predecessor of its parent basic block. See PR14725. 549 void fixLCSSAPHIs(); 550 551 /// Iteratively sink the scalarized operands of a predicated instruction into 552 /// the block that was created for it. 553 void sinkScalarOperands(Instruction *PredInst); 554 555 /// Shrinks vector element sizes to the smallest bitwidth they can be legally 556 /// represented as. 557 void truncateToMinimalBitwidths(); 558 559 /// Create a broadcast instruction. This method generates a broadcast 560 /// instruction (shuffle) for loop invariant values and for the induction 561 /// value. If this is the induction variable then we extend it to N, N+1, ... 562 /// this is needed because each iteration in the loop corresponds to a SIMD 563 /// element. 564 virtual Value *getBroadcastInstrs(Value *V); 565 566 /// This function adds (StartIdx, StartIdx + Step, StartIdx + 2*Step, ...) 567 /// to each vector element of Val. The sequence starts at StartIndex. 568 /// \p Opcode is relevant for FP induction variable. 569 virtual Value *getStepVector(Value *Val, int StartIdx, Value *Step, 570 Instruction::BinaryOps Opcode = 571 Instruction::BinaryOpsEnd); 572 573 /// Compute scalar induction steps. \p ScalarIV is the scalar induction 574 /// variable on which to base the steps, \p Step is the size of the step, and 575 /// \p EntryVal is the value from the original loop that maps to the steps. 576 /// Note that \p EntryVal doesn't have to be an induction variable - it 577 /// can also be a truncate instruction. 578 void buildScalarSteps(Value *ScalarIV, Value *Step, Instruction *EntryVal, 579 const InductionDescriptor &ID); 580 581 /// Create a vector induction phi node based on an existing scalar one. \p 582 /// EntryVal is the value from the original loop that maps to the vector phi 583 /// node, and \p Step is the loop-invariant step. If \p EntryVal is a 584 /// truncate instruction, instead of widening the original IV, we widen a 585 /// version of the IV truncated to \p EntryVal's type. 586 void createVectorIntOrFpInductionPHI(const InductionDescriptor &II, 587 Value *Step, Instruction *EntryVal); 588 589 /// Returns true if an instruction \p I should be scalarized instead of 590 /// vectorized for the chosen vectorization factor. 591 bool shouldScalarizeInstruction(Instruction *I) const; 592 593 /// Returns true if we should generate a scalar version of \p IV. 594 bool needsScalarInduction(Instruction *IV) const; 595 596 /// If there is a cast involved in the induction variable \p ID, which should 597 /// be ignored in the vectorized loop body, this function records the 598 /// VectorLoopValue of the respective Phi also as the VectorLoopValue of the 599 /// cast. We had already proved that the casted Phi is equal to the uncasted 600 /// Phi in the vectorized loop (under a runtime guard), and therefore 601 /// there is no need to vectorize the cast - the same value can be used in the 602 /// vector loop for both the Phi and the cast. 603 /// If \p VectorLoopValue is a scalarized value, \p Lane is also specified, 604 /// Otherwise, \p VectorLoopValue is a widened/vectorized value. 605 /// 606 /// \p EntryVal is the value from the original loop that maps to the vector 607 /// phi node and is used to distinguish what is the IV currently being 608 /// processed - original one (if \p EntryVal is a phi corresponding to the 609 /// original IV) or the "newly-created" one based on the proof mentioned above 610 /// (see also buildScalarSteps() and createVectorIntOrFPInductionPHI()). In the 611 /// latter case \p EntryVal is a TruncInst and we must not record anything for 612 /// that IV, but it's error-prone to expect callers of this routine to care 613 /// about that, hence this explicit parameter. 614 void recordVectorLoopValueForInductionCast(const InductionDescriptor &ID, 615 const Instruction *EntryVal, 616 Value *VectorLoopValue, 617 unsigned Part, 618 unsigned Lane = UINT_MAX); 619 620 /// Generate a shuffle sequence that will reverse the vector Vec. 621 virtual Value *reverseVector(Value *Vec); 622 623 /// Returns (and creates if needed) the original loop trip count. 624 Value *getOrCreateTripCount(Loop *NewLoop); 625 626 /// Returns (and creates if needed) the trip count of the widened loop. 627 Value *getOrCreateVectorTripCount(Loop *NewLoop); 628 629 /// Returns a bitcasted value to the requested vector type. 630 /// Also handles bitcasts of vector<float> <-> vector<pointer> types. 631 Value *createBitOrPointerCast(Value *V, VectorType *DstVTy, 632 const DataLayout &DL); 633 634 /// Emit a bypass check to see if the vector trip count is zero, including if 635 /// it overflows. 636 void emitMinimumIterationCountCheck(Loop *L, BasicBlock *Bypass); 637 638 /// Emit a bypass check to see if all of the SCEV assumptions we've 639 /// had to make are correct. 640 void emitSCEVChecks(Loop *L, BasicBlock *Bypass); 641 642 /// Emit bypass checks to check any memory assumptions we may have made. 643 void emitMemRuntimeChecks(Loop *L, BasicBlock *Bypass); 644 645 /// Compute the transformed value of Index at offset StartValue using step 646 /// StepValue. 647 /// For integer induction, returns StartValue + Index * StepValue. 648 /// For pointer induction, returns StartValue[Index * StepValue]. 649 /// FIXME: The newly created binary instructions should contain nsw/nuw 650 /// flags, which can be found from the original scalar operations. 651 Value *emitTransformedIndex(IRBuilder<> &B, Value *Index, ScalarEvolution *SE, 652 const DataLayout &DL, 653 const InductionDescriptor &ID) const; 654 655 /// Add additional metadata to \p To that was not present on \p Orig. 656 /// 657 /// Currently this is used to add the noalias annotations based on the 658 /// inserted memchecks. Use this for instructions that are *cloned* into the 659 /// vector loop. 660 void addNewMetadata(Instruction *To, const Instruction *Orig); 661 662 /// Add metadata from one instruction to another. 663 /// 664 /// This includes both the original MDs from \p From and additional ones (\see 665 /// addNewMetadata). Use this for *newly created* instructions in the vector 666 /// loop. 667 void addMetadata(Instruction *To, Instruction *From); 668 669 /// Similar to the previous function but it adds the metadata to a 670 /// vector of instructions. 671 void addMetadata(ArrayRef<Value *> To, Instruction *From); 672 673 /// The original loop. 674 Loop *OrigLoop; 675 676 /// A wrapper around ScalarEvolution used to add runtime SCEV checks. Applies 677 /// dynamic knowledge to simplify SCEV expressions and converts them to a 678 /// more usable form. 679 PredicatedScalarEvolution &PSE; 680 681 /// Loop Info. 682 LoopInfo *LI; 683 684 /// Dominator Tree. 685 DominatorTree *DT; 686 687 /// Alias Analysis. 688 AliasAnalysis *AA; 689 690 /// Target Library Info. 691 const TargetLibraryInfo *TLI; 692 693 /// Target Transform Info. 694 const TargetTransformInfo *TTI; 695 696 /// Assumption Cache. 697 AssumptionCache *AC; 698 699 /// Interface to emit optimization remarks. 700 OptimizationRemarkEmitter *ORE; 701 702 /// LoopVersioning. It's only set up (non-null) if memchecks were 703 /// used. 704 /// 705 /// This is currently only used to add no-alias metadata based on the 706 /// memchecks. The actually versioning is performed manually. 707 std::unique_ptr<LoopVersioning> LVer; 708 709 /// The vectorization SIMD factor to use. Each vector will have this many 710 /// vector elements. 711 unsigned VF; 712 713 /// The vectorization unroll factor to use. Each scalar is vectorized to this 714 /// many different vector instructions. 715 unsigned UF; 716 717 /// The builder that we use 718 IRBuilder<> Builder; 719 720 // --- Vectorization state --- 721 722 /// The vector-loop preheader. 723 BasicBlock *LoopVectorPreHeader; 724 725 /// The scalar-loop preheader. 726 BasicBlock *LoopScalarPreHeader; 727 728 /// Middle Block between the vector and the scalar. 729 BasicBlock *LoopMiddleBlock; 730 731 /// The ExitBlock of the scalar loop. 732 BasicBlock *LoopExitBlock; 733 734 /// The vector loop body. 735 BasicBlock *LoopVectorBody; 736 737 /// The scalar loop body. 738 BasicBlock *LoopScalarBody; 739 740 /// A list of all bypass blocks. The first block is the entry of the loop. 741 SmallVector<BasicBlock *, 4> LoopBypassBlocks; 742 743 /// The new Induction variable which was added to the new block. 744 PHINode *Induction = nullptr; 745 746 /// The induction variable of the old basic block. 747 PHINode *OldInduction = nullptr; 748 749 /// Maps values from the original loop to their corresponding values in the 750 /// vectorized loop. A key value can map to either vector values, scalar 751 /// values or both kinds of values, depending on whether the key was 752 /// vectorized and scalarized. 753 VectorizerValueMap VectorLoopValueMap; 754 755 /// Store instructions that were predicated. 756 SmallVector<Instruction *, 4> PredicatedInstructions; 757 758 /// Trip count of the original loop. 759 Value *TripCount = nullptr; 760 761 /// Trip count of the widened loop (TripCount - TripCount % (VF*UF)) 762 Value *VectorTripCount = nullptr; 763 764 /// The legality analysis. 765 LoopVectorizationLegality *Legal; 766 767 /// The profitablity analysis. 768 LoopVectorizationCostModel *Cost; 769 770 // Record whether runtime checks are added. 771 bool AddedSafetyChecks = false; 772 773 // Holds the end values for each induction variable. We save the end values 774 // so we can later fix-up the external users of the induction variables. 775 DenseMap<PHINode *, Value *> IVEndValues; 776 777 // Vector of original scalar PHIs whose corresponding widened PHIs need to be 778 // fixed up at the end of vector code generation. 779 SmallVector<PHINode *, 8> OrigPHIsToFix; 780 }; 781 782 class InnerLoopUnroller : public InnerLoopVectorizer { 783 public: 784 InnerLoopUnroller(Loop *OrigLoop, PredicatedScalarEvolution &PSE, 785 LoopInfo *LI, DominatorTree *DT, 786 const TargetLibraryInfo *TLI, 787 const TargetTransformInfo *TTI, AssumptionCache *AC, 788 OptimizationRemarkEmitter *ORE, unsigned UnrollFactor, 789 LoopVectorizationLegality *LVL, 790 LoopVectorizationCostModel *CM) 791 : InnerLoopVectorizer(OrigLoop, PSE, LI, DT, TLI, TTI, AC, ORE, 1, 792 UnrollFactor, LVL, CM) {} 793 794 private: 795 Value *getBroadcastInstrs(Value *V) override; 796 Value *getStepVector(Value *Val, int StartIdx, Value *Step, 797 Instruction::BinaryOps Opcode = 798 Instruction::BinaryOpsEnd) override; 799 Value *reverseVector(Value *Vec) override; 800 }; 801 802 } // end namespace llvm 803 804 /// Look for a meaningful debug location on the instruction or it's 805 /// operands. 806 static Instruction *getDebugLocFromInstOrOperands(Instruction *I) { 807 if (!I) 808 return I; 809 810 DebugLoc Empty; 811 if (I->getDebugLoc() != Empty) 812 return I; 813 814 for (User::op_iterator OI = I->op_begin(), OE = I->op_end(); OI != OE; ++OI) { 815 if (Instruction *OpInst = dyn_cast<Instruction>(*OI)) 816 if (OpInst->getDebugLoc() != Empty) 817 return OpInst; 818 } 819 820 return I; 821 } 822 823 void InnerLoopVectorizer::setDebugLocFromInst(IRBuilder<> &B, const Value *Ptr) { 824 if (const Instruction *Inst = dyn_cast_or_null<Instruction>(Ptr)) { 825 const DILocation *DIL = Inst->getDebugLoc(); 826 if (DIL && Inst->getFunction()->isDebugInfoForProfiling() && 827 !isa<DbgInfoIntrinsic>(Inst)) { 828 auto NewDIL = DIL->cloneByMultiplyingDuplicationFactor(UF * VF); 829 if (NewDIL) 830 B.SetCurrentDebugLocation(NewDIL.getValue()); 831 else 832 LLVM_DEBUG(dbgs() 833 << "Failed to create new discriminator: " 834 << DIL->getFilename() << " Line: " << DIL->getLine()); 835 } 836 else 837 B.SetCurrentDebugLocation(DIL); 838 } else 839 B.SetCurrentDebugLocation(DebugLoc()); 840 } 841 842 /// Write a record \p DebugMsg about vectorization failure to the debug 843 /// output stream. If \p I is passed, it is an instruction that prevents 844 /// vectorization. 845 #ifndef NDEBUG 846 static void debugVectorizationFailure(const StringRef DebugMsg, 847 Instruction *I) { 848 dbgs() << "LV: Not vectorizing: " << DebugMsg; 849 if (I != nullptr) 850 dbgs() << " " << *I; 851 else 852 dbgs() << '.'; 853 dbgs() << '\n'; 854 } 855 #endif 856 857 /// Create an analysis remark that explains why vectorization failed 858 /// 859 /// \p PassName is the name of the pass (e.g. can be AlwaysPrint). \p 860 /// RemarkName is the identifier for the remark. If \p I is passed it is an 861 /// instruction that prevents vectorization. Otherwise \p TheLoop is used for 862 /// the location of the remark. \return the remark object that can be 863 /// streamed to. 864 static OptimizationRemarkAnalysis createLVAnalysis(const char *PassName, 865 StringRef RemarkName, Loop *TheLoop, Instruction *I) { 866 Value *CodeRegion = TheLoop->getHeader(); 867 DebugLoc DL = TheLoop->getStartLoc(); 868 869 if (I) { 870 CodeRegion = I->getParent(); 871 // If there is no debug location attached to the instruction, revert back to 872 // using the loop's. 873 if (I->getDebugLoc()) 874 DL = I->getDebugLoc(); 875 } 876 877 OptimizationRemarkAnalysis R(PassName, RemarkName, DL, CodeRegion); 878 R << "loop not vectorized: "; 879 return R; 880 } 881 882 namespace llvm { 883 884 void reportVectorizationFailure(const StringRef DebugMsg, 885 const StringRef OREMsg, const StringRef ORETag, 886 OptimizationRemarkEmitter *ORE, Loop *TheLoop, Instruction *I) { 887 LLVM_DEBUG(debugVectorizationFailure(DebugMsg, I)); 888 LoopVectorizeHints Hints(TheLoop, true /* doesn't matter */, *ORE); 889 ORE->emit(createLVAnalysis(Hints.vectorizeAnalysisPassName(), 890 ORETag, TheLoop, I) << OREMsg); 891 } 892 893 } // end namespace llvm 894 895 #ifndef NDEBUG 896 /// \return string containing a file name and a line # for the given loop. 897 static std::string getDebugLocString(const Loop *L) { 898 std::string Result; 899 if (L) { 900 raw_string_ostream OS(Result); 901 if (const DebugLoc LoopDbgLoc = L->getStartLoc()) 902 LoopDbgLoc.print(OS); 903 else 904 // Just print the module name. 905 OS << L->getHeader()->getParent()->getParent()->getModuleIdentifier(); 906 OS.flush(); 907 } 908 return Result; 909 } 910 #endif 911 912 void InnerLoopVectorizer::addNewMetadata(Instruction *To, 913 const Instruction *Orig) { 914 // If the loop was versioned with memchecks, add the corresponding no-alias 915 // metadata. 916 if (LVer && (isa<LoadInst>(Orig) || isa<StoreInst>(Orig))) 917 LVer->annotateInstWithNoAlias(To, Orig); 918 } 919 920 void InnerLoopVectorizer::addMetadata(Instruction *To, 921 Instruction *From) { 922 propagateMetadata(To, From); 923 addNewMetadata(To, From); 924 } 925 926 void InnerLoopVectorizer::addMetadata(ArrayRef<Value *> To, 927 Instruction *From) { 928 for (Value *V : To) { 929 if (Instruction *I = dyn_cast<Instruction>(V)) 930 addMetadata(I, From); 931 } 932 } 933 934 namespace llvm { 935 936 // Loop vectorization cost-model hints how the scalar epilogue loop should be 937 // lowered. 938 enum ScalarEpilogueLowering { 939 940 // The default: allowing scalar epilogues. 941 CM_ScalarEpilogueAllowed, 942 943 // Vectorization with OptForSize: don't allow epilogues. 944 CM_ScalarEpilogueNotAllowedOptSize, 945 946 // A special case of vectorisation with OptForSize: loops with a very small 947 // trip count are considered for vectorization under OptForSize, thereby 948 // making sure the cost of their loop body is dominant, free of runtime 949 // guards and scalar iteration overheads. 950 CM_ScalarEpilogueNotAllowedLowTripLoop, 951 952 // Loop hint predicate indicating an epilogue is undesired. 953 CM_ScalarEpilogueNotNeededUsePredicate 954 }; 955 956 /// LoopVectorizationCostModel - estimates the expected speedups due to 957 /// vectorization. 958 /// In many cases vectorization is not profitable. This can happen because of 959 /// a number of reasons. In this class we mainly attempt to predict the 960 /// expected speedup/slowdowns due to the supported instruction set. We use the 961 /// TargetTransformInfo to query the different backends for the cost of 962 /// different operations. 963 class LoopVectorizationCostModel { 964 public: 965 LoopVectorizationCostModel(ScalarEpilogueLowering SEL, Loop *L, 966 PredicatedScalarEvolution &PSE, LoopInfo *LI, 967 LoopVectorizationLegality *Legal, 968 const TargetTransformInfo &TTI, 969 const TargetLibraryInfo *TLI, DemandedBits *DB, 970 AssumptionCache *AC, 971 OptimizationRemarkEmitter *ORE, const Function *F, 972 const LoopVectorizeHints *Hints, 973 InterleavedAccessInfo &IAI) 974 : ScalarEpilogueStatus(SEL), TheLoop(L), PSE(PSE), LI(LI), Legal(Legal), 975 TTI(TTI), TLI(TLI), DB(DB), AC(AC), ORE(ORE), TheFunction(F), 976 Hints(Hints), InterleaveInfo(IAI) {} 977 978 /// \return An upper bound for the vectorization factor, or None if 979 /// vectorization and interleaving should be avoided up front. 980 Optional<unsigned> computeMaxVF(unsigned UserVF, unsigned UserIC); 981 982 /// \return True if runtime checks are required for vectorization, and false 983 /// otherwise. 984 bool runtimeChecksRequired(); 985 986 /// \return The most profitable vectorization factor and the cost of that VF. 987 /// This method checks every power of two up to MaxVF. If UserVF is not ZERO 988 /// then this vectorization factor will be selected if vectorization is 989 /// possible. 990 VectorizationFactor selectVectorizationFactor(unsigned MaxVF); 991 992 /// Setup cost-based decisions for user vectorization factor. 993 void selectUserVectorizationFactor(unsigned UserVF) { 994 collectUniformsAndScalars(UserVF); 995 collectInstsToScalarize(UserVF); 996 } 997 998 /// \return The size (in bits) of the smallest and widest types in the code 999 /// that needs to be vectorized. We ignore values that remain scalar such as 1000 /// 64 bit loop indices. 1001 std::pair<unsigned, unsigned> getSmallestAndWidestTypes(); 1002 1003 /// \return The desired interleave count. 1004 /// If interleave count has been specified by metadata it will be returned. 1005 /// Otherwise, the interleave count is computed and returned. VF and LoopCost 1006 /// are the selected vectorization factor and the cost of the selected VF. 1007 unsigned selectInterleaveCount(unsigned VF, unsigned LoopCost); 1008 1009 /// Memory access instruction may be vectorized in more than one way. 1010 /// Form of instruction after vectorization depends on cost. 1011 /// This function takes cost-based decisions for Load/Store instructions 1012 /// and collects them in a map. This decisions map is used for building 1013 /// the lists of loop-uniform and loop-scalar instructions. 1014 /// The calculated cost is saved with widening decision in order to 1015 /// avoid redundant calculations. 1016 void setCostBasedWideningDecision(unsigned VF); 1017 1018 /// A struct that represents some properties of the register usage 1019 /// of a loop. 1020 struct RegisterUsage { 1021 /// Holds the number of loop invariant values that are used in the loop. 1022 /// The key is ClassID of target-provided register class. 1023 SmallMapVector<unsigned, unsigned, 4> LoopInvariantRegs; 1024 /// Holds the maximum number of concurrent live intervals in the loop. 1025 /// The key is ClassID of target-provided register class. 1026 SmallMapVector<unsigned, unsigned, 4> MaxLocalUsers; 1027 }; 1028 1029 /// \return Returns information about the register usages of the loop for the 1030 /// given vectorization factors. 1031 SmallVector<RegisterUsage, 8> calculateRegisterUsage(ArrayRef<unsigned> VFs); 1032 1033 /// Collect values we want to ignore in the cost model. 1034 void collectValuesToIgnore(); 1035 1036 /// \returns The smallest bitwidth each instruction can be represented with. 1037 /// The vector equivalents of these instructions should be truncated to this 1038 /// type. 1039 const MapVector<Instruction *, uint64_t> &getMinimalBitwidths() const { 1040 return MinBWs; 1041 } 1042 1043 /// \returns True if it is more profitable to scalarize instruction \p I for 1044 /// vectorization factor \p VF. 1045 bool isProfitableToScalarize(Instruction *I, unsigned VF) const { 1046 assert(VF > 1 && "Profitable to scalarize relevant only for VF > 1."); 1047 1048 // Cost model is not run in the VPlan-native path - return conservative 1049 // result until this changes. 1050 if (EnableVPlanNativePath) 1051 return false; 1052 1053 auto Scalars = InstsToScalarize.find(VF); 1054 assert(Scalars != InstsToScalarize.end() && 1055 "VF not yet analyzed for scalarization profitability"); 1056 return Scalars->second.find(I) != Scalars->second.end(); 1057 } 1058 1059 /// Returns true if \p I is known to be uniform after vectorization. 1060 bool isUniformAfterVectorization(Instruction *I, unsigned VF) const { 1061 if (VF == 1) 1062 return true; 1063 1064 // Cost model is not run in the VPlan-native path - return conservative 1065 // result until this changes. 1066 if (EnableVPlanNativePath) 1067 return false; 1068 1069 auto UniformsPerVF = Uniforms.find(VF); 1070 assert(UniformsPerVF != Uniforms.end() && 1071 "VF not yet analyzed for uniformity"); 1072 return UniformsPerVF->second.find(I) != UniformsPerVF->second.end(); 1073 } 1074 1075 /// Returns true if \p I is known to be scalar after vectorization. 1076 bool isScalarAfterVectorization(Instruction *I, unsigned VF) const { 1077 if (VF == 1) 1078 return true; 1079 1080 // Cost model is not run in the VPlan-native path - return conservative 1081 // result until this changes. 1082 if (EnableVPlanNativePath) 1083 return false; 1084 1085 auto ScalarsPerVF = Scalars.find(VF); 1086 assert(ScalarsPerVF != Scalars.end() && 1087 "Scalar values are not calculated for VF"); 1088 return ScalarsPerVF->second.find(I) != ScalarsPerVF->second.end(); 1089 } 1090 1091 /// \returns True if instruction \p I can be truncated to a smaller bitwidth 1092 /// for vectorization factor \p VF. 1093 bool canTruncateToMinimalBitwidth(Instruction *I, unsigned VF) const { 1094 return VF > 1 && MinBWs.find(I) != MinBWs.end() && 1095 !isProfitableToScalarize(I, VF) && 1096 !isScalarAfterVectorization(I, VF); 1097 } 1098 1099 /// Decision that was taken during cost calculation for memory instruction. 1100 enum InstWidening { 1101 CM_Unknown, 1102 CM_Widen, // For consecutive accesses with stride +1. 1103 CM_Widen_Reverse, // For consecutive accesses with stride -1. 1104 CM_Interleave, 1105 CM_GatherScatter, 1106 CM_Scalarize 1107 }; 1108 1109 /// Save vectorization decision \p W and \p Cost taken by the cost model for 1110 /// instruction \p I and vector width \p VF. 1111 void setWideningDecision(Instruction *I, unsigned VF, InstWidening W, 1112 unsigned Cost) { 1113 assert(VF >= 2 && "Expected VF >=2"); 1114 WideningDecisions[std::make_pair(I, VF)] = std::make_pair(W, Cost); 1115 } 1116 1117 /// Save vectorization decision \p W and \p Cost taken by the cost model for 1118 /// interleaving group \p Grp and vector width \p VF. 1119 void setWideningDecision(const InterleaveGroup<Instruction> *Grp, unsigned VF, 1120 InstWidening W, unsigned Cost) { 1121 assert(VF >= 2 && "Expected VF >=2"); 1122 /// Broadcast this decicion to all instructions inside the group. 1123 /// But the cost will be assigned to one instruction only. 1124 for (unsigned i = 0; i < Grp->getFactor(); ++i) { 1125 if (auto *I = Grp->getMember(i)) { 1126 if (Grp->getInsertPos() == I) 1127 WideningDecisions[std::make_pair(I, VF)] = std::make_pair(W, Cost); 1128 else 1129 WideningDecisions[std::make_pair(I, VF)] = std::make_pair(W, 0); 1130 } 1131 } 1132 } 1133 1134 /// Return the cost model decision for the given instruction \p I and vector 1135 /// width \p VF. Return CM_Unknown if this instruction did not pass 1136 /// through the cost modeling. 1137 InstWidening getWideningDecision(Instruction *I, unsigned VF) { 1138 assert(VF >= 2 && "Expected VF >=2"); 1139 1140 // Cost model is not run in the VPlan-native path - return conservative 1141 // result until this changes. 1142 if (EnableVPlanNativePath) 1143 return CM_GatherScatter; 1144 1145 std::pair<Instruction *, unsigned> InstOnVF = std::make_pair(I, VF); 1146 auto Itr = WideningDecisions.find(InstOnVF); 1147 if (Itr == WideningDecisions.end()) 1148 return CM_Unknown; 1149 return Itr->second.first; 1150 } 1151 1152 /// Return the vectorization cost for the given instruction \p I and vector 1153 /// width \p VF. 1154 unsigned getWideningCost(Instruction *I, unsigned VF) { 1155 assert(VF >= 2 && "Expected VF >=2"); 1156 std::pair<Instruction *, unsigned> InstOnVF = std::make_pair(I, VF); 1157 assert(WideningDecisions.find(InstOnVF) != WideningDecisions.end() && 1158 "The cost is not calculated"); 1159 return WideningDecisions[InstOnVF].second; 1160 } 1161 1162 /// Return True if instruction \p I is an optimizable truncate whose operand 1163 /// is an induction variable. Such a truncate will be removed by adding a new 1164 /// induction variable with the destination type. 1165 bool isOptimizableIVTruncate(Instruction *I, unsigned VF) { 1166 // If the instruction is not a truncate, return false. 1167 auto *Trunc = dyn_cast<TruncInst>(I); 1168 if (!Trunc) 1169 return false; 1170 1171 // Get the source and destination types of the truncate. 1172 Type *SrcTy = ToVectorTy(cast<CastInst>(I)->getSrcTy(), VF); 1173 Type *DestTy = ToVectorTy(cast<CastInst>(I)->getDestTy(), VF); 1174 1175 // If the truncate is free for the given types, return false. Replacing a 1176 // free truncate with an induction variable would add an induction variable 1177 // update instruction to each iteration of the loop. We exclude from this 1178 // check the primary induction variable since it will need an update 1179 // instruction regardless. 1180 Value *Op = Trunc->getOperand(0); 1181 if (Op != Legal->getPrimaryInduction() && TTI.isTruncateFree(SrcTy, DestTy)) 1182 return false; 1183 1184 // If the truncated value is not an induction variable, return false. 1185 return Legal->isInductionPhi(Op); 1186 } 1187 1188 /// Collects the instructions to scalarize for each predicated instruction in 1189 /// the loop. 1190 void collectInstsToScalarize(unsigned VF); 1191 1192 /// Collect Uniform and Scalar values for the given \p VF. 1193 /// The sets depend on CM decision for Load/Store instructions 1194 /// that may be vectorized as interleave, gather-scatter or scalarized. 1195 void collectUniformsAndScalars(unsigned VF) { 1196 // Do the analysis once. 1197 if (VF == 1 || Uniforms.find(VF) != Uniforms.end()) 1198 return; 1199 setCostBasedWideningDecision(VF); 1200 collectLoopUniforms(VF); 1201 collectLoopScalars(VF); 1202 } 1203 1204 /// Returns true if the target machine supports masked store operation 1205 /// for the given \p DataType and kind of access to \p Ptr. 1206 bool isLegalMaskedStore(Type *DataType, Value *Ptr, Align Alignment) { 1207 return Legal->isConsecutivePtr(Ptr) && 1208 TTI.isLegalMaskedStore(DataType, Alignment); 1209 } 1210 1211 /// Returns true if the target machine supports masked load operation 1212 /// for the given \p DataType and kind of access to \p Ptr. 1213 bool isLegalMaskedLoad(Type *DataType, Value *Ptr, Align Alignment) { 1214 return Legal->isConsecutivePtr(Ptr) && 1215 TTI.isLegalMaskedLoad(DataType, Alignment); 1216 } 1217 1218 /// Returns true if the target machine supports masked scatter operation 1219 /// for the given \p DataType. 1220 bool isLegalMaskedScatter(Type *DataType, Align Alignment) { 1221 return TTI.isLegalMaskedScatter(DataType, Alignment); 1222 } 1223 1224 /// Returns true if the target machine supports masked gather operation 1225 /// for the given \p DataType. 1226 bool isLegalMaskedGather(Type *DataType, Align Alignment) { 1227 return TTI.isLegalMaskedGather(DataType, Alignment); 1228 } 1229 1230 /// Returns true if the target machine can represent \p V as a masked gather 1231 /// or scatter operation. 1232 bool isLegalGatherOrScatter(Value *V) { 1233 bool LI = isa<LoadInst>(V); 1234 bool SI = isa<StoreInst>(V); 1235 if (!LI && !SI) 1236 return false; 1237 auto *Ty = getMemInstValueType(V); 1238 Align Align = getLoadStoreAlignment(V); 1239 return (LI && isLegalMaskedGather(Ty, Align)) || 1240 (SI && isLegalMaskedScatter(Ty, Align)); 1241 } 1242 1243 /// Returns true if \p I is an instruction that will be scalarized with 1244 /// predication. Such instructions include conditional stores and 1245 /// instructions that may divide by zero. 1246 /// If a non-zero VF has been calculated, we check if I will be scalarized 1247 /// predication for that VF. 1248 bool isScalarWithPredication(Instruction *I, unsigned VF = 1); 1249 1250 // Returns true if \p I is an instruction that will be predicated either 1251 // through scalar predication or masked load/store or masked gather/scatter. 1252 // Superset of instructions that return true for isScalarWithPredication. 1253 bool isPredicatedInst(Instruction *I) { 1254 if (!blockNeedsPredication(I->getParent())) 1255 return false; 1256 // Loads and stores that need some form of masked operation are predicated 1257 // instructions. 1258 if (isa<LoadInst>(I) || isa<StoreInst>(I)) 1259 return Legal->isMaskRequired(I); 1260 return isScalarWithPredication(I); 1261 } 1262 1263 /// Returns true if \p I is a memory instruction with consecutive memory 1264 /// access that can be widened. 1265 bool memoryInstructionCanBeWidened(Instruction *I, unsigned VF = 1); 1266 1267 /// Returns true if \p I is a memory instruction in an interleaved-group 1268 /// of memory accesses that can be vectorized with wide vector loads/stores 1269 /// and shuffles. 1270 bool interleavedAccessCanBeWidened(Instruction *I, unsigned VF = 1); 1271 1272 /// Check if \p Instr belongs to any interleaved access group. 1273 bool isAccessInterleaved(Instruction *Instr) { 1274 return InterleaveInfo.isInterleaved(Instr); 1275 } 1276 1277 /// Get the interleaved access group that \p Instr belongs to. 1278 const InterleaveGroup<Instruction> * 1279 getInterleavedAccessGroup(Instruction *Instr) { 1280 return InterleaveInfo.getInterleaveGroup(Instr); 1281 } 1282 1283 /// Returns true if an interleaved group requires a scalar iteration 1284 /// to handle accesses with gaps, and there is nothing preventing us from 1285 /// creating a scalar epilogue. 1286 bool requiresScalarEpilogue() const { 1287 return isScalarEpilogueAllowed() && InterleaveInfo.requiresScalarEpilogue(); 1288 } 1289 1290 /// Returns true if a scalar epilogue is not allowed due to optsize or a 1291 /// loop hint annotation. 1292 bool isScalarEpilogueAllowed() const { 1293 return ScalarEpilogueStatus == CM_ScalarEpilogueAllowed; 1294 } 1295 1296 /// Returns true if all loop blocks should be masked to fold tail loop. 1297 bool foldTailByMasking() const { return FoldTailByMasking; } 1298 1299 bool blockNeedsPredication(BasicBlock *BB) { 1300 return foldTailByMasking() || Legal->blockNeedsPredication(BB); 1301 } 1302 1303 /// Estimate cost of an intrinsic call instruction CI if it were vectorized 1304 /// with factor VF. Return the cost of the instruction, including 1305 /// scalarization overhead if it's needed. 1306 unsigned getVectorIntrinsicCost(CallInst *CI, unsigned VF); 1307 1308 /// Estimate cost of a call instruction CI if it were vectorized with factor 1309 /// VF. Return the cost of the instruction, including scalarization overhead 1310 /// if it's needed. The flag NeedToScalarize shows if the call needs to be 1311 /// scalarized - 1312 /// i.e. either vector version isn't available, or is too expensive. 1313 unsigned getVectorCallCost(CallInst *CI, unsigned VF, bool &NeedToScalarize); 1314 1315 /// Invalidates decisions already taken by the cost model. 1316 void invalidateCostModelingDecisions() { 1317 WideningDecisions.clear(); 1318 Uniforms.clear(); 1319 Scalars.clear(); 1320 } 1321 1322 private: 1323 unsigned NumPredStores = 0; 1324 1325 /// \return An upper bound for the vectorization factor, larger than zero. 1326 /// One is returned if vectorization should best be avoided due to cost. 1327 unsigned computeFeasibleMaxVF(unsigned ConstTripCount); 1328 1329 /// The vectorization cost is a combination of the cost itself and a boolean 1330 /// indicating whether any of the contributing operations will actually 1331 /// operate on 1332 /// vector values after type legalization in the backend. If this latter value 1333 /// is 1334 /// false, then all operations will be scalarized (i.e. no vectorization has 1335 /// actually taken place). 1336 using VectorizationCostTy = std::pair<unsigned, bool>; 1337 1338 /// Returns the expected execution cost. The unit of the cost does 1339 /// not matter because we use the 'cost' units to compare different 1340 /// vector widths. The cost that is returned is *not* normalized by 1341 /// the factor width. 1342 VectorizationCostTy expectedCost(unsigned VF); 1343 1344 /// Returns the execution time cost of an instruction for a given vector 1345 /// width. Vector width of one means scalar. 1346 VectorizationCostTy getInstructionCost(Instruction *I, unsigned VF); 1347 1348 /// The cost-computation logic from getInstructionCost which provides 1349 /// the vector type as an output parameter. 1350 unsigned getInstructionCost(Instruction *I, unsigned VF, Type *&VectorTy); 1351 1352 /// Calculate vectorization cost of memory instruction \p I. 1353 unsigned getMemoryInstructionCost(Instruction *I, unsigned VF); 1354 1355 /// The cost computation for scalarized memory instruction. 1356 unsigned getMemInstScalarizationCost(Instruction *I, unsigned VF); 1357 1358 /// The cost computation for interleaving group of memory instructions. 1359 unsigned getInterleaveGroupCost(Instruction *I, unsigned VF); 1360 1361 /// The cost computation for Gather/Scatter instruction. 1362 unsigned getGatherScatterCost(Instruction *I, unsigned VF); 1363 1364 /// The cost computation for widening instruction \p I with consecutive 1365 /// memory access. 1366 unsigned getConsecutiveMemOpCost(Instruction *I, unsigned VF); 1367 1368 /// The cost calculation for Load/Store instruction \p I with uniform pointer - 1369 /// Load: scalar load + broadcast. 1370 /// Store: scalar store + (loop invariant value stored? 0 : extract of last 1371 /// element) 1372 unsigned getUniformMemOpCost(Instruction *I, unsigned VF); 1373 1374 /// Estimate the overhead of scalarizing an instruction. This is a 1375 /// convenience wrapper for the type-based getScalarizationOverhead API. 1376 unsigned getScalarizationOverhead(Instruction *I, unsigned VF); 1377 1378 /// Returns whether the instruction is a load or store and will be a emitted 1379 /// as a vector operation. 1380 bool isConsecutiveLoadOrStore(Instruction *I); 1381 1382 /// Returns true if an artificially high cost for emulated masked memrefs 1383 /// should be used. 1384 bool useEmulatedMaskMemRefHack(Instruction *I); 1385 1386 /// Map of scalar integer values to the smallest bitwidth they can be legally 1387 /// represented as. The vector equivalents of these values should be truncated 1388 /// to this type. 1389 MapVector<Instruction *, uint64_t> MinBWs; 1390 1391 /// A type representing the costs for instructions if they were to be 1392 /// scalarized rather than vectorized. The entries are Instruction-Cost 1393 /// pairs. 1394 using ScalarCostsTy = DenseMap<Instruction *, unsigned>; 1395 1396 /// A set containing all BasicBlocks that are known to present after 1397 /// vectorization as a predicated block. 1398 SmallPtrSet<BasicBlock *, 4> PredicatedBBsAfterVectorization; 1399 1400 /// Records whether it is allowed to have the original scalar loop execute at 1401 /// least once. This may be needed as a fallback loop in case runtime 1402 /// aliasing/dependence checks fail, or to handle the tail/remainder 1403 /// iterations when the trip count is unknown or doesn't divide by the VF, 1404 /// or as a peel-loop to handle gaps in interleave-groups. 1405 /// Under optsize and when the trip count is very small we don't allow any 1406 /// iterations to execute in the scalar loop. 1407 ScalarEpilogueLowering ScalarEpilogueStatus = CM_ScalarEpilogueAllowed; 1408 1409 /// All blocks of loop are to be masked to fold tail of scalar iterations. 1410 bool FoldTailByMasking = false; 1411 1412 /// A map holding scalar costs for different vectorization factors. The 1413 /// presence of a cost for an instruction in the mapping indicates that the 1414 /// instruction will be scalarized when vectorizing with the associated 1415 /// vectorization factor. The entries are VF-ScalarCostTy pairs. 1416 DenseMap<unsigned, ScalarCostsTy> InstsToScalarize; 1417 1418 /// Holds the instructions known to be uniform after vectorization. 1419 /// The data is collected per VF. 1420 DenseMap<unsigned, SmallPtrSet<Instruction *, 4>> Uniforms; 1421 1422 /// Holds the instructions known to be scalar after vectorization. 1423 /// The data is collected per VF. 1424 DenseMap<unsigned, SmallPtrSet<Instruction *, 4>> Scalars; 1425 1426 /// Holds the instructions (address computations) that are forced to be 1427 /// scalarized. 1428 DenseMap<unsigned, SmallPtrSet<Instruction *, 4>> ForcedScalars; 1429 1430 /// Returns the expected difference in cost from scalarizing the expression 1431 /// feeding a predicated instruction \p PredInst. The instructions to 1432 /// scalarize and their scalar costs are collected in \p ScalarCosts. A 1433 /// non-negative return value implies the expression will be scalarized. 1434 /// Currently, only single-use chains are considered for scalarization. 1435 int computePredInstDiscount(Instruction *PredInst, ScalarCostsTy &ScalarCosts, 1436 unsigned VF); 1437 1438 /// Collect the instructions that are uniform after vectorization. An 1439 /// instruction is uniform if we represent it with a single scalar value in 1440 /// the vectorized loop corresponding to each vector iteration. Examples of 1441 /// uniform instructions include pointer operands of consecutive or 1442 /// interleaved memory accesses. Note that although uniformity implies an 1443 /// instruction will be scalar, the reverse is not true. In general, a 1444 /// scalarized instruction will be represented by VF scalar values in the 1445 /// vectorized loop, each corresponding to an iteration of the original 1446 /// scalar loop. 1447 void collectLoopUniforms(unsigned VF); 1448 1449 /// Collect the instructions that are scalar after vectorization. An 1450 /// instruction is scalar if it is known to be uniform or will be scalarized 1451 /// during vectorization. Non-uniform scalarized instructions will be 1452 /// represented by VF values in the vectorized loop, each corresponding to an 1453 /// iteration of the original scalar loop. 1454 void collectLoopScalars(unsigned VF); 1455 1456 /// Keeps cost model vectorization decision and cost for instructions. 1457 /// Right now it is used for memory instructions only. 1458 using DecisionList = DenseMap<std::pair<Instruction *, unsigned>, 1459 std::pair<InstWidening, unsigned>>; 1460 1461 DecisionList WideningDecisions; 1462 1463 /// Returns true if \p V is expected to be vectorized and it needs to be 1464 /// extracted. 1465 bool needsExtract(Value *V, unsigned VF) const { 1466 Instruction *I = dyn_cast<Instruction>(V); 1467 if (VF == 1 || !I || !TheLoop->contains(I) || TheLoop->isLoopInvariant(I)) 1468 return false; 1469 1470 // Assume we can vectorize V (and hence we need extraction) if the 1471 // scalars are not computed yet. This can happen, because it is called 1472 // via getScalarizationOverhead from setCostBasedWideningDecision, before 1473 // the scalars are collected. That should be a safe assumption in most 1474 // cases, because we check if the operands have vectorizable types 1475 // beforehand in LoopVectorizationLegality. 1476 return Scalars.find(VF) == Scalars.end() || 1477 !isScalarAfterVectorization(I, VF); 1478 }; 1479 1480 /// Returns a range containing only operands needing to be extracted. 1481 SmallVector<Value *, 4> filterExtractingOperands(Instruction::op_range Ops, 1482 unsigned VF) { 1483 return SmallVector<Value *, 4>(make_filter_range( 1484 Ops, [this, VF](Value *V) { return this->needsExtract(V, VF); })); 1485 } 1486 1487 public: 1488 /// The loop that we evaluate. 1489 Loop *TheLoop; 1490 1491 /// Predicated scalar evolution analysis. 1492 PredicatedScalarEvolution &PSE; 1493 1494 /// Loop Info analysis. 1495 LoopInfo *LI; 1496 1497 /// Vectorization legality. 1498 LoopVectorizationLegality *Legal; 1499 1500 /// Vector target information. 1501 const TargetTransformInfo &TTI; 1502 1503 /// Target Library Info. 1504 const TargetLibraryInfo *TLI; 1505 1506 /// Demanded bits analysis. 1507 DemandedBits *DB; 1508 1509 /// Assumption cache. 1510 AssumptionCache *AC; 1511 1512 /// Interface to emit optimization remarks. 1513 OptimizationRemarkEmitter *ORE; 1514 1515 const Function *TheFunction; 1516 1517 /// Loop Vectorize Hint. 1518 const LoopVectorizeHints *Hints; 1519 1520 /// The interleave access information contains groups of interleaved accesses 1521 /// with the same stride and close to each other. 1522 InterleavedAccessInfo &InterleaveInfo; 1523 1524 /// Values to ignore in the cost model. 1525 SmallPtrSet<const Value *, 16> ValuesToIgnore; 1526 1527 /// Values to ignore in the cost model when VF > 1. 1528 SmallPtrSet<const Value *, 16> VecValuesToIgnore; 1529 }; 1530 1531 } // end namespace llvm 1532 1533 // Return true if \p OuterLp is an outer loop annotated with hints for explicit 1534 // vectorization. The loop needs to be annotated with #pragma omp simd 1535 // simdlen(#) or #pragma clang vectorize(enable) vectorize_width(#). If the 1536 // vector length information is not provided, vectorization is not considered 1537 // explicit. Interleave hints are not allowed either. These limitations will be 1538 // relaxed in the future. 1539 // Please, note that we are currently forced to abuse the pragma 'clang 1540 // vectorize' semantics. This pragma provides *auto-vectorization hints* 1541 // (i.e., LV must check that vectorization is legal) whereas pragma 'omp simd' 1542 // provides *explicit vectorization hints* (LV can bypass legal checks and 1543 // assume that vectorization is legal). However, both hints are implemented 1544 // using the same metadata (llvm.loop.vectorize, processed by 1545 // LoopVectorizeHints). This will be fixed in the future when the native IR 1546 // representation for pragma 'omp simd' is introduced. 1547 static bool isExplicitVecOuterLoop(Loop *OuterLp, 1548 OptimizationRemarkEmitter *ORE) { 1549 assert(!OuterLp->empty() && "This is not an outer loop"); 1550 LoopVectorizeHints Hints(OuterLp, true /*DisableInterleaving*/, *ORE); 1551 1552 // Only outer loops with an explicit vectorization hint are supported. 1553 // Unannotated outer loops are ignored. 1554 if (Hints.getForce() == LoopVectorizeHints::FK_Undefined) 1555 return false; 1556 1557 Function *Fn = OuterLp->getHeader()->getParent(); 1558 if (!Hints.allowVectorization(Fn, OuterLp, 1559 true /*VectorizeOnlyWhenForced*/)) { 1560 LLVM_DEBUG(dbgs() << "LV: Loop hints prevent outer loop vectorization.\n"); 1561 return false; 1562 } 1563 1564 if (Hints.getInterleave() > 1) { 1565 // TODO: Interleave support is future work. 1566 LLVM_DEBUG(dbgs() << "LV: Not vectorizing: Interleave is not supported for " 1567 "outer loops.\n"); 1568 Hints.emitRemarkWithHints(); 1569 return false; 1570 } 1571 1572 return true; 1573 } 1574 1575 static void collectSupportedLoops(Loop &L, LoopInfo *LI, 1576 OptimizationRemarkEmitter *ORE, 1577 SmallVectorImpl<Loop *> &V) { 1578 // Collect inner loops and outer loops without irreducible control flow. For 1579 // now, only collect outer loops that have explicit vectorization hints. If we 1580 // are stress testing the VPlan H-CFG construction, we collect the outermost 1581 // loop of every loop nest. 1582 if (L.empty() || VPlanBuildStressTest || 1583 (EnableVPlanNativePath && isExplicitVecOuterLoop(&L, ORE))) { 1584 LoopBlocksRPO RPOT(&L); 1585 RPOT.perform(LI); 1586 if (!containsIrreducibleCFG<const BasicBlock *>(RPOT, *LI)) { 1587 V.push_back(&L); 1588 // TODO: Collect inner loops inside marked outer loops in case 1589 // vectorization fails for the outer loop. Do not invoke 1590 // 'containsIrreducibleCFG' again for inner loops when the outer loop is 1591 // already known to be reducible. We can use an inherited attribute for 1592 // that. 1593 return; 1594 } 1595 } 1596 for (Loop *InnerL : L) 1597 collectSupportedLoops(*InnerL, LI, ORE, V); 1598 } 1599 1600 namespace { 1601 1602 /// The LoopVectorize Pass. 1603 struct LoopVectorize : public FunctionPass { 1604 /// Pass identification, replacement for typeid 1605 static char ID; 1606 1607 LoopVectorizePass Impl; 1608 1609 explicit LoopVectorize(bool InterleaveOnlyWhenForced = false, 1610 bool VectorizeOnlyWhenForced = false) 1611 : FunctionPass(ID), 1612 Impl({InterleaveOnlyWhenForced, VectorizeOnlyWhenForced}) { 1613 initializeLoopVectorizePass(*PassRegistry::getPassRegistry()); 1614 } 1615 1616 bool runOnFunction(Function &F) override { 1617 if (skipFunction(F)) 1618 return false; 1619 1620 auto *SE = &getAnalysis<ScalarEvolutionWrapperPass>().getSE(); 1621 auto *LI = &getAnalysis<LoopInfoWrapperPass>().getLoopInfo(); 1622 auto *TTI = &getAnalysis<TargetTransformInfoWrapperPass>().getTTI(F); 1623 auto *DT = &getAnalysis<DominatorTreeWrapperPass>().getDomTree(); 1624 auto *BFI = &getAnalysis<BlockFrequencyInfoWrapperPass>().getBFI(); 1625 auto *TLIP = getAnalysisIfAvailable<TargetLibraryInfoWrapperPass>(); 1626 auto *TLI = TLIP ? &TLIP->getTLI(F) : nullptr; 1627 auto *AA = &getAnalysis<AAResultsWrapperPass>().getAAResults(); 1628 auto *AC = &getAnalysis<AssumptionCacheTracker>().getAssumptionCache(F); 1629 auto *LAA = &getAnalysis<LoopAccessLegacyAnalysis>(); 1630 auto *DB = &getAnalysis<DemandedBitsWrapperPass>().getDemandedBits(); 1631 auto *ORE = &getAnalysis<OptimizationRemarkEmitterWrapperPass>().getORE(); 1632 auto *PSI = &getAnalysis<ProfileSummaryInfoWrapperPass>().getPSI(); 1633 1634 std::function<const LoopAccessInfo &(Loop &)> GetLAA = 1635 [&](Loop &L) -> const LoopAccessInfo & { return LAA->getInfo(&L); }; 1636 1637 return Impl.runImpl(F, *SE, *LI, *TTI, *DT, *BFI, TLI, *DB, *AA, *AC, 1638 GetLAA, *ORE, PSI).MadeAnyChange; 1639 } 1640 1641 void getAnalysisUsage(AnalysisUsage &AU) const override { 1642 AU.addRequired<AssumptionCacheTracker>(); 1643 AU.addRequired<BlockFrequencyInfoWrapperPass>(); 1644 AU.addRequired<DominatorTreeWrapperPass>(); 1645 AU.addRequired<LoopInfoWrapperPass>(); 1646 AU.addRequired<ScalarEvolutionWrapperPass>(); 1647 AU.addRequired<TargetTransformInfoWrapperPass>(); 1648 AU.addRequired<AAResultsWrapperPass>(); 1649 AU.addRequired<LoopAccessLegacyAnalysis>(); 1650 AU.addRequired<DemandedBitsWrapperPass>(); 1651 AU.addRequired<OptimizationRemarkEmitterWrapperPass>(); 1652 AU.addRequired<InjectTLIMappingsLegacy>(); 1653 1654 // We currently do not preserve loopinfo/dominator analyses with outer loop 1655 // vectorization. Until this is addressed, mark these analyses as preserved 1656 // only for non-VPlan-native path. 1657 // TODO: Preserve Loop and Dominator analyses for VPlan-native path. 1658 if (!EnableVPlanNativePath) { 1659 AU.addPreserved<LoopInfoWrapperPass>(); 1660 AU.addPreserved<DominatorTreeWrapperPass>(); 1661 } 1662 1663 AU.addPreserved<BasicAAWrapperPass>(); 1664 AU.addPreserved<GlobalsAAWrapperPass>(); 1665 AU.addRequired<ProfileSummaryInfoWrapperPass>(); 1666 } 1667 }; 1668 1669 } // end anonymous namespace 1670 1671 //===----------------------------------------------------------------------===// 1672 // Implementation of LoopVectorizationLegality, InnerLoopVectorizer and 1673 // LoopVectorizationCostModel and LoopVectorizationPlanner. 1674 //===----------------------------------------------------------------------===// 1675 1676 Value *InnerLoopVectorizer::getBroadcastInstrs(Value *V) { 1677 // We need to place the broadcast of invariant variables outside the loop, 1678 // but only if it's proven safe to do so. Else, broadcast will be inside 1679 // vector loop body. 1680 Instruction *Instr = dyn_cast<Instruction>(V); 1681 bool SafeToHoist = OrigLoop->isLoopInvariant(V) && 1682 (!Instr || 1683 DT->dominates(Instr->getParent(), LoopVectorPreHeader)); 1684 // Place the code for broadcasting invariant variables in the new preheader. 1685 IRBuilder<>::InsertPointGuard Guard(Builder); 1686 if (SafeToHoist) 1687 Builder.SetInsertPoint(LoopVectorPreHeader->getTerminator()); 1688 1689 // Broadcast the scalar into all locations in the vector. 1690 Value *Shuf = Builder.CreateVectorSplat(VF, V, "broadcast"); 1691 1692 return Shuf; 1693 } 1694 1695 void InnerLoopVectorizer::createVectorIntOrFpInductionPHI( 1696 const InductionDescriptor &II, Value *Step, Instruction *EntryVal) { 1697 assert((isa<PHINode>(EntryVal) || isa<TruncInst>(EntryVal)) && 1698 "Expected either an induction phi-node or a truncate of it!"); 1699 Value *Start = II.getStartValue(); 1700 1701 // Construct the initial value of the vector IV in the vector loop preheader 1702 auto CurrIP = Builder.saveIP(); 1703 Builder.SetInsertPoint(LoopVectorPreHeader->getTerminator()); 1704 if (isa<TruncInst>(EntryVal)) { 1705 assert(Start->getType()->isIntegerTy() && 1706 "Truncation requires an integer type"); 1707 auto *TruncType = cast<IntegerType>(EntryVal->getType()); 1708 Step = Builder.CreateTrunc(Step, TruncType); 1709 Start = Builder.CreateCast(Instruction::Trunc, Start, TruncType); 1710 } 1711 Value *SplatStart = Builder.CreateVectorSplat(VF, Start); 1712 Value *SteppedStart = 1713 getStepVector(SplatStart, 0, Step, II.getInductionOpcode()); 1714 1715 // We create vector phi nodes for both integer and floating-point induction 1716 // variables. Here, we determine the kind of arithmetic we will perform. 1717 Instruction::BinaryOps AddOp; 1718 Instruction::BinaryOps MulOp; 1719 if (Step->getType()->isIntegerTy()) { 1720 AddOp = Instruction::Add; 1721 MulOp = Instruction::Mul; 1722 } else { 1723 AddOp = II.getInductionOpcode(); 1724 MulOp = Instruction::FMul; 1725 } 1726 1727 // Multiply the vectorization factor by the step using integer or 1728 // floating-point arithmetic as appropriate. 1729 Value *ConstVF = getSignedIntOrFpConstant(Step->getType(), VF); 1730 Value *Mul = addFastMathFlag(Builder.CreateBinOp(MulOp, Step, ConstVF)); 1731 1732 // Create a vector splat to use in the induction update. 1733 // 1734 // FIXME: If the step is non-constant, we create the vector splat with 1735 // IRBuilder. IRBuilder can constant-fold the multiply, but it doesn't 1736 // handle a constant vector splat. 1737 Value *SplatVF = 1738 isa<Constant>(Mul) 1739 ? ConstantVector::getSplat({VF, false}, cast<Constant>(Mul)) 1740 : Builder.CreateVectorSplat(VF, Mul); 1741 Builder.restoreIP(CurrIP); 1742 1743 // We may need to add the step a number of times, depending on the unroll 1744 // factor. The last of those goes into the PHI. 1745 PHINode *VecInd = PHINode::Create(SteppedStart->getType(), 2, "vec.ind", 1746 &*LoopVectorBody->getFirstInsertionPt()); 1747 VecInd->setDebugLoc(EntryVal->getDebugLoc()); 1748 Instruction *LastInduction = VecInd; 1749 for (unsigned Part = 0; Part < UF; ++Part) { 1750 VectorLoopValueMap.setVectorValue(EntryVal, Part, LastInduction); 1751 1752 if (isa<TruncInst>(EntryVal)) 1753 addMetadata(LastInduction, EntryVal); 1754 recordVectorLoopValueForInductionCast(II, EntryVal, LastInduction, Part); 1755 1756 LastInduction = cast<Instruction>(addFastMathFlag( 1757 Builder.CreateBinOp(AddOp, LastInduction, SplatVF, "step.add"))); 1758 LastInduction->setDebugLoc(EntryVal->getDebugLoc()); 1759 } 1760 1761 // Move the last step to the end of the latch block. This ensures consistent 1762 // placement of all induction updates. 1763 auto *LoopVectorLatch = LI->getLoopFor(LoopVectorBody)->getLoopLatch(); 1764 auto *Br = cast<BranchInst>(LoopVectorLatch->getTerminator()); 1765 auto *ICmp = cast<Instruction>(Br->getCondition()); 1766 LastInduction->moveBefore(ICmp); 1767 LastInduction->setName("vec.ind.next"); 1768 1769 VecInd->addIncoming(SteppedStart, LoopVectorPreHeader); 1770 VecInd->addIncoming(LastInduction, LoopVectorLatch); 1771 } 1772 1773 bool InnerLoopVectorizer::shouldScalarizeInstruction(Instruction *I) const { 1774 return Cost->isScalarAfterVectorization(I, VF) || 1775 Cost->isProfitableToScalarize(I, VF); 1776 } 1777 1778 bool InnerLoopVectorizer::needsScalarInduction(Instruction *IV) const { 1779 if (shouldScalarizeInstruction(IV)) 1780 return true; 1781 auto isScalarInst = [&](User *U) -> bool { 1782 auto *I = cast<Instruction>(U); 1783 return (OrigLoop->contains(I) && shouldScalarizeInstruction(I)); 1784 }; 1785 return llvm::any_of(IV->users(), isScalarInst); 1786 } 1787 1788 void InnerLoopVectorizer::recordVectorLoopValueForInductionCast( 1789 const InductionDescriptor &ID, const Instruction *EntryVal, 1790 Value *VectorLoopVal, unsigned Part, unsigned Lane) { 1791 assert((isa<PHINode>(EntryVal) || isa<TruncInst>(EntryVal)) && 1792 "Expected either an induction phi-node or a truncate of it!"); 1793 1794 // This induction variable is not the phi from the original loop but the 1795 // newly-created IV based on the proof that casted Phi is equal to the 1796 // uncasted Phi in the vectorized loop (under a runtime guard possibly). It 1797 // re-uses the same InductionDescriptor that original IV uses but we don't 1798 // have to do any recording in this case - that is done when original IV is 1799 // processed. 1800 if (isa<TruncInst>(EntryVal)) 1801 return; 1802 1803 const SmallVectorImpl<Instruction *> &Casts = ID.getCastInsts(); 1804 if (Casts.empty()) 1805 return; 1806 // Only the first Cast instruction in the Casts vector is of interest. 1807 // The rest of the Casts (if exist) have no uses outside the 1808 // induction update chain itself. 1809 Instruction *CastInst = *Casts.begin(); 1810 if (Lane < UINT_MAX) 1811 VectorLoopValueMap.setScalarValue(CastInst, {Part, Lane}, VectorLoopVal); 1812 else 1813 VectorLoopValueMap.setVectorValue(CastInst, Part, VectorLoopVal); 1814 } 1815 1816 void InnerLoopVectorizer::widenIntOrFpInduction(PHINode *IV, TruncInst *Trunc) { 1817 assert((IV->getType()->isIntegerTy() || IV != OldInduction) && 1818 "Primary induction variable must have an integer type"); 1819 1820 auto II = Legal->getInductionVars().find(IV); 1821 assert(II != Legal->getInductionVars().end() && "IV is not an induction"); 1822 1823 auto ID = II->second; 1824 assert(IV->getType() == ID.getStartValue()->getType() && "Types must match"); 1825 1826 // The value from the original loop to which we are mapping the new induction 1827 // variable. 1828 Instruction *EntryVal = Trunc ? cast<Instruction>(Trunc) : IV; 1829 1830 auto &DL = OrigLoop->getHeader()->getModule()->getDataLayout(); 1831 1832 // Generate code for the induction step. Note that induction steps are 1833 // required to be loop-invariant 1834 auto CreateStepValue = [&](const SCEV *Step) -> Value * { 1835 assert(PSE.getSE()->isLoopInvariant(Step, OrigLoop) && 1836 "Induction step should be loop invariant"); 1837 if (PSE.getSE()->isSCEVable(IV->getType())) { 1838 SCEVExpander Exp(*PSE.getSE(), DL, "induction"); 1839 return Exp.expandCodeFor(Step, Step->getType(), 1840 LoopVectorPreHeader->getTerminator()); 1841 } 1842 return cast<SCEVUnknown>(Step)->getValue(); 1843 }; 1844 1845 // The scalar value to broadcast. This is derived from the canonical 1846 // induction variable. If a truncation type is given, truncate the canonical 1847 // induction variable and step. Otherwise, derive these values from the 1848 // induction descriptor. 1849 auto CreateScalarIV = [&](Value *&Step) -> Value * { 1850 Value *ScalarIV = Induction; 1851 if (IV != OldInduction) { 1852 ScalarIV = IV->getType()->isIntegerTy() 1853 ? Builder.CreateSExtOrTrunc(Induction, IV->getType()) 1854 : Builder.CreateCast(Instruction::SIToFP, Induction, 1855 IV->getType()); 1856 ScalarIV = emitTransformedIndex(Builder, ScalarIV, PSE.getSE(), DL, ID); 1857 ScalarIV->setName("offset.idx"); 1858 } 1859 if (Trunc) { 1860 auto *TruncType = cast<IntegerType>(Trunc->getType()); 1861 assert(Step->getType()->isIntegerTy() && 1862 "Truncation requires an integer step"); 1863 ScalarIV = Builder.CreateTrunc(ScalarIV, TruncType); 1864 Step = Builder.CreateTrunc(Step, TruncType); 1865 } 1866 return ScalarIV; 1867 }; 1868 1869 // Create the vector values from the scalar IV, in the absence of creating a 1870 // vector IV. 1871 auto CreateSplatIV = [&](Value *ScalarIV, Value *Step) { 1872 Value *Broadcasted = getBroadcastInstrs(ScalarIV); 1873 for (unsigned Part = 0; Part < UF; ++Part) { 1874 Value *EntryPart = 1875 getStepVector(Broadcasted, VF * Part, Step, ID.getInductionOpcode()); 1876 VectorLoopValueMap.setVectorValue(EntryVal, Part, EntryPart); 1877 if (Trunc) 1878 addMetadata(EntryPart, Trunc); 1879 recordVectorLoopValueForInductionCast(ID, EntryVal, EntryPart, Part); 1880 } 1881 }; 1882 1883 // Now do the actual transformations, and start with creating the step value. 1884 Value *Step = CreateStepValue(ID.getStep()); 1885 if (VF <= 1) { 1886 Value *ScalarIV = CreateScalarIV(Step); 1887 CreateSplatIV(ScalarIV, Step); 1888 return; 1889 } 1890 1891 // Determine if we want a scalar version of the induction variable. This is 1892 // true if the induction variable itself is not widened, or if it has at 1893 // least one user in the loop that is not widened. 1894 auto NeedsScalarIV = needsScalarInduction(EntryVal); 1895 if (!NeedsScalarIV) { 1896 createVectorIntOrFpInductionPHI(ID, Step, EntryVal); 1897 return; 1898 } 1899 1900 // Try to create a new independent vector induction variable. If we can't 1901 // create the phi node, we will splat the scalar induction variable in each 1902 // loop iteration. 1903 if (!shouldScalarizeInstruction(EntryVal)) { 1904 createVectorIntOrFpInductionPHI(ID, Step, EntryVal); 1905 Value *ScalarIV = CreateScalarIV(Step); 1906 // Create scalar steps that can be used by instructions we will later 1907 // scalarize. Note that the addition of the scalar steps will not increase 1908 // the number of instructions in the loop in the common case prior to 1909 // InstCombine. We will be trading one vector extract for each scalar step. 1910 buildScalarSteps(ScalarIV, Step, EntryVal, ID); 1911 return; 1912 } 1913 1914 // All IV users are scalar instructions, so only emit a scalar IV, not a 1915 // vectorised IV. Except when we tail-fold, then the splat IV feeds the 1916 // predicate used by the masked loads/stores. 1917 Value *ScalarIV = CreateScalarIV(Step); 1918 if (!Cost->isScalarEpilogueAllowed()) 1919 CreateSplatIV(ScalarIV, Step); 1920 buildScalarSteps(ScalarIV, Step, EntryVal, ID); 1921 } 1922 1923 Value *InnerLoopVectorizer::getStepVector(Value *Val, int StartIdx, Value *Step, 1924 Instruction::BinaryOps BinOp) { 1925 // Create and check the types. 1926 auto *ValVTy = cast<VectorType>(Val->getType()); 1927 int VLen = ValVTy->getNumElements(); 1928 1929 Type *STy = Val->getType()->getScalarType(); 1930 assert((STy->isIntegerTy() || STy->isFloatingPointTy()) && 1931 "Induction Step must be an integer or FP"); 1932 assert(Step->getType() == STy && "Step has wrong type"); 1933 1934 SmallVector<Constant *, 8> Indices; 1935 1936 if (STy->isIntegerTy()) { 1937 // Create a vector of consecutive numbers from zero to VF. 1938 for (int i = 0; i < VLen; ++i) 1939 Indices.push_back(ConstantInt::get(STy, StartIdx + i)); 1940 1941 // Add the consecutive indices to the vector value. 1942 Constant *Cv = ConstantVector::get(Indices); 1943 assert(Cv->getType() == Val->getType() && "Invalid consecutive vec"); 1944 Step = Builder.CreateVectorSplat(VLen, Step); 1945 assert(Step->getType() == Val->getType() && "Invalid step vec"); 1946 // FIXME: The newly created binary instructions should contain nsw/nuw flags, 1947 // which can be found from the original scalar operations. 1948 Step = Builder.CreateMul(Cv, Step); 1949 return Builder.CreateAdd(Val, Step, "induction"); 1950 } 1951 1952 // Floating point induction. 1953 assert((BinOp == Instruction::FAdd || BinOp == Instruction::FSub) && 1954 "Binary Opcode should be specified for FP induction"); 1955 // Create a vector of consecutive numbers from zero to VF. 1956 for (int i = 0; i < VLen; ++i) 1957 Indices.push_back(ConstantFP::get(STy, (double)(StartIdx + i))); 1958 1959 // Add the consecutive indices to the vector value. 1960 Constant *Cv = ConstantVector::get(Indices); 1961 1962 Step = Builder.CreateVectorSplat(VLen, Step); 1963 1964 // Floating point operations had to be 'fast' to enable the induction. 1965 FastMathFlags Flags; 1966 Flags.setFast(); 1967 1968 Value *MulOp = Builder.CreateFMul(Cv, Step); 1969 if (isa<Instruction>(MulOp)) 1970 // Have to check, MulOp may be a constant 1971 cast<Instruction>(MulOp)->setFastMathFlags(Flags); 1972 1973 Value *BOp = Builder.CreateBinOp(BinOp, Val, MulOp, "induction"); 1974 if (isa<Instruction>(BOp)) 1975 cast<Instruction>(BOp)->setFastMathFlags(Flags); 1976 return BOp; 1977 } 1978 1979 void InnerLoopVectorizer::buildScalarSteps(Value *ScalarIV, Value *Step, 1980 Instruction *EntryVal, 1981 const InductionDescriptor &ID) { 1982 // We shouldn't have to build scalar steps if we aren't vectorizing. 1983 assert(VF > 1 && "VF should be greater than one"); 1984 1985 // Get the value type and ensure it and the step have the same integer type. 1986 Type *ScalarIVTy = ScalarIV->getType()->getScalarType(); 1987 assert(ScalarIVTy == Step->getType() && 1988 "Val and Step should have the same type"); 1989 1990 // We build scalar steps for both integer and floating-point induction 1991 // variables. Here, we determine the kind of arithmetic we will perform. 1992 Instruction::BinaryOps AddOp; 1993 Instruction::BinaryOps MulOp; 1994 if (ScalarIVTy->isIntegerTy()) { 1995 AddOp = Instruction::Add; 1996 MulOp = Instruction::Mul; 1997 } else { 1998 AddOp = ID.getInductionOpcode(); 1999 MulOp = Instruction::FMul; 2000 } 2001 2002 // Determine the number of scalars we need to generate for each unroll 2003 // iteration. If EntryVal is uniform, we only need to generate the first 2004 // lane. Otherwise, we generate all VF values. 2005 unsigned Lanes = 2006 Cost->isUniformAfterVectorization(cast<Instruction>(EntryVal), VF) ? 1 2007 : VF; 2008 // Compute the scalar steps and save the results in VectorLoopValueMap. 2009 for (unsigned Part = 0; Part < UF; ++Part) { 2010 for (unsigned Lane = 0; Lane < Lanes; ++Lane) { 2011 auto *StartIdx = getSignedIntOrFpConstant(ScalarIVTy, VF * Part + Lane); 2012 auto *Mul = addFastMathFlag(Builder.CreateBinOp(MulOp, StartIdx, Step)); 2013 auto *Add = addFastMathFlag(Builder.CreateBinOp(AddOp, ScalarIV, Mul)); 2014 VectorLoopValueMap.setScalarValue(EntryVal, {Part, Lane}, Add); 2015 recordVectorLoopValueForInductionCast(ID, EntryVal, Add, Part, Lane); 2016 } 2017 } 2018 } 2019 2020 Value *InnerLoopVectorizer::getOrCreateVectorValue(Value *V, unsigned Part) { 2021 assert(V != Induction && "The new induction variable should not be used."); 2022 assert(!V->getType()->isVectorTy() && "Can't widen a vector"); 2023 assert(!V->getType()->isVoidTy() && "Type does not produce a value"); 2024 2025 // If we have a stride that is replaced by one, do it here. Defer this for 2026 // the VPlan-native path until we start running Legal checks in that path. 2027 if (!EnableVPlanNativePath && Legal->hasStride(V)) 2028 V = ConstantInt::get(V->getType(), 1); 2029 2030 // If we have a vector mapped to this value, return it. 2031 if (VectorLoopValueMap.hasVectorValue(V, Part)) 2032 return VectorLoopValueMap.getVectorValue(V, Part); 2033 2034 // If the value has not been vectorized, check if it has been scalarized 2035 // instead. If it has been scalarized, and we actually need the value in 2036 // vector form, we will construct the vector values on demand. 2037 if (VectorLoopValueMap.hasAnyScalarValue(V)) { 2038 Value *ScalarValue = VectorLoopValueMap.getScalarValue(V, {Part, 0}); 2039 2040 // If we've scalarized a value, that value should be an instruction. 2041 auto *I = cast<Instruction>(V); 2042 2043 // If we aren't vectorizing, we can just copy the scalar map values over to 2044 // the vector map. 2045 if (VF == 1) { 2046 VectorLoopValueMap.setVectorValue(V, Part, ScalarValue); 2047 return ScalarValue; 2048 } 2049 2050 // Get the last scalar instruction we generated for V and Part. If the value 2051 // is known to be uniform after vectorization, this corresponds to lane zero 2052 // of the Part unroll iteration. Otherwise, the last instruction is the one 2053 // we created for the last vector lane of the Part unroll iteration. 2054 unsigned LastLane = Cost->isUniformAfterVectorization(I, VF) ? 0 : VF - 1; 2055 auto *LastInst = cast<Instruction>( 2056 VectorLoopValueMap.getScalarValue(V, {Part, LastLane})); 2057 2058 // Set the insert point after the last scalarized instruction. This ensures 2059 // the insertelement sequence will directly follow the scalar definitions. 2060 auto OldIP = Builder.saveIP(); 2061 auto NewIP = std::next(BasicBlock::iterator(LastInst)); 2062 Builder.SetInsertPoint(&*NewIP); 2063 2064 // However, if we are vectorizing, we need to construct the vector values. 2065 // If the value is known to be uniform after vectorization, we can just 2066 // broadcast the scalar value corresponding to lane zero for each unroll 2067 // iteration. Otherwise, we construct the vector values using insertelement 2068 // instructions. Since the resulting vectors are stored in 2069 // VectorLoopValueMap, we will only generate the insertelements once. 2070 Value *VectorValue = nullptr; 2071 if (Cost->isUniformAfterVectorization(I, VF)) { 2072 VectorValue = getBroadcastInstrs(ScalarValue); 2073 VectorLoopValueMap.setVectorValue(V, Part, VectorValue); 2074 } else { 2075 // Initialize packing with insertelements to start from undef. 2076 Value *Undef = UndefValue::get(VectorType::get(V->getType(), VF)); 2077 VectorLoopValueMap.setVectorValue(V, Part, Undef); 2078 for (unsigned Lane = 0; Lane < VF; ++Lane) 2079 packScalarIntoVectorValue(V, {Part, Lane}); 2080 VectorValue = VectorLoopValueMap.getVectorValue(V, Part); 2081 } 2082 Builder.restoreIP(OldIP); 2083 return VectorValue; 2084 } 2085 2086 // If this scalar is unknown, assume that it is a constant or that it is 2087 // loop invariant. Broadcast V and save the value for future uses. 2088 Value *B = getBroadcastInstrs(V); 2089 VectorLoopValueMap.setVectorValue(V, Part, B); 2090 return B; 2091 } 2092 2093 Value * 2094 InnerLoopVectorizer::getOrCreateScalarValue(Value *V, 2095 const VPIteration &Instance) { 2096 // If the value is not an instruction contained in the loop, it should 2097 // already be scalar. 2098 if (OrigLoop->isLoopInvariant(V)) 2099 return V; 2100 2101 assert(Instance.Lane > 0 2102 ? !Cost->isUniformAfterVectorization(cast<Instruction>(V), VF) 2103 : true && "Uniform values only have lane zero"); 2104 2105 // If the value from the original loop has not been vectorized, it is 2106 // represented by UF x VF scalar values in the new loop. Return the requested 2107 // scalar value. 2108 if (VectorLoopValueMap.hasScalarValue(V, Instance)) 2109 return VectorLoopValueMap.getScalarValue(V, Instance); 2110 2111 // If the value has not been scalarized, get its entry in VectorLoopValueMap 2112 // for the given unroll part. If this entry is not a vector type (i.e., the 2113 // vectorization factor is one), there is no need to generate an 2114 // extractelement instruction. 2115 auto *U = getOrCreateVectorValue(V, Instance.Part); 2116 if (!U->getType()->isVectorTy()) { 2117 assert(VF == 1 && "Value not scalarized has non-vector type"); 2118 return U; 2119 } 2120 2121 // Otherwise, the value from the original loop has been vectorized and is 2122 // represented by UF vector values. Extract and return the requested scalar 2123 // value from the appropriate vector lane. 2124 return Builder.CreateExtractElement(U, Builder.getInt32(Instance.Lane)); 2125 } 2126 2127 void InnerLoopVectorizer::packScalarIntoVectorValue( 2128 Value *V, const VPIteration &Instance) { 2129 assert(V != Induction && "The new induction variable should not be used."); 2130 assert(!V->getType()->isVectorTy() && "Can't pack a vector"); 2131 assert(!V->getType()->isVoidTy() && "Type does not produce a value"); 2132 2133 Value *ScalarInst = VectorLoopValueMap.getScalarValue(V, Instance); 2134 Value *VectorValue = VectorLoopValueMap.getVectorValue(V, Instance.Part); 2135 VectorValue = Builder.CreateInsertElement(VectorValue, ScalarInst, 2136 Builder.getInt32(Instance.Lane)); 2137 VectorLoopValueMap.resetVectorValue(V, Instance.Part, VectorValue); 2138 } 2139 2140 Value *InnerLoopVectorizer::reverseVector(Value *Vec) { 2141 assert(Vec->getType()->isVectorTy() && "Invalid type"); 2142 SmallVector<int, 8> ShuffleMask; 2143 for (unsigned i = 0; i < VF; ++i) 2144 ShuffleMask.push_back(VF - i - 1); 2145 2146 return Builder.CreateShuffleVector(Vec, UndefValue::get(Vec->getType()), 2147 ShuffleMask, "reverse"); 2148 } 2149 2150 // Return whether we allow using masked interleave-groups (for dealing with 2151 // strided loads/stores that reside in predicated blocks, or for dealing 2152 // with gaps). 2153 static bool useMaskedInterleavedAccesses(const TargetTransformInfo &TTI) { 2154 // If an override option has been passed in for interleaved accesses, use it. 2155 if (EnableMaskedInterleavedMemAccesses.getNumOccurrences() > 0) 2156 return EnableMaskedInterleavedMemAccesses; 2157 2158 return TTI.enableMaskedInterleavedAccessVectorization(); 2159 } 2160 2161 // Try to vectorize the interleave group that \p Instr belongs to. 2162 // 2163 // E.g. Translate following interleaved load group (factor = 3): 2164 // for (i = 0; i < N; i+=3) { 2165 // R = Pic[i]; // Member of index 0 2166 // G = Pic[i+1]; // Member of index 1 2167 // B = Pic[i+2]; // Member of index 2 2168 // ... // do something to R, G, B 2169 // } 2170 // To: 2171 // %wide.vec = load <12 x i32> ; Read 4 tuples of R,G,B 2172 // %R.vec = shuffle %wide.vec, undef, <0, 3, 6, 9> ; R elements 2173 // %G.vec = shuffle %wide.vec, undef, <1, 4, 7, 10> ; G elements 2174 // %B.vec = shuffle %wide.vec, undef, <2, 5, 8, 11> ; B elements 2175 // 2176 // Or translate following interleaved store group (factor = 3): 2177 // for (i = 0; i < N; i+=3) { 2178 // ... do something to R, G, B 2179 // Pic[i] = R; // Member of index 0 2180 // Pic[i+1] = G; // Member of index 1 2181 // Pic[i+2] = B; // Member of index 2 2182 // } 2183 // To: 2184 // %R_G.vec = shuffle %R.vec, %G.vec, <0, 1, 2, ..., 7> 2185 // %B_U.vec = shuffle %B.vec, undef, <0, 1, 2, 3, u, u, u, u> 2186 // %interleaved.vec = shuffle %R_G.vec, %B_U.vec, 2187 // <0, 4, 8, 1, 5, 9, 2, 6, 10, 3, 7, 11> ; Interleave R,G,B elements 2188 // store <12 x i32> %interleaved.vec ; Write 4 tuples of R,G,B 2189 void InnerLoopVectorizer::vectorizeInterleaveGroup( 2190 const InterleaveGroup<Instruction> *Group, VPTransformState &State, 2191 VPValue *Addr, VPValue *BlockInMask) { 2192 Instruction *Instr = Group->getInsertPos(); 2193 const DataLayout &DL = Instr->getModule()->getDataLayout(); 2194 2195 // Prepare for the vector type of the interleaved load/store. 2196 Type *ScalarTy = getMemInstValueType(Instr); 2197 unsigned InterleaveFactor = Group->getFactor(); 2198 Type *VecTy = VectorType::get(ScalarTy, InterleaveFactor * VF); 2199 2200 // Prepare for the new pointers. 2201 SmallVector<Value *, 2> AddrParts; 2202 unsigned Index = Group->getIndex(Instr); 2203 2204 // TODO: extend the masked interleaved-group support to reversed access. 2205 assert((!BlockInMask || !Group->isReverse()) && 2206 "Reversed masked interleave-group not supported."); 2207 2208 // If the group is reverse, adjust the index to refer to the last vector lane 2209 // instead of the first. We adjust the index from the first vector lane, 2210 // rather than directly getting the pointer for lane VF - 1, because the 2211 // pointer operand of the interleaved access is supposed to be uniform. For 2212 // uniform instructions, we're only required to generate a value for the 2213 // first vector lane in each unroll iteration. 2214 if (Group->isReverse()) 2215 Index += (VF - 1) * Group->getFactor(); 2216 2217 for (unsigned Part = 0; Part < UF; Part++) { 2218 Value *AddrPart = State.get(Addr, {Part, 0}); 2219 setDebugLocFromInst(Builder, AddrPart); 2220 2221 // Notice current instruction could be any index. Need to adjust the address 2222 // to the member of index 0. 2223 // 2224 // E.g. a = A[i+1]; // Member of index 1 (Current instruction) 2225 // b = A[i]; // Member of index 0 2226 // Current pointer is pointed to A[i+1], adjust it to A[i]. 2227 // 2228 // E.g. A[i+1] = a; // Member of index 1 2229 // A[i] = b; // Member of index 0 2230 // A[i+2] = c; // Member of index 2 (Current instruction) 2231 // Current pointer is pointed to A[i+2], adjust it to A[i]. 2232 2233 bool InBounds = false; 2234 if (auto *gep = dyn_cast<GetElementPtrInst>(AddrPart->stripPointerCasts())) 2235 InBounds = gep->isInBounds(); 2236 AddrPart = Builder.CreateGEP(ScalarTy, AddrPart, Builder.getInt32(-Index)); 2237 cast<GetElementPtrInst>(AddrPart)->setIsInBounds(InBounds); 2238 2239 // Cast to the vector pointer type. 2240 unsigned AddressSpace = AddrPart->getType()->getPointerAddressSpace(); 2241 Type *PtrTy = VecTy->getPointerTo(AddressSpace); 2242 AddrParts.push_back(Builder.CreateBitCast(AddrPart, PtrTy)); 2243 } 2244 2245 setDebugLocFromInst(Builder, Instr); 2246 Value *UndefVec = UndefValue::get(VecTy); 2247 2248 Value *MaskForGaps = nullptr; 2249 if (Group->requiresScalarEpilogue() && !Cost->isScalarEpilogueAllowed()) { 2250 MaskForGaps = createBitMaskForGaps(Builder, VF, *Group); 2251 assert(MaskForGaps && "Mask for Gaps is required but it is null"); 2252 } 2253 2254 // Vectorize the interleaved load group. 2255 if (isa<LoadInst>(Instr)) { 2256 // For each unroll part, create a wide load for the group. 2257 SmallVector<Value *, 2> NewLoads; 2258 for (unsigned Part = 0; Part < UF; Part++) { 2259 Instruction *NewLoad; 2260 if (BlockInMask || MaskForGaps) { 2261 assert(useMaskedInterleavedAccesses(*TTI) && 2262 "masked interleaved groups are not allowed."); 2263 Value *GroupMask = MaskForGaps; 2264 if (BlockInMask) { 2265 Value *BlockInMaskPart = State.get(BlockInMask, Part); 2266 auto *Undefs = UndefValue::get(BlockInMaskPart->getType()); 2267 Value *ShuffledMask = Builder.CreateShuffleVector( 2268 BlockInMaskPart, Undefs, 2269 createReplicatedMask(InterleaveFactor, VF), "interleaved.mask"); 2270 GroupMask = MaskForGaps 2271 ? Builder.CreateBinOp(Instruction::And, ShuffledMask, 2272 MaskForGaps) 2273 : ShuffledMask; 2274 } 2275 NewLoad = 2276 Builder.CreateMaskedLoad(AddrParts[Part], Group->getAlign(), 2277 GroupMask, UndefVec, "wide.masked.vec"); 2278 } 2279 else 2280 NewLoad = Builder.CreateAlignedLoad(VecTy, AddrParts[Part], 2281 Group->getAlign(), "wide.vec"); 2282 Group->addMetadata(NewLoad); 2283 NewLoads.push_back(NewLoad); 2284 } 2285 2286 // For each member in the group, shuffle out the appropriate data from the 2287 // wide loads. 2288 for (unsigned I = 0; I < InterleaveFactor; ++I) { 2289 Instruction *Member = Group->getMember(I); 2290 2291 // Skip the gaps in the group. 2292 if (!Member) 2293 continue; 2294 2295 auto StrideMask = createStrideMask(I, InterleaveFactor, VF); 2296 for (unsigned Part = 0; Part < UF; Part++) { 2297 Value *StridedVec = Builder.CreateShuffleVector( 2298 NewLoads[Part], UndefVec, StrideMask, "strided.vec"); 2299 2300 // If this member has different type, cast the result type. 2301 if (Member->getType() != ScalarTy) { 2302 VectorType *OtherVTy = VectorType::get(Member->getType(), VF); 2303 StridedVec = createBitOrPointerCast(StridedVec, OtherVTy, DL); 2304 } 2305 2306 if (Group->isReverse()) 2307 StridedVec = reverseVector(StridedVec); 2308 2309 VectorLoopValueMap.setVectorValue(Member, Part, StridedVec); 2310 } 2311 } 2312 return; 2313 } 2314 2315 // The sub vector type for current instruction. 2316 VectorType *SubVT = VectorType::get(ScalarTy, VF); 2317 2318 // Vectorize the interleaved store group. 2319 for (unsigned Part = 0; Part < UF; Part++) { 2320 // Collect the stored vector from each member. 2321 SmallVector<Value *, 4> StoredVecs; 2322 for (unsigned i = 0; i < InterleaveFactor; i++) { 2323 // Interleaved store group doesn't allow a gap, so each index has a member 2324 Instruction *Member = Group->getMember(i); 2325 assert(Member && "Fail to get a member from an interleaved store group"); 2326 2327 Value *StoredVec = getOrCreateVectorValue( 2328 cast<StoreInst>(Member)->getValueOperand(), Part); 2329 if (Group->isReverse()) 2330 StoredVec = reverseVector(StoredVec); 2331 2332 // If this member has different type, cast it to a unified type. 2333 2334 if (StoredVec->getType() != SubVT) 2335 StoredVec = createBitOrPointerCast(StoredVec, SubVT, DL); 2336 2337 StoredVecs.push_back(StoredVec); 2338 } 2339 2340 // Concatenate all vectors into a wide vector. 2341 Value *WideVec = concatenateVectors(Builder, StoredVecs); 2342 2343 // Interleave the elements in the wide vector. 2344 Value *IVec = Builder.CreateShuffleVector( 2345 WideVec, UndefVec, createInterleaveMask(VF, InterleaveFactor), 2346 "interleaved.vec"); 2347 2348 Instruction *NewStoreInstr; 2349 if (BlockInMask) { 2350 Value *BlockInMaskPart = State.get(BlockInMask, Part); 2351 auto *Undefs = UndefValue::get(BlockInMaskPart->getType()); 2352 Value *ShuffledMask = Builder.CreateShuffleVector( 2353 BlockInMaskPart, Undefs, createReplicatedMask(InterleaveFactor, VF), 2354 "interleaved.mask"); 2355 NewStoreInstr = Builder.CreateMaskedStore( 2356 IVec, AddrParts[Part], Group->getAlign(), ShuffledMask); 2357 } 2358 else 2359 NewStoreInstr = 2360 Builder.CreateAlignedStore(IVec, AddrParts[Part], Group->getAlign()); 2361 2362 Group->addMetadata(NewStoreInstr); 2363 } 2364 } 2365 2366 void InnerLoopVectorizer::vectorizeMemoryInstruction(Instruction *Instr, 2367 VPTransformState &State, 2368 VPValue *Addr, 2369 VPValue *StoredValue, 2370 VPValue *BlockInMask) { 2371 // Attempt to issue a wide load. 2372 LoadInst *LI = dyn_cast<LoadInst>(Instr); 2373 StoreInst *SI = dyn_cast<StoreInst>(Instr); 2374 2375 assert((LI || SI) && "Invalid Load/Store instruction"); 2376 assert((!SI || StoredValue) && "No stored value provided for widened store"); 2377 assert((!LI || !StoredValue) && "Stored value provided for widened load"); 2378 2379 LoopVectorizationCostModel::InstWidening Decision = 2380 Cost->getWideningDecision(Instr, VF); 2381 assert((Decision == LoopVectorizationCostModel::CM_Widen || 2382 Decision == LoopVectorizationCostModel::CM_Widen_Reverse || 2383 Decision == LoopVectorizationCostModel::CM_GatherScatter) && 2384 "CM decision is not to widen the memory instruction"); 2385 2386 Type *ScalarDataTy = getMemInstValueType(Instr); 2387 Type *DataTy = VectorType::get(ScalarDataTy, VF); 2388 const Align Alignment = getLoadStoreAlignment(Instr); 2389 2390 // Determine if the pointer operand of the access is either consecutive or 2391 // reverse consecutive. 2392 bool Reverse = (Decision == LoopVectorizationCostModel::CM_Widen_Reverse); 2393 bool ConsecutiveStride = 2394 Reverse || (Decision == LoopVectorizationCostModel::CM_Widen); 2395 bool CreateGatherScatter = 2396 (Decision == LoopVectorizationCostModel::CM_GatherScatter); 2397 2398 // Either Ptr feeds a vector load/store, or a vector GEP should feed a vector 2399 // gather/scatter. Otherwise Decision should have been to Scalarize. 2400 assert((ConsecutiveStride || CreateGatherScatter) && 2401 "The instruction should be scalarized"); 2402 (void)ConsecutiveStride; 2403 2404 VectorParts BlockInMaskParts(UF); 2405 bool isMaskRequired = BlockInMask; 2406 if (isMaskRequired) 2407 for (unsigned Part = 0; Part < UF; ++Part) 2408 BlockInMaskParts[Part] = State.get(BlockInMask, Part); 2409 2410 const auto CreateVecPtr = [&](unsigned Part, Value *Ptr) -> Value * { 2411 // Calculate the pointer for the specific unroll-part. 2412 GetElementPtrInst *PartPtr = nullptr; 2413 2414 bool InBounds = false; 2415 if (auto *gep = dyn_cast<GetElementPtrInst>(Ptr->stripPointerCasts())) 2416 InBounds = gep->isInBounds(); 2417 2418 if (Reverse) { 2419 // If the address is consecutive but reversed, then the 2420 // wide store needs to start at the last vector element. 2421 PartPtr = cast<GetElementPtrInst>( 2422 Builder.CreateGEP(ScalarDataTy, Ptr, Builder.getInt32(-Part * VF))); 2423 PartPtr->setIsInBounds(InBounds); 2424 PartPtr = cast<GetElementPtrInst>( 2425 Builder.CreateGEP(ScalarDataTy, PartPtr, Builder.getInt32(1 - VF))); 2426 PartPtr->setIsInBounds(InBounds); 2427 if (isMaskRequired) // Reverse of a null all-one mask is a null mask. 2428 BlockInMaskParts[Part] = reverseVector(BlockInMaskParts[Part]); 2429 } else { 2430 PartPtr = cast<GetElementPtrInst>( 2431 Builder.CreateGEP(ScalarDataTy, Ptr, Builder.getInt32(Part * VF))); 2432 PartPtr->setIsInBounds(InBounds); 2433 } 2434 2435 unsigned AddressSpace = Ptr->getType()->getPointerAddressSpace(); 2436 return Builder.CreateBitCast(PartPtr, DataTy->getPointerTo(AddressSpace)); 2437 }; 2438 2439 // Handle Stores: 2440 if (SI) { 2441 setDebugLocFromInst(Builder, SI); 2442 2443 for (unsigned Part = 0; Part < UF; ++Part) { 2444 Instruction *NewSI = nullptr; 2445 Value *StoredVal = State.get(StoredValue, Part); 2446 if (CreateGatherScatter) { 2447 Value *MaskPart = isMaskRequired ? BlockInMaskParts[Part] : nullptr; 2448 Value *VectorGep = State.get(Addr, Part); 2449 NewSI = Builder.CreateMaskedScatter(StoredVal, VectorGep, Alignment, 2450 MaskPart); 2451 } else { 2452 if (Reverse) { 2453 // If we store to reverse consecutive memory locations, then we need 2454 // to reverse the order of elements in the stored value. 2455 StoredVal = reverseVector(StoredVal); 2456 // We don't want to update the value in the map as it might be used in 2457 // another expression. So don't call resetVectorValue(StoredVal). 2458 } 2459 auto *VecPtr = CreateVecPtr(Part, State.get(Addr, {0, 0})); 2460 if (isMaskRequired) 2461 NewSI = Builder.CreateMaskedStore(StoredVal, VecPtr, Alignment, 2462 BlockInMaskParts[Part]); 2463 else 2464 NewSI = Builder.CreateAlignedStore(StoredVal, VecPtr, Alignment); 2465 } 2466 addMetadata(NewSI, SI); 2467 } 2468 return; 2469 } 2470 2471 // Handle loads. 2472 assert(LI && "Must have a load instruction"); 2473 setDebugLocFromInst(Builder, LI); 2474 for (unsigned Part = 0; Part < UF; ++Part) { 2475 Value *NewLI; 2476 if (CreateGatherScatter) { 2477 Value *MaskPart = isMaskRequired ? BlockInMaskParts[Part] : nullptr; 2478 Value *VectorGep = State.get(Addr, Part); 2479 NewLI = Builder.CreateMaskedGather(VectorGep, Alignment, MaskPart, 2480 nullptr, "wide.masked.gather"); 2481 addMetadata(NewLI, LI); 2482 } else { 2483 auto *VecPtr = CreateVecPtr(Part, State.get(Addr, {0, 0})); 2484 if (isMaskRequired) 2485 NewLI = Builder.CreateMaskedLoad( 2486 VecPtr, Alignment, BlockInMaskParts[Part], UndefValue::get(DataTy), 2487 "wide.masked.load"); 2488 else 2489 NewLI = 2490 Builder.CreateAlignedLoad(DataTy, VecPtr, Alignment, "wide.load"); 2491 2492 // Add metadata to the load, but setVectorValue to the reverse shuffle. 2493 addMetadata(NewLI, LI); 2494 if (Reverse) 2495 NewLI = reverseVector(NewLI); 2496 } 2497 VectorLoopValueMap.setVectorValue(Instr, Part, NewLI); 2498 } 2499 } 2500 2501 void InnerLoopVectorizer::scalarizeInstruction(Instruction *Instr, VPUser &User, 2502 const VPIteration &Instance, 2503 bool IfPredicateInstr, 2504 VPTransformState &State) { 2505 assert(!Instr->getType()->isAggregateType() && "Can't handle vectors"); 2506 2507 setDebugLocFromInst(Builder, Instr); 2508 2509 // Does this instruction return a value ? 2510 bool IsVoidRetTy = Instr->getType()->isVoidTy(); 2511 2512 Instruction *Cloned = Instr->clone(); 2513 if (!IsVoidRetTy) 2514 Cloned->setName(Instr->getName() + ".cloned"); 2515 2516 // Replace the operands of the cloned instructions with their scalar 2517 // equivalents in the new loop. 2518 for (unsigned op = 0, e = User.getNumOperands(); op != e; ++op) { 2519 auto *NewOp = State.get(User.getOperand(op), Instance); 2520 Cloned->setOperand(op, NewOp); 2521 } 2522 addNewMetadata(Cloned, Instr); 2523 2524 // Place the cloned scalar in the new loop. 2525 Builder.Insert(Cloned); 2526 2527 // Add the cloned scalar to the scalar map entry. 2528 VectorLoopValueMap.setScalarValue(Instr, Instance, Cloned); 2529 2530 // If we just cloned a new assumption, add it the assumption cache. 2531 if (auto *II = dyn_cast<IntrinsicInst>(Cloned)) 2532 if (II->getIntrinsicID() == Intrinsic::assume) 2533 AC->registerAssumption(II); 2534 2535 // End if-block. 2536 if (IfPredicateInstr) 2537 PredicatedInstructions.push_back(Cloned); 2538 } 2539 2540 PHINode *InnerLoopVectorizer::createInductionVariable(Loop *L, Value *Start, 2541 Value *End, Value *Step, 2542 Instruction *DL) { 2543 BasicBlock *Header = L->getHeader(); 2544 BasicBlock *Latch = L->getLoopLatch(); 2545 // As we're just creating this loop, it's possible no latch exists 2546 // yet. If so, use the header as this will be a single block loop. 2547 if (!Latch) 2548 Latch = Header; 2549 2550 IRBuilder<> Builder(&*Header->getFirstInsertionPt()); 2551 Instruction *OldInst = getDebugLocFromInstOrOperands(OldInduction); 2552 setDebugLocFromInst(Builder, OldInst); 2553 auto *Induction = Builder.CreatePHI(Start->getType(), 2, "index"); 2554 2555 Builder.SetInsertPoint(Latch->getTerminator()); 2556 setDebugLocFromInst(Builder, OldInst); 2557 2558 // Create i+1 and fill the PHINode. 2559 Value *Next = Builder.CreateAdd(Induction, Step, "index.next"); 2560 Induction->addIncoming(Start, L->getLoopPreheader()); 2561 Induction->addIncoming(Next, Latch); 2562 // Create the compare. 2563 Value *ICmp = Builder.CreateICmpEQ(Next, End); 2564 Builder.CreateCondBr(ICmp, L->getExitBlock(), Header); 2565 2566 // Now we have two terminators. Remove the old one from the block. 2567 Latch->getTerminator()->eraseFromParent(); 2568 2569 return Induction; 2570 } 2571 2572 Value *InnerLoopVectorizer::getOrCreateTripCount(Loop *L) { 2573 if (TripCount) 2574 return TripCount; 2575 2576 assert(L && "Create Trip Count for null loop."); 2577 IRBuilder<> Builder(L->getLoopPreheader()->getTerminator()); 2578 // Find the loop boundaries. 2579 ScalarEvolution *SE = PSE.getSE(); 2580 const SCEV *BackedgeTakenCount = PSE.getBackedgeTakenCount(); 2581 assert(BackedgeTakenCount != SE->getCouldNotCompute() && 2582 "Invalid loop count"); 2583 2584 Type *IdxTy = Legal->getWidestInductionType(); 2585 assert(IdxTy && "No type for induction"); 2586 2587 // The exit count might have the type of i64 while the phi is i32. This can 2588 // happen if we have an induction variable that is sign extended before the 2589 // compare. The only way that we get a backedge taken count is that the 2590 // induction variable was signed and as such will not overflow. In such a case 2591 // truncation is legal. 2592 if (SE->getTypeSizeInBits(BackedgeTakenCount->getType()) > 2593 IdxTy->getPrimitiveSizeInBits()) 2594 BackedgeTakenCount = SE->getTruncateOrNoop(BackedgeTakenCount, IdxTy); 2595 BackedgeTakenCount = SE->getNoopOrZeroExtend(BackedgeTakenCount, IdxTy); 2596 2597 // Get the total trip count from the count by adding 1. 2598 const SCEV *ExitCount = SE->getAddExpr( 2599 BackedgeTakenCount, SE->getOne(BackedgeTakenCount->getType())); 2600 2601 const DataLayout &DL = L->getHeader()->getModule()->getDataLayout(); 2602 2603 // Expand the trip count and place the new instructions in the preheader. 2604 // Notice that the pre-header does not change, only the loop body. 2605 SCEVExpander Exp(*SE, DL, "induction"); 2606 2607 // Count holds the overall loop count (N). 2608 TripCount = Exp.expandCodeFor(ExitCount, ExitCount->getType(), 2609 L->getLoopPreheader()->getTerminator()); 2610 2611 if (TripCount->getType()->isPointerTy()) 2612 TripCount = 2613 CastInst::CreatePointerCast(TripCount, IdxTy, "exitcount.ptrcnt.to.int", 2614 L->getLoopPreheader()->getTerminator()); 2615 2616 return TripCount; 2617 } 2618 2619 Value *InnerLoopVectorizer::getOrCreateVectorTripCount(Loop *L) { 2620 if (VectorTripCount) 2621 return VectorTripCount; 2622 2623 Value *TC = getOrCreateTripCount(L); 2624 IRBuilder<> Builder(L->getLoopPreheader()->getTerminator()); 2625 2626 Type *Ty = TC->getType(); 2627 Constant *Step = ConstantInt::get(Ty, VF * UF); 2628 2629 // If the tail is to be folded by masking, round the number of iterations N 2630 // up to a multiple of Step instead of rounding down. This is done by first 2631 // adding Step-1 and then rounding down. Note that it's ok if this addition 2632 // overflows: the vector induction variable will eventually wrap to zero given 2633 // that it starts at zero and its Step is a power of two; the loop will then 2634 // exit, with the last early-exit vector comparison also producing all-true. 2635 if (Cost->foldTailByMasking()) { 2636 assert(isPowerOf2_32(VF * UF) && 2637 "VF*UF must be a power of 2 when folding tail by masking"); 2638 TC = Builder.CreateAdd(TC, ConstantInt::get(Ty, VF * UF - 1), "n.rnd.up"); 2639 } 2640 2641 // Now we need to generate the expression for the part of the loop that the 2642 // vectorized body will execute. This is equal to N - (N % Step) if scalar 2643 // iterations are not required for correctness, or N - Step, otherwise. Step 2644 // is equal to the vectorization factor (number of SIMD elements) times the 2645 // unroll factor (number of SIMD instructions). 2646 Value *R = Builder.CreateURem(TC, Step, "n.mod.vf"); 2647 2648 // If there is a non-reversed interleaved group that may speculatively access 2649 // memory out-of-bounds, we need to ensure that there will be at least one 2650 // iteration of the scalar epilogue loop. Thus, if the step evenly divides 2651 // the trip count, we set the remainder to be equal to the step. If the step 2652 // does not evenly divide the trip count, no adjustment is necessary since 2653 // there will already be scalar iterations. Note that the minimum iterations 2654 // check ensures that N >= Step. 2655 if (VF > 1 && Cost->requiresScalarEpilogue()) { 2656 auto *IsZero = Builder.CreateICmpEQ(R, ConstantInt::get(R->getType(), 0)); 2657 R = Builder.CreateSelect(IsZero, Step, R); 2658 } 2659 2660 VectorTripCount = Builder.CreateSub(TC, R, "n.vec"); 2661 2662 return VectorTripCount; 2663 } 2664 2665 Value *InnerLoopVectorizer::createBitOrPointerCast(Value *V, VectorType *DstVTy, 2666 const DataLayout &DL) { 2667 // Verify that V is a vector type with same number of elements as DstVTy. 2668 unsigned VF = DstVTy->getNumElements(); 2669 VectorType *SrcVecTy = cast<VectorType>(V->getType()); 2670 assert((VF == SrcVecTy->getNumElements()) && "Vector dimensions do not match"); 2671 Type *SrcElemTy = SrcVecTy->getElementType(); 2672 Type *DstElemTy = DstVTy->getElementType(); 2673 assert((DL.getTypeSizeInBits(SrcElemTy) == DL.getTypeSizeInBits(DstElemTy)) && 2674 "Vector elements must have same size"); 2675 2676 // Do a direct cast if element types are castable. 2677 if (CastInst::isBitOrNoopPointerCastable(SrcElemTy, DstElemTy, DL)) { 2678 return Builder.CreateBitOrPointerCast(V, DstVTy); 2679 } 2680 // V cannot be directly casted to desired vector type. 2681 // May happen when V is a floating point vector but DstVTy is a vector of 2682 // pointers or vice-versa. Handle this using a two-step bitcast using an 2683 // intermediate Integer type for the bitcast i.e. Ptr <-> Int <-> Float. 2684 assert((DstElemTy->isPointerTy() != SrcElemTy->isPointerTy()) && 2685 "Only one type should be a pointer type"); 2686 assert((DstElemTy->isFloatingPointTy() != SrcElemTy->isFloatingPointTy()) && 2687 "Only one type should be a floating point type"); 2688 Type *IntTy = 2689 IntegerType::getIntNTy(V->getContext(), DL.getTypeSizeInBits(SrcElemTy)); 2690 VectorType *VecIntTy = VectorType::get(IntTy, VF); 2691 Value *CastVal = Builder.CreateBitOrPointerCast(V, VecIntTy); 2692 return Builder.CreateBitOrPointerCast(CastVal, DstVTy); 2693 } 2694 2695 void InnerLoopVectorizer::emitMinimumIterationCountCheck(Loop *L, 2696 BasicBlock *Bypass) { 2697 Value *Count = getOrCreateTripCount(L); 2698 // Reuse existing vector loop preheader for TC checks. 2699 // Note that new preheader block is generated for vector loop. 2700 BasicBlock *const TCCheckBlock = LoopVectorPreHeader; 2701 IRBuilder<> Builder(TCCheckBlock->getTerminator()); 2702 2703 // Generate code to check if the loop's trip count is less than VF * UF, or 2704 // equal to it in case a scalar epilogue is required; this implies that the 2705 // vector trip count is zero. This check also covers the case where adding one 2706 // to the backedge-taken count overflowed leading to an incorrect trip count 2707 // of zero. In this case we will also jump to the scalar loop. 2708 auto P = Cost->requiresScalarEpilogue() ? ICmpInst::ICMP_ULE 2709 : ICmpInst::ICMP_ULT; 2710 2711 // If tail is to be folded, vector loop takes care of all iterations. 2712 Value *CheckMinIters = Builder.getFalse(); 2713 if (!Cost->foldTailByMasking()) 2714 CheckMinIters = Builder.CreateICmp( 2715 P, Count, ConstantInt::get(Count->getType(), VF * UF), 2716 "min.iters.check"); 2717 2718 // Create new preheader for vector loop. 2719 LoopVectorPreHeader = 2720 SplitBlock(TCCheckBlock, TCCheckBlock->getTerminator(), DT, LI, nullptr, 2721 "vector.ph"); 2722 2723 assert(DT->properlyDominates(DT->getNode(TCCheckBlock), 2724 DT->getNode(Bypass)->getIDom()) && 2725 "TC check is expected to dominate Bypass"); 2726 2727 // Update dominator for Bypass & LoopExit. 2728 DT->changeImmediateDominator(Bypass, TCCheckBlock); 2729 DT->changeImmediateDominator(LoopExitBlock, TCCheckBlock); 2730 2731 ReplaceInstWithInst( 2732 TCCheckBlock->getTerminator(), 2733 BranchInst::Create(Bypass, LoopVectorPreHeader, CheckMinIters)); 2734 LoopBypassBlocks.push_back(TCCheckBlock); 2735 } 2736 2737 void InnerLoopVectorizer::emitSCEVChecks(Loop *L, BasicBlock *Bypass) { 2738 // Reuse existing vector loop preheader for SCEV checks. 2739 // Note that new preheader block is generated for vector loop. 2740 BasicBlock *const SCEVCheckBlock = LoopVectorPreHeader; 2741 2742 // Generate the code to check that the SCEV assumptions that we made. 2743 // We want the new basic block to start at the first instruction in a 2744 // sequence of instructions that form a check. 2745 SCEVExpander Exp(*PSE.getSE(), Bypass->getModule()->getDataLayout(), 2746 "scev.check"); 2747 Value *SCEVCheck = Exp.expandCodeForPredicate( 2748 &PSE.getUnionPredicate(), SCEVCheckBlock->getTerminator()); 2749 2750 if (auto *C = dyn_cast<ConstantInt>(SCEVCheck)) 2751 if (C->isZero()) 2752 return; 2753 2754 assert(!SCEVCheckBlock->getParent()->hasOptSize() && 2755 "Cannot SCEV check stride or overflow when optimizing for size"); 2756 2757 SCEVCheckBlock->setName("vector.scevcheck"); 2758 // Create new preheader for vector loop. 2759 LoopVectorPreHeader = 2760 SplitBlock(SCEVCheckBlock, SCEVCheckBlock->getTerminator(), DT, LI, 2761 nullptr, "vector.ph"); 2762 2763 // Update dominator only if this is first RT check. 2764 if (LoopBypassBlocks.empty()) { 2765 DT->changeImmediateDominator(Bypass, SCEVCheckBlock); 2766 DT->changeImmediateDominator(LoopExitBlock, SCEVCheckBlock); 2767 } 2768 2769 ReplaceInstWithInst( 2770 SCEVCheckBlock->getTerminator(), 2771 BranchInst::Create(Bypass, LoopVectorPreHeader, SCEVCheck)); 2772 LoopBypassBlocks.push_back(SCEVCheckBlock); 2773 AddedSafetyChecks = true; 2774 } 2775 2776 void InnerLoopVectorizer::emitMemRuntimeChecks(Loop *L, BasicBlock *Bypass) { 2777 // VPlan-native path does not do any analysis for runtime checks currently. 2778 if (EnableVPlanNativePath) 2779 return; 2780 2781 // Reuse existing vector loop preheader for runtime memory checks. 2782 // Note that new preheader block is generated for vector loop. 2783 BasicBlock *const MemCheckBlock = L->getLoopPreheader(); 2784 2785 // Generate the code that checks in runtime if arrays overlap. We put the 2786 // checks into a separate block to make the more common case of few elements 2787 // faster. 2788 auto *LAI = Legal->getLAI(); 2789 const auto &RtPtrChecking = *LAI->getRuntimePointerChecking(); 2790 if (!RtPtrChecking.Need) 2791 return; 2792 Instruction *FirstCheckInst; 2793 Instruction *MemRuntimeCheck; 2794 std::tie(FirstCheckInst, MemRuntimeCheck) = 2795 addRuntimeChecks(MemCheckBlock->getTerminator(), OrigLoop, 2796 RtPtrChecking.getChecks(), RtPtrChecking.getSE()); 2797 if (!MemRuntimeCheck) 2798 return; 2799 2800 if (MemCheckBlock->getParent()->hasOptSize()) { 2801 assert(Cost->Hints->getForce() == LoopVectorizeHints::FK_Enabled && 2802 "Cannot emit memory checks when optimizing for size, unless forced " 2803 "to vectorize."); 2804 ORE->emit([&]() { 2805 return OptimizationRemarkAnalysis(DEBUG_TYPE, "VectorizationCodeSize", 2806 L->getStartLoc(), L->getHeader()) 2807 << "Code-size may be reduced by not forcing " 2808 "vectorization, or by source-code modifications " 2809 "eliminating the need for runtime checks " 2810 "(e.g., adding 'restrict')."; 2811 }); 2812 } 2813 2814 MemCheckBlock->setName("vector.memcheck"); 2815 // Create new preheader for vector loop. 2816 LoopVectorPreHeader = 2817 SplitBlock(MemCheckBlock, MemCheckBlock->getTerminator(), DT, LI, nullptr, 2818 "vector.ph"); 2819 2820 // Update dominator only if this is first RT check. 2821 if (LoopBypassBlocks.empty()) { 2822 DT->changeImmediateDominator(Bypass, MemCheckBlock); 2823 DT->changeImmediateDominator(LoopExitBlock, MemCheckBlock); 2824 } 2825 2826 ReplaceInstWithInst( 2827 MemCheckBlock->getTerminator(), 2828 BranchInst::Create(Bypass, LoopVectorPreHeader, MemRuntimeCheck)); 2829 LoopBypassBlocks.push_back(MemCheckBlock); 2830 AddedSafetyChecks = true; 2831 2832 // We currently don't use LoopVersioning for the actual loop cloning but we 2833 // still use it to add the noalias metadata. 2834 LVer = std::make_unique<LoopVersioning>(*Legal->getLAI(), OrigLoop, LI, DT, 2835 PSE.getSE()); 2836 LVer->prepareNoAliasMetadata(); 2837 } 2838 2839 Value *InnerLoopVectorizer::emitTransformedIndex( 2840 IRBuilder<> &B, Value *Index, ScalarEvolution *SE, const DataLayout &DL, 2841 const InductionDescriptor &ID) const { 2842 2843 SCEVExpander Exp(*SE, DL, "induction"); 2844 auto Step = ID.getStep(); 2845 auto StartValue = ID.getStartValue(); 2846 assert(Index->getType() == Step->getType() && 2847 "Index type does not match StepValue type"); 2848 2849 // Note: the IR at this point is broken. We cannot use SE to create any new 2850 // SCEV and then expand it, hoping that SCEV's simplification will give us 2851 // a more optimal code. Unfortunately, attempt of doing so on invalid IR may 2852 // lead to various SCEV crashes. So all we can do is to use builder and rely 2853 // on InstCombine for future simplifications. Here we handle some trivial 2854 // cases only. 2855 auto CreateAdd = [&B](Value *X, Value *Y) { 2856 assert(X->getType() == Y->getType() && "Types don't match!"); 2857 if (auto *CX = dyn_cast<ConstantInt>(X)) 2858 if (CX->isZero()) 2859 return Y; 2860 if (auto *CY = dyn_cast<ConstantInt>(Y)) 2861 if (CY->isZero()) 2862 return X; 2863 return B.CreateAdd(X, Y); 2864 }; 2865 2866 auto CreateMul = [&B](Value *X, Value *Y) { 2867 assert(X->getType() == Y->getType() && "Types don't match!"); 2868 if (auto *CX = dyn_cast<ConstantInt>(X)) 2869 if (CX->isOne()) 2870 return Y; 2871 if (auto *CY = dyn_cast<ConstantInt>(Y)) 2872 if (CY->isOne()) 2873 return X; 2874 return B.CreateMul(X, Y); 2875 }; 2876 2877 switch (ID.getKind()) { 2878 case InductionDescriptor::IK_IntInduction: { 2879 assert(Index->getType() == StartValue->getType() && 2880 "Index type does not match StartValue type"); 2881 if (ID.getConstIntStepValue() && ID.getConstIntStepValue()->isMinusOne()) 2882 return B.CreateSub(StartValue, Index); 2883 auto *Offset = CreateMul( 2884 Index, Exp.expandCodeFor(Step, Index->getType(), &*B.GetInsertPoint())); 2885 return CreateAdd(StartValue, Offset); 2886 } 2887 case InductionDescriptor::IK_PtrInduction: { 2888 assert(isa<SCEVConstant>(Step) && 2889 "Expected constant step for pointer induction"); 2890 return B.CreateGEP( 2891 StartValue->getType()->getPointerElementType(), StartValue, 2892 CreateMul(Index, Exp.expandCodeFor(Step, Index->getType(), 2893 &*B.GetInsertPoint()))); 2894 } 2895 case InductionDescriptor::IK_FpInduction: { 2896 assert(Step->getType()->isFloatingPointTy() && "Expected FP Step value"); 2897 auto InductionBinOp = ID.getInductionBinOp(); 2898 assert(InductionBinOp && 2899 (InductionBinOp->getOpcode() == Instruction::FAdd || 2900 InductionBinOp->getOpcode() == Instruction::FSub) && 2901 "Original bin op should be defined for FP induction"); 2902 2903 Value *StepValue = cast<SCEVUnknown>(Step)->getValue(); 2904 2905 // Floating point operations had to be 'fast' to enable the induction. 2906 FastMathFlags Flags; 2907 Flags.setFast(); 2908 2909 Value *MulExp = B.CreateFMul(StepValue, Index); 2910 if (isa<Instruction>(MulExp)) 2911 // We have to check, the MulExp may be a constant. 2912 cast<Instruction>(MulExp)->setFastMathFlags(Flags); 2913 2914 Value *BOp = B.CreateBinOp(InductionBinOp->getOpcode(), StartValue, MulExp, 2915 "induction"); 2916 if (isa<Instruction>(BOp)) 2917 cast<Instruction>(BOp)->setFastMathFlags(Flags); 2918 2919 return BOp; 2920 } 2921 case InductionDescriptor::IK_NoInduction: 2922 return nullptr; 2923 } 2924 llvm_unreachable("invalid enum"); 2925 } 2926 2927 BasicBlock *InnerLoopVectorizer::createVectorizedLoopSkeleton() { 2928 /* 2929 In this function we generate a new loop. The new loop will contain 2930 the vectorized instructions while the old loop will continue to run the 2931 scalar remainder. 2932 2933 [ ] <-- loop iteration number check. 2934 / | 2935 / v 2936 | [ ] <-- vector loop bypass (may consist of multiple blocks). 2937 | / | 2938 | / v 2939 || [ ] <-- vector pre header. 2940 |/ | 2941 | v 2942 | [ ] \ 2943 | [ ]_| <-- vector loop. 2944 | | 2945 | v 2946 | -[ ] <--- middle-block. 2947 | / | 2948 | / v 2949 -|- >[ ] <--- new preheader. 2950 | | 2951 | v 2952 | [ ] \ 2953 | [ ]_| <-- old scalar loop to handle remainder. 2954 \ | 2955 \ v 2956 >[ ] <-- exit block. 2957 ... 2958 */ 2959 2960 MDNode *OrigLoopID = OrigLoop->getLoopID(); 2961 2962 // Some loops have a single integer induction variable, while other loops 2963 // don't. One example is c++ iterators that often have multiple pointer 2964 // induction variables. In the code below we also support a case where we 2965 // don't have a single induction variable. 2966 // 2967 // We try to obtain an induction variable from the original loop as hard 2968 // as possible. However if we don't find one that: 2969 // - is an integer 2970 // - counts from zero, stepping by one 2971 // - is the size of the widest induction variable type 2972 // then we create a new one. 2973 OldInduction = Legal->getPrimaryInduction(); 2974 Type *IdxTy = Legal->getWidestInductionType(); 2975 2976 // Split the single block loop into the two loop structure described above. 2977 LoopScalarBody = OrigLoop->getHeader(); 2978 LoopVectorPreHeader = OrigLoop->getLoopPreheader(); 2979 LoopExitBlock = OrigLoop->getExitBlock(); 2980 assert(LoopExitBlock && "Must have an exit block"); 2981 assert(LoopVectorPreHeader && "Invalid loop structure"); 2982 2983 LoopMiddleBlock = 2984 SplitBlock(LoopVectorPreHeader, LoopVectorPreHeader->getTerminator(), DT, 2985 LI, nullptr, "middle.block"); 2986 LoopScalarPreHeader = 2987 SplitBlock(LoopMiddleBlock, LoopMiddleBlock->getTerminator(), DT, LI, 2988 nullptr, "scalar.ph"); 2989 // We intentionally don't let SplitBlock to update LoopInfo since 2990 // LoopVectorBody should belong to another loop than LoopVectorPreHeader. 2991 // LoopVectorBody is explicitly added to the correct place few lines later. 2992 LoopVectorBody = 2993 SplitBlock(LoopVectorPreHeader, LoopVectorPreHeader->getTerminator(), DT, 2994 nullptr, nullptr, "vector.body"); 2995 2996 // Update dominator for loop exit. 2997 DT->changeImmediateDominator(LoopExitBlock, LoopMiddleBlock); 2998 2999 // Create and register the new vector loop. 3000 Loop *Lp = LI->AllocateLoop(); 3001 Loop *ParentLoop = OrigLoop->getParentLoop(); 3002 3003 // Insert the new loop into the loop nest and register the new basic blocks 3004 // before calling any utilities such as SCEV that require valid LoopInfo. 3005 if (ParentLoop) { 3006 ParentLoop->addChildLoop(Lp); 3007 } else { 3008 LI->addTopLevelLoop(Lp); 3009 } 3010 Lp->addBasicBlockToLoop(LoopVectorBody, *LI); 3011 3012 // Find the loop boundaries. 3013 Value *Count = getOrCreateTripCount(Lp); 3014 3015 Value *StartIdx = ConstantInt::get(IdxTy, 0); 3016 3017 // Now, compare the new count to zero. If it is zero skip the vector loop and 3018 // jump to the scalar loop. This check also covers the case where the 3019 // backedge-taken count is uint##_max: adding one to it will overflow leading 3020 // to an incorrect trip count of zero. In this (rare) case we will also jump 3021 // to the scalar loop. 3022 emitMinimumIterationCountCheck(Lp, LoopScalarPreHeader); 3023 3024 // Generate the code to check any assumptions that we've made for SCEV 3025 // expressions. 3026 emitSCEVChecks(Lp, LoopScalarPreHeader); 3027 3028 // Generate the code that checks in runtime if arrays overlap. We put the 3029 // checks into a separate block to make the more common case of few elements 3030 // faster. 3031 emitMemRuntimeChecks(Lp, LoopScalarPreHeader); 3032 3033 // Generate the induction variable. 3034 // The loop step is equal to the vectorization factor (num of SIMD elements) 3035 // times the unroll factor (num of SIMD instructions). 3036 Value *CountRoundDown = getOrCreateVectorTripCount(Lp); 3037 Constant *Step = ConstantInt::get(IdxTy, VF * UF); 3038 Induction = 3039 createInductionVariable(Lp, StartIdx, CountRoundDown, Step, 3040 getDebugLocFromInstOrOperands(OldInduction)); 3041 3042 // We are going to resume the execution of the scalar loop. 3043 // Go over all of the induction variables that we found and fix the 3044 // PHIs that are left in the scalar version of the loop. 3045 // The starting values of PHI nodes depend on the counter of the last 3046 // iteration in the vectorized loop. 3047 // If we come from a bypass edge then we need to start from the original 3048 // start value. 3049 3050 // This variable saves the new starting index for the scalar loop. It is used 3051 // to test if there are any tail iterations left once the vector loop has 3052 // completed. 3053 for (auto &InductionEntry : Legal->getInductionVars()) { 3054 PHINode *OrigPhi = InductionEntry.first; 3055 InductionDescriptor II = InductionEntry.second; 3056 3057 // Create phi nodes to merge from the backedge-taken check block. 3058 PHINode *BCResumeVal = 3059 PHINode::Create(OrigPhi->getType(), 3, "bc.resume.val", 3060 LoopScalarPreHeader->getTerminator()); 3061 // Copy original phi DL over to the new one. 3062 BCResumeVal->setDebugLoc(OrigPhi->getDebugLoc()); 3063 Value *&EndValue = IVEndValues[OrigPhi]; 3064 if (OrigPhi == OldInduction) { 3065 // We know what the end value is. 3066 EndValue = CountRoundDown; 3067 } else { 3068 IRBuilder<> B(Lp->getLoopPreheader()->getTerminator()); 3069 Type *StepType = II.getStep()->getType(); 3070 Instruction::CastOps CastOp = 3071 CastInst::getCastOpcode(CountRoundDown, true, StepType, true); 3072 Value *CRD = B.CreateCast(CastOp, CountRoundDown, StepType, "cast.crd"); 3073 const DataLayout &DL = LoopScalarBody->getModule()->getDataLayout(); 3074 EndValue = emitTransformedIndex(B, CRD, PSE.getSE(), DL, II); 3075 EndValue->setName("ind.end"); 3076 } 3077 3078 // The new PHI merges the original incoming value, in case of a bypass, 3079 // or the value at the end of the vectorized loop. 3080 BCResumeVal->addIncoming(EndValue, LoopMiddleBlock); 3081 3082 // Fix the scalar body counter (PHI node). 3083 // The old induction's phi node in the scalar body needs the truncated 3084 // value. 3085 for (BasicBlock *BB : LoopBypassBlocks) 3086 BCResumeVal->addIncoming(II.getStartValue(), BB); 3087 OrigPhi->setIncomingValueForBlock(LoopScalarPreHeader, BCResumeVal); 3088 } 3089 3090 // We need the OrigLoop (scalar loop part) latch terminator to help 3091 // produce correct debug info for the middle block BB instructions. 3092 // The legality check stage guarantees that the loop will have a single 3093 // latch. 3094 assert(isa<BranchInst>(OrigLoop->getLoopLatch()->getTerminator()) && 3095 "Scalar loop latch terminator isn't a branch"); 3096 BranchInst *ScalarLatchBr = 3097 cast<BranchInst>(OrigLoop->getLoopLatch()->getTerminator()); 3098 3099 // Add a check in the middle block to see if we have completed 3100 // all of the iterations in the first vector loop. 3101 // If (N - N%VF) == N, then we *don't* need to run the remainder. 3102 // If tail is to be folded, we know we don't need to run the remainder. 3103 Value *CmpN = Builder.getTrue(); 3104 if (!Cost->foldTailByMasking()) { 3105 CmpN = CmpInst::Create(Instruction::ICmp, CmpInst::ICMP_EQ, Count, 3106 CountRoundDown, "cmp.n", 3107 LoopMiddleBlock->getTerminator()); 3108 3109 // Here we use the same DebugLoc as the scalar loop latch branch instead 3110 // of the corresponding compare because they may have ended up with 3111 // different line numbers and we want to avoid awkward line stepping while 3112 // debugging. Eg. if the compare has got a line number inside the loop. 3113 cast<Instruction>(CmpN)->setDebugLoc(ScalarLatchBr->getDebugLoc()); 3114 } 3115 3116 BranchInst *BrInst = 3117 BranchInst::Create(LoopExitBlock, LoopScalarPreHeader, CmpN); 3118 BrInst->setDebugLoc(ScalarLatchBr->getDebugLoc()); 3119 ReplaceInstWithInst(LoopMiddleBlock->getTerminator(), BrInst); 3120 3121 // Get ready to start creating new instructions into the vectorized body. 3122 assert(LoopVectorPreHeader == Lp->getLoopPreheader() && 3123 "Inconsistent vector loop preheader"); 3124 Builder.SetInsertPoint(&*LoopVectorBody->getFirstInsertionPt()); 3125 3126 Optional<MDNode *> VectorizedLoopID = 3127 makeFollowupLoopID(OrigLoopID, {LLVMLoopVectorizeFollowupAll, 3128 LLVMLoopVectorizeFollowupVectorized}); 3129 if (VectorizedLoopID.hasValue()) { 3130 Lp->setLoopID(VectorizedLoopID.getValue()); 3131 3132 // Do not setAlreadyVectorized if loop attributes have been defined 3133 // explicitly. 3134 return LoopVectorPreHeader; 3135 } 3136 3137 // Keep all loop hints from the original loop on the vector loop (we'll 3138 // replace the vectorizer-specific hints below). 3139 if (MDNode *LID = OrigLoop->getLoopID()) 3140 Lp->setLoopID(LID); 3141 3142 LoopVectorizeHints Hints(Lp, true, *ORE); 3143 Hints.setAlreadyVectorized(); 3144 3145 #ifdef EXPENSIVE_CHECKS 3146 assert(DT->verify(DominatorTree::VerificationLevel::Fast)); 3147 LI->verify(*DT); 3148 #endif 3149 3150 return LoopVectorPreHeader; 3151 } 3152 3153 // Fix up external users of the induction variable. At this point, we are 3154 // in LCSSA form, with all external PHIs that use the IV having one input value, 3155 // coming from the remainder loop. We need those PHIs to also have a correct 3156 // value for the IV when arriving directly from the middle block. 3157 void InnerLoopVectorizer::fixupIVUsers(PHINode *OrigPhi, 3158 const InductionDescriptor &II, 3159 Value *CountRoundDown, Value *EndValue, 3160 BasicBlock *MiddleBlock) { 3161 // There are two kinds of external IV usages - those that use the value 3162 // computed in the last iteration (the PHI) and those that use the penultimate 3163 // value (the value that feeds into the phi from the loop latch). 3164 // We allow both, but they, obviously, have different values. 3165 3166 assert(OrigLoop->getExitBlock() && "Expected a single exit block"); 3167 3168 DenseMap<Value *, Value *> MissingVals; 3169 3170 // An external user of the last iteration's value should see the value that 3171 // the remainder loop uses to initialize its own IV. 3172 Value *PostInc = OrigPhi->getIncomingValueForBlock(OrigLoop->getLoopLatch()); 3173 for (User *U : PostInc->users()) { 3174 Instruction *UI = cast<Instruction>(U); 3175 if (!OrigLoop->contains(UI)) { 3176 assert(isa<PHINode>(UI) && "Expected LCSSA form"); 3177 MissingVals[UI] = EndValue; 3178 } 3179 } 3180 3181 // An external user of the penultimate value need to see EndValue - Step. 3182 // The simplest way to get this is to recompute it from the constituent SCEVs, 3183 // that is Start + (Step * (CRD - 1)). 3184 for (User *U : OrigPhi->users()) { 3185 auto *UI = cast<Instruction>(U); 3186 if (!OrigLoop->contains(UI)) { 3187 const DataLayout &DL = 3188 OrigLoop->getHeader()->getModule()->getDataLayout(); 3189 assert(isa<PHINode>(UI) && "Expected LCSSA form"); 3190 3191 IRBuilder<> B(MiddleBlock->getTerminator()); 3192 Value *CountMinusOne = B.CreateSub( 3193 CountRoundDown, ConstantInt::get(CountRoundDown->getType(), 1)); 3194 Value *CMO = 3195 !II.getStep()->getType()->isIntegerTy() 3196 ? B.CreateCast(Instruction::SIToFP, CountMinusOne, 3197 II.getStep()->getType()) 3198 : B.CreateSExtOrTrunc(CountMinusOne, II.getStep()->getType()); 3199 CMO->setName("cast.cmo"); 3200 Value *Escape = emitTransformedIndex(B, CMO, PSE.getSE(), DL, II); 3201 Escape->setName("ind.escape"); 3202 MissingVals[UI] = Escape; 3203 } 3204 } 3205 3206 for (auto &I : MissingVals) { 3207 PHINode *PHI = cast<PHINode>(I.first); 3208 // One corner case we have to handle is two IVs "chasing" each-other, 3209 // that is %IV2 = phi [...], [ %IV1, %latch ] 3210 // In this case, if IV1 has an external use, we need to avoid adding both 3211 // "last value of IV1" and "penultimate value of IV2". So, verify that we 3212 // don't already have an incoming value for the middle block. 3213 if (PHI->getBasicBlockIndex(MiddleBlock) == -1) 3214 PHI->addIncoming(I.second, MiddleBlock); 3215 } 3216 } 3217 3218 namespace { 3219 3220 struct CSEDenseMapInfo { 3221 static bool canHandle(const Instruction *I) { 3222 return isa<InsertElementInst>(I) || isa<ExtractElementInst>(I) || 3223 isa<ShuffleVectorInst>(I) || isa<GetElementPtrInst>(I); 3224 } 3225 3226 static inline Instruction *getEmptyKey() { 3227 return DenseMapInfo<Instruction *>::getEmptyKey(); 3228 } 3229 3230 static inline Instruction *getTombstoneKey() { 3231 return DenseMapInfo<Instruction *>::getTombstoneKey(); 3232 } 3233 3234 static unsigned getHashValue(const Instruction *I) { 3235 assert(canHandle(I) && "Unknown instruction!"); 3236 return hash_combine(I->getOpcode(), hash_combine_range(I->value_op_begin(), 3237 I->value_op_end())); 3238 } 3239 3240 static bool isEqual(const Instruction *LHS, const Instruction *RHS) { 3241 if (LHS == getEmptyKey() || RHS == getEmptyKey() || 3242 LHS == getTombstoneKey() || RHS == getTombstoneKey()) 3243 return LHS == RHS; 3244 return LHS->isIdenticalTo(RHS); 3245 } 3246 }; 3247 3248 } // end anonymous namespace 3249 3250 ///Perform cse of induction variable instructions. 3251 static void cse(BasicBlock *BB) { 3252 // Perform simple cse. 3253 SmallDenseMap<Instruction *, Instruction *, 4, CSEDenseMapInfo> CSEMap; 3254 for (BasicBlock::iterator I = BB->begin(), E = BB->end(); I != E;) { 3255 Instruction *In = &*I++; 3256 3257 if (!CSEDenseMapInfo::canHandle(In)) 3258 continue; 3259 3260 // Check if we can replace this instruction with any of the 3261 // visited instructions. 3262 if (Instruction *V = CSEMap.lookup(In)) { 3263 In->replaceAllUsesWith(V); 3264 In->eraseFromParent(); 3265 continue; 3266 } 3267 3268 CSEMap[In] = In; 3269 } 3270 } 3271 3272 unsigned LoopVectorizationCostModel::getVectorCallCost(CallInst *CI, 3273 unsigned VF, 3274 bool &NeedToScalarize) { 3275 Function *F = CI->getCalledFunction(); 3276 Type *ScalarRetTy = CI->getType(); 3277 SmallVector<Type *, 4> Tys, ScalarTys; 3278 for (auto &ArgOp : CI->arg_operands()) 3279 ScalarTys.push_back(ArgOp->getType()); 3280 3281 // Estimate cost of scalarized vector call. The source operands are assumed 3282 // to be vectors, so we need to extract individual elements from there, 3283 // execute VF scalar calls, and then gather the result into the vector return 3284 // value. 3285 unsigned ScalarCallCost = TTI.getCallInstrCost(F, ScalarRetTy, ScalarTys, 3286 TTI::TCK_RecipThroughput); 3287 if (VF == 1) 3288 return ScalarCallCost; 3289 3290 // Compute corresponding vector type for return value and arguments. 3291 Type *RetTy = ToVectorTy(ScalarRetTy, VF); 3292 for (Type *ScalarTy : ScalarTys) 3293 Tys.push_back(ToVectorTy(ScalarTy, VF)); 3294 3295 // Compute costs of unpacking argument values for the scalar calls and 3296 // packing the return values to a vector. 3297 unsigned ScalarizationCost = getScalarizationOverhead(CI, VF); 3298 3299 unsigned Cost = ScalarCallCost * VF + ScalarizationCost; 3300 3301 // If we can't emit a vector call for this function, then the currently found 3302 // cost is the cost we need to return. 3303 NeedToScalarize = true; 3304 VFShape Shape = VFShape::get(*CI, {VF, false}, false /*HasGlobalPred*/); 3305 Function *VecFunc = VFDatabase(*CI).getVectorizedFunction(Shape); 3306 3307 if (!TLI || CI->isNoBuiltin() || !VecFunc) 3308 return Cost; 3309 3310 // If the corresponding vector cost is cheaper, return its cost. 3311 unsigned VectorCallCost = TTI.getCallInstrCost(nullptr, RetTy, Tys, 3312 TTI::TCK_RecipThroughput); 3313 if (VectorCallCost < Cost) { 3314 NeedToScalarize = false; 3315 return VectorCallCost; 3316 } 3317 return Cost; 3318 } 3319 3320 unsigned LoopVectorizationCostModel::getVectorIntrinsicCost(CallInst *CI, 3321 unsigned VF) { 3322 Intrinsic::ID ID = getVectorIntrinsicIDForCall(CI, TLI); 3323 assert(ID && "Expected intrinsic call!"); 3324 3325 IntrinsicCostAttributes CostAttrs(ID, *CI, VF); 3326 return TTI.getIntrinsicInstrCost(CostAttrs, 3327 TargetTransformInfo::TCK_RecipThroughput); 3328 } 3329 3330 static Type *smallestIntegerVectorType(Type *T1, Type *T2) { 3331 auto *I1 = cast<IntegerType>(cast<VectorType>(T1)->getElementType()); 3332 auto *I2 = cast<IntegerType>(cast<VectorType>(T2)->getElementType()); 3333 return I1->getBitWidth() < I2->getBitWidth() ? T1 : T2; 3334 } 3335 3336 static Type *largestIntegerVectorType(Type *T1, Type *T2) { 3337 auto *I1 = cast<IntegerType>(cast<VectorType>(T1)->getElementType()); 3338 auto *I2 = cast<IntegerType>(cast<VectorType>(T2)->getElementType()); 3339 return I1->getBitWidth() > I2->getBitWidth() ? T1 : T2; 3340 } 3341 3342 void InnerLoopVectorizer::truncateToMinimalBitwidths() { 3343 // For every instruction `I` in MinBWs, truncate the operands, create a 3344 // truncated version of `I` and reextend its result. InstCombine runs 3345 // later and will remove any ext/trunc pairs. 3346 SmallPtrSet<Value *, 4> Erased; 3347 for (const auto &KV : Cost->getMinimalBitwidths()) { 3348 // If the value wasn't vectorized, we must maintain the original scalar 3349 // type. The absence of the value from VectorLoopValueMap indicates that it 3350 // wasn't vectorized. 3351 if (!VectorLoopValueMap.hasAnyVectorValue(KV.first)) 3352 continue; 3353 for (unsigned Part = 0; Part < UF; ++Part) { 3354 Value *I = getOrCreateVectorValue(KV.first, Part); 3355 if (Erased.find(I) != Erased.end() || I->use_empty() || 3356 !isa<Instruction>(I)) 3357 continue; 3358 Type *OriginalTy = I->getType(); 3359 Type *ScalarTruncatedTy = 3360 IntegerType::get(OriginalTy->getContext(), KV.second); 3361 Type *TruncatedTy = VectorType::get( 3362 ScalarTruncatedTy, cast<VectorType>(OriginalTy)->getNumElements()); 3363 if (TruncatedTy == OriginalTy) 3364 continue; 3365 3366 IRBuilder<> B(cast<Instruction>(I)); 3367 auto ShrinkOperand = [&](Value *V) -> Value * { 3368 if (auto *ZI = dyn_cast<ZExtInst>(V)) 3369 if (ZI->getSrcTy() == TruncatedTy) 3370 return ZI->getOperand(0); 3371 return B.CreateZExtOrTrunc(V, TruncatedTy); 3372 }; 3373 3374 // The actual instruction modification depends on the instruction type, 3375 // unfortunately. 3376 Value *NewI = nullptr; 3377 if (auto *BO = dyn_cast<BinaryOperator>(I)) { 3378 NewI = B.CreateBinOp(BO->getOpcode(), ShrinkOperand(BO->getOperand(0)), 3379 ShrinkOperand(BO->getOperand(1))); 3380 3381 // Any wrapping introduced by shrinking this operation shouldn't be 3382 // considered undefined behavior. So, we can't unconditionally copy 3383 // arithmetic wrapping flags to NewI. 3384 cast<BinaryOperator>(NewI)->copyIRFlags(I, /*IncludeWrapFlags=*/false); 3385 } else if (auto *CI = dyn_cast<ICmpInst>(I)) { 3386 NewI = 3387 B.CreateICmp(CI->getPredicate(), ShrinkOperand(CI->getOperand(0)), 3388 ShrinkOperand(CI->getOperand(1))); 3389 } else if (auto *SI = dyn_cast<SelectInst>(I)) { 3390 NewI = B.CreateSelect(SI->getCondition(), 3391 ShrinkOperand(SI->getTrueValue()), 3392 ShrinkOperand(SI->getFalseValue())); 3393 } else if (auto *CI = dyn_cast<CastInst>(I)) { 3394 switch (CI->getOpcode()) { 3395 default: 3396 llvm_unreachable("Unhandled cast!"); 3397 case Instruction::Trunc: 3398 NewI = ShrinkOperand(CI->getOperand(0)); 3399 break; 3400 case Instruction::SExt: 3401 NewI = B.CreateSExtOrTrunc( 3402 CI->getOperand(0), 3403 smallestIntegerVectorType(OriginalTy, TruncatedTy)); 3404 break; 3405 case Instruction::ZExt: 3406 NewI = B.CreateZExtOrTrunc( 3407 CI->getOperand(0), 3408 smallestIntegerVectorType(OriginalTy, TruncatedTy)); 3409 break; 3410 } 3411 } else if (auto *SI = dyn_cast<ShuffleVectorInst>(I)) { 3412 auto Elements0 = 3413 cast<VectorType>(SI->getOperand(0)->getType())->getNumElements(); 3414 auto *O0 = B.CreateZExtOrTrunc( 3415 SI->getOperand(0), VectorType::get(ScalarTruncatedTy, Elements0)); 3416 auto Elements1 = 3417 cast<VectorType>(SI->getOperand(1)->getType())->getNumElements(); 3418 auto *O1 = B.CreateZExtOrTrunc( 3419 SI->getOperand(1), VectorType::get(ScalarTruncatedTy, Elements1)); 3420 3421 NewI = B.CreateShuffleVector(O0, O1, SI->getShuffleMask()); 3422 } else if (isa<LoadInst>(I) || isa<PHINode>(I)) { 3423 // Don't do anything with the operands, just extend the result. 3424 continue; 3425 } else if (auto *IE = dyn_cast<InsertElementInst>(I)) { 3426 auto Elements = 3427 cast<VectorType>(IE->getOperand(0)->getType())->getNumElements(); 3428 auto *O0 = B.CreateZExtOrTrunc( 3429 IE->getOperand(0), VectorType::get(ScalarTruncatedTy, Elements)); 3430 auto *O1 = B.CreateZExtOrTrunc(IE->getOperand(1), ScalarTruncatedTy); 3431 NewI = B.CreateInsertElement(O0, O1, IE->getOperand(2)); 3432 } else if (auto *EE = dyn_cast<ExtractElementInst>(I)) { 3433 auto Elements = 3434 cast<VectorType>(EE->getOperand(0)->getType())->getNumElements(); 3435 auto *O0 = B.CreateZExtOrTrunc( 3436 EE->getOperand(0), VectorType::get(ScalarTruncatedTy, Elements)); 3437 NewI = B.CreateExtractElement(O0, EE->getOperand(2)); 3438 } else { 3439 // If we don't know what to do, be conservative and don't do anything. 3440 continue; 3441 } 3442 3443 // Lastly, extend the result. 3444 NewI->takeName(cast<Instruction>(I)); 3445 Value *Res = B.CreateZExtOrTrunc(NewI, OriginalTy); 3446 I->replaceAllUsesWith(Res); 3447 cast<Instruction>(I)->eraseFromParent(); 3448 Erased.insert(I); 3449 VectorLoopValueMap.resetVectorValue(KV.first, Part, Res); 3450 } 3451 } 3452 3453 // We'll have created a bunch of ZExts that are now parentless. Clean up. 3454 for (const auto &KV : Cost->getMinimalBitwidths()) { 3455 // If the value wasn't vectorized, we must maintain the original scalar 3456 // type. The absence of the value from VectorLoopValueMap indicates that it 3457 // wasn't vectorized. 3458 if (!VectorLoopValueMap.hasAnyVectorValue(KV.first)) 3459 continue; 3460 for (unsigned Part = 0; Part < UF; ++Part) { 3461 Value *I = getOrCreateVectorValue(KV.first, Part); 3462 ZExtInst *Inst = dyn_cast<ZExtInst>(I); 3463 if (Inst && Inst->use_empty()) { 3464 Value *NewI = Inst->getOperand(0); 3465 Inst->eraseFromParent(); 3466 VectorLoopValueMap.resetVectorValue(KV.first, Part, NewI); 3467 } 3468 } 3469 } 3470 } 3471 3472 void InnerLoopVectorizer::fixVectorizedLoop() { 3473 // Insert truncates and extends for any truncated instructions as hints to 3474 // InstCombine. 3475 if (VF > 1) 3476 truncateToMinimalBitwidths(); 3477 3478 // Fix widened non-induction PHIs by setting up the PHI operands. 3479 if (OrigPHIsToFix.size()) { 3480 assert(EnableVPlanNativePath && 3481 "Unexpected non-induction PHIs for fixup in non VPlan-native path"); 3482 fixNonInductionPHIs(); 3483 } 3484 3485 // At this point every instruction in the original loop is widened to a 3486 // vector form. Now we need to fix the recurrences in the loop. These PHI 3487 // nodes are currently empty because we did not want to introduce cycles. 3488 // This is the second stage of vectorizing recurrences. 3489 fixCrossIterationPHIs(); 3490 3491 // Forget the original basic block. 3492 PSE.getSE()->forgetLoop(OrigLoop); 3493 3494 // Fix-up external users of the induction variables. 3495 for (auto &Entry : Legal->getInductionVars()) 3496 fixupIVUsers(Entry.first, Entry.second, 3497 getOrCreateVectorTripCount(LI->getLoopFor(LoopVectorBody)), 3498 IVEndValues[Entry.first], LoopMiddleBlock); 3499 3500 fixLCSSAPHIs(); 3501 for (Instruction *PI : PredicatedInstructions) 3502 sinkScalarOperands(&*PI); 3503 3504 // Remove redundant induction instructions. 3505 cse(LoopVectorBody); 3506 3507 // Set/update profile weights for the vector and remainder loops as original 3508 // loop iterations are now distributed among them. Note that original loop 3509 // represented by LoopScalarBody becomes remainder loop after vectorization. 3510 // 3511 // For cases like foldTailByMasking() and requiresScalarEpiloque() we may 3512 // end up getting slightly roughened result but that should be OK since 3513 // profile is not inherently precise anyway. Note also possible bypass of 3514 // vector code caused by legality checks is ignored, assigning all the weight 3515 // to the vector loop, optimistically. 3516 setProfileInfoAfterUnrolling(LI->getLoopFor(LoopScalarBody), 3517 LI->getLoopFor(LoopVectorBody), 3518 LI->getLoopFor(LoopScalarBody), VF * UF); 3519 } 3520 3521 void InnerLoopVectorizer::fixCrossIterationPHIs() { 3522 // In order to support recurrences we need to be able to vectorize Phi nodes. 3523 // Phi nodes have cycles, so we need to vectorize them in two stages. This is 3524 // stage #2: We now need to fix the recurrences by adding incoming edges to 3525 // the currently empty PHI nodes. At this point every instruction in the 3526 // original loop is widened to a vector form so we can use them to construct 3527 // the incoming edges. 3528 for (PHINode &Phi : OrigLoop->getHeader()->phis()) { 3529 // Handle first-order recurrences and reductions that need to be fixed. 3530 if (Legal->isFirstOrderRecurrence(&Phi)) 3531 fixFirstOrderRecurrence(&Phi); 3532 else if (Legal->isReductionVariable(&Phi)) 3533 fixReduction(&Phi); 3534 } 3535 } 3536 3537 void InnerLoopVectorizer::fixFirstOrderRecurrence(PHINode *Phi) { 3538 // This is the second phase of vectorizing first-order recurrences. An 3539 // overview of the transformation is described below. Suppose we have the 3540 // following loop. 3541 // 3542 // for (int i = 0; i < n; ++i) 3543 // b[i] = a[i] - a[i - 1]; 3544 // 3545 // There is a first-order recurrence on "a". For this loop, the shorthand 3546 // scalar IR looks like: 3547 // 3548 // scalar.ph: 3549 // s_init = a[-1] 3550 // br scalar.body 3551 // 3552 // scalar.body: 3553 // i = phi [0, scalar.ph], [i+1, scalar.body] 3554 // s1 = phi [s_init, scalar.ph], [s2, scalar.body] 3555 // s2 = a[i] 3556 // b[i] = s2 - s1 3557 // br cond, scalar.body, ... 3558 // 3559 // In this example, s1 is a recurrence because it's value depends on the 3560 // previous iteration. In the first phase of vectorization, we created a 3561 // temporary value for s1. We now complete the vectorization and produce the 3562 // shorthand vector IR shown below (for VF = 4, UF = 1). 3563 // 3564 // vector.ph: 3565 // v_init = vector(..., ..., ..., a[-1]) 3566 // br vector.body 3567 // 3568 // vector.body 3569 // i = phi [0, vector.ph], [i+4, vector.body] 3570 // v1 = phi [v_init, vector.ph], [v2, vector.body] 3571 // v2 = a[i, i+1, i+2, i+3]; 3572 // v3 = vector(v1(3), v2(0, 1, 2)) 3573 // b[i, i+1, i+2, i+3] = v2 - v3 3574 // br cond, vector.body, middle.block 3575 // 3576 // middle.block: 3577 // x = v2(3) 3578 // br scalar.ph 3579 // 3580 // scalar.ph: 3581 // s_init = phi [x, middle.block], [a[-1], otherwise] 3582 // br scalar.body 3583 // 3584 // After execution completes the vector loop, we extract the next value of 3585 // the recurrence (x) to use as the initial value in the scalar loop. 3586 3587 // Get the original loop preheader and single loop latch. 3588 auto *Preheader = OrigLoop->getLoopPreheader(); 3589 auto *Latch = OrigLoop->getLoopLatch(); 3590 3591 // Get the initial and previous values of the scalar recurrence. 3592 auto *ScalarInit = Phi->getIncomingValueForBlock(Preheader); 3593 auto *Previous = Phi->getIncomingValueForBlock(Latch); 3594 3595 // Create a vector from the initial value. 3596 auto *VectorInit = ScalarInit; 3597 if (VF > 1) { 3598 Builder.SetInsertPoint(LoopVectorPreHeader->getTerminator()); 3599 VectorInit = Builder.CreateInsertElement( 3600 UndefValue::get(VectorType::get(VectorInit->getType(), VF)), VectorInit, 3601 Builder.getInt32(VF - 1), "vector.recur.init"); 3602 } 3603 3604 // We constructed a temporary phi node in the first phase of vectorization. 3605 // This phi node will eventually be deleted. 3606 Builder.SetInsertPoint( 3607 cast<Instruction>(VectorLoopValueMap.getVectorValue(Phi, 0))); 3608 3609 // Create a phi node for the new recurrence. The current value will either be 3610 // the initial value inserted into a vector or loop-varying vector value. 3611 auto *VecPhi = Builder.CreatePHI(VectorInit->getType(), 2, "vector.recur"); 3612 VecPhi->addIncoming(VectorInit, LoopVectorPreHeader); 3613 3614 // Get the vectorized previous value of the last part UF - 1. It appears last 3615 // among all unrolled iterations, due to the order of their construction. 3616 Value *PreviousLastPart = getOrCreateVectorValue(Previous, UF - 1); 3617 3618 // Find and set the insertion point after the previous value if it is an 3619 // instruction. 3620 BasicBlock::iterator InsertPt; 3621 // Note that the previous value may have been constant-folded so it is not 3622 // guaranteed to be an instruction in the vector loop. 3623 // FIXME: Loop invariant values do not form recurrences. We should deal with 3624 // them earlier. 3625 if (LI->getLoopFor(LoopVectorBody)->isLoopInvariant(PreviousLastPart)) 3626 InsertPt = LoopVectorBody->getFirstInsertionPt(); 3627 else { 3628 Instruction *PreviousInst = cast<Instruction>(PreviousLastPart); 3629 if (isa<PHINode>(PreviousLastPart)) 3630 // If the previous value is a phi node, we should insert after all the phi 3631 // nodes in the block containing the PHI to avoid breaking basic block 3632 // verification. Note that the basic block may be different to 3633 // LoopVectorBody, in case we predicate the loop. 3634 InsertPt = PreviousInst->getParent()->getFirstInsertionPt(); 3635 else 3636 InsertPt = ++PreviousInst->getIterator(); 3637 } 3638 Builder.SetInsertPoint(&*InsertPt); 3639 3640 // We will construct a vector for the recurrence by combining the values for 3641 // the current and previous iterations. This is the required shuffle mask. 3642 SmallVector<int, 8> ShuffleMask(VF); 3643 ShuffleMask[0] = VF - 1; 3644 for (unsigned I = 1; I < VF; ++I) 3645 ShuffleMask[I] = I + VF - 1; 3646 3647 // The vector from which to take the initial value for the current iteration 3648 // (actual or unrolled). Initially, this is the vector phi node. 3649 Value *Incoming = VecPhi; 3650 3651 // Shuffle the current and previous vector and update the vector parts. 3652 for (unsigned Part = 0; Part < UF; ++Part) { 3653 Value *PreviousPart = getOrCreateVectorValue(Previous, Part); 3654 Value *PhiPart = VectorLoopValueMap.getVectorValue(Phi, Part); 3655 auto *Shuffle = VF > 1 ? Builder.CreateShuffleVector(Incoming, PreviousPart, 3656 ShuffleMask) 3657 : Incoming; 3658 PhiPart->replaceAllUsesWith(Shuffle); 3659 cast<Instruction>(PhiPart)->eraseFromParent(); 3660 VectorLoopValueMap.resetVectorValue(Phi, Part, Shuffle); 3661 Incoming = PreviousPart; 3662 } 3663 3664 // Fix the latch value of the new recurrence in the vector loop. 3665 VecPhi->addIncoming(Incoming, LI->getLoopFor(LoopVectorBody)->getLoopLatch()); 3666 3667 // Extract the last vector element in the middle block. This will be the 3668 // initial value for the recurrence when jumping to the scalar loop. 3669 auto *ExtractForScalar = Incoming; 3670 if (VF > 1) { 3671 Builder.SetInsertPoint(LoopMiddleBlock->getTerminator()); 3672 ExtractForScalar = Builder.CreateExtractElement( 3673 ExtractForScalar, Builder.getInt32(VF - 1), "vector.recur.extract"); 3674 } 3675 // Extract the second last element in the middle block if the 3676 // Phi is used outside the loop. We need to extract the phi itself 3677 // and not the last element (the phi update in the current iteration). This 3678 // will be the value when jumping to the exit block from the LoopMiddleBlock, 3679 // when the scalar loop is not run at all. 3680 Value *ExtractForPhiUsedOutsideLoop = nullptr; 3681 if (VF > 1) 3682 ExtractForPhiUsedOutsideLoop = Builder.CreateExtractElement( 3683 Incoming, Builder.getInt32(VF - 2), "vector.recur.extract.for.phi"); 3684 // When loop is unrolled without vectorizing, initialize 3685 // ExtractForPhiUsedOutsideLoop with the value just prior to unrolled value of 3686 // `Incoming`. This is analogous to the vectorized case above: extracting the 3687 // second last element when VF > 1. 3688 else if (UF > 1) 3689 ExtractForPhiUsedOutsideLoop = getOrCreateVectorValue(Previous, UF - 2); 3690 3691 // Fix the initial value of the original recurrence in the scalar loop. 3692 Builder.SetInsertPoint(&*LoopScalarPreHeader->begin()); 3693 auto *Start = Builder.CreatePHI(Phi->getType(), 2, "scalar.recur.init"); 3694 for (auto *BB : predecessors(LoopScalarPreHeader)) { 3695 auto *Incoming = BB == LoopMiddleBlock ? ExtractForScalar : ScalarInit; 3696 Start->addIncoming(Incoming, BB); 3697 } 3698 3699 Phi->setIncomingValueForBlock(LoopScalarPreHeader, Start); 3700 Phi->setName("scalar.recur"); 3701 3702 // Finally, fix users of the recurrence outside the loop. The users will need 3703 // either the last value of the scalar recurrence or the last value of the 3704 // vector recurrence we extracted in the middle block. Since the loop is in 3705 // LCSSA form, we just need to find all the phi nodes for the original scalar 3706 // recurrence in the exit block, and then add an edge for the middle block. 3707 for (PHINode &LCSSAPhi : LoopExitBlock->phis()) { 3708 if (LCSSAPhi.getIncomingValue(0) == Phi) { 3709 LCSSAPhi.addIncoming(ExtractForPhiUsedOutsideLoop, LoopMiddleBlock); 3710 } 3711 } 3712 } 3713 3714 void InnerLoopVectorizer::fixReduction(PHINode *Phi) { 3715 Constant *Zero = Builder.getInt32(0); 3716 3717 // Get it's reduction variable descriptor. 3718 assert(Legal->isReductionVariable(Phi) && 3719 "Unable to find the reduction variable"); 3720 RecurrenceDescriptor RdxDesc = Legal->getReductionVars()[Phi]; 3721 3722 RecurrenceDescriptor::RecurrenceKind RK = RdxDesc.getRecurrenceKind(); 3723 TrackingVH<Value> ReductionStartValue = RdxDesc.getRecurrenceStartValue(); 3724 Instruction *LoopExitInst = RdxDesc.getLoopExitInstr(); 3725 RecurrenceDescriptor::MinMaxRecurrenceKind MinMaxKind = 3726 RdxDesc.getMinMaxRecurrenceKind(); 3727 setDebugLocFromInst(Builder, ReductionStartValue); 3728 3729 // We need to generate a reduction vector from the incoming scalar. 3730 // To do so, we need to generate the 'identity' vector and override 3731 // one of the elements with the incoming scalar reduction. We need 3732 // to do it in the vector-loop preheader. 3733 Builder.SetInsertPoint(LoopVectorPreHeader->getTerminator()); 3734 3735 // This is the vector-clone of the value that leaves the loop. 3736 Type *VecTy = getOrCreateVectorValue(LoopExitInst, 0)->getType(); 3737 3738 // Find the reduction identity variable. Zero for addition, or, xor, 3739 // one for multiplication, -1 for And. 3740 Value *Identity; 3741 Value *VectorStart; 3742 if (RK == RecurrenceDescriptor::RK_IntegerMinMax || 3743 RK == RecurrenceDescriptor::RK_FloatMinMax) { 3744 // MinMax reduction have the start value as their identify. 3745 if (VF == 1) { 3746 VectorStart = Identity = ReductionStartValue; 3747 } else { 3748 VectorStart = Identity = 3749 Builder.CreateVectorSplat(VF, ReductionStartValue, "minmax.ident"); 3750 } 3751 } else { 3752 // Handle other reduction kinds: 3753 Constant *Iden = RecurrenceDescriptor::getRecurrenceIdentity( 3754 RK, VecTy->getScalarType()); 3755 if (VF == 1) { 3756 Identity = Iden; 3757 // This vector is the Identity vector where the first element is the 3758 // incoming scalar reduction. 3759 VectorStart = ReductionStartValue; 3760 } else { 3761 Identity = ConstantVector::getSplat({VF, false}, Iden); 3762 3763 // This vector is the Identity vector where the first element is the 3764 // incoming scalar reduction. 3765 VectorStart = 3766 Builder.CreateInsertElement(Identity, ReductionStartValue, Zero); 3767 } 3768 } 3769 3770 // Wrap flags are in general invalid after vectorization, clear them. 3771 clearReductionWrapFlags(RdxDesc); 3772 3773 // Fix the vector-loop phi. 3774 3775 // Reductions do not have to start at zero. They can start with 3776 // any loop invariant values. 3777 BasicBlock *Latch = OrigLoop->getLoopLatch(); 3778 Value *LoopVal = Phi->getIncomingValueForBlock(Latch); 3779 3780 for (unsigned Part = 0; Part < UF; ++Part) { 3781 Value *VecRdxPhi = getOrCreateVectorValue(Phi, Part); 3782 Value *Val = getOrCreateVectorValue(LoopVal, Part); 3783 // Make sure to add the reduction start value only to the 3784 // first unroll part. 3785 Value *StartVal = (Part == 0) ? VectorStart : Identity; 3786 cast<PHINode>(VecRdxPhi)->addIncoming(StartVal, LoopVectorPreHeader); 3787 cast<PHINode>(VecRdxPhi) 3788 ->addIncoming(Val, LI->getLoopFor(LoopVectorBody)->getLoopLatch()); 3789 } 3790 3791 // Before each round, move the insertion point right between 3792 // the PHIs and the values we are going to write. 3793 // This allows us to write both PHINodes and the extractelement 3794 // instructions. 3795 Builder.SetInsertPoint(&*LoopMiddleBlock->getFirstInsertionPt()); 3796 3797 setDebugLocFromInst(Builder, LoopExitInst); 3798 3799 // If tail is folded by masking, the vector value to leave the loop should be 3800 // a Select choosing between the vectorized LoopExitInst and vectorized Phi, 3801 // instead of the former. 3802 if (Cost->foldTailByMasking()) { 3803 for (unsigned Part = 0; Part < UF; ++Part) { 3804 Value *VecLoopExitInst = 3805 VectorLoopValueMap.getVectorValue(LoopExitInst, Part); 3806 Value *Sel = nullptr; 3807 for (User *U : VecLoopExitInst->users()) { 3808 if (isa<SelectInst>(U)) { 3809 assert(!Sel && "Reduction exit feeding two selects"); 3810 Sel = U; 3811 } else 3812 assert(isa<PHINode>(U) && "Reduction exit must feed Phi's or select"); 3813 } 3814 assert(Sel && "Reduction exit feeds no select"); 3815 VectorLoopValueMap.resetVectorValue(LoopExitInst, Part, Sel); 3816 } 3817 } 3818 3819 // If the vector reduction can be performed in a smaller type, we truncate 3820 // then extend the loop exit value to enable InstCombine to evaluate the 3821 // entire expression in the smaller type. 3822 if (VF > 1 && Phi->getType() != RdxDesc.getRecurrenceType()) { 3823 Type *RdxVecTy = VectorType::get(RdxDesc.getRecurrenceType(), VF); 3824 Builder.SetInsertPoint( 3825 LI->getLoopFor(LoopVectorBody)->getLoopLatch()->getTerminator()); 3826 VectorParts RdxParts(UF); 3827 for (unsigned Part = 0; Part < UF; ++Part) { 3828 RdxParts[Part] = VectorLoopValueMap.getVectorValue(LoopExitInst, Part); 3829 Value *Trunc = Builder.CreateTrunc(RdxParts[Part], RdxVecTy); 3830 Value *Extnd = RdxDesc.isSigned() ? Builder.CreateSExt(Trunc, VecTy) 3831 : Builder.CreateZExt(Trunc, VecTy); 3832 for (Value::user_iterator UI = RdxParts[Part]->user_begin(); 3833 UI != RdxParts[Part]->user_end();) 3834 if (*UI != Trunc) { 3835 (*UI++)->replaceUsesOfWith(RdxParts[Part], Extnd); 3836 RdxParts[Part] = Extnd; 3837 } else { 3838 ++UI; 3839 } 3840 } 3841 Builder.SetInsertPoint(&*LoopMiddleBlock->getFirstInsertionPt()); 3842 for (unsigned Part = 0; Part < UF; ++Part) { 3843 RdxParts[Part] = Builder.CreateTrunc(RdxParts[Part], RdxVecTy); 3844 VectorLoopValueMap.resetVectorValue(LoopExitInst, Part, RdxParts[Part]); 3845 } 3846 } 3847 3848 // Reduce all of the unrolled parts into a single vector. 3849 Value *ReducedPartRdx = VectorLoopValueMap.getVectorValue(LoopExitInst, 0); 3850 unsigned Op = RecurrenceDescriptor::getRecurrenceBinOp(RK); 3851 3852 // The middle block terminator has already been assigned a DebugLoc here (the 3853 // OrigLoop's single latch terminator). We want the whole middle block to 3854 // appear to execute on this line because: (a) it is all compiler generated, 3855 // (b) these instructions are always executed after evaluating the latch 3856 // conditional branch, and (c) other passes may add new predecessors which 3857 // terminate on this line. This is the easiest way to ensure we don't 3858 // accidentally cause an extra step back into the loop while debugging. 3859 setDebugLocFromInst(Builder, LoopMiddleBlock->getTerminator()); 3860 for (unsigned Part = 1; Part < UF; ++Part) { 3861 Value *RdxPart = VectorLoopValueMap.getVectorValue(LoopExitInst, Part); 3862 if (Op != Instruction::ICmp && Op != Instruction::FCmp) 3863 // Floating point operations had to be 'fast' to enable the reduction. 3864 ReducedPartRdx = addFastMathFlag( 3865 Builder.CreateBinOp((Instruction::BinaryOps)Op, RdxPart, 3866 ReducedPartRdx, "bin.rdx"), 3867 RdxDesc.getFastMathFlags()); 3868 else 3869 ReducedPartRdx = createMinMaxOp(Builder, MinMaxKind, ReducedPartRdx, 3870 RdxPart); 3871 } 3872 3873 if (VF > 1) { 3874 bool NoNaN = Legal->hasFunNoNaNAttr(); 3875 ReducedPartRdx = 3876 createTargetReduction(Builder, TTI, RdxDesc, ReducedPartRdx, NoNaN); 3877 // If the reduction can be performed in a smaller type, we need to extend 3878 // the reduction to the wider type before we branch to the original loop. 3879 if (Phi->getType() != RdxDesc.getRecurrenceType()) 3880 ReducedPartRdx = 3881 RdxDesc.isSigned() 3882 ? Builder.CreateSExt(ReducedPartRdx, Phi->getType()) 3883 : Builder.CreateZExt(ReducedPartRdx, Phi->getType()); 3884 } 3885 3886 // Create a phi node that merges control-flow from the backedge-taken check 3887 // block and the middle block. 3888 PHINode *BCBlockPhi = PHINode::Create(Phi->getType(), 2, "bc.merge.rdx", 3889 LoopScalarPreHeader->getTerminator()); 3890 for (unsigned I = 0, E = LoopBypassBlocks.size(); I != E; ++I) 3891 BCBlockPhi->addIncoming(ReductionStartValue, LoopBypassBlocks[I]); 3892 BCBlockPhi->addIncoming(ReducedPartRdx, LoopMiddleBlock); 3893 3894 // Now, we need to fix the users of the reduction variable 3895 // inside and outside of the scalar remainder loop. 3896 // We know that the loop is in LCSSA form. We need to update the 3897 // PHI nodes in the exit blocks. 3898 for (PHINode &LCSSAPhi : LoopExitBlock->phis()) { 3899 // All PHINodes need to have a single entry edge, or two if 3900 // we already fixed them. 3901 assert(LCSSAPhi.getNumIncomingValues() < 3 && "Invalid LCSSA PHI"); 3902 3903 // We found a reduction value exit-PHI. Update it with the 3904 // incoming bypass edge. 3905 if (LCSSAPhi.getIncomingValue(0) == LoopExitInst) 3906 LCSSAPhi.addIncoming(ReducedPartRdx, LoopMiddleBlock); 3907 } // end of the LCSSA phi scan. 3908 3909 // Fix the scalar loop reduction variable with the incoming reduction sum 3910 // from the vector body and from the backedge value. 3911 int IncomingEdgeBlockIdx = 3912 Phi->getBasicBlockIndex(OrigLoop->getLoopLatch()); 3913 assert(IncomingEdgeBlockIdx >= 0 && "Invalid block index"); 3914 // Pick the other block. 3915 int SelfEdgeBlockIdx = (IncomingEdgeBlockIdx ? 0 : 1); 3916 Phi->setIncomingValue(SelfEdgeBlockIdx, BCBlockPhi); 3917 Phi->setIncomingValue(IncomingEdgeBlockIdx, LoopExitInst); 3918 } 3919 3920 void InnerLoopVectorizer::clearReductionWrapFlags( 3921 RecurrenceDescriptor &RdxDesc) { 3922 RecurrenceDescriptor::RecurrenceKind RK = RdxDesc.getRecurrenceKind(); 3923 if (RK != RecurrenceDescriptor::RK_IntegerAdd && 3924 RK != RecurrenceDescriptor::RK_IntegerMult) 3925 return; 3926 3927 Instruction *LoopExitInstr = RdxDesc.getLoopExitInstr(); 3928 assert(LoopExitInstr && "null loop exit instruction"); 3929 SmallVector<Instruction *, 8> Worklist; 3930 SmallPtrSet<Instruction *, 8> Visited; 3931 Worklist.push_back(LoopExitInstr); 3932 Visited.insert(LoopExitInstr); 3933 3934 while (!Worklist.empty()) { 3935 Instruction *Cur = Worklist.pop_back_val(); 3936 if (isa<OverflowingBinaryOperator>(Cur)) 3937 for (unsigned Part = 0; Part < UF; ++Part) { 3938 Value *V = getOrCreateVectorValue(Cur, Part); 3939 cast<Instruction>(V)->dropPoisonGeneratingFlags(); 3940 } 3941 3942 for (User *U : Cur->users()) { 3943 Instruction *UI = cast<Instruction>(U); 3944 if ((Cur != LoopExitInstr || OrigLoop->contains(UI->getParent())) && 3945 Visited.insert(UI).second) 3946 Worklist.push_back(UI); 3947 } 3948 } 3949 } 3950 3951 void InnerLoopVectorizer::fixLCSSAPHIs() { 3952 for (PHINode &LCSSAPhi : LoopExitBlock->phis()) { 3953 if (LCSSAPhi.getNumIncomingValues() == 1) { 3954 auto *IncomingValue = LCSSAPhi.getIncomingValue(0); 3955 // Non-instruction incoming values will have only one value. 3956 unsigned LastLane = 0; 3957 if (isa<Instruction>(IncomingValue)) 3958 LastLane = Cost->isUniformAfterVectorization( 3959 cast<Instruction>(IncomingValue), VF) 3960 ? 0 3961 : VF - 1; 3962 // Can be a loop invariant incoming value or the last scalar value to be 3963 // extracted from the vectorized loop. 3964 Builder.SetInsertPoint(LoopMiddleBlock->getTerminator()); 3965 Value *lastIncomingValue = 3966 getOrCreateScalarValue(IncomingValue, { UF - 1, LastLane }); 3967 LCSSAPhi.addIncoming(lastIncomingValue, LoopMiddleBlock); 3968 } 3969 } 3970 } 3971 3972 void InnerLoopVectorizer::sinkScalarOperands(Instruction *PredInst) { 3973 // The basic block and loop containing the predicated instruction. 3974 auto *PredBB = PredInst->getParent(); 3975 auto *VectorLoop = LI->getLoopFor(PredBB); 3976 3977 // Initialize a worklist with the operands of the predicated instruction. 3978 SetVector<Value *> Worklist(PredInst->op_begin(), PredInst->op_end()); 3979 3980 // Holds instructions that we need to analyze again. An instruction may be 3981 // reanalyzed if we don't yet know if we can sink it or not. 3982 SmallVector<Instruction *, 8> InstsToReanalyze; 3983 3984 // Returns true if a given use occurs in the predicated block. Phi nodes use 3985 // their operands in their corresponding predecessor blocks. 3986 auto isBlockOfUsePredicated = [&](Use &U) -> bool { 3987 auto *I = cast<Instruction>(U.getUser()); 3988 BasicBlock *BB = I->getParent(); 3989 if (auto *Phi = dyn_cast<PHINode>(I)) 3990 BB = Phi->getIncomingBlock( 3991 PHINode::getIncomingValueNumForOperand(U.getOperandNo())); 3992 return BB == PredBB; 3993 }; 3994 3995 // Iteratively sink the scalarized operands of the predicated instruction 3996 // into the block we created for it. When an instruction is sunk, it's 3997 // operands are then added to the worklist. The algorithm ends after one pass 3998 // through the worklist doesn't sink a single instruction. 3999 bool Changed; 4000 do { 4001 // Add the instructions that need to be reanalyzed to the worklist, and 4002 // reset the changed indicator. 4003 Worklist.insert(InstsToReanalyze.begin(), InstsToReanalyze.end()); 4004 InstsToReanalyze.clear(); 4005 Changed = false; 4006 4007 while (!Worklist.empty()) { 4008 auto *I = dyn_cast<Instruction>(Worklist.pop_back_val()); 4009 4010 // We can't sink an instruction if it is a phi node, is already in the 4011 // predicated block, is not in the loop, or may have side effects. 4012 if (!I || isa<PHINode>(I) || I->getParent() == PredBB || 4013 !VectorLoop->contains(I) || I->mayHaveSideEffects()) 4014 continue; 4015 4016 // It's legal to sink the instruction if all its uses occur in the 4017 // predicated block. Otherwise, there's nothing to do yet, and we may 4018 // need to reanalyze the instruction. 4019 if (!llvm::all_of(I->uses(), isBlockOfUsePredicated)) { 4020 InstsToReanalyze.push_back(I); 4021 continue; 4022 } 4023 4024 // Move the instruction to the beginning of the predicated block, and add 4025 // it's operands to the worklist. 4026 I->moveBefore(&*PredBB->getFirstInsertionPt()); 4027 Worklist.insert(I->op_begin(), I->op_end()); 4028 4029 // The sinking may have enabled other instructions to be sunk, so we will 4030 // need to iterate. 4031 Changed = true; 4032 } 4033 } while (Changed); 4034 } 4035 4036 void InnerLoopVectorizer::fixNonInductionPHIs() { 4037 for (PHINode *OrigPhi : OrigPHIsToFix) { 4038 PHINode *NewPhi = 4039 cast<PHINode>(VectorLoopValueMap.getVectorValue(OrigPhi, 0)); 4040 unsigned NumIncomingValues = OrigPhi->getNumIncomingValues(); 4041 4042 SmallVector<BasicBlock *, 2> ScalarBBPredecessors( 4043 predecessors(OrigPhi->getParent())); 4044 SmallVector<BasicBlock *, 2> VectorBBPredecessors( 4045 predecessors(NewPhi->getParent())); 4046 assert(ScalarBBPredecessors.size() == VectorBBPredecessors.size() && 4047 "Scalar and Vector BB should have the same number of predecessors"); 4048 4049 // The insertion point in Builder may be invalidated by the time we get 4050 // here. Force the Builder insertion point to something valid so that we do 4051 // not run into issues during insertion point restore in 4052 // getOrCreateVectorValue calls below. 4053 Builder.SetInsertPoint(NewPhi); 4054 4055 // The predecessor order is preserved and we can rely on mapping between 4056 // scalar and vector block predecessors. 4057 for (unsigned i = 0; i < NumIncomingValues; ++i) { 4058 BasicBlock *NewPredBB = VectorBBPredecessors[i]; 4059 4060 // When looking up the new scalar/vector values to fix up, use incoming 4061 // values from original phi. 4062 Value *ScIncV = 4063 OrigPhi->getIncomingValueForBlock(ScalarBBPredecessors[i]); 4064 4065 // Scalar incoming value may need a broadcast 4066 Value *NewIncV = getOrCreateVectorValue(ScIncV, 0); 4067 NewPhi->addIncoming(NewIncV, NewPredBB); 4068 } 4069 } 4070 } 4071 4072 void InnerLoopVectorizer::widenGEP(GetElementPtrInst *GEP, unsigned UF, 4073 unsigned VF, bool IsPtrLoopInvariant, 4074 SmallBitVector &IsIndexLoopInvariant) { 4075 // Construct a vector GEP by widening the operands of the scalar GEP as 4076 // necessary. We mark the vector GEP 'inbounds' if appropriate. A GEP 4077 // results in a vector of pointers when at least one operand of the GEP 4078 // is vector-typed. Thus, to keep the representation compact, we only use 4079 // vector-typed operands for loop-varying values. 4080 4081 if (VF > 1 && IsPtrLoopInvariant && IsIndexLoopInvariant.all()) { 4082 // If we are vectorizing, but the GEP has only loop-invariant operands, 4083 // the GEP we build (by only using vector-typed operands for 4084 // loop-varying values) would be a scalar pointer. Thus, to ensure we 4085 // produce a vector of pointers, we need to either arbitrarily pick an 4086 // operand to broadcast, or broadcast a clone of the original GEP. 4087 // Here, we broadcast a clone of the original. 4088 // 4089 // TODO: If at some point we decide to scalarize instructions having 4090 // loop-invariant operands, this special case will no longer be 4091 // required. We would add the scalarization decision to 4092 // collectLoopScalars() and teach getVectorValue() to broadcast 4093 // the lane-zero scalar value. 4094 auto *Clone = Builder.Insert(GEP->clone()); 4095 for (unsigned Part = 0; Part < UF; ++Part) { 4096 Value *EntryPart = Builder.CreateVectorSplat(VF, Clone); 4097 VectorLoopValueMap.setVectorValue(GEP, Part, EntryPart); 4098 addMetadata(EntryPart, GEP); 4099 } 4100 } else { 4101 // If the GEP has at least one loop-varying operand, we are sure to 4102 // produce a vector of pointers. But if we are only unrolling, we want 4103 // to produce a scalar GEP for each unroll part. Thus, the GEP we 4104 // produce with the code below will be scalar (if VF == 1) or vector 4105 // (otherwise). Note that for the unroll-only case, we still maintain 4106 // values in the vector mapping with initVector, as we do for other 4107 // instructions. 4108 for (unsigned Part = 0; Part < UF; ++Part) { 4109 // The pointer operand of the new GEP. If it's loop-invariant, we 4110 // won't broadcast it. 4111 auto *Ptr = IsPtrLoopInvariant 4112 ? GEP->getPointerOperand() 4113 : getOrCreateVectorValue(GEP->getPointerOperand(), Part); 4114 4115 // Collect all the indices for the new GEP. If any index is 4116 // loop-invariant, we won't broadcast it. 4117 SmallVector<Value *, 4> Indices; 4118 for (auto Index : enumerate(GEP->indices())) { 4119 Value *User = Index.value().get(); 4120 if (IsIndexLoopInvariant[Index.index()]) 4121 Indices.push_back(User); 4122 else 4123 Indices.push_back(getOrCreateVectorValue(User, Part)); 4124 } 4125 4126 // Create the new GEP. Note that this GEP may be a scalar if VF == 1, 4127 // but it should be a vector, otherwise. 4128 auto *NewGEP = 4129 GEP->isInBounds() 4130 ? Builder.CreateInBoundsGEP(GEP->getSourceElementType(), Ptr, 4131 Indices) 4132 : Builder.CreateGEP(GEP->getSourceElementType(), Ptr, Indices); 4133 assert((VF == 1 || NewGEP->getType()->isVectorTy()) && 4134 "NewGEP is not a pointer vector"); 4135 VectorLoopValueMap.setVectorValue(GEP, Part, NewGEP); 4136 addMetadata(NewGEP, GEP); 4137 } 4138 } 4139 } 4140 4141 void InnerLoopVectorizer::widenPHIInstruction(Instruction *PN, unsigned UF, 4142 unsigned VF) { 4143 PHINode *P = cast<PHINode>(PN); 4144 if (EnableVPlanNativePath) { 4145 // Currently we enter here in the VPlan-native path for non-induction 4146 // PHIs where all control flow is uniform. We simply widen these PHIs. 4147 // Create a vector phi with no operands - the vector phi operands will be 4148 // set at the end of vector code generation. 4149 Type *VecTy = 4150 (VF == 1) ? PN->getType() : VectorType::get(PN->getType(), VF); 4151 Value *VecPhi = Builder.CreatePHI(VecTy, PN->getNumOperands(), "vec.phi"); 4152 VectorLoopValueMap.setVectorValue(P, 0, VecPhi); 4153 OrigPHIsToFix.push_back(P); 4154 4155 return; 4156 } 4157 4158 assert(PN->getParent() == OrigLoop->getHeader() && 4159 "Non-header phis should have been handled elsewhere"); 4160 4161 // In order to support recurrences we need to be able to vectorize Phi nodes. 4162 // Phi nodes have cycles, so we need to vectorize them in two stages. This is 4163 // stage #1: We create a new vector PHI node with no incoming edges. We'll use 4164 // this value when we vectorize all of the instructions that use the PHI. 4165 if (Legal->isReductionVariable(P) || Legal->isFirstOrderRecurrence(P)) { 4166 for (unsigned Part = 0; Part < UF; ++Part) { 4167 // This is phase one of vectorizing PHIs. 4168 Type *VecTy = 4169 (VF == 1) ? PN->getType() : VectorType::get(PN->getType(), VF); 4170 Value *EntryPart = PHINode::Create( 4171 VecTy, 2, "vec.phi", &*LoopVectorBody->getFirstInsertionPt()); 4172 VectorLoopValueMap.setVectorValue(P, Part, EntryPart); 4173 } 4174 return; 4175 } 4176 4177 setDebugLocFromInst(Builder, P); 4178 4179 // This PHINode must be an induction variable. 4180 // Make sure that we know about it. 4181 assert(Legal->getInductionVars().count(P) && "Not an induction variable"); 4182 4183 InductionDescriptor II = Legal->getInductionVars().lookup(P); 4184 const DataLayout &DL = OrigLoop->getHeader()->getModule()->getDataLayout(); 4185 4186 // FIXME: The newly created binary instructions should contain nsw/nuw flags, 4187 // which can be found from the original scalar operations. 4188 switch (II.getKind()) { 4189 case InductionDescriptor::IK_NoInduction: 4190 llvm_unreachable("Unknown induction"); 4191 case InductionDescriptor::IK_IntInduction: 4192 case InductionDescriptor::IK_FpInduction: 4193 llvm_unreachable("Integer/fp induction is handled elsewhere."); 4194 case InductionDescriptor::IK_PtrInduction: { 4195 // Handle the pointer induction variable case. 4196 assert(P->getType()->isPointerTy() && "Unexpected type."); 4197 // This is the normalized GEP that starts counting at zero. 4198 Value *PtrInd = Induction; 4199 PtrInd = Builder.CreateSExtOrTrunc(PtrInd, II.getStep()->getType()); 4200 // Determine the number of scalars we need to generate for each unroll 4201 // iteration. If the instruction is uniform, we only need to generate the 4202 // first lane. Otherwise, we generate all VF values. 4203 unsigned Lanes = Cost->isUniformAfterVectorization(P, VF) ? 1 : VF; 4204 // These are the scalar results. Notice that we don't generate vector GEPs 4205 // because scalar GEPs result in better code. 4206 for (unsigned Part = 0; Part < UF; ++Part) { 4207 for (unsigned Lane = 0; Lane < Lanes; ++Lane) { 4208 Constant *Idx = ConstantInt::get(PtrInd->getType(), Lane + Part * VF); 4209 Value *GlobalIdx = Builder.CreateAdd(PtrInd, Idx); 4210 Value *SclrGep = 4211 emitTransformedIndex(Builder, GlobalIdx, PSE.getSE(), DL, II); 4212 SclrGep->setName("next.gep"); 4213 VectorLoopValueMap.setScalarValue(P, {Part, Lane}, SclrGep); 4214 } 4215 } 4216 return; 4217 } 4218 } 4219 } 4220 4221 /// A helper function for checking whether an integer division-related 4222 /// instruction may divide by zero (in which case it must be predicated if 4223 /// executed conditionally in the scalar code). 4224 /// TODO: It may be worthwhile to generalize and check isKnownNonZero(). 4225 /// Non-zero divisors that are non compile-time constants will not be 4226 /// converted into multiplication, so we will still end up scalarizing 4227 /// the division, but can do so w/o predication. 4228 static bool mayDivideByZero(Instruction &I) { 4229 assert((I.getOpcode() == Instruction::UDiv || 4230 I.getOpcode() == Instruction::SDiv || 4231 I.getOpcode() == Instruction::URem || 4232 I.getOpcode() == Instruction::SRem) && 4233 "Unexpected instruction"); 4234 Value *Divisor = I.getOperand(1); 4235 auto *CInt = dyn_cast<ConstantInt>(Divisor); 4236 return !CInt || CInt->isZero(); 4237 } 4238 4239 void InnerLoopVectorizer::widenInstruction(Instruction &I, VPUser &User, 4240 VPTransformState &State) { 4241 switch (I.getOpcode()) { 4242 case Instruction::Call: 4243 case Instruction::Br: 4244 case Instruction::PHI: 4245 case Instruction::GetElementPtr: 4246 case Instruction::Select: 4247 llvm_unreachable("This instruction is handled by a different recipe."); 4248 case Instruction::UDiv: 4249 case Instruction::SDiv: 4250 case Instruction::SRem: 4251 case Instruction::URem: 4252 case Instruction::Add: 4253 case Instruction::FAdd: 4254 case Instruction::Sub: 4255 case Instruction::FSub: 4256 case Instruction::FNeg: 4257 case Instruction::Mul: 4258 case Instruction::FMul: 4259 case Instruction::FDiv: 4260 case Instruction::FRem: 4261 case Instruction::Shl: 4262 case Instruction::LShr: 4263 case Instruction::AShr: 4264 case Instruction::And: 4265 case Instruction::Or: 4266 case Instruction::Xor: { 4267 // Just widen unops and binops. 4268 setDebugLocFromInst(Builder, &I); 4269 4270 for (unsigned Part = 0; Part < UF; ++Part) { 4271 SmallVector<Value *, 2> Ops; 4272 for (VPValue *VPOp : User.operands()) 4273 Ops.push_back(State.get(VPOp, Part)); 4274 4275 Value *V = Builder.CreateNAryOp(I.getOpcode(), Ops); 4276 4277 if (auto *VecOp = dyn_cast<Instruction>(V)) 4278 VecOp->copyIRFlags(&I); 4279 4280 // Use this vector value for all users of the original instruction. 4281 VectorLoopValueMap.setVectorValue(&I, Part, V); 4282 addMetadata(V, &I); 4283 } 4284 4285 break; 4286 } 4287 case Instruction::ICmp: 4288 case Instruction::FCmp: { 4289 // Widen compares. Generate vector compares. 4290 bool FCmp = (I.getOpcode() == Instruction::FCmp); 4291 auto *Cmp = cast<CmpInst>(&I); 4292 setDebugLocFromInst(Builder, Cmp); 4293 for (unsigned Part = 0; Part < UF; ++Part) { 4294 Value *A = State.get(User.getOperand(0), Part); 4295 Value *B = State.get(User.getOperand(1), Part); 4296 Value *C = nullptr; 4297 if (FCmp) { 4298 // Propagate fast math flags. 4299 IRBuilder<>::FastMathFlagGuard FMFG(Builder); 4300 Builder.setFastMathFlags(Cmp->getFastMathFlags()); 4301 C = Builder.CreateFCmp(Cmp->getPredicate(), A, B); 4302 } else { 4303 C = Builder.CreateICmp(Cmp->getPredicate(), A, B); 4304 } 4305 VectorLoopValueMap.setVectorValue(&I, Part, C); 4306 addMetadata(C, &I); 4307 } 4308 4309 break; 4310 } 4311 4312 case Instruction::ZExt: 4313 case Instruction::SExt: 4314 case Instruction::FPToUI: 4315 case Instruction::FPToSI: 4316 case Instruction::FPExt: 4317 case Instruction::PtrToInt: 4318 case Instruction::IntToPtr: 4319 case Instruction::SIToFP: 4320 case Instruction::UIToFP: 4321 case Instruction::Trunc: 4322 case Instruction::FPTrunc: 4323 case Instruction::BitCast: { 4324 auto *CI = cast<CastInst>(&I); 4325 setDebugLocFromInst(Builder, CI); 4326 4327 /// Vectorize casts. 4328 Type *DestTy = 4329 (VF == 1) ? CI->getType() : VectorType::get(CI->getType(), VF); 4330 4331 for (unsigned Part = 0; Part < UF; ++Part) { 4332 Value *A = State.get(User.getOperand(0), Part); 4333 Value *Cast = Builder.CreateCast(CI->getOpcode(), A, DestTy); 4334 VectorLoopValueMap.setVectorValue(&I, Part, Cast); 4335 addMetadata(Cast, &I); 4336 } 4337 break; 4338 } 4339 default: 4340 // This instruction is not vectorized by simple widening. 4341 LLVM_DEBUG(dbgs() << "LV: Found an unhandled instruction: " << I); 4342 llvm_unreachable("Unhandled instruction!"); 4343 } // end of switch. 4344 } 4345 4346 void InnerLoopVectorizer::widenCallInstruction(CallInst &I, VPUser &ArgOperands, 4347 VPTransformState &State) { 4348 assert(!isa<DbgInfoIntrinsic>(I) && 4349 "DbgInfoIntrinsic should have been dropped during VPlan construction"); 4350 setDebugLocFromInst(Builder, &I); 4351 4352 Module *M = I.getParent()->getParent()->getParent(); 4353 auto *CI = cast<CallInst>(&I); 4354 4355 SmallVector<Type *, 4> Tys; 4356 for (Value *ArgOperand : CI->arg_operands()) 4357 Tys.push_back(ToVectorTy(ArgOperand->getType(), VF)); 4358 4359 Intrinsic::ID ID = getVectorIntrinsicIDForCall(CI, TLI); 4360 4361 // The flag shows whether we use Intrinsic or a usual Call for vectorized 4362 // version of the instruction. 4363 // Is it beneficial to perform intrinsic call compared to lib call? 4364 bool NeedToScalarize = false; 4365 unsigned CallCost = Cost->getVectorCallCost(CI, VF, NeedToScalarize); 4366 bool UseVectorIntrinsic = 4367 ID && Cost->getVectorIntrinsicCost(CI, VF) <= CallCost; 4368 assert((UseVectorIntrinsic || !NeedToScalarize) && 4369 "Instruction should be scalarized elsewhere."); 4370 4371 for (unsigned Part = 0; Part < UF; ++Part) { 4372 SmallVector<Value *, 4> Args; 4373 for (auto &I : enumerate(ArgOperands.operands())) { 4374 // Some intrinsics have a scalar argument - don't replace it with a 4375 // vector. 4376 Value *Arg; 4377 if (!UseVectorIntrinsic || !hasVectorInstrinsicScalarOpd(ID, I.index())) 4378 Arg = State.get(I.value(), Part); 4379 else 4380 Arg = State.get(I.value(), {0, 0}); 4381 Args.push_back(Arg); 4382 } 4383 4384 Function *VectorF; 4385 if (UseVectorIntrinsic) { 4386 // Use vector version of the intrinsic. 4387 Type *TysForDecl[] = {CI->getType()}; 4388 if (VF > 1) 4389 TysForDecl[0] = VectorType::get(CI->getType()->getScalarType(), VF); 4390 VectorF = Intrinsic::getDeclaration(M, ID, TysForDecl); 4391 assert(VectorF && "Can't retrieve vector intrinsic."); 4392 } else { 4393 // Use vector version of the function call. 4394 const VFShape Shape = 4395 VFShape::get(*CI, {VF, false} /*EC*/, false /*HasGlobalPred*/); 4396 #ifndef NDEBUG 4397 assert(VFDatabase(*CI).getVectorizedFunction(Shape) != nullptr && 4398 "Can't create vector function."); 4399 #endif 4400 VectorF = VFDatabase(*CI).getVectorizedFunction(Shape); 4401 } 4402 SmallVector<OperandBundleDef, 1> OpBundles; 4403 CI->getOperandBundlesAsDefs(OpBundles); 4404 CallInst *V = Builder.CreateCall(VectorF, Args, OpBundles); 4405 4406 if (isa<FPMathOperator>(V)) 4407 V->copyFastMathFlags(CI); 4408 4409 VectorLoopValueMap.setVectorValue(&I, Part, V); 4410 addMetadata(V, &I); 4411 } 4412 } 4413 4414 void InnerLoopVectorizer::widenSelectInstruction(SelectInst &I, 4415 bool InvariantCond) { 4416 setDebugLocFromInst(Builder, &I); 4417 4418 // The condition can be loop invariant but still defined inside the 4419 // loop. This means that we can't just use the original 'cond' value. 4420 // We have to take the 'vectorized' value and pick the first lane. 4421 // Instcombine will make this a no-op. 4422 4423 auto *ScalarCond = getOrCreateScalarValue(I.getOperand(0), {0, 0}); 4424 4425 for (unsigned Part = 0; Part < UF; ++Part) { 4426 Value *Cond = getOrCreateVectorValue(I.getOperand(0), Part); 4427 Value *Op0 = getOrCreateVectorValue(I.getOperand(1), Part); 4428 Value *Op1 = getOrCreateVectorValue(I.getOperand(2), Part); 4429 Value *Sel = 4430 Builder.CreateSelect(InvariantCond ? ScalarCond : Cond, Op0, Op1); 4431 VectorLoopValueMap.setVectorValue(&I, Part, Sel); 4432 addMetadata(Sel, &I); 4433 } 4434 } 4435 4436 void LoopVectorizationCostModel::collectLoopScalars(unsigned VF) { 4437 // We should not collect Scalars more than once per VF. Right now, this 4438 // function is called from collectUniformsAndScalars(), which already does 4439 // this check. Collecting Scalars for VF=1 does not make any sense. 4440 assert(VF >= 2 && Scalars.find(VF) == Scalars.end() && 4441 "This function should not be visited twice for the same VF"); 4442 4443 SmallSetVector<Instruction *, 8> Worklist; 4444 4445 // These sets are used to seed the analysis with pointers used by memory 4446 // accesses that will remain scalar. 4447 SmallSetVector<Instruction *, 8> ScalarPtrs; 4448 SmallPtrSet<Instruction *, 8> PossibleNonScalarPtrs; 4449 4450 // A helper that returns true if the use of Ptr by MemAccess will be scalar. 4451 // The pointer operands of loads and stores will be scalar as long as the 4452 // memory access is not a gather or scatter operation. The value operand of a 4453 // store will remain scalar if the store is scalarized. 4454 auto isScalarUse = [&](Instruction *MemAccess, Value *Ptr) { 4455 InstWidening WideningDecision = getWideningDecision(MemAccess, VF); 4456 assert(WideningDecision != CM_Unknown && 4457 "Widening decision should be ready at this moment"); 4458 if (auto *Store = dyn_cast<StoreInst>(MemAccess)) 4459 if (Ptr == Store->getValueOperand()) 4460 return WideningDecision == CM_Scalarize; 4461 assert(Ptr == getLoadStorePointerOperand(MemAccess) && 4462 "Ptr is neither a value or pointer operand"); 4463 return WideningDecision != CM_GatherScatter; 4464 }; 4465 4466 // A helper that returns true if the given value is a bitcast or 4467 // getelementptr instruction contained in the loop. 4468 auto isLoopVaryingBitCastOrGEP = [&](Value *V) { 4469 return ((isa<BitCastInst>(V) && V->getType()->isPointerTy()) || 4470 isa<GetElementPtrInst>(V)) && 4471 !TheLoop->isLoopInvariant(V); 4472 }; 4473 4474 // A helper that evaluates a memory access's use of a pointer. If the use 4475 // will be a scalar use, and the pointer is only used by memory accesses, we 4476 // place the pointer in ScalarPtrs. Otherwise, the pointer is placed in 4477 // PossibleNonScalarPtrs. 4478 auto evaluatePtrUse = [&](Instruction *MemAccess, Value *Ptr) { 4479 // We only care about bitcast and getelementptr instructions contained in 4480 // the loop. 4481 if (!isLoopVaryingBitCastOrGEP(Ptr)) 4482 return; 4483 4484 // If the pointer has already been identified as scalar (e.g., if it was 4485 // also identified as uniform), there's nothing to do. 4486 auto *I = cast<Instruction>(Ptr); 4487 if (Worklist.count(I)) 4488 return; 4489 4490 // If the use of the pointer will be a scalar use, and all users of the 4491 // pointer are memory accesses, place the pointer in ScalarPtrs. Otherwise, 4492 // place the pointer in PossibleNonScalarPtrs. 4493 if (isScalarUse(MemAccess, Ptr) && llvm::all_of(I->users(), [&](User *U) { 4494 return isa<LoadInst>(U) || isa<StoreInst>(U); 4495 })) 4496 ScalarPtrs.insert(I); 4497 else 4498 PossibleNonScalarPtrs.insert(I); 4499 }; 4500 4501 // We seed the scalars analysis with three classes of instructions: (1) 4502 // instructions marked uniform-after-vectorization, (2) bitcast and 4503 // getelementptr instructions used by memory accesses requiring a scalar use, 4504 // and (3) pointer induction variables and their update instructions (we 4505 // currently only scalarize these). 4506 // 4507 // (1) Add to the worklist all instructions that have been identified as 4508 // uniform-after-vectorization. 4509 Worklist.insert(Uniforms[VF].begin(), Uniforms[VF].end()); 4510 4511 // (2) Add to the worklist all bitcast and getelementptr instructions used by 4512 // memory accesses requiring a scalar use. The pointer operands of loads and 4513 // stores will be scalar as long as the memory accesses is not a gather or 4514 // scatter operation. The value operand of a store will remain scalar if the 4515 // store is scalarized. 4516 for (auto *BB : TheLoop->blocks()) 4517 for (auto &I : *BB) { 4518 if (auto *Load = dyn_cast<LoadInst>(&I)) { 4519 evaluatePtrUse(Load, Load->getPointerOperand()); 4520 } else if (auto *Store = dyn_cast<StoreInst>(&I)) { 4521 evaluatePtrUse(Store, Store->getPointerOperand()); 4522 evaluatePtrUse(Store, Store->getValueOperand()); 4523 } 4524 } 4525 for (auto *I : ScalarPtrs) 4526 if (PossibleNonScalarPtrs.find(I) == PossibleNonScalarPtrs.end()) { 4527 LLVM_DEBUG(dbgs() << "LV: Found scalar instruction: " << *I << "\n"); 4528 Worklist.insert(I); 4529 } 4530 4531 // (3) Add to the worklist all pointer induction variables and their update 4532 // instructions. 4533 // 4534 // TODO: Once we are able to vectorize pointer induction variables we should 4535 // no longer insert them into the worklist here. 4536 auto *Latch = TheLoop->getLoopLatch(); 4537 for (auto &Induction : Legal->getInductionVars()) { 4538 auto *Ind = Induction.first; 4539 auto *IndUpdate = cast<Instruction>(Ind->getIncomingValueForBlock(Latch)); 4540 if (Induction.second.getKind() != InductionDescriptor::IK_PtrInduction) 4541 continue; 4542 Worklist.insert(Ind); 4543 Worklist.insert(IndUpdate); 4544 LLVM_DEBUG(dbgs() << "LV: Found scalar instruction: " << *Ind << "\n"); 4545 LLVM_DEBUG(dbgs() << "LV: Found scalar instruction: " << *IndUpdate 4546 << "\n"); 4547 } 4548 4549 // Insert the forced scalars. 4550 // FIXME: Currently widenPHIInstruction() often creates a dead vector 4551 // induction variable when the PHI user is scalarized. 4552 auto ForcedScalar = ForcedScalars.find(VF); 4553 if (ForcedScalar != ForcedScalars.end()) 4554 for (auto *I : ForcedScalar->second) 4555 Worklist.insert(I); 4556 4557 // Expand the worklist by looking through any bitcasts and getelementptr 4558 // instructions we've already identified as scalar. This is similar to the 4559 // expansion step in collectLoopUniforms(); however, here we're only 4560 // expanding to include additional bitcasts and getelementptr instructions. 4561 unsigned Idx = 0; 4562 while (Idx != Worklist.size()) { 4563 Instruction *Dst = Worklist[Idx++]; 4564 if (!isLoopVaryingBitCastOrGEP(Dst->getOperand(0))) 4565 continue; 4566 auto *Src = cast<Instruction>(Dst->getOperand(0)); 4567 if (llvm::all_of(Src->users(), [&](User *U) -> bool { 4568 auto *J = cast<Instruction>(U); 4569 return !TheLoop->contains(J) || Worklist.count(J) || 4570 ((isa<LoadInst>(J) || isa<StoreInst>(J)) && 4571 isScalarUse(J, Src)); 4572 })) { 4573 Worklist.insert(Src); 4574 LLVM_DEBUG(dbgs() << "LV: Found scalar instruction: " << *Src << "\n"); 4575 } 4576 } 4577 4578 // An induction variable will remain scalar if all users of the induction 4579 // variable and induction variable update remain scalar. 4580 for (auto &Induction : Legal->getInductionVars()) { 4581 auto *Ind = Induction.first; 4582 auto *IndUpdate = cast<Instruction>(Ind->getIncomingValueForBlock(Latch)); 4583 4584 // We already considered pointer induction variables, so there's no reason 4585 // to look at their users again. 4586 // 4587 // TODO: Once we are able to vectorize pointer induction variables we 4588 // should no longer skip over them here. 4589 if (Induction.second.getKind() == InductionDescriptor::IK_PtrInduction) 4590 continue; 4591 4592 // If tail-folding is applied, the primary induction variable will be used 4593 // to feed a vector compare. 4594 if (Ind == Legal->getPrimaryInduction() && foldTailByMasking()) 4595 continue; 4596 4597 // Determine if all users of the induction variable are scalar after 4598 // vectorization. 4599 auto ScalarInd = llvm::all_of(Ind->users(), [&](User *U) -> bool { 4600 auto *I = cast<Instruction>(U); 4601 return I == IndUpdate || !TheLoop->contains(I) || Worklist.count(I); 4602 }); 4603 if (!ScalarInd) 4604 continue; 4605 4606 // Determine if all users of the induction variable update instruction are 4607 // scalar after vectorization. 4608 auto ScalarIndUpdate = 4609 llvm::all_of(IndUpdate->users(), [&](User *U) -> bool { 4610 auto *I = cast<Instruction>(U); 4611 return I == Ind || !TheLoop->contains(I) || Worklist.count(I); 4612 }); 4613 if (!ScalarIndUpdate) 4614 continue; 4615 4616 // The induction variable and its update instruction will remain scalar. 4617 Worklist.insert(Ind); 4618 Worklist.insert(IndUpdate); 4619 LLVM_DEBUG(dbgs() << "LV: Found scalar instruction: " << *Ind << "\n"); 4620 LLVM_DEBUG(dbgs() << "LV: Found scalar instruction: " << *IndUpdate 4621 << "\n"); 4622 } 4623 4624 Scalars[VF].insert(Worklist.begin(), Worklist.end()); 4625 } 4626 4627 bool LoopVectorizationCostModel::isScalarWithPredication(Instruction *I, unsigned VF) { 4628 if (!blockNeedsPredication(I->getParent())) 4629 return false; 4630 switch(I->getOpcode()) { 4631 default: 4632 break; 4633 case Instruction::Load: 4634 case Instruction::Store: { 4635 if (!Legal->isMaskRequired(I)) 4636 return false; 4637 auto *Ptr = getLoadStorePointerOperand(I); 4638 auto *Ty = getMemInstValueType(I); 4639 // We have already decided how to vectorize this instruction, get that 4640 // result. 4641 if (VF > 1) { 4642 InstWidening WideningDecision = getWideningDecision(I, VF); 4643 assert(WideningDecision != CM_Unknown && 4644 "Widening decision should be ready at this moment"); 4645 return WideningDecision == CM_Scalarize; 4646 } 4647 const Align Alignment = getLoadStoreAlignment(I); 4648 return isa<LoadInst>(I) ? !(isLegalMaskedLoad(Ty, Ptr, Alignment) || 4649 isLegalMaskedGather(Ty, Alignment)) 4650 : !(isLegalMaskedStore(Ty, Ptr, Alignment) || 4651 isLegalMaskedScatter(Ty, Alignment)); 4652 } 4653 case Instruction::UDiv: 4654 case Instruction::SDiv: 4655 case Instruction::SRem: 4656 case Instruction::URem: 4657 return mayDivideByZero(*I); 4658 } 4659 return false; 4660 } 4661 4662 bool LoopVectorizationCostModel::interleavedAccessCanBeWidened(Instruction *I, 4663 unsigned VF) { 4664 assert(isAccessInterleaved(I) && "Expecting interleaved access."); 4665 assert(getWideningDecision(I, VF) == CM_Unknown && 4666 "Decision should not be set yet."); 4667 auto *Group = getInterleavedAccessGroup(I); 4668 assert(Group && "Must have a group."); 4669 4670 // If the instruction's allocated size doesn't equal it's type size, it 4671 // requires padding and will be scalarized. 4672 auto &DL = I->getModule()->getDataLayout(); 4673 auto *ScalarTy = getMemInstValueType(I); 4674 if (hasIrregularType(ScalarTy, DL, VF)) 4675 return false; 4676 4677 // Check if masking is required. 4678 // A Group may need masking for one of two reasons: it resides in a block that 4679 // needs predication, or it was decided to use masking to deal with gaps. 4680 bool PredicatedAccessRequiresMasking = 4681 Legal->blockNeedsPredication(I->getParent()) && Legal->isMaskRequired(I); 4682 bool AccessWithGapsRequiresMasking = 4683 Group->requiresScalarEpilogue() && !isScalarEpilogueAllowed(); 4684 if (!PredicatedAccessRequiresMasking && !AccessWithGapsRequiresMasking) 4685 return true; 4686 4687 // If masked interleaving is required, we expect that the user/target had 4688 // enabled it, because otherwise it either wouldn't have been created or 4689 // it should have been invalidated by the CostModel. 4690 assert(useMaskedInterleavedAccesses(TTI) && 4691 "Masked interleave-groups for predicated accesses are not enabled."); 4692 4693 auto *Ty = getMemInstValueType(I); 4694 const Align Alignment = getLoadStoreAlignment(I); 4695 return isa<LoadInst>(I) ? TTI.isLegalMaskedLoad(Ty, Alignment) 4696 : TTI.isLegalMaskedStore(Ty, Alignment); 4697 } 4698 4699 bool LoopVectorizationCostModel::memoryInstructionCanBeWidened(Instruction *I, 4700 unsigned VF) { 4701 // Get and ensure we have a valid memory instruction. 4702 LoadInst *LI = dyn_cast<LoadInst>(I); 4703 StoreInst *SI = dyn_cast<StoreInst>(I); 4704 assert((LI || SI) && "Invalid memory instruction"); 4705 4706 auto *Ptr = getLoadStorePointerOperand(I); 4707 4708 // In order to be widened, the pointer should be consecutive, first of all. 4709 if (!Legal->isConsecutivePtr(Ptr)) 4710 return false; 4711 4712 // If the instruction is a store located in a predicated block, it will be 4713 // scalarized. 4714 if (isScalarWithPredication(I)) 4715 return false; 4716 4717 // If the instruction's allocated size doesn't equal it's type size, it 4718 // requires padding and will be scalarized. 4719 auto &DL = I->getModule()->getDataLayout(); 4720 auto *ScalarTy = LI ? LI->getType() : SI->getValueOperand()->getType(); 4721 if (hasIrregularType(ScalarTy, DL, VF)) 4722 return false; 4723 4724 return true; 4725 } 4726 4727 void LoopVectorizationCostModel::collectLoopUniforms(unsigned VF) { 4728 // We should not collect Uniforms more than once per VF. Right now, 4729 // this function is called from collectUniformsAndScalars(), which 4730 // already does this check. Collecting Uniforms for VF=1 does not make any 4731 // sense. 4732 4733 assert(VF >= 2 && Uniforms.find(VF) == Uniforms.end() && 4734 "This function should not be visited twice for the same VF"); 4735 4736 // Visit the list of Uniforms. If we'll not find any uniform value, we'll 4737 // not analyze again. Uniforms.count(VF) will return 1. 4738 Uniforms[VF].clear(); 4739 4740 // We now know that the loop is vectorizable! 4741 // Collect instructions inside the loop that will remain uniform after 4742 // vectorization. 4743 4744 // Global values, params and instructions outside of current loop are out of 4745 // scope. 4746 auto isOutOfScope = [&](Value *V) -> bool { 4747 Instruction *I = dyn_cast<Instruction>(V); 4748 return (!I || !TheLoop->contains(I)); 4749 }; 4750 4751 SetVector<Instruction *> Worklist; 4752 BasicBlock *Latch = TheLoop->getLoopLatch(); 4753 4754 // Instructions that are scalar with predication must not be considered 4755 // uniform after vectorization, because that would create an erroneous 4756 // replicating region where only a single instance out of VF should be formed. 4757 // TODO: optimize such seldom cases if found important, see PR40816. 4758 auto addToWorklistIfAllowed = [&](Instruction *I) -> void { 4759 if (isScalarWithPredication(I, VF)) { 4760 LLVM_DEBUG(dbgs() << "LV: Found not uniform being ScalarWithPredication: " 4761 << *I << "\n"); 4762 return; 4763 } 4764 LLVM_DEBUG(dbgs() << "LV: Found uniform instruction: " << *I << "\n"); 4765 Worklist.insert(I); 4766 }; 4767 4768 // Start with the conditional branch. If the branch condition is an 4769 // instruction contained in the loop that is only used by the branch, it is 4770 // uniform. 4771 auto *Cmp = dyn_cast<Instruction>(Latch->getTerminator()->getOperand(0)); 4772 if (Cmp && TheLoop->contains(Cmp) && Cmp->hasOneUse()) 4773 addToWorklistIfAllowed(Cmp); 4774 4775 // Holds consecutive and consecutive-like pointers. Consecutive-like pointers 4776 // are pointers that are treated like consecutive pointers during 4777 // vectorization. The pointer operands of interleaved accesses are an 4778 // example. 4779 SmallSetVector<Instruction *, 8> ConsecutiveLikePtrs; 4780 4781 // Holds pointer operands of instructions that are possibly non-uniform. 4782 SmallPtrSet<Instruction *, 8> PossibleNonUniformPtrs; 4783 4784 auto isUniformDecision = [&](Instruction *I, unsigned VF) { 4785 InstWidening WideningDecision = getWideningDecision(I, VF); 4786 assert(WideningDecision != CM_Unknown && 4787 "Widening decision should be ready at this moment"); 4788 4789 return (WideningDecision == CM_Widen || 4790 WideningDecision == CM_Widen_Reverse || 4791 WideningDecision == CM_Interleave); 4792 }; 4793 // Iterate over the instructions in the loop, and collect all 4794 // consecutive-like pointer operands in ConsecutiveLikePtrs. If it's possible 4795 // that a consecutive-like pointer operand will be scalarized, we collect it 4796 // in PossibleNonUniformPtrs instead. We use two sets here because a single 4797 // getelementptr instruction can be used by both vectorized and scalarized 4798 // memory instructions. For example, if a loop loads and stores from the same 4799 // location, but the store is conditional, the store will be scalarized, and 4800 // the getelementptr won't remain uniform. 4801 for (auto *BB : TheLoop->blocks()) 4802 for (auto &I : *BB) { 4803 // If there's no pointer operand, there's nothing to do. 4804 auto *Ptr = dyn_cast_or_null<Instruction>(getLoadStorePointerOperand(&I)); 4805 if (!Ptr) 4806 continue; 4807 4808 // True if all users of Ptr are memory accesses that have Ptr as their 4809 // pointer operand. 4810 auto UsersAreMemAccesses = 4811 llvm::all_of(Ptr->users(), [&](User *U) -> bool { 4812 return getLoadStorePointerOperand(U) == Ptr; 4813 }); 4814 4815 // Ensure the memory instruction will not be scalarized or used by 4816 // gather/scatter, making its pointer operand non-uniform. If the pointer 4817 // operand is used by any instruction other than a memory access, we 4818 // conservatively assume the pointer operand may be non-uniform. 4819 if (!UsersAreMemAccesses || !isUniformDecision(&I, VF)) 4820 PossibleNonUniformPtrs.insert(Ptr); 4821 4822 // If the memory instruction will be vectorized and its pointer operand 4823 // is consecutive-like, or interleaving - the pointer operand should 4824 // remain uniform. 4825 else 4826 ConsecutiveLikePtrs.insert(Ptr); 4827 } 4828 4829 // Add to the Worklist all consecutive and consecutive-like pointers that 4830 // aren't also identified as possibly non-uniform. 4831 for (auto *V : ConsecutiveLikePtrs) 4832 if (PossibleNonUniformPtrs.find(V) == PossibleNonUniformPtrs.end()) 4833 addToWorklistIfAllowed(V); 4834 4835 // Expand Worklist in topological order: whenever a new instruction 4836 // is added , its users should be already inside Worklist. It ensures 4837 // a uniform instruction will only be used by uniform instructions. 4838 unsigned idx = 0; 4839 while (idx != Worklist.size()) { 4840 Instruction *I = Worklist[idx++]; 4841 4842 for (auto OV : I->operand_values()) { 4843 // isOutOfScope operands cannot be uniform instructions. 4844 if (isOutOfScope(OV)) 4845 continue; 4846 // First order recurrence Phi's should typically be considered 4847 // non-uniform. 4848 auto *OP = dyn_cast<PHINode>(OV); 4849 if (OP && Legal->isFirstOrderRecurrence(OP)) 4850 continue; 4851 // If all the users of the operand are uniform, then add the 4852 // operand into the uniform worklist. 4853 auto *OI = cast<Instruction>(OV); 4854 if (llvm::all_of(OI->users(), [&](User *U) -> bool { 4855 auto *J = cast<Instruction>(U); 4856 return Worklist.count(J) || 4857 (OI == getLoadStorePointerOperand(J) && 4858 isUniformDecision(J, VF)); 4859 })) 4860 addToWorklistIfAllowed(OI); 4861 } 4862 } 4863 4864 // Returns true if Ptr is the pointer operand of a memory access instruction 4865 // I, and I is known to not require scalarization. 4866 auto isVectorizedMemAccessUse = [&](Instruction *I, Value *Ptr) -> bool { 4867 return getLoadStorePointerOperand(I) == Ptr && isUniformDecision(I, VF); 4868 }; 4869 4870 // For an instruction to be added into Worklist above, all its users inside 4871 // the loop should also be in Worklist. However, this condition cannot be 4872 // true for phi nodes that form a cyclic dependence. We must process phi 4873 // nodes separately. An induction variable will remain uniform if all users 4874 // of the induction variable and induction variable update remain uniform. 4875 // The code below handles both pointer and non-pointer induction variables. 4876 for (auto &Induction : Legal->getInductionVars()) { 4877 auto *Ind = Induction.first; 4878 auto *IndUpdate = cast<Instruction>(Ind->getIncomingValueForBlock(Latch)); 4879 4880 // Determine if all users of the induction variable are uniform after 4881 // vectorization. 4882 auto UniformInd = llvm::all_of(Ind->users(), [&](User *U) -> bool { 4883 auto *I = cast<Instruction>(U); 4884 return I == IndUpdate || !TheLoop->contains(I) || Worklist.count(I) || 4885 isVectorizedMemAccessUse(I, Ind); 4886 }); 4887 if (!UniformInd) 4888 continue; 4889 4890 // Determine if all users of the induction variable update instruction are 4891 // uniform after vectorization. 4892 auto UniformIndUpdate = 4893 llvm::all_of(IndUpdate->users(), [&](User *U) -> bool { 4894 auto *I = cast<Instruction>(U); 4895 return I == Ind || !TheLoop->contains(I) || Worklist.count(I) || 4896 isVectorizedMemAccessUse(I, IndUpdate); 4897 }); 4898 if (!UniformIndUpdate) 4899 continue; 4900 4901 // The induction variable and its update instruction will remain uniform. 4902 addToWorklistIfAllowed(Ind); 4903 addToWorklistIfAllowed(IndUpdate); 4904 } 4905 4906 Uniforms[VF].insert(Worklist.begin(), Worklist.end()); 4907 } 4908 4909 bool LoopVectorizationCostModel::runtimeChecksRequired() { 4910 LLVM_DEBUG(dbgs() << "LV: Performing code size checks.\n"); 4911 4912 if (Legal->getRuntimePointerChecking()->Need) { 4913 reportVectorizationFailure("Runtime ptr check is required with -Os/-Oz", 4914 "runtime pointer checks needed. Enable vectorization of this " 4915 "loop with '#pragma clang loop vectorize(enable)' when " 4916 "compiling with -Os/-Oz", 4917 "CantVersionLoopWithOptForSize", ORE, TheLoop); 4918 return true; 4919 } 4920 4921 if (!PSE.getUnionPredicate().getPredicates().empty()) { 4922 reportVectorizationFailure("Runtime SCEV check is required with -Os/-Oz", 4923 "runtime SCEV checks needed. Enable vectorization of this " 4924 "loop with '#pragma clang loop vectorize(enable)' when " 4925 "compiling with -Os/-Oz", 4926 "CantVersionLoopWithOptForSize", ORE, TheLoop); 4927 return true; 4928 } 4929 4930 // FIXME: Avoid specializing for stride==1 instead of bailing out. 4931 if (!Legal->getLAI()->getSymbolicStrides().empty()) { 4932 reportVectorizationFailure("Runtime stride check is required with -Os/-Oz", 4933 "runtime stride == 1 checks needed. Enable vectorization of " 4934 "this loop with '#pragma clang loop vectorize(enable)' when " 4935 "compiling with -Os/-Oz", 4936 "CantVersionLoopWithOptForSize", ORE, TheLoop); 4937 return true; 4938 } 4939 4940 return false; 4941 } 4942 4943 Optional<unsigned> LoopVectorizationCostModel::computeMaxVF(unsigned UserVF, 4944 unsigned UserIC) { 4945 if (Legal->getRuntimePointerChecking()->Need && TTI.hasBranchDivergence()) { 4946 // TODO: It may by useful to do since it's still likely to be dynamically 4947 // uniform if the target can skip. 4948 reportVectorizationFailure( 4949 "Not inserting runtime ptr check for divergent target", 4950 "runtime pointer checks needed. Not enabled for divergent target", 4951 "CantVersionLoopWithDivergentTarget", ORE, TheLoop); 4952 return None; 4953 } 4954 4955 unsigned TC = PSE.getSE()->getSmallConstantTripCount(TheLoop); 4956 LLVM_DEBUG(dbgs() << "LV: Found trip count: " << TC << '\n'); 4957 if (TC == 1) { 4958 reportVectorizationFailure("Single iteration (non) loop", 4959 "loop trip count is one, irrelevant for vectorization", 4960 "SingleIterationLoop", ORE, TheLoop); 4961 return None; 4962 } 4963 4964 switch (ScalarEpilogueStatus) { 4965 case CM_ScalarEpilogueAllowed: 4966 return UserVF ? UserVF : computeFeasibleMaxVF(TC); 4967 case CM_ScalarEpilogueNotNeededUsePredicate: 4968 LLVM_DEBUG( 4969 dbgs() << "LV: vector predicate hint/switch found.\n" 4970 << "LV: Not allowing scalar epilogue, creating predicated " 4971 << "vector loop.\n"); 4972 break; 4973 case CM_ScalarEpilogueNotAllowedLowTripLoop: 4974 // fallthrough as a special case of OptForSize 4975 case CM_ScalarEpilogueNotAllowedOptSize: 4976 if (ScalarEpilogueStatus == CM_ScalarEpilogueNotAllowedOptSize) 4977 LLVM_DEBUG( 4978 dbgs() << "LV: Not allowing scalar epilogue due to -Os/-Oz.\n"); 4979 else 4980 LLVM_DEBUG(dbgs() << "LV: Not allowing scalar epilogue due to low trip " 4981 << "count.\n"); 4982 4983 // Bail if runtime checks are required, which are not good when optimising 4984 // for size. 4985 if (runtimeChecksRequired()) 4986 return None; 4987 break; 4988 } 4989 4990 // Now try the tail folding 4991 4992 // Invalidate interleave groups that require an epilogue if we can't mask 4993 // the interleave-group. 4994 if (!useMaskedInterleavedAccesses(TTI)) { 4995 assert(WideningDecisions.empty() && Uniforms.empty() && Scalars.empty() && 4996 "No decisions should have been taken at this point"); 4997 // Note: There is no need to invalidate any cost modeling decisions here, as 4998 // non where taken so far. 4999 InterleaveInfo.invalidateGroupsRequiringScalarEpilogue(); 5000 } 5001 5002 unsigned MaxVF = UserVF ? UserVF : computeFeasibleMaxVF(TC); 5003 unsigned MaxVFtimesIC = UserIC ? MaxVF * UserIC : MaxVF; 5004 if (TC > 0 && TC % MaxVFtimesIC == 0) { 5005 // Accept MaxVF if we do not have a tail. 5006 LLVM_DEBUG(dbgs() << "LV: No tail will remain for any chosen VF.\n"); 5007 return MaxVF; 5008 } 5009 5010 // If we don't know the precise trip count, or if the trip count that we 5011 // found modulo the vectorization factor is not zero, try to fold the tail 5012 // by masking. 5013 // FIXME: look for a smaller MaxVF that does divide TC rather than masking. 5014 if (Legal->prepareToFoldTailByMasking()) { 5015 FoldTailByMasking = true; 5016 return MaxVF; 5017 } 5018 5019 if (TC == 0) { 5020 reportVectorizationFailure( 5021 "Unable to calculate the loop count due to complex control flow", 5022 "unable to calculate the loop count due to complex control flow", 5023 "UnknownLoopCountComplexCFG", ORE, TheLoop); 5024 return None; 5025 } 5026 5027 reportVectorizationFailure( 5028 "Cannot optimize for size and vectorize at the same time.", 5029 "cannot optimize for size and vectorize at the same time. " 5030 "Enable vectorization of this loop with '#pragma clang loop " 5031 "vectorize(enable)' when compiling with -Os/-Oz", 5032 "NoTailLoopWithOptForSize", ORE, TheLoop); 5033 return None; 5034 } 5035 5036 unsigned 5037 LoopVectorizationCostModel::computeFeasibleMaxVF(unsigned ConstTripCount) { 5038 MinBWs = computeMinimumValueSizes(TheLoop->getBlocks(), *DB, &TTI); 5039 unsigned SmallestType, WidestType; 5040 std::tie(SmallestType, WidestType) = getSmallestAndWidestTypes(); 5041 unsigned WidestRegister = TTI.getRegisterBitWidth(true); 5042 5043 // Get the maximum safe dependence distance in bits computed by LAA. 5044 // It is computed by MaxVF * sizeOf(type) * 8, where type is taken from 5045 // the memory accesses that is most restrictive (involved in the smallest 5046 // dependence distance). 5047 unsigned MaxSafeRegisterWidth = Legal->getMaxSafeRegisterWidth(); 5048 5049 WidestRegister = std::min(WidestRegister, MaxSafeRegisterWidth); 5050 5051 unsigned MaxVectorSize = WidestRegister / WidestType; 5052 5053 LLVM_DEBUG(dbgs() << "LV: The Smallest and Widest types: " << SmallestType 5054 << " / " << WidestType << " bits.\n"); 5055 LLVM_DEBUG(dbgs() << "LV: The Widest register safe to use is: " 5056 << WidestRegister << " bits.\n"); 5057 5058 assert(MaxVectorSize <= 256 && "Did not expect to pack so many elements" 5059 " into one vector!"); 5060 if (MaxVectorSize == 0) { 5061 LLVM_DEBUG(dbgs() << "LV: The target has no vector registers.\n"); 5062 MaxVectorSize = 1; 5063 return MaxVectorSize; 5064 } else if (ConstTripCount && ConstTripCount < MaxVectorSize && 5065 isPowerOf2_32(ConstTripCount)) { 5066 // We need to clamp the VF to be the ConstTripCount. There is no point in 5067 // choosing a higher viable VF as done in the loop below. 5068 LLVM_DEBUG(dbgs() << "LV: Clamping the MaxVF to the constant trip count: " 5069 << ConstTripCount << "\n"); 5070 MaxVectorSize = ConstTripCount; 5071 return MaxVectorSize; 5072 } 5073 5074 unsigned MaxVF = MaxVectorSize; 5075 if (TTI.shouldMaximizeVectorBandwidth(!isScalarEpilogueAllowed()) || 5076 (MaximizeBandwidth && isScalarEpilogueAllowed())) { 5077 // Collect all viable vectorization factors larger than the default MaxVF 5078 // (i.e. MaxVectorSize). 5079 SmallVector<unsigned, 8> VFs; 5080 unsigned NewMaxVectorSize = WidestRegister / SmallestType; 5081 for (unsigned VS = MaxVectorSize * 2; VS <= NewMaxVectorSize; VS *= 2) 5082 VFs.push_back(VS); 5083 5084 // For each VF calculate its register usage. 5085 auto RUs = calculateRegisterUsage(VFs); 5086 5087 // Select the largest VF which doesn't require more registers than existing 5088 // ones. 5089 for (int i = RUs.size() - 1; i >= 0; --i) { 5090 bool Selected = true; 5091 for (auto& pair : RUs[i].MaxLocalUsers) { 5092 unsigned TargetNumRegisters = TTI.getNumberOfRegisters(pair.first); 5093 if (pair.second > TargetNumRegisters) 5094 Selected = false; 5095 } 5096 if (Selected) { 5097 MaxVF = VFs[i]; 5098 break; 5099 } 5100 } 5101 if (unsigned MinVF = TTI.getMinimumVF(SmallestType)) { 5102 if (MaxVF < MinVF) { 5103 LLVM_DEBUG(dbgs() << "LV: Overriding calculated MaxVF(" << MaxVF 5104 << ") with target's minimum: " << MinVF << '\n'); 5105 MaxVF = MinVF; 5106 } 5107 } 5108 } 5109 return MaxVF; 5110 } 5111 5112 VectorizationFactor 5113 LoopVectorizationCostModel::selectVectorizationFactor(unsigned MaxVF) { 5114 float Cost = expectedCost(1).first; 5115 const float ScalarCost = Cost; 5116 unsigned Width = 1; 5117 LLVM_DEBUG(dbgs() << "LV: Scalar loop costs: " << (int)ScalarCost << ".\n"); 5118 5119 bool ForceVectorization = Hints->getForce() == LoopVectorizeHints::FK_Enabled; 5120 if (ForceVectorization && MaxVF > 1) { 5121 // Ignore scalar width, because the user explicitly wants vectorization. 5122 // Initialize cost to max so that VF = 2 is, at least, chosen during cost 5123 // evaluation. 5124 Cost = std::numeric_limits<float>::max(); 5125 } 5126 5127 for (unsigned i = 2; i <= MaxVF; i *= 2) { 5128 // Notice that the vector loop needs to be executed less times, so 5129 // we need to divide the cost of the vector loops by the width of 5130 // the vector elements. 5131 VectorizationCostTy C = expectedCost(i); 5132 float VectorCost = C.first / (float)i; 5133 LLVM_DEBUG(dbgs() << "LV: Vector loop of width " << i 5134 << " costs: " << (int)VectorCost << ".\n"); 5135 if (!C.second && !ForceVectorization) { 5136 LLVM_DEBUG( 5137 dbgs() << "LV: Not considering vector loop of width " << i 5138 << " because it will not generate any vector instructions.\n"); 5139 continue; 5140 } 5141 if (VectorCost < Cost) { 5142 Cost = VectorCost; 5143 Width = i; 5144 } 5145 } 5146 5147 if (!EnableCondStoresVectorization && NumPredStores) { 5148 reportVectorizationFailure("There are conditional stores.", 5149 "store that is conditionally executed prevents vectorization", 5150 "ConditionalStore", ORE, TheLoop); 5151 Width = 1; 5152 Cost = ScalarCost; 5153 } 5154 5155 LLVM_DEBUG(if (ForceVectorization && Width > 1 && Cost >= ScalarCost) dbgs() 5156 << "LV: Vectorization seems to be not beneficial, " 5157 << "but was forced by a user.\n"); 5158 LLVM_DEBUG(dbgs() << "LV: Selecting VF: " << Width << ".\n"); 5159 VectorizationFactor Factor = {Width, (unsigned)(Width * Cost)}; 5160 return Factor; 5161 } 5162 5163 std::pair<unsigned, unsigned> 5164 LoopVectorizationCostModel::getSmallestAndWidestTypes() { 5165 unsigned MinWidth = -1U; 5166 unsigned MaxWidth = 8; 5167 const DataLayout &DL = TheFunction->getParent()->getDataLayout(); 5168 5169 // For each block. 5170 for (BasicBlock *BB : TheLoop->blocks()) { 5171 // For each instruction in the loop. 5172 for (Instruction &I : BB->instructionsWithoutDebug()) { 5173 Type *T = I.getType(); 5174 5175 // Skip ignored values. 5176 if (ValuesToIgnore.find(&I) != ValuesToIgnore.end()) 5177 continue; 5178 5179 // Only examine Loads, Stores and PHINodes. 5180 if (!isa<LoadInst>(I) && !isa<StoreInst>(I) && !isa<PHINode>(I)) 5181 continue; 5182 5183 // Examine PHI nodes that are reduction variables. Update the type to 5184 // account for the recurrence type. 5185 if (auto *PN = dyn_cast<PHINode>(&I)) { 5186 if (!Legal->isReductionVariable(PN)) 5187 continue; 5188 RecurrenceDescriptor RdxDesc = Legal->getReductionVars()[PN]; 5189 T = RdxDesc.getRecurrenceType(); 5190 } 5191 5192 // Examine the stored values. 5193 if (auto *ST = dyn_cast<StoreInst>(&I)) 5194 T = ST->getValueOperand()->getType(); 5195 5196 // Ignore loaded pointer types and stored pointer types that are not 5197 // vectorizable. 5198 // 5199 // FIXME: The check here attempts to predict whether a load or store will 5200 // be vectorized. We only know this for certain after a VF has 5201 // been selected. Here, we assume that if an access can be 5202 // vectorized, it will be. We should also look at extending this 5203 // optimization to non-pointer types. 5204 // 5205 if (T->isPointerTy() && !isConsecutiveLoadOrStore(&I) && 5206 !isAccessInterleaved(&I) && !isLegalGatherOrScatter(&I)) 5207 continue; 5208 5209 MinWidth = std::min(MinWidth, 5210 (unsigned)DL.getTypeSizeInBits(T->getScalarType())); 5211 MaxWidth = std::max(MaxWidth, 5212 (unsigned)DL.getTypeSizeInBits(T->getScalarType())); 5213 } 5214 } 5215 5216 return {MinWidth, MaxWidth}; 5217 } 5218 5219 unsigned LoopVectorizationCostModel::selectInterleaveCount(unsigned VF, 5220 unsigned LoopCost) { 5221 // -- The interleave heuristics -- 5222 // We interleave the loop in order to expose ILP and reduce the loop overhead. 5223 // There are many micro-architectural considerations that we can't predict 5224 // at this level. For example, frontend pressure (on decode or fetch) due to 5225 // code size, or the number and capabilities of the execution ports. 5226 // 5227 // We use the following heuristics to select the interleave count: 5228 // 1. If the code has reductions, then we interleave to break the cross 5229 // iteration dependency. 5230 // 2. If the loop is really small, then we interleave to reduce the loop 5231 // overhead. 5232 // 3. We don't interleave if we think that we will spill registers to memory 5233 // due to the increased register pressure. 5234 5235 if (!isScalarEpilogueAllowed()) 5236 return 1; 5237 5238 // We used the distance for the interleave count. 5239 if (Legal->getMaxSafeDepDistBytes() != -1U) 5240 return 1; 5241 5242 // Do not interleave loops with a relatively small known or estimated trip 5243 // count. 5244 auto BestKnownTC = getSmallBestKnownTC(*PSE.getSE(), TheLoop); 5245 if (BestKnownTC && *BestKnownTC < TinyTripCountInterleaveThreshold) 5246 return 1; 5247 5248 RegisterUsage R = calculateRegisterUsage({VF})[0]; 5249 // We divide by these constants so assume that we have at least one 5250 // instruction that uses at least one register. 5251 for (auto& pair : R.MaxLocalUsers) { 5252 pair.second = std::max(pair.second, 1U); 5253 } 5254 5255 // We calculate the interleave count using the following formula. 5256 // Subtract the number of loop invariants from the number of available 5257 // registers. These registers are used by all of the interleaved instances. 5258 // Next, divide the remaining registers by the number of registers that is 5259 // required by the loop, in order to estimate how many parallel instances 5260 // fit without causing spills. All of this is rounded down if necessary to be 5261 // a power of two. We want power of two interleave count to simplify any 5262 // addressing operations or alignment considerations. 5263 // We also want power of two interleave counts to ensure that the induction 5264 // variable of the vector loop wraps to zero, when tail is folded by masking; 5265 // this currently happens when OptForSize, in which case IC is set to 1 above. 5266 unsigned IC = UINT_MAX; 5267 5268 for (auto& pair : R.MaxLocalUsers) { 5269 unsigned TargetNumRegisters = TTI.getNumberOfRegisters(pair.first); 5270 LLVM_DEBUG(dbgs() << "LV: The target has " << TargetNumRegisters 5271 << " registers of " 5272 << TTI.getRegisterClassName(pair.first) << " register class\n"); 5273 if (VF == 1) { 5274 if (ForceTargetNumScalarRegs.getNumOccurrences() > 0) 5275 TargetNumRegisters = ForceTargetNumScalarRegs; 5276 } else { 5277 if (ForceTargetNumVectorRegs.getNumOccurrences() > 0) 5278 TargetNumRegisters = ForceTargetNumVectorRegs; 5279 } 5280 unsigned MaxLocalUsers = pair.second; 5281 unsigned LoopInvariantRegs = 0; 5282 if (R.LoopInvariantRegs.find(pair.first) != R.LoopInvariantRegs.end()) 5283 LoopInvariantRegs = R.LoopInvariantRegs[pair.first]; 5284 5285 unsigned TmpIC = PowerOf2Floor((TargetNumRegisters - LoopInvariantRegs) / MaxLocalUsers); 5286 // Don't count the induction variable as interleaved. 5287 if (EnableIndVarRegisterHeur) { 5288 TmpIC = 5289 PowerOf2Floor((TargetNumRegisters - LoopInvariantRegs - 1) / 5290 std::max(1U, (MaxLocalUsers - 1))); 5291 } 5292 5293 IC = std::min(IC, TmpIC); 5294 } 5295 5296 // Clamp the interleave ranges to reasonable counts. 5297 unsigned MaxInterleaveCount = TTI.getMaxInterleaveFactor(VF); 5298 5299 // Check if the user has overridden the max. 5300 if (VF == 1) { 5301 if (ForceTargetMaxScalarInterleaveFactor.getNumOccurrences() > 0) 5302 MaxInterleaveCount = ForceTargetMaxScalarInterleaveFactor; 5303 } else { 5304 if (ForceTargetMaxVectorInterleaveFactor.getNumOccurrences() > 0) 5305 MaxInterleaveCount = ForceTargetMaxVectorInterleaveFactor; 5306 } 5307 5308 // If trip count is known or estimated compile time constant, limit the 5309 // interleave count to be less than the trip count divided by VF. 5310 if (BestKnownTC) { 5311 MaxInterleaveCount = std::min(*BestKnownTC / VF, MaxInterleaveCount); 5312 } 5313 5314 // If we did not calculate the cost for VF (because the user selected the VF) 5315 // then we calculate the cost of VF here. 5316 if (LoopCost == 0) 5317 LoopCost = expectedCost(VF).first; 5318 5319 assert(LoopCost && "Non-zero loop cost expected"); 5320 5321 // Clamp the calculated IC to be between the 1 and the max interleave count 5322 // that the target and trip count allows. 5323 if (IC > MaxInterleaveCount) 5324 IC = MaxInterleaveCount; 5325 else if (IC < 1) 5326 IC = 1; 5327 5328 // Interleave if we vectorized this loop and there is a reduction that could 5329 // benefit from interleaving. 5330 if (VF > 1 && !Legal->getReductionVars().empty()) { 5331 LLVM_DEBUG(dbgs() << "LV: Interleaving because of reductions.\n"); 5332 return IC; 5333 } 5334 5335 // Note that if we've already vectorized the loop we will have done the 5336 // runtime check and so interleaving won't require further checks. 5337 bool InterleavingRequiresRuntimePointerCheck = 5338 (VF == 1 && Legal->getRuntimePointerChecking()->Need); 5339 5340 // We want to interleave small loops in order to reduce the loop overhead and 5341 // potentially expose ILP opportunities. 5342 LLVM_DEBUG(dbgs() << "LV: Loop cost is " << LoopCost << '\n'); 5343 if (!InterleavingRequiresRuntimePointerCheck && LoopCost < SmallLoopCost) { 5344 // We assume that the cost overhead is 1 and we use the cost model 5345 // to estimate the cost of the loop and interleave until the cost of the 5346 // loop overhead is about 5% of the cost of the loop. 5347 unsigned SmallIC = 5348 std::min(IC, (unsigned)PowerOf2Floor(SmallLoopCost / LoopCost)); 5349 5350 // Interleave until store/load ports (estimated by max interleave count) are 5351 // saturated. 5352 unsigned NumStores = Legal->getNumStores(); 5353 unsigned NumLoads = Legal->getNumLoads(); 5354 unsigned StoresIC = IC / (NumStores ? NumStores : 1); 5355 unsigned LoadsIC = IC / (NumLoads ? NumLoads : 1); 5356 5357 // If we have a scalar reduction (vector reductions are already dealt with 5358 // by this point), we can increase the critical path length if the loop 5359 // we're interleaving is inside another loop. Limit, by default to 2, so the 5360 // critical path only gets increased by one reduction operation. 5361 if (!Legal->getReductionVars().empty() && TheLoop->getLoopDepth() > 1) { 5362 unsigned F = static_cast<unsigned>(MaxNestedScalarReductionIC); 5363 SmallIC = std::min(SmallIC, F); 5364 StoresIC = std::min(StoresIC, F); 5365 LoadsIC = std::min(LoadsIC, F); 5366 } 5367 5368 if (EnableLoadStoreRuntimeInterleave && 5369 std::max(StoresIC, LoadsIC) > SmallIC) { 5370 LLVM_DEBUG( 5371 dbgs() << "LV: Interleaving to saturate store or load ports.\n"); 5372 return std::max(StoresIC, LoadsIC); 5373 } 5374 5375 LLVM_DEBUG(dbgs() << "LV: Interleaving to reduce branch cost.\n"); 5376 return SmallIC; 5377 } 5378 5379 // Interleave if this is a large loop (small loops are already dealt with by 5380 // this point) that could benefit from interleaving. 5381 bool HasReductions = !Legal->getReductionVars().empty(); 5382 if (TTI.enableAggressiveInterleaving(HasReductions)) { 5383 LLVM_DEBUG(dbgs() << "LV: Interleaving to expose ILP.\n"); 5384 return IC; 5385 } 5386 5387 LLVM_DEBUG(dbgs() << "LV: Not Interleaving.\n"); 5388 return 1; 5389 } 5390 5391 SmallVector<LoopVectorizationCostModel::RegisterUsage, 8> 5392 LoopVectorizationCostModel::calculateRegisterUsage(ArrayRef<unsigned> VFs) { 5393 // This function calculates the register usage by measuring the highest number 5394 // of values that are alive at a single location. Obviously, this is a very 5395 // rough estimation. We scan the loop in a topological order in order and 5396 // assign a number to each instruction. We use RPO to ensure that defs are 5397 // met before their users. We assume that each instruction that has in-loop 5398 // users starts an interval. We record every time that an in-loop value is 5399 // used, so we have a list of the first and last occurrences of each 5400 // instruction. Next, we transpose this data structure into a multi map that 5401 // holds the list of intervals that *end* at a specific location. This multi 5402 // map allows us to perform a linear search. We scan the instructions linearly 5403 // and record each time that a new interval starts, by placing it in a set. 5404 // If we find this value in the multi-map then we remove it from the set. 5405 // The max register usage is the maximum size of the set. 5406 // We also search for instructions that are defined outside the loop, but are 5407 // used inside the loop. We need this number separately from the max-interval 5408 // usage number because when we unroll, loop-invariant values do not take 5409 // more register. 5410 LoopBlocksDFS DFS(TheLoop); 5411 DFS.perform(LI); 5412 5413 RegisterUsage RU; 5414 5415 // Each 'key' in the map opens a new interval. The values 5416 // of the map are the index of the 'last seen' usage of the 5417 // instruction that is the key. 5418 using IntervalMap = DenseMap<Instruction *, unsigned>; 5419 5420 // Maps instruction to its index. 5421 SmallVector<Instruction *, 64> IdxToInstr; 5422 // Marks the end of each interval. 5423 IntervalMap EndPoint; 5424 // Saves the list of instruction indices that are used in the loop. 5425 SmallPtrSet<Instruction *, 8> Ends; 5426 // Saves the list of values that are used in the loop but are 5427 // defined outside the loop, such as arguments and constants. 5428 SmallPtrSet<Value *, 8> LoopInvariants; 5429 5430 for (BasicBlock *BB : make_range(DFS.beginRPO(), DFS.endRPO())) { 5431 for (Instruction &I : BB->instructionsWithoutDebug()) { 5432 IdxToInstr.push_back(&I); 5433 5434 // Save the end location of each USE. 5435 for (Value *U : I.operands()) { 5436 auto *Instr = dyn_cast<Instruction>(U); 5437 5438 // Ignore non-instruction values such as arguments, constants, etc. 5439 if (!Instr) 5440 continue; 5441 5442 // If this instruction is outside the loop then record it and continue. 5443 if (!TheLoop->contains(Instr)) { 5444 LoopInvariants.insert(Instr); 5445 continue; 5446 } 5447 5448 // Overwrite previous end points. 5449 EndPoint[Instr] = IdxToInstr.size(); 5450 Ends.insert(Instr); 5451 } 5452 } 5453 } 5454 5455 // Saves the list of intervals that end with the index in 'key'. 5456 using InstrList = SmallVector<Instruction *, 2>; 5457 DenseMap<unsigned, InstrList> TransposeEnds; 5458 5459 // Transpose the EndPoints to a list of values that end at each index. 5460 for (auto &Interval : EndPoint) 5461 TransposeEnds[Interval.second].push_back(Interval.first); 5462 5463 SmallPtrSet<Instruction *, 8> OpenIntervals; 5464 5465 // Get the size of the widest register. 5466 unsigned MaxSafeDepDist = -1U; 5467 if (Legal->getMaxSafeDepDistBytes() != -1U) 5468 MaxSafeDepDist = Legal->getMaxSafeDepDistBytes() * 8; 5469 unsigned WidestRegister = 5470 std::min(TTI.getRegisterBitWidth(true), MaxSafeDepDist); 5471 const DataLayout &DL = TheFunction->getParent()->getDataLayout(); 5472 5473 SmallVector<RegisterUsage, 8> RUs(VFs.size()); 5474 SmallVector<SmallMapVector<unsigned, unsigned, 4>, 8> MaxUsages(VFs.size()); 5475 5476 LLVM_DEBUG(dbgs() << "LV(REG): Calculating max register usage:\n"); 5477 5478 // A lambda that gets the register usage for the given type and VF. 5479 auto GetRegUsage = [&DL, WidestRegister](Type *Ty, unsigned VF) { 5480 if (Ty->isTokenTy()) 5481 return 0U; 5482 unsigned TypeSize = DL.getTypeSizeInBits(Ty->getScalarType()); 5483 return std::max<unsigned>(1, VF * TypeSize / WidestRegister); 5484 }; 5485 5486 for (unsigned int i = 0, s = IdxToInstr.size(); i < s; ++i) { 5487 Instruction *I = IdxToInstr[i]; 5488 5489 // Remove all of the instructions that end at this location. 5490 InstrList &List = TransposeEnds[i]; 5491 for (Instruction *ToRemove : List) 5492 OpenIntervals.erase(ToRemove); 5493 5494 // Ignore instructions that are never used within the loop. 5495 if (Ends.find(I) == Ends.end()) 5496 continue; 5497 5498 // Skip ignored values. 5499 if (ValuesToIgnore.find(I) != ValuesToIgnore.end()) 5500 continue; 5501 5502 // For each VF find the maximum usage of registers. 5503 for (unsigned j = 0, e = VFs.size(); j < e; ++j) { 5504 // Count the number of live intervals. 5505 SmallMapVector<unsigned, unsigned, 4> RegUsage; 5506 5507 if (VFs[j] == 1) { 5508 for (auto Inst : OpenIntervals) { 5509 unsigned ClassID = TTI.getRegisterClassForType(false, Inst->getType()); 5510 if (RegUsage.find(ClassID) == RegUsage.end()) 5511 RegUsage[ClassID] = 1; 5512 else 5513 RegUsage[ClassID] += 1; 5514 } 5515 } else { 5516 collectUniformsAndScalars(VFs[j]); 5517 for (auto Inst : OpenIntervals) { 5518 // Skip ignored values for VF > 1. 5519 if (VecValuesToIgnore.find(Inst) != VecValuesToIgnore.end()) 5520 continue; 5521 if (isScalarAfterVectorization(Inst, VFs[j])) { 5522 unsigned ClassID = TTI.getRegisterClassForType(false, Inst->getType()); 5523 if (RegUsage.find(ClassID) == RegUsage.end()) 5524 RegUsage[ClassID] = 1; 5525 else 5526 RegUsage[ClassID] += 1; 5527 } else { 5528 unsigned ClassID = TTI.getRegisterClassForType(true, Inst->getType()); 5529 if (RegUsage.find(ClassID) == RegUsage.end()) 5530 RegUsage[ClassID] = GetRegUsage(Inst->getType(), VFs[j]); 5531 else 5532 RegUsage[ClassID] += GetRegUsage(Inst->getType(), VFs[j]); 5533 } 5534 } 5535 } 5536 5537 for (auto& pair : RegUsage) { 5538 if (MaxUsages[j].find(pair.first) != MaxUsages[j].end()) 5539 MaxUsages[j][pair.first] = std::max(MaxUsages[j][pair.first], pair.second); 5540 else 5541 MaxUsages[j][pair.first] = pair.second; 5542 } 5543 } 5544 5545 LLVM_DEBUG(dbgs() << "LV(REG): At #" << i << " Interval # " 5546 << OpenIntervals.size() << '\n'); 5547 5548 // Add the current instruction to the list of open intervals. 5549 OpenIntervals.insert(I); 5550 } 5551 5552 for (unsigned i = 0, e = VFs.size(); i < e; ++i) { 5553 SmallMapVector<unsigned, unsigned, 4> Invariant; 5554 5555 for (auto Inst : LoopInvariants) { 5556 unsigned Usage = VFs[i] == 1 ? 1 : GetRegUsage(Inst->getType(), VFs[i]); 5557 unsigned ClassID = TTI.getRegisterClassForType(VFs[i] > 1, Inst->getType()); 5558 if (Invariant.find(ClassID) == Invariant.end()) 5559 Invariant[ClassID] = Usage; 5560 else 5561 Invariant[ClassID] += Usage; 5562 } 5563 5564 LLVM_DEBUG({ 5565 dbgs() << "LV(REG): VF = " << VFs[i] << '\n'; 5566 dbgs() << "LV(REG): Found max usage: " << MaxUsages[i].size() 5567 << " item\n"; 5568 for (const auto &pair : MaxUsages[i]) { 5569 dbgs() << "LV(REG): RegisterClass: " 5570 << TTI.getRegisterClassName(pair.first) << ", " << pair.second 5571 << " registers\n"; 5572 } 5573 dbgs() << "LV(REG): Found invariant usage: " << Invariant.size() 5574 << " item\n"; 5575 for (const auto &pair : Invariant) { 5576 dbgs() << "LV(REG): RegisterClass: " 5577 << TTI.getRegisterClassName(pair.first) << ", " << pair.second 5578 << " registers\n"; 5579 } 5580 }); 5581 5582 RU.LoopInvariantRegs = Invariant; 5583 RU.MaxLocalUsers = MaxUsages[i]; 5584 RUs[i] = RU; 5585 } 5586 5587 return RUs; 5588 } 5589 5590 bool LoopVectorizationCostModel::useEmulatedMaskMemRefHack(Instruction *I){ 5591 // TODO: Cost model for emulated masked load/store is completely 5592 // broken. This hack guides the cost model to use an artificially 5593 // high enough value to practically disable vectorization with such 5594 // operations, except where previously deployed legality hack allowed 5595 // using very low cost values. This is to avoid regressions coming simply 5596 // from moving "masked load/store" check from legality to cost model. 5597 // Masked Load/Gather emulation was previously never allowed. 5598 // Limited number of Masked Store/Scatter emulation was allowed. 5599 assert(isPredicatedInst(I) && "Expecting a scalar emulated instruction"); 5600 return isa<LoadInst>(I) || 5601 (isa<StoreInst>(I) && 5602 NumPredStores > NumberOfStoresToPredicate); 5603 } 5604 5605 void LoopVectorizationCostModel::collectInstsToScalarize(unsigned VF) { 5606 // If we aren't vectorizing the loop, or if we've already collected the 5607 // instructions to scalarize, there's nothing to do. Collection may already 5608 // have occurred if we have a user-selected VF and are now computing the 5609 // expected cost for interleaving. 5610 if (VF < 2 || InstsToScalarize.find(VF) != InstsToScalarize.end()) 5611 return; 5612 5613 // Initialize a mapping for VF in InstsToScalalarize. If we find that it's 5614 // not profitable to scalarize any instructions, the presence of VF in the 5615 // map will indicate that we've analyzed it already. 5616 ScalarCostsTy &ScalarCostsVF = InstsToScalarize[VF]; 5617 5618 // Find all the instructions that are scalar with predication in the loop and 5619 // determine if it would be better to not if-convert the blocks they are in. 5620 // If so, we also record the instructions to scalarize. 5621 for (BasicBlock *BB : TheLoop->blocks()) { 5622 if (!blockNeedsPredication(BB)) 5623 continue; 5624 for (Instruction &I : *BB) 5625 if (isScalarWithPredication(&I)) { 5626 ScalarCostsTy ScalarCosts; 5627 // Do not apply discount logic if hacked cost is needed 5628 // for emulated masked memrefs. 5629 if (!useEmulatedMaskMemRefHack(&I) && 5630 computePredInstDiscount(&I, ScalarCosts, VF) >= 0) 5631 ScalarCostsVF.insert(ScalarCosts.begin(), ScalarCosts.end()); 5632 // Remember that BB will remain after vectorization. 5633 PredicatedBBsAfterVectorization.insert(BB); 5634 } 5635 } 5636 } 5637 5638 int LoopVectorizationCostModel::computePredInstDiscount( 5639 Instruction *PredInst, DenseMap<Instruction *, unsigned> &ScalarCosts, 5640 unsigned VF) { 5641 assert(!isUniformAfterVectorization(PredInst, VF) && 5642 "Instruction marked uniform-after-vectorization will be predicated"); 5643 5644 // Initialize the discount to zero, meaning that the scalar version and the 5645 // vector version cost the same. 5646 int Discount = 0; 5647 5648 // Holds instructions to analyze. The instructions we visit are mapped in 5649 // ScalarCosts. Those instructions are the ones that would be scalarized if 5650 // we find that the scalar version costs less. 5651 SmallVector<Instruction *, 8> Worklist; 5652 5653 // Returns true if the given instruction can be scalarized. 5654 auto canBeScalarized = [&](Instruction *I) -> bool { 5655 // We only attempt to scalarize instructions forming a single-use chain 5656 // from the original predicated block that would otherwise be vectorized. 5657 // Although not strictly necessary, we give up on instructions we know will 5658 // already be scalar to avoid traversing chains that are unlikely to be 5659 // beneficial. 5660 if (!I->hasOneUse() || PredInst->getParent() != I->getParent() || 5661 isScalarAfterVectorization(I, VF)) 5662 return false; 5663 5664 // If the instruction is scalar with predication, it will be analyzed 5665 // separately. We ignore it within the context of PredInst. 5666 if (isScalarWithPredication(I)) 5667 return false; 5668 5669 // If any of the instruction's operands are uniform after vectorization, 5670 // the instruction cannot be scalarized. This prevents, for example, a 5671 // masked load from being scalarized. 5672 // 5673 // We assume we will only emit a value for lane zero of an instruction 5674 // marked uniform after vectorization, rather than VF identical values. 5675 // Thus, if we scalarize an instruction that uses a uniform, we would 5676 // create uses of values corresponding to the lanes we aren't emitting code 5677 // for. This behavior can be changed by allowing getScalarValue to clone 5678 // the lane zero values for uniforms rather than asserting. 5679 for (Use &U : I->operands()) 5680 if (auto *J = dyn_cast<Instruction>(U.get())) 5681 if (isUniformAfterVectorization(J, VF)) 5682 return false; 5683 5684 // Otherwise, we can scalarize the instruction. 5685 return true; 5686 }; 5687 5688 // Compute the expected cost discount from scalarizing the entire expression 5689 // feeding the predicated instruction. We currently only consider expressions 5690 // that are single-use instruction chains. 5691 Worklist.push_back(PredInst); 5692 while (!Worklist.empty()) { 5693 Instruction *I = Worklist.pop_back_val(); 5694 5695 // If we've already analyzed the instruction, there's nothing to do. 5696 if (ScalarCosts.find(I) != ScalarCosts.end()) 5697 continue; 5698 5699 // Compute the cost of the vector instruction. Note that this cost already 5700 // includes the scalarization overhead of the predicated instruction. 5701 unsigned VectorCost = getInstructionCost(I, VF).first; 5702 5703 // Compute the cost of the scalarized instruction. This cost is the cost of 5704 // the instruction as if it wasn't if-converted and instead remained in the 5705 // predicated block. We will scale this cost by block probability after 5706 // computing the scalarization overhead. 5707 unsigned ScalarCost = VF * getInstructionCost(I, 1).first; 5708 5709 // Compute the scalarization overhead of needed insertelement instructions 5710 // and phi nodes. 5711 if (isScalarWithPredication(I) && !I->getType()->isVoidTy()) { 5712 ScalarCost += TTI.getScalarizationOverhead( 5713 cast<VectorType>(ToVectorTy(I->getType(), VF)), 5714 APInt::getAllOnesValue(VF), true, false); 5715 ScalarCost += VF * TTI.getCFInstrCost(Instruction::PHI); 5716 } 5717 5718 // Compute the scalarization overhead of needed extractelement 5719 // instructions. For each of the instruction's operands, if the operand can 5720 // be scalarized, add it to the worklist; otherwise, account for the 5721 // overhead. 5722 for (Use &U : I->operands()) 5723 if (auto *J = dyn_cast<Instruction>(U.get())) { 5724 assert(VectorType::isValidElementType(J->getType()) && 5725 "Instruction has non-scalar type"); 5726 if (canBeScalarized(J)) 5727 Worklist.push_back(J); 5728 else if (needsExtract(J, VF)) 5729 ScalarCost += TTI.getScalarizationOverhead( 5730 cast<VectorType>(ToVectorTy(J->getType(), VF)), 5731 APInt::getAllOnesValue(VF), false, true); 5732 } 5733 5734 // Scale the total scalar cost by block probability. 5735 ScalarCost /= getReciprocalPredBlockProb(); 5736 5737 // Compute the discount. A non-negative discount means the vector version 5738 // of the instruction costs more, and scalarizing would be beneficial. 5739 Discount += VectorCost - ScalarCost; 5740 ScalarCosts[I] = ScalarCost; 5741 } 5742 5743 return Discount; 5744 } 5745 5746 LoopVectorizationCostModel::VectorizationCostTy 5747 LoopVectorizationCostModel::expectedCost(unsigned VF) { 5748 VectorizationCostTy Cost; 5749 5750 // For each block. 5751 for (BasicBlock *BB : TheLoop->blocks()) { 5752 VectorizationCostTy BlockCost; 5753 5754 // For each instruction in the old loop. 5755 for (Instruction &I : BB->instructionsWithoutDebug()) { 5756 // Skip ignored values. 5757 if (ValuesToIgnore.find(&I) != ValuesToIgnore.end() || 5758 (VF > 1 && VecValuesToIgnore.find(&I) != VecValuesToIgnore.end())) 5759 continue; 5760 5761 VectorizationCostTy C = getInstructionCost(&I, VF); 5762 5763 // Check if we should override the cost. 5764 if (ForceTargetInstructionCost.getNumOccurrences() > 0) 5765 C.first = ForceTargetInstructionCost; 5766 5767 BlockCost.first += C.first; 5768 BlockCost.second |= C.second; 5769 LLVM_DEBUG(dbgs() << "LV: Found an estimated cost of " << C.first 5770 << " for VF " << VF << " For instruction: " << I 5771 << '\n'); 5772 } 5773 5774 // If we are vectorizing a predicated block, it will have been 5775 // if-converted. This means that the block's instructions (aside from 5776 // stores and instructions that may divide by zero) will now be 5777 // unconditionally executed. For the scalar case, we may not always execute 5778 // the predicated block. Thus, scale the block's cost by the probability of 5779 // executing it. 5780 if (VF == 1 && blockNeedsPredication(BB)) 5781 BlockCost.first /= getReciprocalPredBlockProb(); 5782 5783 Cost.first += BlockCost.first; 5784 Cost.second |= BlockCost.second; 5785 } 5786 5787 return Cost; 5788 } 5789 5790 /// Gets Address Access SCEV after verifying that the access pattern 5791 /// is loop invariant except the induction variable dependence. 5792 /// 5793 /// This SCEV can be sent to the Target in order to estimate the address 5794 /// calculation cost. 5795 static const SCEV *getAddressAccessSCEV( 5796 Value *Ptr, 5797 LoopVectorizationLegality *Legal, 5798 PredicatedScalarEvolution &PSE, 5799 const Loop *TheLoop) { 5800 5801 auto *Gep = dyn_cast<GetElementPtrInst>(Ptr); 5802 if (!Gep) 5803 return nullptr; 5804 5805 // We are looking for a gep with all loop invariant indices except for one 5806 // which should be an induction variable. 5807 auto SE = PSE.getSE(); 5808 unsigned NumOperands = Gep->getNumOperands(); 5809 for (unsigned i = 1; i < NumOperands; ++i) { 5810 Value *Opd = Gep->getOperand(i); 5811 if (!SE->isLoopInvariant(SE->getSCEV(Opd), TheLoop) && 5812 !Legal->isInductionVariable(Opd)) 5813 return nullptr; 5814 } 5815 5816 // Now we know we have a GEP ptr, %inv, %ind, %inv. return the Ptr SCEV. 5817 return PSE.getSCEV(Ptr); 5818 } 5819 5820 static bool isStrideMul(Instruction *I, LoopVectorizationLegality *Legal) { 5821 return Legal->hasStride(I->getOperand(0)) || 5822 Legal->hasStride(I->getOperand(1)); 5823 } 5824 5825 unsigned LoopVectorizationCostModel::getMemInstScalarizationCost(Instruction *I, 5826 unsigned VF) { 5827 assert(VF > 1 && "Scalarization cost of instruction implies vectorization."); 5828 Type *ValTy = getMemInstValueType(I); 5829 auto SE = PSE.getSE(); 5830 5831 unsigned AS = getLoadStoreAddressSpace(I); 5832 Value *Ptr = getLoadStorePointerOperand(I); 5833 Type *PtrTy = ToVectorTy(Ptr->getType(), VF); 5834 5835 // Figure out whether the access is strided and get the stride value 5836 // if it's known in compile time 5837 const SCEV *PtrSCEV = getAddressAccessSCEV(Ptr, Legal, PSE, TheLoop); 5838 5839 // Get the cost of the scalar memory instruction and address computation. 5840 unsigned Cost = VF * TTI.getAddressComputationCost(PtrTy, SE, PtrSCEV); 5841 5842 // Don't pass *I here, since it is scalar but will actually be part of a 5843 // vectorized loop where the user of it is a vectorized instruction. 5844 const Align Alignment = getLoadStoreAlignment(I); 5845 Cost += VF * TTI.getMemoryOpCost(I->getOpcode(), ValTy->getScalarType(), 5846 Alignment, AS, 5847 TTI::TCK_RecipThroughput); 5848 5849 // Get the overhead of the extractelement and insertelement instructions 5850 // we might create due to scalarization. 5851 Cost += getScalarizationOverhead(I, VF); 5852 5853 // If we have a predicated store, it may not be executed for each vector 5854 // lane. Scale the cost by the probability of executing the predicated 5855 // block. 5856 if (isPredicatedInst(I)) { 5857 Cost /= getReciprocalPredBlockProb(); 5858 5859 if (useEmulatedMaskMemRefHack(I)) 5860 // Artificially setting to a high enough value to practically disable 5861 // vectorization with such operations. 5862 Cost = 3000000; 5863 } 5864 5865 return Cost; 5866 } 5867 5868 unsigned LoopVectorizationCostModel::getConsecutiveMemOpCost(Instruction *I, 5869 unsigned VF) { 5870 Type *ValTy = getMemInstValueType(I); 5871 auto *VectorTy = cast<VectorType>(ToVectorTy(ValTy, VF)); 5872 Value *Ptr = getLoadStorePointerOperand(I); 5873 unsigned AS = getLoadStoreAddressSpace(I); 5874 int ConsecutiveStride = Legal->isConsecutivePtr(Ptr); 5875 enum TTI::TargetCostKind CostKind = TTI::TCK_RecipThroughput; 5876 5877 assert((ConsecutiveStride == 1 || ConsecutiveStride == -1) && 5878 "Stride should be 1 or -1 for consecutive memory access"); 5879 const Align Alignment = getLoadStoreAlignment(I); 5880 unsigned Cost = 0; 5881 if (Legal->isMaskRequired(I)) 5882 Cost += TTI.getMaskedMemoryOpCost(I->getOpcode(), VectorTy, 5883 Alignment.value(), AS, CostKind); 5884 else 5885 Cost += TTI.getMemoryOpCost(I->getOpcode(), VectorTy, Alignment, AS, 5886 CostKind, I); 5887 5888 bool Reverse = ConsecutiveStride < 0; 5889 if (Reverse) 5890 Cost += TTI.getShuffleCost(TargetTransformInfo::SK_Reverse, VectorTy, 0); 5891 return Cost; 5892 } 5893 5894 unsigned LoopVectorizationCostModel::getUniformMemOpCost(Instruction *I, 5895 unsigned VF) { 5896 Type *ValTy = getMemInstValueType(I); 5897 auto *VectorTy = cast<VectorType>(ToVectorTy(ValTy, VF)); 5898 const Align Alignment = getLoadStoreAlignment(I); 5899 unsigned AS = getLoadStoreAddressSpace(I); 5900 enum TTI::TargetCostKind CostKind = TTI::TCK_RecipThroughput; 5901 if (isa<LoadInst>(I)) { 5902 return TTI.getAddressComputationCost(ValTy) + 5903 TTI.getMemoryOpCost(Instruction::Load, ValTy, Alignment, AS, 5904 CostKind) + 5905 TTI.getShuffleCost(TargetTransformInfo::SK_Broadcast, VectorTy); 5906 } 5907 StoreInst *SI = cast<StoreInst>(I); 5908 5909 bool isLoopInvariantStoreValue = Legal->isUniform(SI->getValueOperand()); 5910 return TTI.getAddressComputationCost(ValTy) + 5911 TTI.getMemoryOpCost(Instruction::Store, ValTy, Alignment, AS, 5912 CostKind) + 5913 (isLoopInvariantStoreValue 5914 ? 0 5915 : TTI.getVectorInstrCost(Instruction::ExtractElement, VectorTy, 5916 VF - 1)); 5917 } 5918 5919 unsigned LoopVectorizationCostModel::getGatherScatterCost(Instruction *I, 5920 unsigned VF) { 5921 Type *ValTy = getMemInstValueType(I); 5922 auto *VectorTy = cast<VectorType>(ToVectorTy(ValTy, VF)); 5923 const Align Alignment = getLoadStoreAlignment(I); 5924 Value *Ptr = getLoadStorePointerOperand(I); 5925 5926 return TTI.getAddressComputationCost(VectorTy) + 5927 TTI.getGatherScatterOpCost(I->getOpcode(), VectorTy, Ptr, 5928 Legal->isMaskRequired(I), Alignment.value(), 5929 TargetTransformInfo::TCK_RecipThroughput, 5930 I); 5931 } 5932 5933 unsigned LoopVectorizationCostModel::getInterleaveGroupCost(Instruction *I, 5934 unsigned VF) { 5935 Type *ValTy = getMemInstValueType(I); 5936 auto *VectorTy = cast<VectorType>(ToVectorTy(ValTy, VF)); 5937 unsigned AS = getLoadStoreAddressSpace(I); 5938 5939 auto Group = getInterleavedAccessGroup(I); 5940 assert(Group && "Fail to get an interleaved access group."); 5941 5942 unsigned InterleaveFactor = Group->getFactor(); 5943 VectorType *WideVecTy = VectorType::get(ValTy, VF * InterleaveFactor); 5944 5945 // Holds the indices of existing members in an interleaved load group. 5946 // An interleaved store group doesn't need this as it doesn't allow gaps. 5947 SmallVector<unsigned, 4> Indices; 5948 if (isa<LoadInst>(I)) { 5949 for (unsigned i = 0; i < InterleaveFactor; i++) 5950 if (Group->getMember(i)) 5951 Indices.push_back(i); 5952 } 5953 5954 // Calculate the cost of the whole interleaved group. 5955 bool UseMaskForGaps = 5956 Group->requiresScalarEpilogue() && !isScalarEpilogueAllowed(); 5957 unsigned Cost = TTI.getInterleavedMemoryOpCost( 5958 I->getOpcode(), WideVecTy, Group->getFactor(), Indices, 5959 Group->getAlign().value(), AS, TTI::TCK_RecipThroughput, 5960 Legal->isMaskRequired(I), UseMaskForGaps); 5961 5962 if (Group->isReverse()) { 5963 // TODO: Add support for reversed masked interleaved access. 5964 assert(!Legal->isMaskRequired(I) && 5965 "Reverse masked interleaved access not supported."); 5966 Cost += Group->getNumMembers() * 5967 TTI.getShuffleCost(TargetTransformInfo::SK_Reverse, VectorTy, 0); 5968 } 5969 return Cost; 5970 } 5971 5972 unsigned LoopVectorizationCostModel::getMemoryInstructionCost(Instruction *I, 5973 unsigned VF) { 5974 // Calculate scalar cost only. Vectorization cost should be ready at this 5975 // moment. 5976 if (VF == 1) { 5977 Type *ValTy = getMemInstValueType(I); 5978 const Align Alignment = getLoadStoreAlignment(I); 5979 unsigned AS = getLoadStoreAddressSpace(I); 5980 5981 return TTI.getAddressComputationCost(ValTy) + 5982 TTI.getMemoryOpCost(I->getOpcode(), ValTy, Alignment, AS, 5983 TTI::TCK_RecipThroughput, I); 5984 } 5985 return getWideningCost(I, VF); 5986 } 5987 5988 LoopVectorizationCostModel::VectorizationCostTy 5989 LoopVectorizationCostModel::getInstructionCost(Instruction *I, unsigned VF) { 5990 // If we know that this instruction will remain uniform, check the cost of 5991 // the scalar version. 5992 if (isUniformAfterVectorization(I, VF)) 5993 VF = 1; 5994 5995 if (VF > 1 && isProfitableToScalarize(I, VF)) 5996 return VectorizationCostTy(InstsToScalarize[VF][I], false); 5997 5998 // Forced scalars do not have any scalarization overhead. 5999 auto ForcedScalar = ForcedScalars.find(VF); 6000 if (VF > 1 && ForcedScalar != ForcedScalars.end()) { 6001 auto InstSet = ForcedScalar->second; 6002 if (InstSet.find(I) != InstSet.end()) 6003 return VectorizationCostTy((getInstructionCost(I, 1).first * VF), false); 6004 } 6005 6006 Type *VectorTy; 6007 unsigned C = getInstructionCost(I, VF, VectorTy); 6008 6009 bool TypeNotScalarized = 6010 VF > 1 && VectorTy->isVectorTy() && TTI.getNumberOfParts(VectorTy) < VF; 6011 return VectorizationCostTy(C, TypeNotScalarized); 6012 } 6013 6014 unsigned LoopVectorizationCostModel::getScalarizationOverhead(Instruction *I, 6015 unsigned VF) { 6016 6017 if (VF == 1) 6018 return 0; 6019 6020 unsigned Cost = 0; 6021 Type *RetTy = ToVectorTy(I->getType(), VF); 6022 if (!RetTy->isVoidTy() && 6023 (!isa<LoadInst>(I) || !TTI.supportsEfficientVectorElementLoadStore())) 6024 Cost += TTI.getScalarizationOverhead( 6025 cast<VectorType>(RetTy), APInt::getAllOnesValue(VF), true, false); 6026 6027 // Some targets keep addresses scalar. 6028 if (isa<LoadInst>(I) && !TTI.prefersVectorizedAddressing()) 6029 return Cost; 6030 6031 // Some targets support efficient element stores. 6032 if (isa<StoreInst>(I) && TTI.supportsEfficientVectorElementLoadStore()) 6033 return Cost; 6034 6035 // Collect operands to consider. 6036 CallInst *CI = dyn_cast<CallInst>(I); 6037 Instruction::op_range Ops = CI ? CI->arg_operands() : I->operands(); 6038 6039 // Skip operands that do not require extraction/scalarization and do not incur 6040 // any overhead. 6041 return Cost + TTI.getOperandsScalarizationOverhead( 6042 filterExtractingOperands(Ops, VF), VF); 6043 } 6044 6045 void LoopVectorizationCostModel::setCostBasedWideningDecision(unsigned VF) { 6046 if (VF == 1) 6047 return; 6048 NumPredStores = 0; 6049 for (BasicBlock *BB : TheLoop->blocks()) { 6050 // For each instruction in the old loop. 6051 for (Instruction &I : *BB) { 6052 Value *Ptr = getLoadStorePointerOperand(&I); 6053 if (!Ptr) 6054 continue; 6055 6056 // TODO: We should generate better code and update the cost model for 6057 // predicated uniform stores. Today they are treated as any other 6058 // predicated store (see added test cases in 6059 // invariant-store-vectorization.ll). 6060 if (isa<StoreInst>(&I) && isScalarWithPredication(&I)) 6061 NumPredStores++; 6062 6063 if (Legal->isUniform(Ptr) && 6064 // Conditional loads and stores should be scalarized and predicated. 6065 // isScalarWithPredication cannot be used here since masked 6066 // gather/scatters are not considered scalar with predication. 6067 !Legal->blockNeedsPredication(I.getParent())) { 6068 // TODO: Avoid replicating loads and stores instead of 6069 // relying on instcombine to remove them. 6070 // Load: Scalar load + broadcast 6071 // Store: Scalar store + isLoopInvariantStoreValue ? 0 : extract 6072 unsigned Cost = getUniformMemOpCost(&I, VF); 6073 setWideningDecision(&I, VF, CM_Scalarize, Cost); 6074 continue; 6075 } 6076 6077 // We assume that widening is the best solution when possible. 6078 if (memoryInstructionCanBeWidened(&I, VF)) { 6079 unsigned Cost = getConsecutiveMemOpCost(&I, VF); 6080 int ConsecutiveStride = 6081 Legal->isConsecutivePtr(getLoadStorePointerOperand(&I)); 6082 assert((ConsecutiveStride == 1 || ConsecutiveStride == -1) && 6083 "Expected consecutive stride."); 6084 InstWidening Decision = 6085 ConsecutiveStride == 1 ? CM_Widen : CM_Widen_Reverse; 6086 setWideningDecision(&I, VF, Decision, Cost); 6087 continue; 6088 } 6089 6090 // Choose between Interleaving, Gather/Scatter or Scalarization. 6091 unsigned InterleaveCost = std::numeric_limits<unsigned>::max(); 6092 unsigned NumAccesses = 1; 6093 if (isAccessInterleaved(&I)) { 6094 auto Group = getInterleavedAccessGroup(&I); 6095 assert(Group && "Fail to get an interleaved access group."); 6096 6097 // Make one decision for the whole group. 6098 if (getWideningDecision(&I, VF) != CM_Unknown) 6099 continue; 6100 6101 NumAccesses = Group->getNumMembers(); 6102 if (interleavedAccessCanBeWidened(&I, VF)) 6103 InterleaveCost = getInterleaveGroupCost(&I, VF); 6104 } 6105 6106 unsigned GatherScatterCost = 6107 isLegalGatherOrScatter(&I) 6108 ? getGatherScatterCost(&I, VF) * NumAccesses 6109 : std::numeric_limits<unsigned>::max(); 6110 6111 unsigned ScalarizationCost = 6112 getMemInstScalarizationCost(&I, VF) * NumAccesses; 6113 6114 // Choose better solution for the current VF, 6115 // write down this decision and use it during vectorization. 6116 unsigned Cost; 6117 InstWidening Decision; 6118 if (InterleaveCost <= GatherScatterCost && 6119 InterleaveCost < ScalarizationCost) { 6120 Decision = CM_Interleave; 6121 Cost = InterleaveCost; 6122 } else if (GatherScatterCost < ScalarizationCost) { 6123 Decision = CM_GatherScatter; 6124 Cost = GatherScatterCost; 6125 } else { 6126 Decision = CM_Scalarize; 6127 Cost = ScalarizationCost; 6128 } 6129 // If the instructions belongs to an interleave group, the whole group 6130 // receives the same decision. The whole group receives the cost, but 6131 // the cost will actually be assigned to one instruction. 6132 if (auto Group = getInterleavedAccessGroup(&I)) 6133 setWideningDecision(Group, VF, Decision, Cost); 6134 else 6135 setWideningDecision(&I, VF, Decision, Cost); 6136 } 6137 } 6138 6139 // Make sure that any load of address and any other address computation 6140 // remains scalar unless there is gather/scatter support. This avoids 6141 // inevitable extracts into address registers, and also has the benefit of 6142 // activating LSR more, since that pass can't optimize vectorized 6143 // addresses. 6144 if (TTI.prefersVectorizedAddressing()) 6145 return; 6146 6147 // Start with all scalar pointer uses. 6148 SmallPtrSet<Instruction *, 8> AddrDefs; 6149 for (BasicBlock *BB : TheLoop->blocks()) 6150 for (Instruction &I : *BB) { 6151 Instruction *PtrDef = 6152 dyn_cast_or_null<Instruction>(getLoadStorePointerOperand(&I)); 6153 if (PtrDef && TheLoop->contains(PtrDef) && 6154 getWideningDecision(&I, VF) != CM_GatherScatter) 6155 AddrDefs.insert(PtrDef); 6156 } 6157 6158 // Add all instructions used to generate the addresses. 6159 SmallVector<Instruction *, 4> Worklist; 6160 for (auto *I : AddrDefs) 6161 Worklist.push_back(I); 6162 while (!Worklist.empty()) { 6163 Instruction *I = Worklist.pop_back_val(); 6164 for (auto &Op : I->operands()) 6165 if (auto *InstOp = dyn_cast<Instruction>(Op)) 6166 if ((InstOp->getParent() == I->getParent()) && !isa<PHINode>(InstOp) && 6167 AddrDefs.insert(InstOp).second) 6168 Worklist.push_back(InstOp); 6169 } 6170 6171 for (auto *I : AddrDefs) { 6172 if (isa<LoadInst>(I)) { 6173 // Setting the desired widening decision should ideally be handled in 6174 // by cost functions, but since this involves the task of finding out 6175 // if the loaded register is involved in an address computation, it is 6176 // instead changed here when we know this is the case. 6177 InstWidening Decision = getWideningDecision(I, VF); 6178 if (Decision == CM_Widen || Decision == CM_Widen_Reverse) 6179 // Scalarize a widened load of address. 6180 setWideningDecision(I, VF, CM_Scalarize, 6181 (VF * getMemoryInstructionCost(I, 1))); 6182 else if (auto Group = getInterleavedAccessGroup(I)) { 6183 // Scalarize an interleave group of address loads. 6184 for (unsigned I = 0; I < Group->getFactor(); ++I) { 6185 if (Instruction *Member = Group->getMember(I)) 6186 setWideningDecision(Member, VF, CM_Scalarize, 6187 (VF * getMemoryInstructionCost(Member, 1))); 6188 } 6189 } 6190 } else 6191 // Make sure I gets scalarized and a cost estimate without 6192 // scalarization overhead. 6193 ForcedScalars[VF].insert(I); 6194 } 6195 } 6196 6197 unsigned LoopVectorizationCostModel::getInstructionCost(Instruction *I, 6198 unsigned VF, 6199 Type *&VectorTy) { 6200 Type *RetTy = I->getType(); 6201 if (canTruncateToMinimalBitwidth(I, VF)) 6202 RetTy = IntegerType::get(RetTy->getContext(), MinBWs[I]); 6203 VectorTy = isScalarAfterVectorization(I, VF) ? RetTy : ToVectorTy(RetTy, VF); 6204 auto SE = PSE.getSE(); 6205 TTI::TargetCostKind CostKind = TTI::TCK_RecipThroughput; 6206 6207 // TODO: We need to estimate the cost of intrinsic calls. 6208 switch (I->getOpcode()) { 6209 case Instruction::GetElementPtr: 6210 // We mark this instruction as zero-cost because the cost of GEPs in 6211 // vectorized code depends on whether the corresponding memory instruction 6212 // is scalarized or not. Therefore, we handle GEPs with the memory 6213 // instruction cost. 6214 return 0; 6215 case Instruction::Br: { 6216 // In cases of scalarized and predicated instructions, there will be VF 6217 // predicated blocks in the vectorized loop. Each branch around these 6218 // blocks requires also an extract of its vector compare i1 element. 6219 bool ScalarPredicatedBB = false; 6220 BranchInst *BI = cast<BranchInst>(I); 6221 if (VF > 1 && BI->isConditional() && 6222 (PredicatedBBsAfterVectorization.find(BI->getSuccessor(0)) != 6223 PredicatedBBsAfterVectorization.end() || 6224 PredicatedBBsAfterVectorization.find(BI->getSuccessor(1)) != 6225 PredicatedBBsAfterVectorization.end())) 6226 ScalarPredicatedBB = true; 6227 6228 if (ScalarPredicatedBB) { 6229 // Return cost for branches around scalarized and predicated blocks. 6230 VectorType *Vec_i1Ty = 6231 VectorType::get(IntegerType::getInt1Ty(RetTy->getContext()), VF); 6232 return (TTI.getScalarizationOverhead(Vec_i1Ty, APInt::getAllOnesValue(VF), 6233 false, true) + 6234 (TTI.getCFInstrCost(Instruction::Br) * VF)); 6235 } else if (I->getParent() == TheLoop->getLoopLatch() || VF == 1) 6236 // The back-edge branch will remain, as will all scalar branches. 6237 return TTI.getCFInstrCost(Instruction::Br); 6238 else 6239 // This branch will be eliminated by if-conversion. 6240 return 0; 6241 // Note: We currently assume zero cost for an unconditional branch inside 6242 // a predicated block since it will become a fall-through, although we 6243 // may decide in the future to call TTI for all branches. 6244 } 6245 case Instruction::PHI: { 6246 auto *Phi = cast<PHINode>(I); 6247 6248 // First-order recurrences are replaced by vector shuffles inside the loop. 6249 // NOTE: Don't use ToVectorTy as SK_ExtractSubvector expects a vector type. 6250 if (VF > 1 && Legal->isFirstOrderRecurrence(Phi)) 6251 return TTI.getShuffleCost(TargetTransformInfo::SK_ExtractSubvector, 6252 cast<VectorType>(VectorTy), VF - 1, 6253 VectorType::get(RetTy, 1)); 6254 6255 // Phi nodes in non-header blocks (not inductions, reductions, etc.) are 6256 // converted into select instructions. We require N - 1 selects per phi 6257 // node, where N is the number of incoming values. 6258 if (VF > 1 && Phi->getParent() != TheLoop->getHeader()) 6259 return (Phi->getNumIncomingValues() - 1) * 6260 TTI.getCmpSelInstrCost( 6261 Instruction::Select, ToVectorTy(Phi->getType(), VF), 6262 ToVectorTy(Type::getInt1Ty(Phi->getContext()), VF), 6263 CostKind); 6264 6265 return TTI.getCFInstrCost(Instruction::PHI); 6266 } 6267 case Instruction::UDiv: 6268 case Instruction::SDiv: 6269 case Instruction::URem: 6270 case Instruction::SRem: 6271 // If we have a predicated instruction, it may not be executed for each 6272 // vector lane. Get the scalarization cost and scale this amount by the 6273 // probability of executing the predicated block. If the instruction is not 6274 // predicated, we fall through to the next case. 6275 if (VF > 1 && isScalarWithPredication(I)) { 6276 unsigned Cost = 0; 6277 6278 // These instructions have a non-void type, so account for the phi nodes 6279 // that we will create. This cost is likely to be zero. The phi node 6280 // cost, if any, should be scaled by the block probability because it 6281 // models a copy at the end of each predicated block. 6282 Cost += VF * TTI.getCFInstrCost(Instruction::PHI); 6283 6284 // The cost of the non-predicated instruction. 6285 Cost += VF * TTI.getArithmeticInstrCost(I->getOpcode(), RetTy, CostKind); 6286 6287 // The cost of insertelement and extractelement instructions needed for 6288 // scalarization. 6289 Cost += getScalarizationOverhead(I, VF); 6290 6291 // Scale the cost by the probability of executing the predicated blocks. 6292 // This assumes the predicated block for each vector lane is equally 6293 // likely. 6294 return Cost / getReciprocalPredBlockProb(); 6295 } 6296 LLVM_FALLTHROUGH; 6297 case Instruction::Add: 6298 case Instruction::FAdd: 6299 case Instruction::Sub: 6300 case Instruction::FSub: 6301 case Instruction::Mul: 6302 case Instruction::FMul: 6303 case Instruction::FDiv: 6304 case Instruction::FRem: 6305 case Instruction::Shl: 6306 case Instruction::LShr: 6307 case Instruction::AShr: 6308 case Instruction::And: 6309 case Instruction::Or: 6310 case Instruction::Xor: { 6311 // Since we will replace the stride by 1 the multiplication should go away. 6312 if (I->getOpcode() == Instruction::Mul && isStrideMul(I, Legal)) 6313 return 0; 6314 // Certain instructions can be cheaper to vectorize if they have a constant 6315 // second vector operand. One example of this are shifts on x86. 6316 Value *Op2 = I->getOperand(1); 6317 TargetTransformInfo::OperandValueProperties Op2VP; 6318 TargetTransformInfo::OperandValueKind Op2VK = 6319 TTI.getOperandInfo(Op2, Op2VP); 6320 if (Op2VK == TargetTransformInfo::OK_AnyValue && Legal->isUniform(Op2)) 6321 Op2VK = TargetTransformInfo::OK_UniformValue; 6322 6323 SmallVector<const Value *, 4> Operands(I->operand_values()); 6324 unsigned N = isScalarAfterVectorization(I, VF) ? VF : 1; 6325 return N * TTI.getArithmeticInstrCost( 6326 I->getOpcode(), VectorTy, CostKind, 6327 TargetTransformInfo::OK_AnyValue, 6328 Op2VK, TargetTransformInfo::OP_None, Op2VP, Operands, I); 6329 } 6330 case Instruction::FNeg: { 6331 unsigned N = isScalarAfterVectorization(I, VF) ? VF : 1; 6332 return N * TTI.getArithmeticInstrCost( 6333 I->getOpcode(), VectorTy, CostKind, 6334 TargetTransformInfo::OK_AnyValue, 6335 TargetTransformInfo::OK_AnyValue, 6336 TargetTransformInfo::OP_None, TargetTransformInfo::OP_None, 6337 I->getOperand(0), I); 6338 } 6339 case Instruction::Select: { 6340 SelectInst *SI = cast<SelectInst>(I); 6341 const SCEV *CondSCEV = SE->getSCEV(SI->getCondition()); 6342 bool ScalarCond = (SE->isLoopInvariant(CondSCEV, TheLoop)); 6343 Type *CondTy = SI->getCondition()->getType(); 6344 if (!ScalarCond) 6345 CondTy = VectorType::get(CondTy, VF); 6346 6347 return TTI.getCmpSelInstrCost(I->getOpcode(), VectorTy, CondTy, 6348 CostKind, I); 6349 } 6350 case Instruction::ICmp: 6351 case Instruction::FCmp: { 6352 Type *ValTy = I->getOperand(0)->getType(); 6353 Instruction *Op0AsInstruction = dyn_cast<Instruction>(I->getOperand(0)); 6354 if (canTruncateToMinimalBitwidth(Op0AsInstruction, VF)) 6355 ValTy = IntegerType::get(ValTy->getContext(), MinBWs[Op0AsInstruction]); 6356 VectorTy = ToVectorTy(ValTy, VF); 6357 return TTI.getCmpSelInstrCost(I->getOpcode(), VectorTy, nullptr, CostKind, 6358 I); 6359 } 6360 case Instruction::Store: 6361 case Instruction::Load: { 6362 unsigned Width = VF; 6363 if (Width > 1) { 6364 InstWidening Decision = getWideningDecision(I, Width); 6365 assert(Decision != CM_Unknown && 6366 "CM decision should be taken at this point"); 6367 if (Decision == CM_Scalarize) 6368 Width = 1; 6369 } 6370 VectorTy = ToVectorTy(getMemInstValueType(I), Width); 6371 return getMemoryInstructionCost(I, VF); 6372 } 6373 case Instruction::ZExt: 6374 case Instruction::SExt: 6375 case Instruction::FPToUI: 6376 case Instruction::FPToSI: 6377 case Instruction::FPExt: 6378 case Instruction::PtrToInt: 6379 case Instruction::IntToPtr: 6380 case Instruction::SIToFP: 6381 case Instruction::UIToFP: 6382 case Instruction::Trunc: 6383 case Instruction::FPTrunc: 6384 case Instruction::BitCast: { 6385 // We optimize the truncation of induction variables having constant 6386 // integer steps. The cost of these truncations is the same as the scalar 6387 // operation. 6388 if (isOptimizableIVTruncate(I, VF)) { 6389 auto *Trunc = cast<TruncInst>(I); 6390 return TTI.getCastInstrCost(Instruction::Trunc, Trunc->getDestTy(), 6391 Trunc->getSrcTy(), CostKind, Trunc); 6392 } 6393 6394 Type *SrcScalarTy = I->getOperand(0)->getType(); 6395 Type *SrcVecTy = 6396 VectorTy->isVectorTy() ? ToVectorTy(SrcScalarTy, VF) : SrcScalarTy; 6397 if (canTruncateToMinimalBitwidth(I, VF)) { 6398 // This cast is going to be shrunk. This may remove the cast or it might 6399 // turn it into slightly different cast. For example, if MinBW == 16, 6400 // "zext i8 %1 to i32" becomes "zext i8 %1 to i16". 6401 // 6402 // Calculate the modified src and dest types. 6403 Type *MinVecTy = VectorTy; 6404 if (I->getOpcode() == Instruction::Trunc) { 6405 SrcVecTy = smallestIntegerVectorType(SrcVecTy, MinVecTy); 6406 VectorTy = 6407 largestIntegerVectorType(ToVectorTy(I->getType(), VF), MinVecTy); 6408 } else if (I->getOpcode() == Instruction::ZExt || 6409 I->getOpcode() == Instruction::SExt) { 6410 SrcVecTy = largestIntegerVectorType(SrcVecTy, MinVecTy); 6411 VectorTy = 6412 smallestIntegerVectorType(ToVectorTy(I->getType(), VF), MinVecTy); 6413 } 6414 } 6415 6416 unsigned N = isScalarAfterVectorization(I, VF) ? VF : 1; 6417 return N * TTI.getCastInstrCost(I->getOpcode(), VectorTy, SrcVecTy, 6418 CostKind, I); 6419 } 6420 case Instruction::Call: { 6421 bool NeedToScalarize; 6422 CallInst *CI = cast<CallInst>(I); 6423 unsigned CallCost = getVectorCallCost(CI, VF, NeedToScalarize); 6424 if (getVectorIntrinsicIDForCall(CI, TLI)) 6425 return std::min(CallCost, getVectorIntrinsicCost(CI, VF)); 6426 return CallCost; 6427 } 6428 default: 6429 // The cost of executing VF copies of the scalar instruction. This opcode 6430 // is unknown. Assume that it is the same as 'mul'. 6431 return VF * TTI.getArithmeticInstrCost(Instruction::Mul, VectorTy, 6432 CostKind) + 6433 getScalarizationOverhead(I, VF); 6434 } // end of switch. 6435 } 6436 6437 char LoopVectorize::ID = 0; 6438 6439 static const char lv_name[] = "Loop Vectorization"; 6440 6441 INITIALIZE_PASS_BEGIN(LoopVectorize, LV_NAME, lv_name, false, false) 6442 INITIALIZE_PASS_DEPENDENCY(TargetTransformInfoWrapperPass) 6443 INITIALIZE_PASS_DEPENDENCY(BasicAAWrapperPass) 6444 INITIALIZE_PASS_DEPENDENCY(AAResultsWrapperPass) 6445 INITIALIZE_PASS_DEPENDENCY(GlobalsAAWrapperPass) 6446 INITIALIZE_PASS_DEPENDENCY(AssumptionCacheTracker) 6447 INITIALIZE_PASS_DEPENDENCY(BlockFrequencyInfoWrapperPass) 6448 INITIALIZE_PASS_DEPENDENCY(DominatorTreeWrapperPass) 6449 INITIALIZE_PASS_DEPENDENCY(ScalarEvolutionWrapperPass) 6450 INITIALIZE_PASS_DEPENDENCY(LoopInfoWrapperPass) 6451 INITIALIZE_PASS_DEPENDENCY(LoopAccessLegacyAnalysis) 6452 INITIALIZE_PASS_DEPENDENCY(DemandedBitsWrapperPass) 6453 INITIALIZE_PASS_DEPENDENCY(OptimizationRemarkEmitterWrapperPass) 6454 INITIALIZE_PASS_DEPENDENCY(ProfileSummaryInfoWrapperPass) 6455 INITIALIZE_PASS_DEPENDENCY(InjectTLIMappingsLegacy) 6456 INITIALIZE_PASS_END(LoopVectorize, LV_NAME, lv_name, false, false) 6457 6458 namespace llvm { 6459 6460 Pass *createLoopVectorizePass() { return new LoopVectorize(); } 6461 6462 Pass *createLoopVectorizePass(bool InterleaveOnlyWhenForced, 6463 bool VectorizeOnlyWhenForced) { 6464 return new LoopVectorize(InterleaveOnlyWhenForced, VectorizeOnlyWhenForced); 6465 } 6466 6467 } // end namespace llvm 6468 6469 bool LoopVectorizationCostModel::isConsecutiveLoadOrStore(Instruction *Inst) { 6470 // Check if the pointer operand of a load or store instruction is 6471 // consecutive. 6472 if (auto *Ptr = getLoadStorePointerOperand(Inst)) 6473 return Legal->isConsecutivePtr(Ptr); 6474 return false; 6475 } 6476 6477 void LoopVectorizationCostModel::collectValuesToIgnore() { 6478 // Ignore ephemeral values. 6479 CodeMetrics::collectEphemeralValues(TheLoop, AC, ValuesToIgnore); 6480 6481 // Ignore type-promoting instructions we identified during reduction 6482 // detection. 6483 for (auto &Reduction : Legal->getReductionVars()) { 6484 RecurrenceDescriptor &RedDes = Reduction.second; 6485 SmallPtrSetImpl<Instruction *> &Casts = RedDes.getCastInsts(); 6486 VecValuesToIgnore.insert(Casts.begin(), Casts.end()); 6487 } 6488 // Ignore type-casting instructions we identified during induction 6489 // detection. 6490 for (auto &Induction : Legal->getInductionVars()) { 6491 InductionDescriptor &IndDes = Induction.second; 6492 const SmallVectorImpl<Instruction *> &Casts = IndDes.getCastInsts(); 6493 VecValuesToIgnore.insert(Casts.begin(), Casts.end()); 6494 } 6495 } 6496 6497 // TODO: we could return a pair of values that specify the max VF and 6498 // min VF, to be used in `buildVPlans(MinVF, MaxVF)` instead of 6499 // `buildVPlans(VF, VF)`. We cannot do it because VPLAN at the moment 6500 // doesn't have a cost model that can choose which plan to execute if 6501 // more than one is generated. 6502 static unsigned determineVPlanVF(const unsigned WidestVectorRegBits, 6503 LoopVectorizationCostModel &CM) { 6504 unsigned WidestType; 6505 std::tie(std::ignore, WidestType) = CM.getSmallestAndWidestTypes(); 6506 return WidestVectorRegBits / WidestType; 6507 } 6508 6509 VectorizationFactor 6510 LoopVectorizationPlanner::planInVPlanNativePath(unsigned UserVF) { 6511 unsigned VF = UserVF; 6512 // Outer loop handling: They may require CFG and instruction level 6513 // transformations before even evaluating whether vectorization is profitable. 6514 // Since we cannot modify the incoming IR, we need to build VPlan upfront in 6515 // the vectorization pipeline. 6516 if (!OrigLoop->empty()) { 6517 // If the user doesn't provide a vectorization factor, determine a 6518 // reasonable one. 6519 if (!UserVF) { 6520 VF = determineVPlanVF(TTI->getRegisterBitWidth(true /* Vector*/), CM); 6521 LLVM_DEBUG(dbgs() << "LV: VPlan computed VF " << VF << ".\n"); 6522 6523 // Make sure we have a VF > 1 for stress testing. 6524 if (VPlanBuildStressTest && VF < 2) { 6525 LLVM_DEBUG(dbgs() << "LV: VPlan stress testing: " 6526 << "overriding computed VF.\n"); 6527 VF = 4; 6528 } 6529 } 6530 assert(EnableVPlanNativePath && "VPlan-native path is not enabled."); 6531 assert(isPowerOf2_32(VF) && "VF needs to be a power of two"); 6532 LLVM_DEBUG(dbgs() << "LV: Using " << (UserVF ? "user " : "") << "VF " << VF 6533 << " to build VPlans.\n"); 6534 buildVPlans(VF, VF); 6535 6536 // For VPlan build stress testing, we bail out after VPlan construction. 6537 if (VPlanBuildStressTest) 6538 return VectorizationFactor::Disabled(); 6539 6540 return {VF, 0}; 6541 } 6542 6543 LLVM_DEBUG( 6544 dbgs() << "LV: Not vectorizing. Inner loops aren't supported in the " 6545 "VPlan-native path.\n"); 6546 return VectorizationFactor::Disabled(); 6547 } 6548 6549 Optional<VectorizationFactor> LoopVectorizationPlanner::plan(unsigned UserVF, 6550 unsigned UserIC) { 6551 assert(OrigLoop->empty() && "Inner loop expected."); 6552 Optional<unsigned> MaybeMaxVF = CM.computeMaxVF(UserVF, UserIC); 6553 if (!MaybeMaxVF) // Cases that should not to be vectorized nor interleaved. 6554 return None; 6555 6556 // Invalidate interleave groups if all blocks of loop will be predicated. 6557 if (CM.blockNeedsPredication(OrigLoop->getHeader()) && 6558 !useMaskedInterleavedAccesses(*TTI)) { 6559 LLVM_DEBUG( 6560 dbgs() 6561 << "LV: Invalidate all interleaved groups due to fold-tail by masking " 6562 "which requires masked-interleaved support.\n"); 6563 if (CM.InterleaveInfo.invalidateGroups()) 6564 // Invalidating interleave groups also requires invalidating all decisions 6565 // based on them, which includes widening decisions and uniform and scalar 6566 // values. 6567 CM.invalidateCostModelingDecisions(); 6568 } 6569 6570 if (UserVF) { 6571 LLVM_DEBUG(dbgs() << "LV: Using user VF " << UserVF << ".\n"); 6572 assert(isPowerOf2_32(UserVF) && "VF needs to be a power of two"); 6573 // Collect the instructions (and their associated costs) that will be more 6574 // profitable to scalarize. 6575 CM.selectUserVectorizationFactor(UserVF); 6576 buildVPlansWithVPRecipes(UserVF, UserVF); 6577 LLVM_DEBUG(printPlans(dbgs())); 6578 return {{UserVF, 0}}; 6579 } 6580 6581 unsigned MaxVF = MaybeMaxVF.getValue(); 6582 assert(MaxVF != 0 && "MaxVF is zero."); 6583 6584 for (unsigned VF = 1; VF <= MaxVF; VF *= 2) { 6585 // Collect Uniform and Scalar instructions after vectorization with VF. 6586 CM.collectUniformsAndScalars(VF); 6587 6588 // Collect the instructions (and their associated costs) that will be more 6589 // profitable to scalarize. 6590 if (VF > 1) 6591 CM.collectInstsToScalarize(VF); 6592 } 6593 6594 buildVPlansWithVPRecipes(1, MaxVF); 6595 LLVM_DEBUG(printPlans(dbgs())); 6596 if (MaxVF == 1) 6597 return VectorizationFactor::Disabled(); 6598 6599 // Select the optimal vectorization factor. 6600 return CM.selectVectorizationFactor(MaxVF); 6601 } 6602 6603 void LoopVectorizationPlanner::setBestPlan(unsigned VF, unsigned UF) { 6604 LLVM_DEBUG(dbgs() << "Setting best plan to VF=" << VF << ", UF=" << UF 6605 << '\n'); 6606 BestVF = VF; 6607 BestUF = UF; 6608 6609 erase_if(VPlans, [VF](const VPlanPtr &Plan) { 6610 return !Plan->hasVF(VF); 6611 }); 6612 assert(VPlans.size() == 1 && "Best VF has not a single VPlan."); 6613 } 6614 6615 void LoopVectorizationPlanner::executePlan(InnerLoopVectorizer &ILV, 6616 DominatorTree *DT) { 6617 // Perform the actual loop transformation. 6618 6619 // 1. Create a new empty loop. Unlink the old loop and connect the new one. 6620 VPCallbackILV CallbackILV(ILV); 6621 6622 VPTransformState State{BestVF, BestUF, LI, 6623 DT, ILV.Builder, ILV.VectorLoopValueMap, 6624 &ILV, CallbackILV}; 6625 State.CFG.PrevBB = ILV.createVectorizedLoopSkeleton(); 6626 State.TripCount = ILV.getOrCreateTripCount(nullptr); 6627 State.CanonicalIV = ILV.Induction; 6628 6629 //===------------------------------------------------===// 6630 // 6631 // Notice: any optimization or new instruction that go 6632 // into the code below should also be implemented in 6633 // the cost-model. 6634 // 6635 //===------------------------------------------------===// 6636 6637 // 2. Copy and widen instructions from the old loop into the new loop. 6638 assert(VPlans.size() == 1 && "Not a single VPlan to execute."); 6639 VPlans.front()->execute(&State); 6640 6641 // 3. Fix the vectorized code: take care of header phi's, live-outs, 6642 // predication, updating analyses. 6643 ILV.fixVectorizedLoop(); 6644 } 6645 6646 void LoopVectorizationPlanner::collectTriviallyDeadInstructions( 6647 SmallPtrSetImpl<Instruction *> &DeadInstructions) { 6648 BasicBlock *Latch = OrigLoop->getLoopLatch(); 6649 6650 // We create new control-flow for the vectorized loop, so the original 6651 // condition will be dead after vectorization if it's only used by the 6652 // branch. 6653 auto *Cmp = dyn_cast<Instruction>(Latch->getTerminator()->getOperand(0)); 6654 if (Cmp && Cmp->hasOneUse()) 6655 DeadInstructions.insert(Cmp); 6656 6657 // We create new "steps" for induction variable updates to which the original 6658 // induction variables map. An original update instruction will be dead if 6659 // all its users except the induction variable are dead. 6660 for (auto &Induction : Legal->getInductionVars()) { 6661 PHINode *Ind = Induction.first; 6662 auto *IndUpdate = cast<Instruction>(Ind->getIncomingValueForBlock(Latch)); 6663 if (llvm::all_of(IndUpdate->users(), [&](User *U) -> bool { 6664 return U == Ind || DeadInstructions.find(cast<Instruction>(U)) != 6665 DeadInstructions.end(); 6666 })) 6667 DeadInstructions.insert(IndUpdate); 6668 6669 // We record as "Dead" also the type-casting instructions we had identified 6670 // during induction analysis. We don't need any handling for them in the 6671 // vectorized loop because we have proven that, under a proper runtime 6672 // test guarding the vectorized loop, the value of the phi, and the casted 6673 // value of the phi, are the same. The last instruction in this casting chain 6674 // will get its scalar/vector/widened def from the scalar/vector/widened def 6675 // of the respective phi node. Any other casts in the induction def-use chain 6676 // have no other uses outside the phi update chain, and will be ignored. 6677 InductionDescriptor &IndDes = Induction.second; 6678 const SmallVectorImpl<Instruction *> &Casts = IndDes.getCastInsts(); 6679 DeadInstructions.insert(Casts.begin(), Casts.end()); 6680 } 6681 } 6682 6683 Value *InnerLoopUnroller::reverseVector(Value *Vec) { return Vec; } 6684 6685 Value *InnerLoopUnroller::getBroadcastInstrs(Value *V) { return V; } 6686 6687 Value *InnerLoopUnroller::getStepVector(Value *Val, int StartIdx, Value *Step, 6688 Instruction::BinaryOps BinOp) { 6689 // When unrolling and the VF is 1, we only need to add a simple scalar. 6690 Type *Ty = Val->getType(); 6691 assert(!Ty->isVectorTy() && "Val must be a scalar"); 6692 6693 if (Ty->isFloatingPointTy()) { 6694 Constant *C = ConstantFP::get(Ty, (double)StartIdx); 6695 6696 // Floating point operations had to be 'fast' to enable the unrolling. 6697 Value *MulOp = addFastMathFlag(Builder.CreateFMul(C, Step)); 6698 return addFastMathFlag(Builder.CreateBinOp(BinOp, Val, MulOp)); 6699 } 6700 Constant *C = ConstantInt::get(Ty, StartIdx); 6701 return Builder.CreateAdd(Val, Builder.CreateMul(C, Step), "induction"); 6702 } 6703 6704 static void AddRuntimeUnrollDisableMetaData(Loop *L) { 6705 SmallVector<Metadata *, 4> MDs; 6706 // Reserve first location for self reference to the LoopID metadata node. 6707 MDs.push_back(nullptr); 6708 bool IsUnrollMetadata = false; 6709 MDNode *LoopID = L->getLoopID(); 6710 if (LoopID) { 6711 // First find existing loop unrolling disable metadata. 6712 for (unsigned i = 1, ie = LoopID->getNumOperands(); i < ie; ++i) { 6713 auto *MD = dyn_cast<MDNode>(LoopID->getOperand(i)); 6714 if (MD) { 6715 const auto *S = dyn_cast<MDString>(MD->getOperand(0)); 6716 IsUnrollMetadata = 6717 S && S->getString().startswith("llvm.loop.unroll.disable"); 6718 } 6719 MDs.push_back(LoopID->getOperand(i)); 6720 } 6721 } 6722 6723 if (!IsUnrollMetadata) { 6724 // Add runtime unroll disable metadata. 6725 LLVMContext &Context = L->getHeader()->getContext(); 6726 SmallVector<Metadata *, 1> DisableOperands; 6727 DisableOperands.push_back( 6728 MDString::get(Context, "llvm.loop.unroll.runtime.disable")); 6729 MDNode *DisableNode = MDNode::get(Context, DisableOperands); 6730 MDs.push_back(DisableNode); 6731 MDNode *NewLoopID = MDNode::get(Context, MDs); 6732 // Set operand 0 to refer to the loop id itself. 6733 NewLoopID->replaceOperandWith(0, NewLoopID); 6734 L->setLoopID(NewLoopID); 6735 } 6736 } 6737 6738 bool LoopVectorizationPlanner::getDecisionAndClampRange( 6739 const std::function<bool(unsigned)> &Predicate, VFRange &Range) { 6740 assert(Range.End > Range.Start && "Trying to test an empty VF range."); 6741 bool PredicateAtRangeStart = Predicate(Range.Start); 6742 6743 for (unsigned TmpVF = Range.Start * 2; TmpVF < Range.End; TmpVF *= 2) 6744 if (Predicate(TmpVF) != PredicateAtRangeStart) { 6745 Range.End = TmpVF; 6746 break; 6747 } 6748 6749 return PredicateAtRangeStart; 6750 } 6751 6752 /// Build VPlans for the full range of feasible VF's = {\p MinVF, 2 * \p MinVF, 6753 /// 4 * \p MinVF, ..., \p MaxVF} by repeatedly building a VPlan for a sub-range 6754 /// of VF's starting at a given VF and extending it as much as possible. Each 6755 /// vectorization decision can potentially shorten this sub-range during 6756 /// buildVPlan(). 6757 void LoopVectorizationPlanner::buildVPlans(unsigned MinVF, unsigned MaxVF) { 6758 for (unsigned VF = MinVF; VF < MaxVF + 1;) { 6759 VFRange SubRange = {VF, MaxVF + 1}; 6760 VPlans.push_back(buildVPlan(SubRange)); 6761 VF = SubRange.End; 6762 } 6763 } 6764 6765 VPValue *VPRecipeBuilder::createEdgeMask(BasicBlock *Src, BasicBlock *Dst, 6766 VPlanPtr &Plan) { 6767 assert(is_contained(predecessors(Dst), Src) && "Invalid edge"); 6768 6769 // Look for cached value. 6770 std::pair<BasicBlock *, BasicBlock *> Edge(Src, Dst); 6771 EdgeMaskCacheTy::iterator ECEntryIt = EdgeMaskCache.find(Edge); 6772 if (ECEntryIt != EdgeMaskCache.end()) 6773 return ECEntryIt->second; 6774 6775 VPValue *SrcMask = createBlockInMask(Src, Plan); 6776 6777 // The terminator has to be a branch inst! 6778 BranchInst *BI = dyn_cast<BranchInst>(Src->getTerminator()); 6779 assert(BI && "Unexpected terminator found"); 6780 6781 if (!BI->isConditional() || BI->getSuccessor(0) == BI->getSuccessor(1)) 6782 return EdgeMaskCache[Edge] = SrcMask; 6783 6784 VPValue *EdgeMask = Plan->getVPValue(BI->getCondition()); 6785 assert(EdgeMask && "No Edge Mask found for condition"); 6786 6787 if (BI->getSuccessor(0) != Dst) 6788 EdgeMask = Builder.createNot(EdgeMask); 6789 6790 if (SrcMask) // Otherwise block in-mask is all-one, no need to AND. 6791 EdgeMask = Builder.createAnd(EdgeMask, SrcMask); 6792 6793 return EdgeMaskCache[Edge] = EdgeMask; 6794 } 6795 6796 VPValue *VPRecipeBuilder::createBlockInMask(BasicBlock *BB, VPlanPtr &Plan) { 6797 assert(OrigLoop->contains(BB) && "Block is not a part of a loop"); 6798 6799 // Look for cached value. 6800 BlockMaskCacheTy::iterator BCEntryIt = BlockMaskCache.find(BB); 6801 if (BCEntryIt != BlockMaskCache.end()) 6802 return BCEntryIt->second; 6803 6804 // All-one mask is modelled as no-mask following the convention for masked 6805 // load/store/gather/scatter. Initialize BlockMask to no-mask. 6806 VPValue *BlockMask = nullptr; 6807 6808 if (OrigLoop->getHeader() == BB) { 6809 if (!CM.blockNeedsPredication(BB)) 6810 return BlockMaskCache[BB] = BlockMask; // Loop incoming mask is all-one. 6811 6812 // Introduce the early-exit compare IV <= BTC to form header block mask. 6813 // This is used instead of IV < TC because TC may wrap, unlike BTC. 6814 // Start by constructing the desired canonical IV. 6815 VPValue *IV = nullptr; 6816 if (Legal->getPrimaryInduction()) 6817 IV = Plan->getVPValue(Legal->getPrimaryInduction()); 6818 else { 6819 auto IVRecipe = new VPWidenCanonicalIVRecipe(); 6820 Builder.getInsertBlock()->appendRecipe(IVRecipe); 6821 IV = IVRecipe->getVPValue(); 6822 } 6823 VPValue *BTC = Plan->getOrCreateBackedgeTakenCount(); 6824 BlockMask = Builder.createNaryOp(VPInstruction::ICmpULE, {IV, BTC}); 6825 return BlockMaskCache[BB] = BlockMask; 6826 } 6827 6828 // This is the block mask. We OR all incoming edges. 6829 for (auto *Predecessor : predecessors(BB)) { 6830 VPValue *EdgeMask = createEdgeMask(Predecessor, BB, Plan); 6831 if (!EdgeMask) // Mask of predecessor is all-one so mask of block is too. 6832 return BlockMaskCache[BB] = EdgeMask; 6833 6834 if (!BlockMask) { // BlockMask has its initialized nullptr value. 6835 BlockMask = EdgeMask; 6836 continue; 6837 } 6838 6839 BlockMask = Builder.createOr(BlockMask, EdgeMask); 6840 } 6841 6842 return BlockMaskCache[BB] = BlockMask; 6843 } 6844 6845 VPWidenMemoryInstructionRecipe * 6846 VPRecipeBuilder::tryToWidenMemory(Instruction *I, VFRange &Range, 6847 VPlanPtr &Plan) { 6848 assert((isa<LoadInst>(I) || isa<StoreInst>(I)) && 6849 "Must be called with either a load or store"); 6850 6851 auto willWiden = [&](unsigned VF) -> bool { 6852 if (VF == 1) 6853 return false; 6854 LoopVectorizationCostModel::InstWidening Decision = 6855 CM.getWideningDecision(I, VF); 6856 assert(Decision != LoopVectorizationCostModel::CM_Unknown && 6857 "CM decision should be taken at this point."); 6858 if (Decision == LoopVectorizationCostModel::CM_Interleave) 6859 return true; 6860 if (CM.isScalarAfterVectorization(I, VF) || 6861 CM.isProfitableToScalarize(I, VF)) 6862 return false; 6863 return Decision != LoopVectorizationCostModel::CM_Scalarize; 6864 }; 6865 6866 if (!LoopVectorizationPlanner::getDecisionAndClampRange(willWiden, Range)) 6867 return nullptr; 6868 6869 VPValue *Mask = nullptr; 6870 if (Legal->isMaskRequired(I)) 6871 Mask = createBlockInMask(I->getParent(), Plan); 6872 6873 VPValue *Addr = Plan->getOrAddVPValue(getLoadStorePointerOperand(I)); 6874 if (LoadInst *Load = dyn_cast<LoadInst>(I)) 6875 return new VPWidenMemoryInstructionRecipe(*Load, Addr, Mask); 6876 6877 StoreInst *Store = cast<StoreInst>(I); 6878 VPValue *StoredValue = Plan->getOrAddVPValue(Store->getValueOperand()); 6879 return new VPWidenMemoryInstructionRecipe(*Store, Addr, StoredValue, Mask); 6880 } 6881 6882 VPWidenIntOrFpInductionRecipe * 6883 VPRecipeBuilder::tryToOptimizeInductionPHI(PHINode *Phi) const { 6884 // Check if this is an integer or fp induction. If so, build the recipe that 6885 // produces its scalar and vector values. 6886 InductionDescriptor II = Legal->getInductionVars().lookup(Phi); 6887 if (II.getKind() == InductionDescriptor::IK_IntInduction || 6888 II.getKind() == InductionDescriptor::IK_FpInduction) 6889 return new VPWidenIntOrFpInductionRecipe(Phi); 6890 6891 return nullptr; 6892 } 6893 6894 VPWidenIntOrFpInductionRecipe * 6895 VPRecipeBuilder::tryToOptimizeInductionTruncate(TruncInst *I, 6896 VFRange &Range) const { 6897 // Optimize the special case where the source is a constant integer 6898 // induction variable. Notice that we can only optimize the 'trunc' case 6899 // because (a) FP conversions lose precision, (b) sext/zext may wrap, and 6900 // (c) other casts depend on pointer size. 6901 6902 // Determine whether \p K is a truncation based on an induction variable that 6903 // can be optimized. 6904 auto isOptimizableIVTruncate = 6905 [&](Instruction *K) -> std::function<bool(unsigned)> { 6906 return 6907 [=](unsigned VF) -> bool { return CM.isOptimizableIVTruncate(K, VF); }; 6908 }; 6909 6910 if (LoopVectorizationPlanner::getDecisionAndClampRange( 6911 isOptimizableIVTruncate(I), Range)) 6912 return new VPWidenIntOrFpInductionRecipe(cast<PHINode>(I->getOperand(0)), 6913 I); 6914 return nullptr; 6915 } 6916 6917 VPBlendRecipe *VPRecipeBuilder::tryToBlend(PHINode *Phi, VPlanPtr &Plan) { 6918 // We know that all PHIs in non-header blocks are converted into selects, so 6919 // we don't have to worry about the insertion order and we can just use the 6920 // builder. At this point we generate the predication tree. There may be 6921 // duplications since this is a simple recursive scan, but future 6922 // optimizations will clean it up. 6923 6924 SmallVector<VPValue *, 2> Operands; 6925 unsigned NumIncoming = Phi->getNumIncomingValues(); 6926 for (unsigned In = 0; In < NumIncoming; In++) { 6927 VPValue *EdgeMask = 6928 createEdgeMask(Phi->getIncomingBlock(In), Phi->getParent(), Plan); 6929 assert((EdgeMask || NumIncoming == 1) && 6930 "Multiple predecessors with one having a full mask"); 6931 Operands.push_back(Plan->getOrAddVPValue(Phi->getIncomingValue(In))); 6932 if (EdgeMask) 6933 Operands.push_back(EdgeMask); 6934 } 6935 return new VPBlendRecipe(Phi, Operands); 6936 } 6937 6938 VPWidenCallRecipe *VPRecipeBuilder::tryToWidenCall(CallInst *CI, VFRange &Range, 6939 VPlan &Plan) const { 6940 6941 bool IsPredicated = LoopVectorizationPlanner::getDecisionAndClampRange( 6942 [this, CI](unsigned VF) { return CM.isScalarWithPredication(CI, VF); }, 6943 Range); 6944 6945 if (IsPredicated) 6946 return nullptr; 6947 6948 Intrinsic::ID ID = getVectorIntrinsicIDForCall(CI, TLI); 6949 if (ID && (ID == Intrinsic::assume || ID == Intrinsic::lifetime_end || 6950 ID == Intrinsic::lifetime_start || ID == Intrinsic::sideeffect)) 6951 return nullptr; 6952 6953 auto willWiden = [&](unsigned VF) -> bool { 6954 Intrinsic::ID ID = getVectorIntrinsicIDForCall(CI, TLI); 6955 // The following case may be scalarized depending on the VF. 6956 // The flag shows whether we use Intrinsic or a usual Call for vectorized 6957 // version of the instruction. 6958 // Is it beneficial to perform intrinsic call compared to lib call? 6959 bool NeedToScalarize = false; 6960 unsigned CallCost = CM.getVectorCallCost(CI, VF, NeedToScalarize); 6961 bool UseVectorIntrinsic = 6962 ID && CM.getVectorIntrinsicCost(CI, VF) <= CallCost; 6963 return UseVectorIntrinsic || !NeedToScalarize; 6964 }; 6965 6966 if (!LoopVectorizationPlanner::getDecisionAndClampRange(willWiden, Range)) 6967 return nullptr; 6968 6969 return new VPWidenCallRecipe(*CI, Plan.mapToVPValues(CI->arg_operands())); 6970 } 6971 6972 bool VPRecipeBuilder::shouldWiden(Instruction *I, VFRange &Range) const { 6973 assert(!isa<BranchInst>(I) && !isa<PHINode>(I) && !isa<LoadInst>(I) && 6974 !isa<StoreInst>(I) && "Instruction should have been handled earlier"); 6975 // Instruction should be widened, unless it is scalar after vectorization, 6976 // scalarization is profitable or it is predicated. 6977 auto WillScalarize = [this, I](unsigned VF) -> bool { 6978 return CM.isScalarAfterVectorization(I, VF) || 6979 CM.isProfitableToScalarize(I, VF) || 6980 CM.isScalarWithPredication(I, VF); 6981 }; 6982 return !LoopVectorizationPlanner::getDecisionAndClampRange(WillScalarize, 6983 Range); 6984 } 6985 6986 VPWidenRecipe *VPRecipeBuilder::tryToWiden(Instruction *I, VPlan &Plan) const { 6987 auto IsVectorizableOpcode = [](unsigned Opcode) { 6988 switch (Opcode) { 6989 case Instruction::Add: 6990 case Instruction::And: 6991 case Instruction::AShr: 6992 case Instruction::BitCast: 6993 case Instruction::FAdd: 6994 case Instruction::FCmp: 6995 case Instruction::FDiv: 6996 case Instruction::FMul: 6997 case Instruction::FNeg: 6998 case Instruction::FPExt: 6999 case Instruction::FPToSI: 7000 case Instruction::FPToUI: 7001 case Instruction::FPTrunc: 7002 case Instruction::FRem: 7003 case Instruction::FSub: 7004 case Instruction::ICmp: 7005 case Instruction::IntToPtr: 7006 case Instruction::LShr: 7007 case Instruction::Mul: 7008 case Instruction::Or: 7009 case Instruction::PtrToInt: 7010 case Instruction::SDiv: 7011 case Instruction::Select: 7012 case Instruction::SExt: 7013 case Instruction::Shl: 7014 case Instruction::SIToFP: 7015 case Instruction::SRem: 7016 case Instruction::Sub: 7017 case Instruction::Trunc: 7018 case Instruction::UDiv: 7019 case Instruction::UIToFP: 7020 case Instruction::URem: 7021 case Instruction::Xor: 7022 case Instruction::ZExt: 7023 return true; 7024 } 7025 return false; 7026 }; 7027 7028 if (!IsVectorizableOpcode(I->getOpcode())) 7029 return nullptr; 7030 7031 // Success: widen this instruction. 7032 return new VPWidenRecipe(*I, Plan.mapToVPValues(I->operands())); 7033 } 7034 7035 VPBasicBlock *VPRecipeBuilder::handleReplication( 7036 Instruction *I, VFRange &Range, VPBasicBlock *VPBB, 7037 DenseMap<Instruction *, VPReplicateRecipe *> &PredInst2Recipe, 7038 VPlanPtr &Plan) { 7039 bool IsUniform = LoopVectorizationPlanner::getDecisionAndClampRange( 7040 [&](unsigned VF) { return CM.isUniformAfterVectorization(I, VF); }, 7041 Range); 7042 7043 bool IsPredicated = LoopVectorizationPlanner::getDecisionAndClampRange( 7044 [&](unsigned VF) { return CM.isScalarWithPredication(I, VF); }, Range); 7045 7046 auto *Recipe = new VPReplicateRecipe(I, Plan->mapToVPValues(I->operands()), 7047 IsUniform, IsPredicated); 7048 setRecipe(I, Recipe); 7049 7050 // Find if I uses a predicated instruction. If so, it will use its scalar 7051 // value. Avoid hoisting the insert-element which packs the scalar value into 7052 // a vector value, as that happens iff all users use the vector value. 7053 for (auto &Op : I->operands()) 7054 if (auto *PredInst = dyn_cast<Instruction>(Op)) 7055 if (PredInst2Recipe.find(PredInst) != PredInst2Recipe.end()) 7056 PredInst2Recipe[PredInst]->setAlsoPack(false); 7057 7058 // Finalize the recipe for Instr, first if it is not predicated. 7059 if (!IsPredicated) { 7060 LLVM_DEBUG(dbgs() << "LV: Scalarizing:" << *I << "\n"); 7061 VPBB->appendRecipe(Recipe); 7062 return VPBB; 7063 } 7064 LLVM_DEBUG(dbgs() << "LV: Scalarizing and predicating:" << *I << "\n"); 7065 assert(VPBB->getSuccessors().empty() && 7066 "VPBB has successors when handling predicated replication."); 7067 // Record predicated instructions for above packing optimizations. 7068 PredInst2Recipe[I] = Recipe; 7069 VPBlockBase *Region = createReplicateRegion(I, Recipe, Plan); 7070 VPBlockUtils::insertBlockAfter(Region, VPBB); 7071 auto *RegSucc = new VPBasicBlock(); 7072 VPBlockUtils::insertBlockAfter(RegSucc, Region); 7073 return RegSucc; 7074 } 7075 7076 VPRegionBlock *VPRecipeBuilder::createReplicateRegion(Instruction *Instr, 7077 VPRecipeBase *PredRecipe, 7078 VPlanPtr &Plan) { 7079 // Instructions marked for predication are replicated and placed under an 7080 // if-then construct to prevent side-effects. 7081 7082 // Generate recipes to compute the block mask for this region. 7083 VPValue *BlockInMask = createBlockInMask(Instr->getParent(), Plan); 7084 7085 // Build the triangular if-then region. 7086 std::string RegionName = (Twine("pred.") + Instr->getOpcodeName()).str(); 7087 assert(Instr->getParent() && "Predicated instruction not in any basic block"); 7088 auto *BOMRecipe = new VPBranchOnMaskRecipe(BlockInMask); 7089 auto *Entry = new VPBasicBlock(Twine(RegionName) + ".entry", BOMRecipe); 7090 auto *PHIRecipe = 7091 Instr->getType()->isVoidTy() ? nullptr : new VPPredInstPHIRecipe(Instr); 7092 auto *Exit = new VPBasicBlock(Twine(RegionName) + ".continue", PHIRecipe); 7093 auto *Pred = new VPBasicBlock(Twine(RegionName) + ".if", PredRecipe); 7094 VPRegionBlock *Region = new VPRegionBlock(Entry, Exit, RegionName, true); 7095 7096 // Note: first set Entry as region entry and then connect successors starting 7097 // from it in order, to propagate the "parent" of each VPBasicBlock. 7098 VPBlockUtils::insertTwoBlocksAfter(Pred, Exit, BlockInMask, Entry); 7099 VPBlockUtils::connectBlocks(Pred, Exit); 7100 7101 return Region; 7102 } 7103 7104 VPRecipeBase *VPRecipeBuilder::tryToCreateWidenRecipe(Instruction *Instr, 7105 VFRange &Range, 7106 VPlanPtr &Plan) { 7107 // First, check for specific widening recipes that deal with calls, memory 7108 // operations, inductions and Phi nodes. 7109 if (auto *CI = dyn_cast<CallInst>(Instr)) 7110 return tryToWidenCall(CI, Range, *Plan); 7111 7112 if (isa<LoadInst>(Instr) || isa<StoreInst>(Instr)) 7113 return tryToWidenMemory(Instr, Range, Plan); 7114 7115 VPRecipeBase *Recipe; 7116 if (auto Phi = dyn_cast<PHINode>(Instr)) { 7117 if (Phi->getParent() != OrigLoop->getHeader()) 7118 return tryToBlend(Phi, Plan); 7119 if ((Recipe = tryToOptimizeInductionPHI(Phi))) 7120 return Recipe; 7121 return new VPWidenPHIRecipe(Phi); 7122 } 7123 7124 if (isa<TruncInst>(Instr) && 7125 (Recipe = tryToOptimizeInductionTruncate(cast<TruncInst>(Instr), Range))) 7126 return Recipe; 7127 7128 if (!shouldWiden(Instr, Range)) 7129 return nullptr; 7130 7131 if (auto GEP = dyn_cast<GetElementPtrInst>(Instr)) 7132 return new VPWidenGEPRecipe(GEP, OrigLoop); 7133 7134 if (auto *SI = dyn_cast<SelectInst>(Instr)) { 7135 bool InvariantCond = 7136 PSE.getSE()->isLoopInvariant(PSE.getSCEV(SI->getOperand(0)), OrigLoop); 7137 return new VPWidenSelectRecipe(*SI, InvariantCond); 7138 } 7139 7140 return tryToWiden(Instr, *Plan); 7141 } 7142 7143 void LoopVectorizationPlanner::buildVPlansWithVPRecipes(unsigned MinVF, 7144 unsigned MaxVF) { 7145 assert(OrigLoop->empty() && "Inner loop expected."); 7146 7147 // Collect conditions feeding internal conditional branches; they need to be 7148 // represented in VPlan for it to model masking. 7149 SmallPtrSet<Value *, 1> NeedDef; 7150 7151 auto *Latch = OrigLoop->getLoopLatch(); 7152 for (BasicBlock *BB : OrigLoop->blocks()) { 7153 if (BB == Latch) 7154 continue; 7155 BranchInst *Branch = dyn_cast<BranchInst>(BB->getTerminator()); 7156 if (Branch && Branch->isConditional()) 7157 NeedDef.insert(Branch->getCondition()); 7158 } 7159 7160 // If the tail is to be folded by masking, the primary induction variable, if 7161 // exists needs to be represented in VPlan for it to model early-exit masking. 7162 // Also, both the Phi and the live-out instruction of each reduction are 7163 // required in order to introduce a select between them in VPlan. 7164 if (CM.foldTailByMasking()) { 7165 if (Legal->getPrimaryInduction()) 7166 NeedDef.insert(Legal->getPrimaryInduction()); 7167 for (auto &Reduction : Legal->getReductionVars()) { 7168 NeedDef.insert(Reduction.first); 7169 NeedDef.insert(Reduction.second.getLoopExitInstr()); 7170 } 7171 } 7172 7173 // Collect instructions from the original loop that will become trivially dead 7174 // in the vectorized loop. We don't need to vectorize these instructions. For 7175 // example, original induction update instructions can become dead because we 7176 // separately emit induction "steps" when generating code for the new loop. 7177 // Similarly, we create a new latch condition when setting up the structure 7178 // of the new loop, so the old one can become dead. 7179 SmallPtrSet<Instruction *, 4> DeadInstructions; 7180 collectTriviallyDeadInstructions(DeadInstructions); 7181 7182 // Add assume instructions we need to drop to DeadInstructions, to prevent 7183 // them from being added to the VPlan. 7184 // TODO: We only need to drop assumes in blocks that get flattend. If the 7185 // control flow is preserved, we should keep them. 7186 auto &ConditionalAssumes = Legal->getConditionalAssumes(); 7187 DeadInstructions.insert(ConditionalAssumes.begin(), ConditionalAssumes.end()); 7188 7189 DenseMap<Instruction *, Instruction *> &SinkAfter = Legal->getSinkAfter(); 7190 // Dead instructions do not need sinking. Remove them from SinkAfter. 7191 for (Instruction *I : DeadInstructions) 7192 SinkAfter.erase(I); 7193 7194 for (unsigned VF = MinVF; VF < MaxVF + 1;) { 7195 VFRange SubRange = {VF, MaxVF + 1}; 7196 VPlans.push_back(buildVPlanWithVPRecipes(SubRange, NeedDef, 7197 DeadInstructions, SinkAfter)); 7198 VF = SubRange.End; 7199 } 7200 } 7201 7202 VPlanPtr LoopVectorizationPlanner::buildVPlanWithVPRecipes( 7203 VFRange &Range, SmallPtrSetImpl<Value *> &NeedDef, 7204 SmallPtrSetImpl<Instruction *> &DeadInstructions, 7205 const DenseMap<Instruction *, Instruction *> &SinkAfter) { 7206 7207 // Hold a mapping from predicated instructions to their recipes, in order to 7208 // fix their AlsoPack behavior if a user is determined to replicate and use a 7209 // scalar instead of vector value. 7210 DenseMap<Instruction *, VPReplicateRecipe *> PredInst2Recipe; 7211 7212 SmallPtrSet<const InterleaveGroup<Instruction> *, 1> InterleaveGroups; 7213 7214 VPRecipeBuilder RecipeBuilder(OrigLoop, TLI, Legal, CM, PSE, Builder); 7215 7216 // --------------------------------------------------------------------------- 7217 // Pre-construction: record ingredients whose recipes we'll need to further 7218 // process after constructing the initial VPlan. 7219 // --------------------------------------------------------------------------- 7220 7221 // Mark instructions we'll need to sink later and their targets as 7222 // ingredients whose recipe we'll need to record. 7223 for (auto &Entry : SinkAfter) { 7224 RecipeBuilder.recordRecipeOf(Entry.first); 7225 RecipeBuilder.recordRecipeOf(Entry.second); 7226 } 7227 7228 // For each interleave group which is relevant for this (possibly trimmed) 7229 // Range, add it to the set of groups to be later applied to the VPlan and add 7230 // placeholders for its members' Recipes which we'll be replacing with a 7231 // single VPInterleaveRecipe. 7232 for (InterleaveGroup<Instruction> *IG : IAI.getInterleaveGroups()) { 7233 auto applyIG = [IG, this](unsigned VF) -> bool { 7234 return (VF >= 2 && // Query is illegal for VF == 1 7235 CM.getWideningDecision(IG->getInsertPos(), VF) == 7236 LoopVectorizationCostModel::CM_Interleave); 7237 }; 7238 if (!getDecisionAndClampRange(applyIG, Range)) 7239 continue; 7240 InterleaveGroups.insert(IG); 7241 for (unsigned i = 0; i < IG->getFactor(); i++) 7242 if (Instruction *Member = IG->getMember(i)) 7243 RecipeBuilder.recordRecipeOf(Member); 7244 }; 7245 7246 // --------------------------------------------------------------------------- 7247 // Build initial VPlan: Scan the body of the loop in a topological order to 7248 // visit each basic block after having visited its predecessor basic blocks. 7249 // --------------------------------------------------------------------------- 7250 7251 // Create a dummy pre-entry VPBasicBlock to start building the VPlan. 7252 auto Plan = std::make_unique<VPlan>(); 7253 VPBasicBlock *VPBB = new VPBasicBlock("Pre-Entry"); 7254 Plan->setEntry(VPBB); 7255 7256 // Represent values that will have defs inside VPlan. 7257 for (Value *V : NeedDef) 7258 Plan->addVPValue(V); 7259 7260 // Scan the body of the loop in a topological order to visit each basic block 7261 // after having visited its predecessor basic blocks. 7262 LoopBlocksDFS DFS(OrigLoop); 7263 DFS.perform(LI); 7264 7265 for (BasicBlock *BB : make_range(DFS.beginRPO(), DFS.endRPO())) { 7266 // Relevant instructions from basic block BB will be grouped into VPRecipe 7267 // ingredients and fill a new VPBasicBlock. 7268 unsigned VPBBsForBB = 0; 7269 auto *FirstVPBBForBB = new VPBasicBlock(BB->getName()); 7270 VPBlockUtils::insertBlockAfter(FirstVPBBForBB, VPBB); 7271 VPBB = FirstVPBBForBB; 7272 Builder.setInsertPoint(VPBB); 7273 7274 // Introduce each ingredient into VPlan. 7275 // TODO: Model and preserve debug instrinsics in VPlan. 7276 for (Instruction &I : BB->instructionsWithoutDebug()) { 7277 Instruction *Instr = &I; 7278 7279 // First filter out irrelevant instructions, to ensure no recipes are 7280 // built for them. 7281 if (isa<BranchInst>(Instr) || 7282 DeadInstructions.find(Instr) != DeadInstructions.end()) 7283 continue; 7284 7285 if (auto Recipe = 7286 RecipeBuilder.tryToCreateWidenRecipe(Instr, Range, Plan)) { 7287 RecipeBuilder.setRecipe(Instr, Recipe); 7288 VPBB->appendRecipe(Recipe); 7289 continue; 7290 } 7291 7292 // Otherwise, if all widening options failed, Instruction is to be 7293 // replicated. This may create a successor for VPBB. 7294 VPBasicBlock *NextVPBB = RecipeBuilder.handleReplication( 7295 Instr, Range, VPBB, PredInst2Recipe, Plan); 7296 if (NextVPBB != VPBB) { 7297 VPBB = NextVPBB; 7298 VPBB->setName(BB->hasName() ? BB->getName() + "." + Twine(VPBBsForBB++) 7299 : ""); 7300 } 7301 } 7302 } 7303 7304 // Discard empty dummy pre-entry VPBasicBlock. Note that other VPBasicBlocks 7305 // may also be empty, such as the last one VPBB, reflecting original 7306 // basic-blocks with no recipes. 7307 VPBasicBlock *PreEntry = cast<VPBasicBlock>(Plan->getEntry()); 7308 assert(PreEntry->empty() && "Expecting empty pre-entry block."); 7309 VPBlockBase *Entry = Plan->setEntry(PreEntry->getSingleSuccessor()); 7310 VPBlockUtils::disconnectBlocks(PreEntry, Entry); 7311 delete PreEntry; 7312 7313 // --------------------------------------------------------------------------- 7314 // Transform initial VPlan: Apply previously taken decisions, in order, to 7315 // bring the VPlan to its final state. 7316 // --------------------------------------------------------------------------- 7317 7318 // Apply Sink-After legal constraints. 7319 for (auto &Entry : SinkAfter) { 7320 VPRecipeBase *Sink = RecipeBuilder.getRecipe(Entry.first); 7321 VPRecipeBase *Target = RecipeBuilder.getRecipe(Entry.second); 7322 Sink->moveAfter(Target); 7323 } 7324 7325 // Interleave memory: for each Interleave Group we marked earlier as relevant 7326 // for this VPlan, replace the Recipes widening its memory instructions with a 7327 // single VPInterleaveRecipe at its insertion point. 7328 for (auto IG : InterleaveGroups) { 7329 auto *Recipe = cast<VPWidenMemoryInstructionRecipe>( 7330 RecipeBuilder.getRecipe(IG->getInsertPos())); 7331 (new VPInterleaveRecipe(IG, Recipe->getAddr(), Recipe->getMask())) 7332 ->insertBefore(Recipe); 7333 7334 for (unsigned i = 0; i < IG->getFactor(); ++i) 7335 if (Instruction *Member = IG->getMember(i)) { 7336 RecipeBuilder.getRecipe(Member)->eraseFromParent(); 7337 } 7338 } 7339 7340 // Finally, if tail is folded by masking, introduce selects between the phi 7341 // and the live-out instruction of each reduction, at the end of the latch. 7342 if (CM.foldTailByMasking()) { 7343 Builder.setInsertPoint(VPBB); 7344 auto *Cond = RecipeBuilder.createBlockInMask(OrigLoop->getHeader(), Plan); 7345 for (auto &Reduction : Legal->getReductionVars()) { 7346 VPValue *Phi = Plan->getVPValue(Reduction.first); 7347 VPValue *Red = Plan->getVPValue(Reduction.second.getLoopExitInstr()); 7348 Builder.createNaryOp(Instruction::Select, {Cond, Red, Phi}); 7349 } 7350 } 7351 7352 std::string PlanName; 7353 raw_string_ostream RSO(PlanName); 7354 unsigned VF = Range.Start; 7355 Plan->addVF(VF); 7356 RSO << "Initial VPlan for VF={" << VF; 7357 for (VF *= 2; VF < Range.End; VF *= 2) { 7358 Plan->addVF(VF); 7359 RSO << "," << VF; 7360 } 7361 RSO << "},UF>=1"; 7362 RSO.flush(); 7363 Plan->setName(PlanName); 7364 7365 return Plan; 7366 } 7367 7368 VPlanPtr LoopVectorizationPlanner::buildVPlan(VFRange &Range) { 7369 // Outer loop handling: They may require CFG and instruction level 7370 // transformations before even evaluating whether vectorization is profitable. 7371 // Since we cannot modify the incoming IR, we need to build VPlan upfront in 7372 // the vectorization pipeline. 7373 assert(!OrigLoop->empty()); 7374 assert(EnableVPlanNativePath && "VPlan-native path is not enabled."); 7375 7376 // Create new empty VPlan 7377 auto Plan = std::make_unique<VPlan>(); 7378 7379 // Build hierarchical CFG 7380 VPlanHCFGBuilder HCFGBuilder(OrigLoop, LI, *Plan); 7381 HCFGBuilder.buildHierarchicalCFG(); 7382 7383 for (unsigned VF = Range.Start; VF < Range.End; VF *= 2) 7384 Plan->addVF(VF); 7385 7386 if (EnableVPlanPredication) { 7387 VPlanPredicator VPP(*Plan); 7388 VPP.predicate(); 7389 7390 // Avoid running transformation to recipes until masked code generation in 7391 // VPlan-native path is in place. 7392 return Plan; 7393 } 7394 7395 SmallPtrSet<Instruction *, 1> DeadInstructions; 7396 VPlanTransforms::VPInstructionsToVPRecipes( 7397 OrigLoop, Plan, Legal->getInductionVars(), DeadInstructions); 7398 return Plan; 7399 } 7400 7401 Value* LoopVectorizationPlanner::VPCallbackILV:: 7402 getOrCreateVectorValues(Value *V, unsigned Part) { 7403 return ILV.getOrCreateVectorValue(V, Part); 7404 } 7405 7406 Value *LoopVectorizationPlanner::VPCallbackILV::getOrCreateScalarValue( 7407 Value *V, const VPIteration &Instance) { 7408 return ILV.getOrCreateScalarValue(V, Instance); 7409 } 7410 7411 void VPInterleaveRecipe::print(raw_ostream &O, const Twine &Indent, 7412 VPSlotTracker &SlotTracker) const { 7413 O << "\"INTERLEAVE-GROUP with factor " << IG->getFactor() << " at "; 7414 IG->getInsertPos()->printAsOperand(O, false); 7415 O << ", "; 7416 getAddr()->printAsOperand(O, SlotTracker); 7417 VPValue *Mask = getMask(); 7418 if (Mask) { 7419 O << ", "; 7420 Mask->printAsOperand(O, SlotTracker); 7421 } 7422 for (unsigned i = 0; i < IG->getFactor(); ++i) 7423 if (Instruction *I = IG->getMember(i)) 7424 O << "\\l\" +\n" << Indent << "\" " << VPlanIngredient(I) << " " << i; 7425 } 7426 7427 void VPWidenCallRecipe::execute(VPTransformState &State) { 7428 State.ILV->widenCallInstruction(Ingredient, User, State); 7429 } 7430 7431 void VPWidenSelectRecipe::execute(VPTransformState &State) { 7432 State.ILV->widenSelectInstruction(Ingredient, InvariantCond); 7433 } 7434 7435 void VPWidenRecipe::execute(VPTransformState &State) { 7436 State.ILV->widenInstruction(Ingredient, User, State); 7437 } 7438 7439 void VPWidenGEPRecipe::execute(VPTransformState &State) { 7440 State.ILV->widenGEP(GEP, State.UF, State.VF, IsPtrLoopInvariant, 7441 IsIndexLoopInvariant); 7442 } 7443 7444 void VPWidenIntOrFpInductionRecipe::execute(VPTransformState &State) { 7445 assert(!State.Instance && "Int or FP induction being replicated."); 7446 State.ILV->widenIntOrFpInduction(IV, Trunc); 7447 } 7448 7449 void VPWidenPHIRecipe::execute(VPTransformState &State) { 7450 State.ILV->widenPHIInstruction(Phi, State.UF, State.VF); 7451 } 7452 7453 void VPBlendRecipe::execute(VPTransformState &State) { 7454 State.ILV->setDebugLocFromInst(State.Builder, Phi); 7455 // We know that all PHIs in non-header blocks are converted into 7456 // selects, so we don't have to worry about the insertion order and we 7457 // can just use the builder. 7458 // At this point we generate the predication tree. There may be 7459 // duplications since this is a simple recursive scan, but future 7460 // optimizations will clean it up. 7461 7462 unsigned NumIncoming = getNumIncomingValues(); 7463 7464 // Generate a sequence of selects of the form: 7465 // SELECT(Mask3, In3, 7466 // SELECT(Mask2, In2, 7467 // SELECT(Mask1, In1, 7468 // In0))) 7469 // Note that Mask0 is never used: lanes for which no path reaches this phi and 7470 // are essentially undef are taken from In0. 7471 InnerLoopVectorizer::VectorParts Entry(State.UF); 7472 for (unsigned In = 0; In < NumIncoming; ++In) { 7473 for (unsigned Part = 0; Part < State.UF; ++Part) { 7474 // We might have single edge PHIs (blocks) - use an identity 7475 // 'select' for the first PHI operand. 7476 Value *In0 = State.get(getIncomingValue(In), Part); 7477 if (In == 0) 7478 Entry[Part] = In0; // Initialize with the first incoming value. 7479 else { 7480 // Select between the current value and the previous incoming edge 7481 // based on the incoming mask. 7482 Value *Cond = State.get(getMask(In), Part); 7483 Entry[Part] = 7484 State.Builder.CreateSelect(Cond, In0, Entry[Part], "predphi"); 7485 } 7486 } 7487 } 7488 for (unsigned Part = 0; Part < State.UF; ++Part) 7489 State.ValueMap.setVectorValue(Phi, Part, Entry[Part]); 7490 } 7491 7492 void VPInterleaveRecipe::execute(VPTransformState &State) { 7493 assert(!State.Instance && "Interleave group being replicated."); 7494 State.ILV->vectorizeInterleaveGroup(IG, State, getAddr(), getMask()); 7495 } 7496 7497 void VPReplicateRecipe::execute(VPTransformState &State) { 7498 if (State.Instance) { // Generate a single instance. 7499 State.ILV->scalarizeInstruction(Ingredient, User, *State.Instance, 7500 IsPredicated, State); 7501 // Insert scalar instance packing it into a vector. 7502 if (AlsoPack && State.VF > 1) { 7503 // If we're constructing lane 0, initialize to start from undef. 7504 if (State.Instance->Lane == 0) { 7505 Value *Undef = 7506 UndefValue::get(VectorType::get(Ingredient->getType(), State.VF)); 7507 State.ValueMap.setVectorValue(Ingredient, State.Instance->Part, Undef); 7508 } 7509 State.ILV->packScalarIntoVectorValue(Ingredient, *State.Instance); 7510 } 7511 return; 7512 } 7513 7514 // Generate scalar instances for all VF lanes of all UF parts, unless the 7515 // instruction is uniform inwhich case generate only the first lane for each 7516 // of the UF parts. 7517 unsigned EndLane = IsUniform ? 1 : State.VF; 7518 for (unsigned Part = 0; Part < State.UF; ++Part) 7519 for (unsigned Lane = 0; Lane < EndLane; ++Lane) 7520 State.ILV->scalarizeInstruction(Ingredient, User, {Part, Lane}, 7521 IsPredicated, State); 7522 } 7523 7524 void VPBranchOnMaskRecipe::execute(VPTransformState &State) { 7525 assert(State.Instance && "Branch on Mask works only on single instance."); 7526 7527 unsigned Part = State.Instance->Part; 7528 unsigned Lane = State.Instance->Lane; 7529 7530 Value *ConditionBit = nullptr; 7531 VPValue *BlockInMask = getMask(); 7532 if (BlockInMask) { 7533 ConditionBit = State.get(BlockInMask, Part); 7534 if (ConditionBit->getType()->isVectorTy()) 7535 ConditionBit = State.Builder.CreateExtractElement( 7536 ConditionBit, State.Builder.getInt32(Lane)); 7537 } else // Block in mask is all-one. 7538 ConditionBit = State.Builder.getTrue(); 7539 7540 // Replace the temporary unreachable terminator with a new conditional branch, 7541 // whose two destinations will be set later when they are created. 7542 auto *CurrentTerminator = State.CFG.PrevBB->getTerminator(); 7543 assert(isa<UnreachableInst>(CurrentTerminator) && 7544 "Expected to replace unreachable terminator with conditional branch."); 7545 auto *CondBr = BranchInst::Create(State.CFG.PrevBB, nullptr, ConditionBit); 7546 CondBr->setSuccessor(0, nullptr); 7547 ReplaceInstWithInst(CurrentTerminator, CondBr); 7548 } 7549 7550 void VPPredInstPHIRecipe::execute(VPTransformState &State) { 7551 assert(State.Instance && "Predicated instruction PHI works per instance."); 7552 Instruction *ScalarPredInst = cast<Instruction>( 7553 State.ValueMap.getScalarValue(PredInst, *State.Instance)); 7554 BasicBlock *PredicatedBB = ScalarPredInst->getParent(); 7555 BasicBlock *PredicatingBB = PredicatedBB->getSinglePredecessor(); 7556 assert(PredicatingBB && "Predicated block has no single predecessor."); 7557 7558 // By current pack/unpack logic we need to generate only a single phi node: if 7559 // a vector value for the predicated instruction exists at this point it means 7560 // the instruction has vector users only, and a phi for the vector value is 7561 // needed. In this case the recipe of the predicated instruction is marked to 7562 // also do that packing, thereby "hoisting" the insert-element sequence. 7563 // Otherwise, a phi node for the scalar value is needed. 7564 unsigned Part = State.Instance->Part; 7565 if (State.ValueMap.hasVectorValue(PredInst, Part)) { 7566 Value *VectorValue = State.ValueMap.getVectorValue(PredInst, Part); 7567 InsertElementInst *IEI = cast<InsertElementInst>(VectorValue); 7568 PHINode *VPhi = State.Builder.CreatePHI(IEI->getType(), 2); 7569 VPhi->addIncoming(IEI->getOperand(0), PredicatingBB); // Unmodified vector. 7570 VPhi->addIncoming(IEI, PredicatedBB); // New vector with inserted element. 7571 State.ValueMap.resetVectorValue(PredInst, Part, VPhi); // Update cache. 7572 } else { 7573 Type *PredInstType = PredInst->getType(); 7574 PHINode *Phi = State.Builder.CreatePHI(PredInstType, 2); 7575 Phi->addIncoming(UndefValue::get(ScalarPredInst->getType()), PredicatingBB); 7576 Phi->addIncoming(ScalarPredInst, PredicatedBB); 7577 State.ValueMap.resetScalarValue(PredInst, *State.Instance, Phi); 7578 } 7579 } 7580 7581 void VPWidenMemoryInstructionRecipe::execute(VPTransformState &State) { 7582 VPValue *StoredValue = isa<StoreInst>(Instr) ? getStoredValue() : nullptr; 7583 State.ILV->vectorizeMemoryInstruction(&Instr, State, getAddr(), StoredValue, 7584 getMask()); 7585 } 7586 7587 // Determine how to lower the scalar epilogue, which depends on 1) optimising 7588 // for minimum code-size, 2) predicate compiler options, 3) loop hints forcing 7589 // predication, and 4) a TTI hook that analyses whether the loop is suitable 7590 // for predication. 7591 static ScalarEpilogueLowering getScalarEpilogueLowering( 7592 Function *F, Loop *L, LoopVectorizeHints &Hints, ProfileSummaryInfo *PSI, 7593 BlockFrequencyInfo *BFI, TargetTransformInfo *TTI, TargetLibraryInfo *TLI, 7594 AssumptionCache *AC, LoopInfo *LI, ScalarEvolution *SE, DominatorTree *DT, 7595 LoopVectorizationLegality &LVL) { 7596 bool OptSize = 7597 F->hasOptSize() || llvm::shouldOptimizeForSize(L->getHeader(), PSI, BFI, 7598 PGSOQueryType::IRPass); 7599 // 1) OptSize takes precedence over all other options, i.e. if this is set, 7600 // don't look at hints or options, and don't request a scalar epilogue. 7601 if (OptSize && Hints.getForce() != LoopVectorizeHints::FK_Enabled) 7602 return CM_ScalarEpilogueNotAllowedOptSize; 7603 7604 bool PredicateOptDisabled = PreferPredicateOverEpilog.getNumOccurrences() && 7605 !PreferPredicateOverEpilog; 7606 7607 // 2) Next, if disabling predication is requested on the command line, honour 7608 // this and request a scalar epilogue. 7609 if (PredicateOptDisabled) 7610 return CM_ScalarEpilogueAllowed; 7611 7612 // 3) and 4) look if enabling predication is requested on the command line, 7613 // with a loop hint, or if the TTI hook indicates this is profitable, request 7614 // predication . 7615 if (PreferPredicateOverEpilog || 7616 Hints.getPredicate() == LoopVectorizeHints::FK_Enabled || 7617 (TTI->preferPredicateOverEpilogue(L, LI, *SE, *AC, TLI, DT, 7618 LVL.getLAI()) && 7619 Hints.getPredicate() != LoopVectorizeHints::FK_Disabled)) 7620 return CM_ScalarEpilogueNotNeededUsePredicate; 7621 7622 return CM_ScalarEpilogueAllowed; 7623 } 7624 7625 // Process the loop in the VPlan-native vectorization path. This path builds 7626 // VPlan upfront in the vectorization pipeline, which allows to apply 7627 // VPlan-to-VPlan transformations from the very beginning without modifying the 7628 // input LLVM IR. 7629 static bool processLoopInVPlanNativePath( 7630 Loop *L, PredicatedScalarEvolution &PSE, LoopInfo *LI, DominatorTree *DT, 7631 LoopVectorizationLegality *LVL, TargetTransformInfo *TTI, 7632 TargetLibraryInfo *TLI, DemandedBits *DB, AssumptionCache *AC, 7633 OptimizationRemarkEmitter *ORE, BlockFrequencyInfo *BFI, 7634 ProfileSummaryInfo *PSI, LoopVectorizeHints &Hints) { 7635 7636 assert(EnableVPlanNativePath && "VPlan-native path is disabled."); 7637 Function *F = L->getHeader()->getParent(); 7638 InterleavedAccessInfo IAI(PSE, L, DT, LI, LVL->getLAI()); 7639 7640 ScalarEpilogueLowering SEL = getScalarEpilogueLowering( 7641 F, L, Hints, PSI, BFI, TTI, TLI, AC, LI, PSE.getSE(), DT, *LVL); 7642 7643 LoopVectorizationCostModel CM(SEL, L, PSE, LI, LVL, *TTI, TLI, DB, AC, ORE, F, 7644 &Hints, IAI); 7645 // Use the planner for outer loop vectorization. 7646 // TODO: CM is not used at this point inside the planner. Turn CM into an 7647 // optional argument if we don't need it in the future. 7648 LoopVectorizationPlanner LVP(L, LI, TLI, TTI, LVL, CM, IAI, PSE); 7649 7650 // Get user vectorization factor. 7651 const unsigned UserVF = Hints.getWidth(); 7652 7653 // Plan how to best vectorize, return the best VF and its cost. 7654 const VectorizationFactor VF = LVP.planInVPlanNativePath(UserVF); 7655 7656 // If we are stress testing VPlan builds, do not attempt to generate vector 7657 // code. Masked vector code generation support will follow soon. 7658 // Also, do not attempt to vectorize if no vector code will be produced. 7659 if (VPlanBuildStressTest || EnableVPlanPredication || 7660 VectorizationFactor::Disabled() == VF) 7661 return false; 7662 7663 LVP.setBestPlan(VF.Width, 1); 7664 7665 InnerLoopVectorizer LB(L, PSE, LI, DT, TLI, TTI, AC, ORE, VF.Width, 1, LVL, 7666 &CM); 7667 LLVM_DEBUG(dbgs() << "Vectorizing outer loop in \"" 7668 << L->getHeader()->getParent()->getName() << "\"\n"); 7669 LVP.executePlan(LB, DT); 7670 7671 // Mark the loop as already vectorized to avoid vectorizing again. 7672 Hints.setAlreadyVectorized(); 7673 7674 assert(!verifyFunction(*L->getHeader()->getParent(), &dbgs())); 7675 return true; 7676 } 7677 7678 LoopVectorizePass::LoopVectorizePass(LoopVectorizeOptions Opts) 7679 : InterleaveOnlyWhenForced(Opts.InterleaveOnlyWhenForced || 7680 !EnableLoopInterleaving), 7681 VectorizeOnlyWhenForced(Opts.VectorizeOnlyWhenForced || 7682 !EnableLoopVectorization) {} 7683 7684 bool LoopVectorizePass::processLoop(Loop *L) { 7685 assert((EnableVPlanNativePath || L->empty()) && 7686 "VPlan-native path is not enabled. Only process inner loops."); 7687 7688 #ifndef NDEBUG 7689 const std::string DebugLocStr = getDebugLocString(L); 7690 #endif /* NDEBUG */ 7691 7692 LLVM_DEBUG(dbgs() << "\nLV: Checking a loop in \"" 7693 << L->getHeader()->getParent()->getName() << "\" from " 7694 << DebugLocStr << "\n"); 7695 7696 LoopVectorizeHints Hints(L, InterleaveOnlyWhenForced, *ORE); 7697 7698 LLVM_DEBUG( 7699 dbgs() << "LV: Loop hints:" 7700 << " force=" 7701 << (Hints.getForce() == LoopVectorizeHints::FK_Disabled 7702 ? "disabled" 7703 : (Hints.getForce() == LoopVectorizeHints::FK_Enabled 7704 ? "enabled" 7705 : "?")) 7706 << " width=" << Hints.getWidth() 7707 << " unroll=" << Hints.getInterleave() << "\n"); 7708 7709 // Function containing loop 7710 Function *F = L->getHeader()->getParent(); 7711 7712 // Looking at the diagnostic output is the only way to determine if a loop 7713 // was vectorized (other than looking at the IR or machine code), so it 7714 // is important to generate an optimization remark for each loop. Most of 7715 // these messages are generated as OptimizationRemarkAnalysis. Remarks 7716 // generated as OptimizationRemark and OptimizationRemarkMissed are 7717 // less verbose reporting vectorized loops and unvectorized loops that may 7718 // benefit from vectorization, respectively. 7719 7720 if (!Hints.allowVectorization(F, L, VectorizeOnlyWhenForced)) { 7721 LLVM_DEBUG(dbgs() << "LV: Loop hints prevent vectorization.\n"); 7722 return false; 7723 } 7724 7725 PredicatedScalarEvolution PSE(*SE, *L); 7726 7727 // Check if it is legal to vectorize the loop. 7728 LoopVectorizationRequirements Requirements(*ORE); 7729 LoopVectorizationLegality LVL(L, PSE, DT, TTI, TLI, AA, F, GetLAA, LI, ORE, 7730 &Requirements, &Hints, DB, AC); 7731 if (!LVL.canVectorize(EnableVPlanNativePath)) { 7732 LLVM_DEBUG(dbgs() << "LV: Not vectorizing: Cannot prove legality.\n"); 7733 Hints.emitRemarkWithHints(); 7734 return false; 7735 } 7736 7737 // Check the function attributes and profiles to find out if this function 7738 // should be optimized for size. 7739 ScalarEpilogueLowering SEL = getScalarEpilogueLowering( 7740 F, L, Hints, PSI, BFI, TTI, TLI, AC, LI, PSE.getSE(), DT, LVL); 7741 7742 // Entrance to the VPlan-native vectorization path. Outer loops are processed 7743 // here. They may require CFG and instruction level transformations before 7744 // even evaluating whether vectorization is profitable. Since we cannot modify 7745 // the incoming IR, we need to build VPlan upfront in the vectorization 7746 // pipeline. 7747 if (!L->empty()) 7748 return processLoopInVPlanNativePath(L, PSE, LI, DT, &LVL, TTI, TLI, DB, AC, 7749 ORE, BFI, PSI, Hints); 7750 7751 assert(L->empty() && "Inner loop expected."); 7752 7753 // Check the loop for a trip count threshold: vectorize loops with a tiny trip 7754 // count by optimizing for size, to minimize overheads. 7755 auto ExpectedTC = getSmallBestKnownTC(*SE, L); 7756 if (ExpectedTC && *ExpectedTC < TinyTripCountVectorThreshold) { 7757 LLVM_DEBUG(dbgs() << "LV: Found a loop with a very small trip count. " 7758 << "This loop is worth vectorizing only if no scalar " 7759 << "iteration overheads are incurred."); 7760 if (Hints.getForce() == LoopVectorizeHints::FK_Enabled) 7761 LLVM_DEBUG(dbgs() << " But vectorizing was explicitly forced.\n"); 7762 else { 7763 LLVM_DEBUG(dbgs() << "\n"); 7764 SEL = CM_ScalarEpilogueNotAllowedLowTripLoop; 7765 } 7766 } 7767 7768 // Check the function attributes to see if implicit floats are allowed. 7769 // FIXME: This check doesn't seem possibly correct -- what if the loop is 7770 // an integer loop and the vector instructions selected are purely integer 7771 // vector instructions? 7772 if (F->hasFnAttribute(Attribute::NoImplicitFloat)) { 7773 reportVectorizationFailure( 7774 "Can't vectorize when the NoImplicitFloat attribute is used", 7775 "loop not vectorized due to NoImplicitFloat attribute", 7776 "NoImplicitFloat", ORE, L); 7777 Hints.emitRemarkWithHints(); 7778 return false; 7779 } 7780 7781 // Check if the target supports potentially unsafe FP vectorization. 7782 // FIXME: Add a check for the type of safety issue (denormal, signaling) 7783 // for the target we're vectorizing for, to make sure none of the 7784 // additional fp-math flags can help. 7785 if (Hints.isPotentiallyUnsafe() && 7786 TTI->isFPVectorizationPotentiallyUnsafe()) { 7787 reportVectorizationFailure( 7788 "Potentially unsafe FP op prevents vectorization", 7789 "loop not vectorized due to unsafe FP support.", 7790 "UnsafeFP", ORE, L); 7791 Hints.emitRemarkWithHints(); 7792 return false; 7793 } 7794 7795 bool UseInterleaved = TTI->enableInterleavedAccessVectorization(); 7796 InterleavedAccessInfo IAI(PSE, L, DT, LI, LVL.getLAI()); 7797 7798 // If an override option has been passed in for interleaved accesses, use it. 7799 if (EnableInterleavedMemAccesses.getNumOccurrences() > 0) 7800 UseInterleaved = EnableInterleavedMemAccesses; 7801 7802 // Analyze interleaved memory accesses. 7803 if (UseInterleaved) { 7804 IAI.analyzeInterleaving(useMaskedInterleavedAccesses(*TTI)); 7805 } 7806 7807 // Use the cost model. 7808 LoopVectorizationCostModel CM(SEL, L, PSE, LI, &LVL, *TTI, TLI, DB, AC, ORE, 7809 F, &Hints, IAI); 7810 CM.collectValuesToIgnore(); 7811 7812 // Use the planner for vectorization. 7813 LoopVectorizationPlanner LVP(L, LI, TLI, TTI, &LVL, CM, IAI, PSE); 7814 7815 // Get user vectorization factor and interleave count. 7816 unsigned UserVF = Hints.getWidth(); 7817 unsigned UserIC = Hints.getInterleave(); 7818 7819 // Plan how to best vectorize, return the best VF and its cost. 7820 Optional<VectorizationFactor> MaybeVF = LVP.plan(UserVF, UserIC); 7821 7822 VectorizationFactor VF = VectorizationFactor::Disabled(); 7823 unsigned IC = 1; 7824 7825 if (MaybeVF) { 7826 VF = *MaybeVF; 7827 // Select the interleave count. 7828 IC = CM.selectInterleaveCount(VF.Width, VF.Cost); 7829 } 7830 7831 // Identify the diagnostic messages that should be produced. 7832 std::pair<StringRef, std::string> VecDiagMsg, IntDiagMsg; 7833 bool VectorizeLoop = true, InterleaveLoop = true; 7834 if (Requirements.doesNotMeet(F, L, Hints)) { 7835 LLVM_DEBUG(dbgs() << "LV: Not vectorizing: loop did not meet vectorization " 7836 "requirements.\n"); 7837 Hints.emitRemarkWithHints(); 7838 return false; 7839 } 7840 7841 if (VF.Width == 1) { 7842 LLVM_DEBUG(dbgs() << "LV: Vectorization is possible but not beneficial.\n"); 7843 VecDiagMsg = std::make_pair( 7844 "VectorizationNotBeneficial", 7845 "the cost-model indicates that vectorization is not beneficial"); 7846 VectorizeLoop = false; 7847 } 7848 7849 if (!MaybeVF && UserIC > 1) { 7850 // Tell the user interleaving was avoided up-front, despite being explicitly 7851 // requested. 7852 LLVM_DEBUG(dbgs() << "LV: Ignoring UserIC, because vectorization and " 7853 "interleaving should be avoided up front\n"); 7854 IntDiagMsg = std::make_pair( 7855 "InterleavingAvoided", 7856 "Ignoring UserIC, because interleaving was avoided up front"); 7857 InterleaveLoop = false; 7858 } else if (IC == 1 && UserIC <= 1) { 7859 // Tell the user interleaving is not beneficial. 7860 LLVM_DEBUG(dbgs() << "LV: Interleaving is not beneficial.\n"); 7861 IntDiagMsg = std::make_pair( 7862 "InterleavingNotBeneficial", 7863 "the cost-model indicates that interleaving is not beneficial"); 7864 InterleaveLoop = false; 7865 if (UserIC == 1) { 7866 IntDiagMsg.first = "InterleavingNotBeneficialAndDisabled"; 7867 IntDiagMsg.second += 7868 " and is explicitly disabled or interleave count is set to 1"; 7869 } 7870 } else if (IC > 1 && UserIC == 1) { 7871 // Tell the user interleaving is beneficial, but it explicitly disabled. 7872 LLVM_DEBUG( 7873 dbgs() << "LV: Interleaving is beneficial but is explicitly disabled."); 7874 IntDiagMsg = std::make_pair( 7875 "InterleavingBeneficialButDisabled", 7876 "the cost-model indicates that interleaving is beneficial " 7877 "but is explicitly disabled or interleave count is set to 1"); 7878 InterleaveLoop = false; 7879 } 7880 7881 // Override IC if user provided an interleave count. 7882 IC = UserIC > 0 ? UserIC : IC; 7883 7884 // Emit diagnostic messages, if any. 7885 const char *VAPassName = Hints.vectorizeAnalysisPassName(); 7886 if (!VectorizeLoop && !InterleaveLoop) { 7887 // Do not vectorize or interleaving the loop. 7888 ORE->emit([&]() { 7889 return OptimizationRemarkMissed(VAPassName, VecDiagMsg.first, 7890 L->getStartLoc(), L->getHeader()) 7891 << VecDiagMsg.second; 7892 }); 7893 ORE->emit([&]() { 7894 return OptimizationRemarkMissed(LV_NAME, IntDiagMsg.first, 7895 L->getStartLoc(), L->getHeader()) 7896 << IntDiagMsg.second; 7897 }); 7898 return false; 7899 } else if (!VectorizeLoop && InterleaveLoop) { 7900 LLVM_DEBUG(dbgs() << "LV: Interleave Count is " << IC << '\n'); 7901 ORE->emit([&]() { 7902 return OptimizationRemarkAnalysis(VAPassName, VecDiagMsg.first, 7903 L->getStartLoc(), L->getHeader()) 7904 << VecDiagMsg.second; 7905 }); 7906 } else if (VectorizeLoop && !InterleaveLoop) { 7907 LLVM_DEBUG(dbgs() << "LV: Found a vectorizable loop (" << VF.Width 7908 << ") in " << DebugLocStr << '\n'); 7909 ORE->emit([&]() { 7910 return OptimizationRemarkAnalysis(LV_NAME, IntDiagMsg.first, 7911 L->getStartLoc(), L->getHeader()) 7912 << IntDiagMsg.second; 7913 }); 7914 } else if (VectorizeLoop && InterleaveLoop) { 7915 LLVM_DEBUG(dbgs() << "LV: Found a vectorizable loop (" << VF.Width 7916 << ") in " << DebugLocStr << '\n'); 7917 LLVM_DEBUG(dbgs() << "LV: Interleave Count is " << IC << '\n'); 7918 } 7919 7920 LVP.setBestPlan(VF.Width, IC); 7921 7922 using namespace ore; 7923 bool DisableRuntimeUnroll = false; 7924 MDNode *OrigLoopID = L->getLoopID(); 7925 7926 if (!VectorizeLoop) { 7927 assert(IC > 1 && "interleave count should not be 1 or 0"); 7928 // If we decided that it is not legal to vectorize the loop, then 7929 // interleave it. 7930 InnerLoopUnroller Unroller(L, PSE, LI, DT, TLI, TTI, AC, ORE, IC, &LVL, 7931 &CM); 7932 LVP.executePlan(Unroller, DT); 7933 7934 ORE->emit([&]() { 7935 return OptimizationRemark(LV_NAME, "Interleaved", L->getStartLoc(), 7936 L->getHeader()) 7937 << "interleaved loop (interleaved count: " 7938 << NV("InterleaveCount", IC) << ")"; 7939 }); 7940 } else { 7941 // If we decided that it is *legal* to vectorize the loop, then do it. 7942 InnerLoopVectorizer LB(L, PSE, LI, DT, TLI, TTI, AC, ORE, VF.Width, IC, 7943 &LVL, &CM); 7944 LVP.executePlan(LB, DT); 7945 ++LoopsVectorized; 7946 7947 // Add metadata to disable runtime unrolling a scalar loop when there are 7948 // no runtime checks about strides and memory. A scalar loop that is 7949 // rarely used is not worth unrolling. 7950 if (!LB.areSafetyChecksAdded()) 7951 DisableRuntimeUnroll = true; 7952 7953 // Report the vectorization decision. 7954 ORE->emit([&]() { 7955 return OptimizationRemark(LV_NAME, "Vectorized", L->getStartLoc(), 7956 L->getHeader()) 7957 << "vectorized loop (vectorization width: " 7958 << NV("VectorizationFactor", VF.Width) 7959 << ", interleaved count: " << NV("InterleaveCount", IC) << ")"; 7960 }); 7961 } 7962 7963 Optional<MDNode *> RemainderLoopID = 7964 makeFollowupLoopID(OrigLoopID, {LLVMLoopVectorizeFollowupAll, 7965 LLVMLoopVectorizeFollowupEpilogue}); 7966 if (RemainderLoopID.hasValue()) { 7967 L->setLoopID(RemainderLoopID.getValue()); 7968 } else { 7969 if (DisableRuntimeUnroll) 7970 AddRuntimeUnrollDisableMetaData(L); 7971 7972 // Mark the loop as already vectorized to avoid vectorizing again. 7973 Hints.setAlreadyVectorized(); 7974 } 7975 7976 assert(!verifyFunction(*L->getHeader()->getParent())); 7977 return true; 7978 } 7979 7980 LoopVectorizeResult LoopVectorizePass::runImpl( 7981 Function &F, ScalarEvolution &SE_, LoopInfo &LI_, TargetTransformInfo &TTI_, 7982 DominatorTree &DT_, BlockFrequencyInfo &BFI_, TargetLibraryInfo *TLI_, 7983 DemandedBits &DB_, AliasAnalysis &AA_, AssumptionCache &AC_, 7984 std::function<const LoopAccessInfo &(Loop &)> &GetLAA_, 7985 OptimizationRemarkEmitter &ORE_, ProfileSummaryInfo *PSI_) { 7986 SE = &SE_; 7987 LI = &LI_; 7988 TTI = &TTI_; 7989 DT = &DT_; 7990 BFI = &BFI_; 7991 TLI = TLI_; 7992 AA = &AA_; 7993 AC = &AC_; 7994 GetLAA = &GetLAA_; 7995 DB = &DB_; 7996 ORE = &ORE_; 7997 PSI = PSI_; 7998 7999 // Don't attempt if 8000 // 1. the target claims to have no vector registers, and 8001 // 2. interleaving won't help ILP. 8002 // 8003 // The second condition is necessary because, even if the target has no 8004 // vector registers, loop vectorization may still enable scalar 8005 // interleaving. 8006 if (!TTI->getNumberOfRegisters(TTI->getRegisterClassForType(true)) && 8007 TTI->getMaxInterleaveFactor(1) < 2) 8008 return LoopVectorizeResult(false, false); 8009 8010 bool Changed = false, CFGChanged = false; 8011 8012 // The vectorizer requires loops to be in simplified form. 8013 // Since simplification may add new inner loops, it has to run before the 8014 // legality and profitability checks. This means running the loop vectorizer 8015 // will simplify all loops, regardless of whether anything end up being 8016 // vectorized. 8017 for (auto &L : *LI) 8018 Changed |= CFGChanged |= 8019 simplifyLoop(L, DT, LI, SE, AC, nullptr, false /* PreserveLCSSA */); 8020 8021 // Build up a worklist of inner-loops to vectorize. This is necessary as 8022 // the act of vectorizing or partially unrolling a loop creates new loops 8023 // and can invalidate iterators across the loops. 8024 SmallVector<Loop *, 8> Worklist; 8025 8026 for (Loop *L : *LI) 8027 collectSupportedLoops(*L, LI, ORE, Worklist); 8028 8029 LoopsAnalyzed += Worklist.size(); 8030 8031 // Now walk the identified inner loops. 8032 while (!Worklist.empty()) { 8033 Loop *L = Worklist.pop_back_val(); 8034 8035 // For the inner loops we actually process, form LCSSA to simplify the 8036 // transform. 8037 Changed |= formLCSSARecursively(*L, *DT, LI, SE); 8038 8039 Changed |= CFGChanged |= processLoop(L); 8040 } 8041 8042 // Process each loop nest in the function. 8043 return LoopVectorizeResult(Changed, CFGChanged); 8044 } 8045 8046 PreservedAnalyses LoopVectorizePass::run(Function &F, 8047 FunctionAnalysisManager &AM) { 8048 auto &SE = AM.getResult<ScalarEvolutionAnalysis>(F); 8049 auto &LI = AM.getResult<LoopAnalysis>(F); 8050 auto &TTI = AM.getResult<TargetIRAnalysis>(F); 8051 auto &DT = AM.getResult<DominatorTreeAnalysis>(F); 8052 auto &BFI = AM.getResult<BlockFrequencyAnalysis>(F); 8053 auto &TLI = AM.getResult<TargetLibraryAnalysis>(F); 8054 auto &AA = AM.getResult<AAManager>(F); 8055 auto &AC = AM.getResult<AssumptionAnalysis>(F); 8056 auto &DB = AM.getResult<DemandedBitsAnalysis>(F); 8057 auto &ORE = AM.getResult<OptimizationRemarkEmitterAnalysis>(F); 8058 MemorySSA *MSSA = EnableMSSALoopDependency 8059 ? &AM.getResult<MemorySSAAnalysis>(F).getMSSA() 8060 : nullptr; 8061 8062 auto &LAM = AM.getResult<LoopAnalysisManagerFunctionProxy>(F).getManager(); 8063 std::function<const LoopAccessInfo &(Loop &)> GetLAA = 8064 [&](Loop &L) -> const LoopAccessInfo & { 8065 LoopStandardAnalysisResults AR = {AA, AC, DT, LI, SE, TLI, TTI, MSSA}; 8066 return LAM.getResult<LoopAccessAnalysis>(L, AR); 8067 }; 8068 auto &MAMProxy = AM.getResult<ModuleAnalysisManagerFunctionProxy>(F); 8069 ProfileSummaryInfo *PSI = 8070 MAMProxy.getCachedResult<ProfileSummaryAnalysis>(*F.getParent()); 8071 LoopVectorizeResult Result = 8072 runImpl(F, SE, LI, TTI, DT, BFI, &TLI, DB, AA, AC, GetLAA, ORE, PSI); 8073 if (!Result.MadeAnyChange) 8074 return PreservedAnalyses::all(); 8075 PreservedAnalyses PA; 8076 8077 // We currently do not preserve loopinfo/dominator analyses with outer loop 8078 // vectorization. Until this is addressed, mark these analyses as preserved 8079 // only for non-VPlan-native path. 8080 // TODO: Preserve Loop and Dominator analyses for VPlan-native path. 8081 if (!EnableVPlanNativePath) { 8082 PA.preserve<LoopAnalysis>(); 8083 PA.preserve<DominatorTreeAnalysis>(); 8084 } 8085 PA.preserve<BasicAA>(); 8086 PA.preserve<GlobalsAA>(); 8087 if (!Result.MadeCFGChange) 8088 PA.preserveSet<CFGAnalyses>(); 8089 return PA; 8090 } 8091