1 //===- LoopVectorize.cpp - A Loop Vectorizer ------------------------------===// 2 // 3 // Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. 4 // See https://llvm.org/LICENSE.txt for license information. 5 // SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception 6 // 7 //===----------------------------------------------------------------------===// 8 // 9 // This is the LLVM loop vectorizer. This pass modifies 'vectorizable' loops 10 // and generates target-independent LLVM-IR. 11 // The vectorizer uses the TargetTransformInfo analysis to estimate the costs 12 // of instructions in order to estimate the profitability of vectorization. 13 // 14 // The loop vectorizer combines consecutive loop iterations into a single 15 // 'wide' iteration. After this transformation the index is incremented 16 // by the SIMD vector width, and not by one. 17 // 18 // This pass has three parts: 19 // 1. The main loop pass that drives the different parts. 20 // 2. LoopVectorizationLegality - A unit that checks for the legality 21 // of the vectorization. 22 // 3. InnerLoopVectorizer - A unit that performs the actual 23 // widening of instructions. 24 // 4. LoopVectorizationCostModel - A unit that checks for the profitability 25 // of vectorization. It decides on the optimal vector width, which 26 // can be one, if vectorization is not profitable. 27 // 28 // There is a development effort going on to migrate loop vectorizer to the 29 // VPlan infrastructure and to introduce outer loop vectorization support (see 30 // docs/Proposal/VectorizationPlan.rst and 31 // http://lists.llvm.org/pipermail/llvm-dev/2017-December/119523.html). For this 32 // purpose, we temporarily introduced the VPlan-native vectorization path: an 33 // alternative vectorization path that is natively implemented on top of the 34 // VPlan infrastructure. See EnableVPlanNativePath for enabling. 35 // 36 //===----------------------------------------------------------------------===// 37 // 38 // The reduction-variable vectorization is based on the paper: 39 // D. Nuzman and R. Henderson. Multi-platform Auto-vectorization. 40 // 41 // Variable uniformity checks are inspired by: 42 // Karrenberg, R. and Hack, S. Whole Function Vectorization. 43 // 44 // The interleaved access vectorization is based on the paper: 45 // Dorit Nuzman, Ira Rosen and Ayal Zaks. Auto-Vectorization of Interleaved 46 // Data for SIMD 47 // 48 // Other ideas/concepts are from: 49 // A. Zaks and D. Nuzman. Autovectorization in GCC-two years later. 50 // 51 // S. Maleki, Y. Gao, M. Garzaran, T. Wong and D. Padua. An Evaluation of 52 // Vectorizing Compilers. 53 // 54 //===----------------------------------------------------------------------===// 55 56 #include "llvm/Transforms/Vectorize/LoopVectorize.h" 57 #include "LoopVectorizationPlanner.h" 58 #include "VPRecipeBuilder.h" 59 #include "VPlan.h" 60 #include "VPlanHCFGBuilder.h" 61 #include "VPlanPredicator.h" 62 #include "VPlanTransforms.h" 63 #include "llvm/ADT/APInt.h" 64 #include "llvm/ADT/ArrayRef.h" 65 #include "llvm/ADT/DenseMap.h" 66 #include "llvm/ADT/DenseMapInfo.h" 67 #include "llvm/ADT/Hashing.h" 68 #include "llvm/ADT/MapVector.h" 69 #include "llvm/ADT/None.h" 70 #include "llvm/ADT/Optional.h" 71 #include "llvm/ADT/STLExtras.h" 72 #include "llvm/ADT/SetVector.h" 73 #include "llvm/ADT/SmallPtrSet.h" 74 #include "llvm/ADT/SmallVector.h" 75 #include "llvm/ADT/Statistic.h" 76 #include "llvm/ADT/StringRef.h" 77 #include "llvm/ADT/Twine.h" 78 #include "llvm/ADT/iterator_range.h" 79 #include "llvm/Analysis/AssumptionCache.h" 80 #include "llvm/Analysis/BasicAliasAnalysis.h" 81 #include "llvm/Analysis/BlockFrequencyInfo.h" 82 #include "llvm/Analysis/CFG.h" 83 #include "llvm/Analysis/CodeMetrics.h" 84 #include "llvm/Analysis/DemandedBits.h" 85 #include "llvm/Analysis/GlobalsModRef.h" 86 #include "llvm/Analysis/LoopAccessAnalysis.h" 87 #include "llvm/Analysis/LoopAnalysisManager.h" 88 #include "llvm/Analysis/LoopInfo.h" 89 #include "llvm/Analysis/LoopIterator.h" 90 #include "llvm/Analysis/MemorySSA.h" 91 #include "llvm/Analysis/OptimizationRemarkEmitter.h" 92 #include "llvm/Analysis/ProfileSummaryInfo.h" 93 #include "llvm/Analysis/ScalarEvolution.h" 94 #include "llvm/Analysis/ScalarEvolutionExpressions.h" 95 #include "llvm/Analysis/TargetLibraryInfo.h" 96 #include "llvm/Analysis/TargetTransformInfo.h" 97 #include "llvm/Analysis/VectorUtils.h" 98 #include "llvm/IR/Attributes.h" 99 #include "llvm/IR/BasicBlock.h" 100 #include "llvm/IR/CFG.h" 101 #include "llvm/IR/Constant.h" 102 #include "llvm/IR/Constants.h" 103 #include "llvm/IR/DataLayout.h" 104 #include "llvm/IR/DebugInfoMetadata.h" 105 #include "llvm/IR/DebugLoc.h" 106 #include "llvm/IR/DerivedTypes.h" 107 #include "llvm/IR/DiagnosticInfo.h" 108 #include "llvm/IR/Dominators.h" 109 #include "llvm/IR/Function.h" 110 #include "llvm/IR/IRBuilder.h" 111 #include "llvm/IR/InstrTypes.h" 112 #include "llvm/IR/Instruction.h" 113 #include "llvm/IR/Instructions.h" 114 #include "llvm/IR/IntrinsicInst.h" 115 #include "llvm/IR/Intrinsics.h" 116 #include "llvm/IR/LLVMContext.h" 117 #include "llvm/IR/Metadata.h" 118 #include "llvm/IR/Module.h" 119 #include "llvm/IR/Operator.h" 120 #include "llvm/IR/Type.h" 121 #include "llvm/IR/Use.h" 122 #include "llvm/IR/User.h" 123 #include "llvm/IR/Value.h" 124 #include "llvm/IR/ValueHandle.h" 125 #include "llvm/IR/Verifier.h" 126 #include "llvm/InitializePasses.h" 127 #include "llvm/Pass.h" 128 #include "llvm/Support/Casting.h" 129 #include "llvm/Support/CommandLine.h" 130 #include "llvm/Support/Compiler.h" 131 #include "llvm/Support/Debug.h" 132 #include "llvm/Support/ErrorHandling.h" 133 #include "llvm/Support/MathExtras.h" 134 #include "llvm/Support/raw_ostream.h" 135 #include "llvm/Transforms/Utils/BasicBlockUtils.h" 136 #include "llvm/Transforms/Utils/InjectTLIMappings.h" 137 #include "llvm/Transforms/Utils/LoopSimplify.h" 138 #include "llvm/Transforms/Utils/LoopUtils.h" 139 #include "llvm/Transforms/Utils/LoopVersioning.h" 140 #include "llvm/Transforms/Utils/ScalarEvolutionExpander.h" 141 #include "llvm/Transforms/Utils/SizeOpts.h" 142 #include "llvm/Transforms/Vectorize/LoopVectorizationLegality.h" 143 #include <algorithm> 144 #include <cassert> 145 #include <cstdint> 146 #include <cstdlib> 147 #include <functional> 148 #include <iterator> 149 #include <limits> 150 #include <memory> 151 #include <string> 152 #include <tuple> 153 #include <utility> 154 155 using namespace llvm; 156 157 #define LV_NAME "loop-vectorize" 158 #define DEBUG_TYPE LV_NAME 159 160 /// @{ 161 /// Metadata attribute names 162 static const char *const LLVMLoopVectorizeFollowupAll = 163 "llvm.loop.vectorize.followup_all"; 164 static const char *const LLVMLoopVectorizeFollowupVectorized = 165 "llvm.loop.vectorize.followup_vectorized"; 166 static const char *const LLVMLoopVectorizeFollowupEpilogue = 167 "llvm.loop.vectorize.followup_epilogue"; 168 /// @} 169 170 STATISTIC(LoopsVectorized, "Number of loops vectorized"); 171 STATISTIC(LoopsAnalyzed, "Number of loops analyzed for vectorization"); 172 173 /// Loops with a known constant trip count below this number are vectorized only 174 /// if no scalar iteration overheads are incurred. 175 static cl::opt<unsigned> TinyTripCountVectorThreshold( 176 "vectorizer-min-trip-count", cl::init(16), cl::Hidden, 177 cl::desc("Loops with a constant trip count that is smaller than this " 178 "value are vectorized only if no scalar iteration overheads " 179 "are incurred.")); 180 181 // Indicates that an epilogue is undesired, predication is preferred. 182 // This means that the vectorizer will try to fold the loop-tail (epilogue) 183 // into the loop and predicate the loop body accordingly. 184 static cl::opt<bool> PreferPredicateOverEpilog( 185 "prefer-predicate-over-epilog", cl::init(false), cl::Hidden, 186 cl::desc("Indicate that an epilogue is undesired, predication should be " 187 "used instead.")); 188 189 static cl::opt<bool> MaximizeBandwidth( 190 "vectorizer-maximize-bandwidth", cl::init(false), cl::Hidden, 191 cl::desc("Maximize bandwidth when selecting vectorization factor which " 192 "will be determined by the smallest type in loop.")); 193 194 static cl::opt<bool> EnableInterleavedMemAccesses( 195 "enable-interleaved-mem-accesses", cl::init(false), cl::Hidden, 196 cl::desc("Enable vectorization on interleaved memory accesses in a loop")); 197 198 /// An interleave-group may need masking if it resides in a block that needs 199 /// predication, or in order to mask away gaps. 200 static cl::opt<bool> EnableMaskedInterleavedMemAccesses( 201 "enable-masked-interleaved-mem-accesses", cl::init(false), cl::Hidden, 202 cl::desc("Enable vectorization on masked interleaved memory accesses in a loop")); 203 204 static cl::opt<unsigned> TinyTripCountInterleaveThreshold( 205 "tiny-trip-count-interleave-threshold", cl::init(128), cl::Hidden, 206 cl::desc("We don't interleave loops with a estimated constant trip count " 207 "below this number")); 208 209 static cl::opt<unsigned> ForceTargetNumScalarRegs( 210 "force-target-num-scalar-regs", cl::init(0), cl::Hidden, 211 cl::desc("A flag that overrides the target's number of scalar registers.")); 212 213 static cl::opt<unsigned> ForceTargetNumVectorRegs( 214 "force-target-num-vector-regs", cl::init(0), cl::Hidden, 215 cl::desc("A flag that overrides the target's number of vector registers.")); 216 217 static cl::opt<unsigned> ForceTargetMaxScalarInterleaveFactor( 218 "force-target-max-scalar-interleave", cl::init(0), cl::Hidden, 219 cl::desc("A flag that overrides the target's max interleave factor for " 220 "scalar loops.")); 221 222 static cl::opt<unsigned> ForceTargetMaxVectorInterleaveFactor( 223 "force-target-max-vector-interleave", cl::init(0), cl::Hidden, 224 cl::desc("A flag that overrides the target's max interleave factor for " 225 "vectorized loops.")); 226 227 static cl::opt<unsigned> ForceTargetInstructionCost( 228 "force-target-instruction-cost", cl::init(0), cl::Hidden, 229 cl::desc("A flag that overrides the target's expected cost for " 230 "an instruction to a single constant value. Mostly " 231 "useful for getting consistent testing.")); 232 233 static cl::opt<unsigned> SmallLoopCost( 234 "small-loop-cost", cl::init(20), cl::Hidden, 235 cl::desc( 236 "The cost of a loop that is considered 'small' by the interleaver.")); 237 238 static cl::opt<bool> LoopVectorizeWithBlockFrequency( 239 "loop-vectorize-with-block-frequency", cl::init(true), cl::Hidden, 240 cl::desc("Enable the use of the block frequency analysis to access PGO " 241 "heuristics minimizing code growth in cold regions and being more " 242 "aggressive in hot regions.")); 243 244 // Runtime interleave loops for load/store throughput. 245 static cl::opt<bool> EnableLoadStoreRuntimeInterleave( 246 "enable-loadstore-runtime-interleave", cl::init(true), cl::Hidden, 247 cl::desc( 248 "Enable runtime interleaving until load/store ports are saturated")); 249 250 /// The number of stores in a loop that are allowed to need predication. 251 static cl::opt<unsigned> NumberOfStoresToPredicate( 252 "vectorize-num-stores-pred", cl::init(1), cl::Hidden, 253 cl::desc("Max number of stores to be predicated behind an if.")); 254 255 static cl::opt<bool> EnableIndVarRegisterHeur( 256 "enable-ind-var-reg-heur", cl::init(true), cl::Hidden, 257 cl::desc("Count the induction variable only once when interleaving")); 258 259 static cl::opt<bool> EnableCondStoresVectorization( 260 "enable-cond-stores-vec", cl::init(true), cl::Hidden, 261 cl::desc("Enable if predication of stores during vectorization.")); 262 263 static cl::opt<unsigned> MaxNestedScalarReductionIC( 264 "max-nested-scalar-reduction-interleave", cl::init(2), cl::Hidden, 265 cl::desc("The maximum interleave count to use when interleaving a scalar " 266 "reduction in a nested loop.")); 267 268 static cl::opt<bool> 269 PreferInLoopReductions("prefer-inloop-reductions", cl::init(false), 270 cl::Hidden, 271 cl::desc("Prefer in-loop vector reductions, " 272 "overriding the targets preference.")); 273 274 cl::opt<bool> EnableVPlanNativePath( 275 "enable-vplan-native-path", cl::init(false), cl::Hidden, 276 cl::desc("Enable VPlan-native vectorization path with " 277 "support for outer loop vectorization.")); 278 279 // FIXME: Remove this switch once we have divergence analysis. Currently we 280 // assume divergent non-backedge branches when this switch is true. 281 cl::opt<bool> EnableVPlanPredication( 282 "enable-vplan-predication", cl::init(false), cl::Hidden, 283 cl::desc("Enable VPlan-native vectorization path predicator with " 284 "support for outer loop vectorization.")); 285 286 // This flag enables the stress testing of the VPlan H-CFG construction in the 287 // VPlan-native vectorization path. It must be used in conjuction with 288 // -enable-vplan-native-path. -vplan-verify-hcfg can also be used to enable the 289 // verification of the H-CFGs built. 290 static cl::opt<bool> VPlanBuildStressTest( 291 "vplan-build-stress-test", cl::init(false), cl::Hidden, 292 cl::desc( 293 "Build VPlan for every supported loop nest in the function and bail " 294 "out right after the build (stress test the VPlan H-CFG construction " 295 "in the VPlan-native vectorization path).")); 296 297 cl::opt<bool> llvm::EnableLoopInterleaving( 298 "interleave-loops", cl::init(true), cl::Hidden, 299 cl::desc("Enable loop interleaving in Loop vectorization passes")); 300 cl::opt<bool> llvm::EnableLoopVectorization( 301 "vectorize-loops", cl::init(true), cl::Hidden, 302 cl::desc("Run the Loop vectorization passes")); 303 304 /// A helper function that returns the type of loaded or stored value. 305 static Type *getMemInstValueType(Value *I) { 306 assert((isa<LoadInst>(I) || isa<StoreInst>(I)) && 307 "Expected Load or Store instruction"); 308 if (auto *LI = dyn_cast<LoadInst>(I)) 309 return LI->getType(); 310 return cast<StoreInst>(I)->getValueOperand()->getType(); 311 } 312 313 /// A helper function that returns true if the given type is irregular. The 314 /// type is irregular if its allocated size doesn't equal the store size of an 315 /// element of the corresponding vector type at the given vectorization factor. 316 static bool hasIrregularType(Type *Ty, const DataLayout &DL, unsigned VF) { 317 // Determine if an array of VF elements of type Ty is "bitcast compatible" 318 // with a <VF x Ty> vector. 319 if (VF > 1) { 320 auto *VectorTy = FixedVectorType::get(Ty, VF); 321 return VF * DL.getTypeAllocSize(Ty) != DL.getTypeStoreSize(VectorTy); 322 } 323 324 // If the vectorization factor is one, we just check if an array of type Ty 325 // requires padding between elements. 326 return DL.getTypeAllocSizeInBits(Ty) != DL.getTypeSizeInBits(Ty); 327 } 328 329 /// A helper function that returns the reciprocal of the block probability of 330 /// predicated blocks. If we return X, we are assuming the predicated block 331 /// will execute once for every X iterations of the loop header. 332 /// 333 /// TODO: We should use actual block probability here, if available. Currently, 334 /// we always assume predicated blocks have a 50% chance of executing. 335 static unsigned getReciprocalPredBlockProb() { return 2; } 336 337 /// A helper function that adds a 'fast' flag to floating-point operations. 338 static Value *addFastMathFlag(Value *V) { 339 if (isa<FPMathOperator>(V)) 340 cast<Instruction>(V)->setFastMathFlags(FastMathFlags::getFast()); 341 return V; 342 } 343 344 static Value *addFastMathFlag(Value *V, FastMathFlags FMF) { 345 if (isa<FPMathOperator>(V)) 346 cast<Instruction>(V)->setFastMathFlags(FMF); 347 return V; 348 } 349 350 /// A helper function that returns an integer or floating-point constant with 351 /// value C. 352 static Constant *getSignedIntOrFpConstant(Type *Ty, int64_t C) { 353 return Ty->isIntegerTy() ? ConstantInt::getSigned(Ty, C) 354 : ConstantFP::get(Ty, C); 355 } 356 357 /// Returns "best known" trip count for the specified loop \p L as defined by 358 /// the following procedure: 359 /// 1) Returns exact trip count if it is known. 360 /// 2) Returns expected trip count according to profile data if any. 361 /// 3) Returns upper bound estimate if it is known. 362 /// 4) Returns None if all of the above failed. 363 static Optional<unsigned> getSmallBestKnownTC(ScalarEvolution &SE, Loop *L) { 364 // Check if exact trip count is known. 365 if (unsigned ExpectedTC = SE.getSmallConstantTripCount(L)) 366 return ExpectedTC; 367 368 // Check if there is an expected trip count available from profile data. 369 if (LoopVectorizeWithBlockFrequency) 370 if (auto EstimatedTC = getLoopEstimatedTripCount(L)) 371 return EstimatedTC; 372 373 // Check if upper bound estimate is known. 374 if (unsigned ExpectedTC = SE.getSmallConstantMaxTripCount(L)) 375 return ExpectedTC; 376 377 return None; 378 } 379 380 namespace llvm { 381 382 /// InnerLoopVectorizer vectorizes loops which contain only one basic 383 /// block to a specified vectorization factor (VF). 384 /// This class performs the widening of scalars into vectors, or multiple 385 /// scalars. This class also implements the following features: 386 /// * It inserts an epilogue loop for handling loops that don't have iteration 387 /// counts that are known to be a multiple of the vectorization factor. 388 /// * It handles the code generation for reduction variables. 389 /// * Scalarization (implementation using scalars) of un-vectorizable 390 /// instructions. 391 /// InnerLoopVectorizer does not perform any vectorization-legality 392 /// checks, and relies on the caller to check for the different legality 393 /// aspects. The InnerLoopVectorizer relies on the 394 /// LoopVectorizationLegality class to provide information about the induction 395 /// and reduction variables that were found to a given vectorization factor. 396 class InnerLoopVectorizer { 397 public: 398 InnerLoopVectorizer(Loop *OrigLoop, PredicatedScalarEvolution &PSE, 399 LoopInfo *LI, DominatorTree *DT, 400 const TargetLibraryInfo *TLI, 401 const TargetTransformInfo *TTI, AssumptionCache *AC, 402 OptimizationRemarkEmitter *ORE, unsigned VecWidth, 403 unsigned UnrollFactor, LoopVectorizationLegality *LVL, 404 LoopVectorizationCostModel *CM, BlockFrequencyInfo *BFI, 405 ProfileSummaryInfo *PSI) 406 : OrigLoop(OrigLoop), PSE(PSE), LI(LI), DT(DT), TLI(TLI), TTI(TTI), 407 AC(AC), ORE(ORE), VF(VecWidth), UF(UnrollFactor), 408 Builder(PSE.getSE()->getContext()), 409 VectorLoopValueMap(UnrollFactor, VecWidth), Legal(LVL), Cost(CM), 410 BFI(BFI), PSI(PSI) { 411 // Query this against the original loop and save it here because the profile 412 // of the original loop header may change as the transformation happens. 413 OptForSizeBasedOnProfile = llvm::shouldOptimizeForSize( 414 OrigLoop->getHeader(), PSI, BFI, PGSOQueryType::IRPass); 415 } 416 417 virtual ~InnerLoopVectorizer() = default; 418 419 /// Create a new empty loop that will contain vectorized instructions later 420 /// on, while the old loop will be used as the scalar remainder. Control flow 421 /// is generated around the vectorized (and scalar epilogue) loops consisting 422 /// of various checks and bypasses. Return the pre-header block of the new 423 /// loop. 424 BasicBlock *createVectorizedLoopSkeleton(); 425 426 /// Widen a single instruction within the innermost loop. 427 void widenInstruction(Instruction &I, VPUser &Operands, 428 VPTransformState &State); 429 430 /// Widen a single call instruction within the innermost loop. 431 void widenCallInstruction(CallInst &I, VPUser &ArgOperands, 432 VPTransformState &State); 433 434 /// Widen a single select instruction within the innermost loop. 435 void widenSelectInstruction(SelectInst &I, VPUser &Operands, 436 bool InvariantCond, VPTransformState &State); 437 438 /// Fix the vectorized code, taking care of header phi's, live-outs, and more. 439 void fixVectorizedLoop(); 440 441 // Return true if any runtime check is added. 442 bool areSafetyChecksAdded() { return AddedSafetyChecks; } 443 444 /// A type for vectorized values in the new loop. Each value from the 445 /// original loop, when vectorized, is represented by UF vector values in the 446 /// new unrolled loop, where UF is the unroll factor. 447 using VectorParts = SmallVector<Value *, 2>; 448 449 /// Vectorize a single GetElementPtrInst based on information gathered and 450 /// decisions taken during planning. 451 void widenGEP(GetElementPtrInst *GEP, VPUser &Indices, unsigned UF, 452 unsigned VF, bool IsPtrLoopInvariant, 453 SmallBitVector &IsIndexLoopInvariant, VPTransformState &State); 454 455 /// Vectorize a single PHINode in a block. This method handles the induction 456 /// variable canonicalization. It supports both VF = 1 for unrolled loops and 457 /// arbitrary length vectors. 458 void widenPHIInstruction(Instruction *PN, unsigned UF, unsigned VF); 459 460 /// A helper function to scalarize a single Instruction in the innermost loop. 461 /// Generates a sequence of scalar instances for each lane between \p MinLane 462 /// and \p MaxLane, times each part between \p MinPart and \p MaxPart, 463 /// inclusive. Uses the VPValue operands from \p Operands instead of \p 464 /// Instr's operands. 465 void scalarizeInstruction(Instruction *Instr, VPUser &Operands, 466 const VPIteration &Instance, bool IfPredicateInstr, 467 VPTransformState &State); 468 469 /// Widen an integer or floating-point induction variable \p IV. If \p Trunc 470 /// is provided, the integer induction variable will first be truncated to 471 /// the corresponding type. 472 void widenIntOrFpInduction(PHINode *IV, TruncInst *Trunc = nullptr); 473 474 /// getOrCreateVectorValue and getOrCreateScalarValue coordinate to generate a 475 /// vector or scalar value on-demand if one is not yet available. When 476 /// vectorizing a loop, we visit the definition of an instruction before its 477 /// uses. When visiting the definition, we either vectorize or scalarize the 478 /// instruction, creating an entry for it in the corresponding map. (In some 479 /// cases, such as induction variables, we will create both vector and scalar 480 /// entries.) Then, as we encounter uses of the definition, we derive values 481 /// for each scalar or vector use unless such a value is already available. 482 /// For example, if we scalarize a definition and one of its uses is vector, 483 /// we build the required vector on-demand with an insertelement sequence 484 /// when visiting the use. Otherwise, if the use is scalar, we can use the 485 /// existing scalar definition. 486 /// 487 /// Return a value in the new loop corresponding to \p V from the original 488 /// loop at unroll index \p Part. If the value has already been vectorized, 489 /// the corresponding vector entry in VectorLoopValueMap is returned. If, 490 /// however, the value has a scalar entry in VectorLoopValueMap, we construct 491 /// a new vector value on-demand by inserting the scalar values into a vector 492 /// with an insertelement sequence. If the value has been neither vectorized 493 /// nor scalarized, it must be loop invariant, so we simply broadcast the 494 /// value into a vector. 495 Value *getOrCreateVectorValue(Value *V, unsigned Part); 496 497 /// Return a value in the new loop corresponding to \p V from the original 498 /// loop at unroll and vector indices \p Instance. If the value has been 499 /// vectorized but not scalarized, the necessary extractelement instruction 500 /// will be generated. 501 Value *getOrCreateScalarValue(Value *V, const VPIteration &Instance); 502 503 /// Construct the vector value of a scalarized value \p V one lane at a time. 504 void packScalarIntoVectorValue(Value *V, const VPIteration &Instance); 505 506 /// Try to vectorize interleaved access group \p Group with the base address 507 /// given in \p Addr, optionally masking the vector operations if \p 508 /// BlockInMask is non-null. Use \p State to translate given VPValues to IR 509 /// values in the vectorized loop. 510 void vectorizeInterleaveGroup(const InterleaveGroup<Instruction> *Group, 511 VPTransformState &State, VPValue *Addr, 512 VPValue *BlockInMask = nullptr); 513 514 /// Vectorize Load and Store instructions with the base address given in \p 515 /// Addr, optionally masking the vector operations if \p BlockInMask is 516 /// non-null. Use \p State to translate given VPValues to IR values in the 517 /// vectorized loop. 518 void vectorizeMemoryInstruction(Instruction *Instr, VPTransformState &State, 519 VPValue *Addr, VPValue *StoredValue, 520 VPValue *BlockInMask); 521 522 /// Set the debug location in the builder using the debug location in 523 /// the instruction. 524 void setDebugLocFromInst(IRBuilder<> &B, const Value *Ptr); 525 526 /// Fix the non-induction PHIs in the OrigPHIsToFix vector. 527 void fixNonInductionPHIs(void); 528 529 protected: 530 friend class LoopVectorizationPlanner; 531 532 /// A small list of PHINodes. 533 using PhiVector = SmallVector<PHINode *, 4>; 534 535 /// A type for scalarized values in the new loop. Each value from the 536 /// original loop, when scalarized, is represented by UF x VF scalar values 537 /// in the new unrolled loop, where UF is the unroll factor and VF is the 538 /// vectorization factor. 539 using ScalarParts = SmallVector<SmallVector<Value *, 4>, 2>; 540 541 /// Set up the values of the IVs correctly when exiting the vector loop. 542 void fixupIVUsers(PHINode *OrigPhi, const InductionDescriptor &II, 543 Value *CountRoundDown, Value *EndValue, 544 BasicBlock *MiddleBlock); 545 546 /// Create a new induction variable inside L. 547 PHINode *createInductionVariable(Loop *L, Value *Start, Value *End, 548 Value *Step, Instruction *DL); 549 550 /// Handle all cross-iteration phis in the header. 551 void fixCrossIterationPHIs(); 552 553 /// Fix a first-order recurrence. This is the second phase of vectorizing 554 /// this phi node. 555 void fixFirstOrderRecurrence(PHINode *Phi); 556 557 /// Fix a reduction cross-iteration phi. This is the second phase of 558 /// vectorizing this phi node. 559 void fixReduction(PHINode *Phi); 560 561 /// Clear NSW/NUW flags from reduction instructions if necessary. 562 void clearReductionWrapFlags(RecurrenceDescriptor &RdxDesc); 563 564 /// The Loop exit block may have single value PHI nodes with some 565 /// incoming value. While vectorizing we only handled real values 566 /// that were defined inside the loop and we should have one value for 567 /// each predecessor of its parent basic block. See PR14725. 568 void fixLCSSAPHIs(); 569 570 /// Iteratively sink the scalarized operands of a predicated instruction into 571 /// the block that was created for it. 572 void sinkScalarOperands(Instruction *PredInst); 573 574 /// Shrinks vector element sizes to the smallest bitwidth they can be legally 575 /// represented as. 576 void truncateToMinimalBitwidths(); 577 578 /// Create a broadcast instruction. This method generates a broadcast 579 /// instruction (shuffle) for loop invariant values and for the induction 580 /// value. If this is the induction variable then we extend it to N, N+1, ... 581 /// this is needed because each iteration in the loop corresponds to a SIMD 582 /// element. 583 virtual Value *getBroadcastInstrs(Value *V); 584 585 /// This function adds (StartIdx, StartIdx + Step, StartIdx + 2*Step, ...) 586 /// to each vector element of Val. The sequence starts at StartIndex. 587 /// \p Opcode is relevant for FP induction variable. 588 virtual Value *getStepVector(Value *Val, int StartIdx, Value *Step, 589 Instruction::BinaryOps Opcode = 590 Instruction::BinaryOpsEnd); 591 592 /// Compute scalar induction steps. \p ScalarIV is the scalar induction 593 /// variable on which to base the steps, \p Step is the size of the step, and 594 /// \p EntryVal is the value from the original loop that maps to the steps. 595 /// Note that \p EntryVal doesn't have to be an induction variable - it 596 /// can also be a truncate instruction. 597 void buildScalarSteps(Value *ScalarIV, Value *Step, Instruction *EntryVal, 598 const InductionDescriptor &ID); 599 600 /// Create a vector induction phi node based on an existing scalar one. \p 601 /// EntryVal is the value from the original loop that maps to the vector phi 602 /// node, and \p Step is the loop-invariant step. If \p EntryVal is a 603 /// truncate instruction, instead of widening the original IV, we widen a 604 /// version of the IV truncated to \p EntryVal's type. 605 void createVectorIntOrFpInductionPHI(const InductionDescriptor &II, 606 Value *Step, Instruction *EntryVal); 607 608 /// Returns true if an instruction \p I should be scalarized instead of 609 /// vectorized for the chosen vectorization factor. 610 bool shouldScalarizeInstruction(Instruction *I) const; 611 612 /// Returns true if we should generate a scalar version of \p IV. 613 bool needsScalarInduction(Instruction *IV) const; 614 615 /// If there is a cast involved in the induction variable \p ID, which should 616 /// be ignored in the vectorized loop body, this function records the 617 /// VectorLoopValue of the respective Phi also as the VectorLoopValue of the 618 /// cast. We had already proved that the casted Phi is equal to the uncasted 619 /// Phi in the vectorized loop (under a runtime guard), and therefore 620 /// there is no need to vectorize the cast - the same value can be used in the 621 /// vector loop for both the Phi and the cast. 622 /// If \p VectorLoopValue is a scalarized value, \p Lane is also specified, 623 /// Otherwise, \p VectorLoopValue is a widened/vectorized value. 624 /// 625 /// \p EntryVal is the value from the original loop that maps to the vector 626 /// phi node and is used to distinguish what is the IV currently being 627 /// processed - original one (if \p EntryVal is a phi corresponding to the 628 /// original IV) or the "newly-created" one based on the proof mentioned above 629 /// (see also buildScalarSteps() and createVectorIntOrFPInductionPHI()). In the 630 /// latter case \p EntryVal is a TruncInst and we must not record anything for 631 /// that IV, but it's error-prone to expect callers of this routine to care 632 /// about that, hence this explicit parameter. 633 void recordVectorLoopValueForInductionCast(const InductionDescriptor &ID, 634 const Instruction *EntryVal, 635 Value *VectorLoopValue, 636 unsigned Part, 637 unsigned Lane = UINT_MAX); 638 639 /// Generate a shuffle sequence that will reverse the vector Vec. 640 virtual Value *reverseVector(Value *Vec); 641 642 /// Returns (and creates if needed) the original loop trip count. 643 Value *getOrCreateTripCount(Loop *NewLoop); 644 645 /// Returns (and creates if needed) the trip count of the widened loop. 646 Value *getOrCreateVectorTripCount(Loop *NewLoop); 647 648 /// Returns a bitcasted value to the requested vector type. 649 /// Also handles bitcasts of vector<float> <-> vector<pointer> types. 650 Value *createBitOrPointerCast(Value *V, VectorType *DstVTy, 651 const DataLayout &DL); 652 653 /// Emit a bypass check to see if the vector trip count is zero, including if 654 /// it overflows. 655 void emitMinimumIterationCountCheck(Loop *L, BasicBlock *Bypass); 656 657 /// Emit a bypass check to see if all of the SCEV assumptions we've 658 /// had to make are correct. 659 void emitSCEVChecks(Loop *L, BasicBlock *Bypass); 660 661 /// Emit bypass checks to check any memory assumptions we may have made. 662 void emitMemRuntimeChecks(Loop *L, BasicBlock *Bypass); 663 664 /// Compute the transformed value of Index at offset StartValue using step 665 /// StepValue. 666 /// For integer induction, returns StartValue + Index * StepValue. 667 /// For pointer induction, returns StartValue[Index * StepValue]. 668 /// FIXME: The newly created binary instructions should contain nsw/nuw 669 /// flags, which can be found from the original scalar operations. 670 Value *emitTransformedIndex(IRBuilder<> &B, Value *Index, ScalarEvolution *SE, 671 const DataLayout &DL, 672 const InductionDescriptor &ID) const; 673 674 /// Emit basic blocks (prefixed with \p Prefix) for the iteration check, 675 /// vector loop preheader, middle block and scalar preheader. Also 676 /// allocate a loop object for the new vector loop and return it. 677 Loop *createVectorLoopSkeleton(StringRef Prefix); 678 679 /// Create new phi nodes for the induction variables to resume iteration count 680 /// in the scalar epilogue, from where the vectorized loop left off (given by 681 /// \p VectorTripCount). 682 void createInductionResumeValues(Loop *L, Value *VectorTripCount); 683 684 /// Complete the loop skeleton by adding debug MDs, creating appropriate 685 /// conditional branches in the middle block, preparing the builder and 686 /// running the verifier. Take in the vector loop \p L as argument, and return 687 /// the preheader of the completed vector loop. 688 BasicBlock *completeLoopSkeleton(Loop *L, MDNode *OrigLoopID); 689 690 /// Add additional metadata to \p To that was not present on \p Orig. 691 /// 692 /// Currently this is used to add the noalias annotations based on the 693 /// inserted memchecks. Use this for instructions that are *cloned* into the 694 /// vector loop. 695 void addNewMetadata(Instruction *To, const Instruction *Orig); 696 697 /// Add metadata from one instruction to another. 698 /// 699 /// This includes both the original MDs from \p From and additional ones (\see 700 /// addNewMetadata). Use this for *newly created* instructions in the vector 701 /// loop. 702 void addMetadata(Instruction *To, Instruction *From); 703 704 /// Similar to the previous function but it adds the metadata to a 705 /// vector of instructions. 706 void addMetadata(ArrayRef<Value *> To, Instruction *From); 707 708 /// The original loop. 709 Loop *OrigLoop; 710 711 /// A wrapper around ScalarEvolution used to add runtime SCEV checks. Applies 712 /// dynamic knowledge to simplify SCEV expressions and converts them to a 713 /// more usable form. 714 PredicatedScalarEvolution &PSE; 715 716 /// Loop Info. 717 LoopInfo *LI; 718 719 /// Dominator Tree. 720 DominatorTree *DT; 721 722 /// Alias Analysis. 723 AAResults *AA; 724 725 /// Target Library Info. 726 const TargetLibraryInfo *TLI; 727 728 /// Target Transform Info. 729 const TargetTransformInfo *TTI; 730 731 /// Assumption Cache. 732 AssumptionCache *AC; 733 734 /// Interface to emit optimization remarks. 735 OptimizationRemarkEmitter *ORE; 736 737 /// LoopVersioning. It's only set up (non-null) if memchecks were 738 /// used. 739 /// 740 /// This is currently only used to add no-alias metadata based on the 741 /// memchecks. The actually versioning is performed manually. 742 std::unique_ptr<LoopVersioning> LVer; 743 744 /// The vectorization SIMD factor to use. Each vector will have this many 745 /// vector elements. 746 unsigned VF; 747 748 /// The vectorization unroll factor to use. Each scalar is vectorized to this 749 /// many different vector instructions. 750 unsigned UF; 751 752 /// The builder that we use 753 IRBuilder<> Builder; 754 755 // --- Vectorization state --- 756 757 /// The vector-loop preheader. 758 BasicBlock *LoopVectorPreHeader; 759 760 /// The scalar-loop preheader. 761 BasicBlock *LoopScalarPreHeader; 762 763 /// Middle Block between the vector and the scalar. 764 BasicBlock *LoopMiddleBlock; 765 766 /// The ExitBlock of the scalar loop. 767 BasicBlock *LoopExitBlock; 768 769 /// The vector loop body. 770 BasicBlock *LoopVectorBody; 771 772 /// The scalar loop body. 773 BasicBlock *LoopScalarBody; 774 775 /// A list of all bypass blocks. The first block is the entry of the loop. 776 SmallVector<BasicBlock *, 4> LoopBypassBlocks; 777 778 /// The new Induction variable which was added to the new block. 779 PHINode *Induction = nullptr; 780 781 /// The induction variable of the old basic block. 782 PHINode *OldInduction = nullptr; 783 784 /// Maps values from the original loop to their corresponding values in the 785 /// vectorized loop. A key value can map to either vector values, scalar 786 /// values or both kinds of values, depending on whether the key was 787 /// vectorized and scalarized. 788 VectorizerValueMap VectorLoopValueMap; 789 790 /// Store instructions that were predicated. 791 SmallVector<Instruction *, 4> PredicatedInstructions; 792 793 /// Trip count of the original loop. 794 Value *TripCount = nullptr; 795 796 /// Trip count of the widened loop (TripCount - TripCount % (VF*UF)) 797 Value *VectorTripCount = nullptr; 798 799 /// The legality analysis. 800 LoopVectorizationLegality *Legal; 801 802 /// The profitablity analysis. 803 LoopVectorizationCostModel *Cost; 804 805 // Record whether runtime checks are added. 806 bool AddedSafetyChecks = false; 807 808 // Holds the end values for each induction variable. We save the end values 809 // so we can later fix-up the external users of the induction variables. 810 DenseMap<PHINode *, Value *> IVEndValues; 811 812 // Vector of original scalar PHIs whose corresponding widened PHIs need to be 813 // fixed up at the end of vector code generation. 814 SmallVector<PHINode *, 8> OrigPHIsToFix; 815 816 /// BFI and PSI are used to check for profile guided size optimizations. 817 BlockFrequencyInfo *BFI; 818 ProfileSummaryInfo *PSI; 819 820 // Whether this loop should be optimized for size based on profile guided size 821 // optimizatios. 822 bool OptForSizeBasedOnProfile; 823 }; 824 825 class InnerLoopUnroller : public InnerLoopVectorizer { 826 public: 827 InnerLoopUnroller(Loop *OrigLoop, PredicatedScalarEvolution &PSE, 828 LoopInfo *LI, DominatorTree *DT, 829 const TargetLibraryInfo *TLI, 830 const TargetTransformInfo *TTI, AssumptionCache *AC, 831 OptimizationRemarkEmitter *ORE, unsigned UnrollFactor, 832 LoopVectorizationLegality *LVL, 833 LoopVectorizationCostModel *CM, BlockFrequencyInfo *BFI, 834 ProfileSummaryInfo *PSI) 835 : InnerLoopVectorizer(OrigLoop, PSE, LI, DT, TLI, TTI, AC, ORE, 1, 836 UnrollFactor, LVL, CM, BFI, PSI) {} 837 838 private: 839 Value *getBroadcastInstrs(Value *V) override; 840 Value *getStepVector(Value *Val, int StartIdx, Value *Step, 841 Instruction::BinaryOps Opcode = 842 Instruction::BinaryOpsEnd) override; 843 Value *reverseVector(Value *Vec) override; 844 }; 845 846 } // end namespace llvm 847 848 /// Look for a meaningful debug location on the instruction or it's 849 /// operands. 850 static Instruction *getDebugLocFromInstOrOperands(Instruction *I) { 851 if (!I) 852 return I; 853 854 DebugLoc Empty; 855 if (I->getDebugLoc() != Empty) 856 return I; 857 858 for (User::op_iterator OI = I->op_begin(), OE = I->op_end(); OI != OE; ++OI) { 859 if (Instruction *OpInst = dyn_cast<Instruction>(*OI)) 860 if (OpInst->getDebugLoc() != Empty) 861 return OpInst; 862 } 863 864 return I; 865 } 866 867 void InnerLoopVectorizer::setDebugLocFromInst(IRBuilder<> &B, const Value *Ptr) { 868 if (const Instruction *Inst = dyn_cast_or_null<Instruction>(Ptr)) { 869 const DILocation *DIL = Inst->getDebugLoc(); 870 if (DIL && Inst->getFunction()->isDebugInfoForProfiling() && 871 !isa<DbgInfoIntrinsic>(Inst)) { 872 auto NewDIL = DIL->cloneByMultiplyingDuplicationFactor(UF * VF); 873 if (NewDIL) 874 B.SetCurrentDebugLocation(NewDIL.getValue()); 875 else 876 LLVM_DEBUG(dbgs() 877 << "Failed to create new discriminator: " 878 << DIL->getFilename() << " Line: " << DIL->getLine()); 879 } 880 else 881 B.SetCurrentDebugLocation(DIL); 882 } else 883 B.SetCurrentDebugLocation(DebugLoc()); 884 } 885 886 /// Write a record \p DebugMsg about vectorization failure to the debug 887 /// output stream. If \p I is passed, it is an instruction that prevents 888 /// vectorization. 889 #ifndef NDEBUG 890 static void debugVectorizationFailure(const StringRef DebugMsg, 891 Instruction *I) { 892 dbgs() << "LV: Not vectorizing: " << DebugMsg; 893 if (I != nullptr) 894 dbgs() << " " << *I; 895 else 896 dbgs() << '.'; 897 dbgs() << '\n'; 898 } 899 #endif 900 901 /// Create an analysis remark that explains why vectorization failed 902 /// 903 /// \p PassName is the name of the pass (e.g. can be AlwaysPrint). \p 904 /// RemarkName is the identifier for the remark. If \p I is passed it is an 905 /// instruction that prevents vectorization. Otherwise \p TheLoop is used for 906 /// the location of the remark. \return the remark object that can be 907 /// streamed to. 908 static OptimizationRemarkAnalysis createLVAnalysis(const char *PassName, 909 StringRef RemarkName, Loop *TheLoop, Instruction *I) { 910 Value *CodeRegion = TheLoop->getHeader(); 911 DebugLoc DL = TheLoop->getStartLoc(); 912 913 if (I) { 914 CodeRegion = I->getParent(); 915 // If there is no debug location attached to the instruction, revert back to 916 // using the loop's. 917 if (I->getDebugLoc()) 918 DL = I->getDebugLoc(); 919 } 920 921 OptimizationRemarkAnalysis R(PassName, RemarkName, DL, CodeRegion); 922 R << "loop not vectorized: "; 923 return R; 924 } 925 926 namespace llvm { 927 928 void reportVectorizationFailure(const StringRef DebugMsg, 929 const StringRef OREMsg, const StringRef ORETag, 930 OptimizationRemarkEmitter *ORE, Loop *TheLoop, Instruction *I) { 931 LLVM_DEBUG(debugVectorizationFailure(DebugMsg, I)); 932 LoopVectorizeHints Hints(TheLoop, true /* doesn't matter */, *ORE); 933 ORE->emit(createLVAnalysis(Hints.vectorizeAnalysisPassName(), 934 ORETag, TheLoop, I) << OREMsg); 935 } 936 937 } // end namespace llvm 938 939 #ifndef NDEBUG 940 /// \return string containing a file name and a line # for the given loop. 941 static std::string getDebugLocString(const Loop *L) { 942 std::string Result; 943 if (L) { 944 raw_string_ostream OS(Result); 945 if (const DebugLoc LoopDbgLoc = L->getStartLoc()) 946 LoopDbgLoc.print(OS); 947 else 948 // Just print the module name. 949 OS << L->getHeader()->getParent()->getParent()->getModuleIdentifier(); 950 OS.flush(); 951 } 952 return Result; 953 } 954 #endif 955 956 void InnerLoopVectorizer::addNewMetadata(Instruction *To, 957 const Instruction *Orig) { 958 // If the loop was versioned with memchecks, add the corresponding no-alias 959 // metadata. 960 if (LVer && (isa<LoadInst>(Orig) || isa<StoreInst>(Orig))) 961 LVer->annotateInstWithNoAlias(To, Orig); 962 } 963 964 void InnerLoopVectorizer::addMetadata(Instruction *To, 965 Instruction *From) { 966 propagateMetadata(To, From); 967 addNewMetadata(To, From); 968 } 969 970 void InnerLoopVectorizer::addMetadata(ArrayRef<Value *> To, 971 Instruction *From) { 972 for (Value *V : To) { 973 if (Instruction *I = dyn_cast<Instruction>(V)) 974 addMetadata(I, From); 975 } 976 } 977 978 namespace llvm { 979 980 // Loop vectorization cost-model hints how the scalar epilogue loop should be 981 // lowered. 982 enum ScalarEpilogueLowering { 983 984 // The default: allowing scalar epilogues. 985 CM_ScalarEpilogueAllowed, 986 987 // Vectorization with OptForSize: don't allow epilogues. 988 CM_ScalarEpilogueNotAllowedOptSize, 989 990 // A special case of vectorisation with OptForSize: loops with a very small 991 // trip count are considered for vectorization under OptForSize, thereby 992 // making sure the cost of their loop body is dominant, free of runtime 993 // guards and scalar iteration overheads. 994 CM_ScalarEpilogueNotAllowedLowTripLoop, 995 996 // Loop hint predicate indicating an epilogue is undesired. 997 CM_ScalarEpilogueNotNeededUsePredicate 998 }; 999 1000 /// LoopVectorizationCostModel - estimates the expected speedups due to 1001 /// vectorization. 1002 /// In many cases vectorization is not profitable. This can happen because of 1003 /// a number of reasons. In this class we mainly attempt to predict the 1004 /// expected speedup/slowdowns due to the supported instruction set. We use the 1005 /// TargetTransformInfo to query the different backends for the cost of 1006 /// different operations. 1007 class LoopVectorizationCostModel { 1008 public: 1009 LoopVectorizationCostModel(ScalarEpilogueLowering SEL, Loop *L, 1010 PredicatedScalarEvolution &PSE, LoopInfo *LI, 1011 LoopVectorizationLegality *Legal, 1012 const TargetTransformInfo &TTI, 1013 const TargetLibraryInfo *TLI, DemandedBits *DB, 1014 AssumptionCache *AC, 1015 OptimizationRemarkEmitter *ORE, const Function *F, 1016 const LoopVectorizeHints *Hints, 1017 InterleavedAccessInfo &IAI) 1018 : ScalarEpilogueStatus(SEL), TheLoop(L), PSE(PSE), LI(LI), Legal(Legal), 1019 TTI(TTI), TLI(TLI), DB(DB), AC(AC), ORE(ORE), TheFunction(F), 1020 Hints(Hints), InterleaveInfo(IAI) {} 1021 1022 /// \return An upper bound for the vectorization factor, or None if 1023 /// vectorization and interleaving should be avoided up front. 1024 Optional<unsigned> computeMaxVF(unsigned UserVF, unsigned UserIC); 1025 1026 /// \return True if runtime checks are required for vectorization, and false 1027 /// otherwise. 1028 bool runtimeChecksRequired(); 1029 1030 /// \return The most profitable vectorization factor and the cost of that VF. 1031 /// This method checks every power of two up to MaxVF. If UserVF is not ZERO 1032 /// then this vectorization factor will be selected if vectorization is 1033 /// possible. 1034 VectorizationFactor selectVectorizationFactor(unsigned MaxVF); 1035 1036 /// Setup cost-based decisions for user vectorization factor. 1037 void selectUserVectorizationFactor(unsigned UserVF) { 1038 collectUniformsAndScalars(UserVF); 1039 collectInstsToScalarize(UserVF); 1040 } 1041 1042 /// \return The size (in bits) of the smallest and widest types in the code 1043 /// that needs to be vectorized. We ignore values that remain scalar such as 1044 /// 64 bit loop indices. 1045 std::pair<unsigned, unsigned> getSmallestAndWidestTypes(); 1046 1047 /// \return The desired interleave count. 1048 /// If interleave count has been specified by metadata it will be returned. 1049 /// Otherwise, the interleave count is computed and returned. VF and LoopCost 1050 /// are the selected vectorization factor and the cost of the selected VF. 1051 unsigned selectInterleaveCount(unsigned VF, unsigned LoopCost); 1052 1053 /// Memory access instruction may be vectorized in more than one way. 1054 /// Form of instruction after vectorization depends on cost. 1055 /// This function takes cost-based decisions for Load/Store instructions 1056 /// and collects them in a map. This decisions map is used for building 1057 /// the lists of loop-uniform and loop-scalar instructions. 1058 /// The calculated cost is saved with widening decision in order to 1059 /// avoid redundant calculations. 1060 void setCostBasedWideningDecision(unsigned VF); 1061 1062 /// A struct that represents some properties of the register usage 1063 /// of a loop. 1064 struct RegisterUsage { 1065 /// Holds the number of loop invariant values that are used in the loop. 1066 /// The key is ClassID of target-provided register class. 1067 SmallMapVector<unsigned, unsigned, 4> LoopInvariantRegs; 1068 /// Holds the maximum number of concurrent live intervals in the loop. 1069 /// The key is ClassID of target-provided register class. 1070 SmallMapVector<unsigned, unsigned, 4> MaxLocalUsers; 1071 }; 1072 1073 /// \return Returns information about the register usages of the loop for the 1074 /// given vectorization factors. 1075 SmallVector<RegisterUsage, 8> calculateRegisterUsage(ArrayRef<unsigned> VFs); 1076 1077 /// Collect values we want to ignore in the cost model. 1078 void collectValuesToIgnore(); 1079 1080 /// Split reductions into those that happen in the loop, and those that happen 1081 /// outside. In loop reductions are collected into InLoopReductionChains. 1082 void collectInLoopReductions(); 1083 1084 /// \returns The smallest bitwidth each instruction can be represented with. 1085 /// The vector equivalents of these instructions should be truncated to this 1086 /// type. 1087 const MapVector<Instruction *, uint64_t> &getMinimalBitwidths() const { 1088 return MinBWs; 1089 } 1090 1091 /// \returns True if it is more profitable to scalarize instruction \p I for 1092 /// vectorization factor \p VF. 1093 bool isProfitableToScalarize(Instruction *I, unsigned VF) const { 1094 assert(VF > 1 && "Profitable to scalarize relevant only for VF > 1."); 1095 1096 // Cost model is not run in the VPlan-native path - return conservative 1097 // result until this changes. 1098 if (EnableVPlanNativePath) 1099 return false; 1100 1101 auto Scalars = InstsToScalarize.find(VF); 1102 assert(Scalars != InstsToScalarize.end() && 1103 "VF not yet analyzed for scalarization profitability"); 1104 return Scalars->second.find(I) != Scalars->second.end(); 1105 } 1106 1107 /// Returns true if \p I is known to be uniform after vectorization. 1108 bool isUniformAfterVectorization(Instruction *I, unsigned VF) const { 1109 if (VF == 1) 1110 return true; 1111 1112 // Cost model is not run in the VPlan-native path - return conservative 1113 // result until this changes. 1114 if (EnableVPlanNativePath) 1115 return false; 1116 1117 auto UniformsPerVF = Uniforms.find(VF); 1118 assert(UniformsPerVF != Uniforms.end() && 1119 "VF not yet analyzed for uniformity"); 1120 return UniformsPerVF->second.count(I); 1121 } 1122 1123 /// Returns true if \p I is known to be scalar after vectorization. 1124 bool isScalarAfterVectorization(Instruction *I, unsigned VF) const { 1125 if (VF == 1) 1126 return true; 1127 1128 // Cost model is not run in the VPlan-native path - return conservative 1129 // result until this changes. 1130 if (EnableVPlanNativePath) 1131 return false; 1132 1133 auto ScalarsPerVF = Scalars.find(VF); 1134 assert(ScalarsPerVF != Scalars.end() && 1135 "Scalar values are not calculated for VF"); 1136 return ScalarsPerVF->second.count(I); 1137 } 1138 1139 /// \returns True if instruction \p I can be truncated to a smaller bitwidth 1140 /// for vectorization factor \p VF. 1141 bool canTruncateToMinimalBitwidth(Instruction *I, unsigned VF) const { 1142 return VF > 1 && MinBWs.find(I) != MinBWs.end() && 1143 !isProfitableToScalarize(I, VF) && 1144 !isScalarAfterVectorization(I, VF); 1145 } 1146 1147 /// Decision that was taken during cost calculation for memory instruction. 1148 enum InstWidening { 1149 CM_Unknown, 1150 CM_Widen, // For consecutive accesses with stride +1. 1151 CM_Widen_Reverse, // For consecutive accesses with stride -1. 1152 CM_Interleave, 1153 CM_GatherScatter, 1154 CM_Scalarize 1155 }; 1156 1157 /// Save vectorization decision \p W and \p Cost taken by the cost model for 1158 /// instruction \p I and vector width \p VF. 1159 void setWideningDecision(Instruction *I, unsigned VF, InstWidening W, 1160 unsigned Cost) { 1161 assert(VF >= 2 && "Expected VF >=2"); 1162 WideningDecisions[std::make_pair(I, VF)] = std::make_pair(W, Cost); 1163 } 1164 1165 /// Save vectorization decision \p W and \p Cost taken by the cost model for 1166 /// interleaving group \p Grp and vector width \p VF. 1167 void setWideningDecision(const InterleaveGroup<Instruction> *Grp, unsigned VF, 1168 InstWidening W, unsigned Cost) { 1169 assert(VF >= 2 && "Expected VF >=2"); 1170 /// Broadcast this decicion to all instructions inside the group. 1171 /// But the cost will be assigned to one instruction only. 1172 for (unsigned i = 0; i < Grp->getFactor(); ++i) { 1173 if (auto *I = Grp->getMember(i)) { 1174 if (Grp->getInsertPos() == I) 1175 WideningDecisions[std::make_pair(I, VF)] = std::make_pair(W, Cost); 1176 else 1177 WideningDecisions[std::make_pair(I, VF)] = std::make_pair(W, 0); 1178 } 1179 } 1180 } 1181 1182 /// Return the cost model decision for the given instruction \p I and vector 1183 /// width \p VF. Return CM_Unknown if this instruction did not pass 1184 /// through the cost modeling. 1185 InstWidening getWideningDecision(Instruction *I, unsigned VF) { 1186 assert(VF >= 2 && "Expected VF >=2"); 1187 1188 // Cost model is not run in the VPlan-native path - return conservative 1189 // result until this changes. 1190 if (EnableVPlanNativePath) 1191 return CM_GatherScatter; 1192 1193 std::pair<Instruction *, unsigned> InstOnVF = std::make_pair(I, VF); 1194 auto Itr = WideningDecisions.find(InstOnVF); 1195 if (Itr == WideningDecisions.end()) 1196 return CM_Unknown; 1197 return Itr->second.first; 1198 } 1199 1200 /// Return the vectorization cost for the given instruction \p I and vector 1201 /// width \p VF. 1202 unsigned getWideningCost(Instruction *I, unsigned VF) { 1203 assert(VF >= 2 && "Expected VF >=2"); 1204 std::pair<Instruction *, unsigned> InstOnVF = std::make_pair(I, VF); 1205 assert(WideningDecisions.find(InstOnVF) != WideningDecisions.end() && 1206 "The cost is not calculated"); 1207 return WideningDecisions[InstOnVF].second; 1208 } 1209 1210 /// Return True if instruction \p I is an optimizable truncate whose operand 1211 /// is an induction variable. Such a truncate will be removed by adding a new 1212 /// induction variable with the destination type. 1213 bool isOptimizableIVTruncate(Instruction *I, unsigned VF) { 1214 // If the instruction is not a truncate, return false. 1215 auto *Trunc = dyn_cast<TruncInst>(I); 1216 if (!Trunc) 1217 return false; 1218 1219 // Get the source and destination types of the truncate. 1220 Type *SrcTy = ToVectorTy(cast<CastInst>(I)->getSrcTy(), VF); 1221 Type *DestTy = ToVectorTy(cast<CastInst>(I)->getDestTy(), VF); 1222 1223 // If the truncate is free for the given types, return false. Replacing a 1224 // free truncate with an induction variable would add an induction variable 1225 // update instruction to each iteration of the loop. We exclude from this 1226 // check the primary induction variable since it will need an update 1227 // instruction regardless. 1228 Value *Op = Trunc->getOperand(0); 1229 if (Op != Legal->getPrimaryInduction() && TTI.isTruncateFree(SrcTy, DestTy)) 1230 return false; 1231 1232 // If the truncated value is not an induction variable, return false. 1233 return Legal->isInductionPhi(Op); 1234 } 1235 1236 /// Collects the instructions to scalarize for each predicated instruction in 1237 /// the loop. 1238 void collectInstsToScalarize(unsigned VF); 1239 1240 /// Collect Uniform and Scalar values for the given \p VF. 1241 /// The sets depend on CM decision for Load/Store instructions 1242 /// that may be vectorized as interleave, gather-scatter or scalarized. 1243 void collectUniformsAndScalars(unsigned VF) { 1244 // Do the analysis once. 1245 if (VF == 1 || Uniforms.find(VF) != Uniforms.end()) 1246 return; 1247 setCostBasedWideningDecision(VF); 1248 collectLoopUniforms(VF); 1249 collectLoopScalars(VF); 1250 } 1251 1252 /// Returns true if the target machine supports masked store operation 1253 /// for the given \p DataType and kind of access to \p Ptr. 1254 bool isLegalMaskedStore(Type *DataType, Value *Ptr, Align Alignment) { 1255 return Legal->isConsecutivePtr(Ptr) && 1256 TTI.isLegalMaskedStore(DataType, Alignment); 1257 } 1258 1259 /// Returns true if the target machine supports masked load operation 1260 /// for the given \p DataType and kind of access to \p Ptr. 1261 bool isLegalMaskedLoad(Type *DataType, Value *Ptr, Align Alignment) { 1262 return Legal->isConsecutivePtr(Ptr) && 1263 TTI.isLegalMaskedLoad(DataType, Alignment); 1264 } 1265 1266 /// Returns true if the target machine supports masked scatter operation 1267 /// for the given \p DataType. 1268 bool isLegalMaskedScatter(Type *DataType, Align Alignment) { 1269 return TTI.isLegalMaskedScatter(DataType, Alignment); 1270 } 1271 1272 /// Returns true if the target machine supports masked gather operation 1273 /// for the given \p DataType. 1274 bool isLegalMaskedGather(Type *DataType, Align Alignment) { 1275 return TTI.isLegalMaskedGather(DataType, Alignment); 1276 } 1277 1278 /// Returns true if the target machine can represent \p V as a masked gather 1279 /// or scatter operation. 1280 bool isLegalGatherOrScatter(Value *V) { 1281 bool LI = isa<LoadInst>(V); 1282 bool SI = isa<StoreInst>(V); 1283 if (!LI && !SI) 1284 return false; 1285 auto *Ty = getMemInstValueType(V); 1286 Align Align = getLoadStoreAlignment(V); 1287 return (LI && isLegalMaskedGather(Ty, Align)) || 1288 (SI && isLegalMaskedScatter(Ty, Align)); 1289 } 1290 1291 /// Returns true if \p I is an instruction that will be scalarized with 1292 /// predication. Such instructions include conditional stores and 1293 /// instructions that may divide by zero. 1294 /// If a non-zero VF has been calculated, we check if I will be scalarized 1295 /// predication for that VF. 1296 bool isScalarWithPredication(Instruction *I, unsigned VF = 1); 1297 1298 // Returns true if \p I is an instruction that will be predicated either 1299 // through scalar predication or masked load/store or masked gather/scatter. 1300 // Superset of instructions that return true for isScalarWithPredication. 1301 bool isPredicatedInst(Instruction *I) { 1302 if (!blockNeedsPredication(I->getParent())) 1303 return false; 1304 // Loads and stores that need some form of masked operation are predicated 1305 // instructions. 1306 if (isa<LoadInst>(I) || isa<StoreInst>(I)) 1307 return Legal->isMaskRequired(I); 1308 return isScalarWithPredication(I); 1309 } 1310 1311 /// Returns true if \p I is a memory instruction with consecutive memory 1312 /// access that can be widened. 1313 bool memoryInstructionCanBeWidened(Instruction *I, unsigned VF = 1); 1314 1315 /// Returns true if \p I is a memory instruction in an interleaved-group 1316 /// of memory accesses that can be vectorized with wide vector loads/stores 1317 /// and shuffles. 1318 bool interleavedAccessCanBeWidened(Instruction *I, unsigned VF = 1); 1319 1320 /// Check if \p Instr belongs to any interleaved access group. 1321 bool isAccessInterleaved(Instruction *Instr) { 1322 return InterleaveInfo.isInterleaved(Instr); 1323 } 1324 1325 /// Get the interleaved access group that \p Instr belongs to. 1326 const InterleaveGroup<Instruction> * 1327 getInterleavedAccessGroup(Instruction *Instr) { 1328 return InterleaveInfo.getInterleaveGroup(Instr); 1329 } 1330 1331 /// Returns true if an interleaved group requires a scalar iteration 1332 /// to handle accesses with gaps, and there is nothing preventing us from 1333 /// creating a scalar epilogue. 1334 bool requiresScalarEpilogue() const { 1335 return isScalarEpilogueAllowed() && InterleaveInfo.requiresScalarEpilogue(); 1336 } 1337 1338 /// Returns true if a scalar epilogue is not allowed due to optsize or a 1339 /// loop hint annotation. 1340 bool isScalarEpilogueAllowed() const { 1341 return ScalarEpilogueStatus == CM_ScalarEpilogueAllowed; 1342 } 1343 1344 /// Returns true if all loop blocks should be masked to fold tail loop. 1345 bool foldTailByMasking() const { return FoldTailByMasking; } 1346 1347 bool blockNeedsPredication(BasicBlock *BB) { 1348 return foldTailByMasking() || Legal->blockNeedsPredication(BB); 1349 } 1350 1351 /// A SmallMapVector to store the InLoop reduction op chains, mapping phi 1352 /// nodes to the chain of instructions representing the reductions. Uses a 1353 /// MapVector to ensure deterministic iteration order. 1354 using ReductionChainMap = 1355 SmallMapVector<PHINode *, SmallVector<Instruction *, 4>, 4>; 1356 1357 /// Return the chain of instructions representing an inloop reduction. 1358 const ReductionChainMap &getInLoopReductionChains() const { 1359 return InLoopReductionChains; 1360 } 1361 1362 /// Returns true if the Phi is part of an inloop reduction. 1363 bool isInLoopReduction(PHINode *Phi) const { 1364 return InLoopReductionChains.count(Phi); 1365 } 1366 1367 /// Estimate cost of an intrinsic call instruction CI if it were vectorized 1368 /// with factor VF. Return the cost of the instruction, including 1369 /// scalarization overhead if it's needed. 1370 unsigned getVectorIntrinsicCost(CallInst *CI, unsigned VF); 1371 1372 /// Estimate cost of a call instruction CI if it were vectorized with factor 1373 /// VF. Return the cost of the instruction, including scalarization overhead 1374 /// if it's needed. The flag NeedToScalarize shows if the call needs to be 1375 /// scalarized - 1376 /// i.e. either vector version isn't available, or is too expensive. 1377 unsigned getVectorCallCost(CallInst *CI, unsigned VF, bool &NeedToScalarize); 1378 1379 /// Invalidates decisions already taken by the cost model. 1380 void invalidateCostModelingDecisions() { 1381 WideningDecisions.clear(); 1382 Uniforms.clear(); 1383 Scalars.clear(); 1384 } 1385 1386 private: 1387 unsigned NumPredStores = 0; 1388 1389 /// \return An upper bound for the vectorization factor, a power-of-2 larger 1390 /// than zero. One is returned if vectorization should best be avoided due 1391 /// to cost. 1392 unsigned computeFeasibleMaxVF(unsigned ConstTripCount); 1393 1394 /// The vectorization cost is a combination of the cost itself and a boolean 1395 /// indicating whether any of the contributing operations will actually 1396 /// operate on 1397 /// vector values after type legalization in the backend. If this latter value 1398 /// is 1399 /// false, then all operations will be scalarized (i.e. no vectorization has 1400 /// actually taken place). 1401 using VectorizationCostTy = std::pair<unsigned, bool>; 1402 1403 /// Returns the expected execution cost. The unit of the cost does 1404 /// not matter because we use the 'cost' units to compare different 1405 /// vector widths. The cost that is returned is *not* normalized by 1406 /// the factor width. 1407 VectorizationCostTy expectedCost(unsigned VF); 1408 1409 /// Returns the execution time cost of an instruction for a given vector 1410 /// width. Vector width of one means scalar. 1411 VectorizationCostTy getInstructionCost(Instruction *I, unsigned VF); 1412 1413 /// The cost-computation logic from getInstructionCost which provides 1414 /// the vector type as an output parameter. 1415 unsigned getInstructionCost(Instruction *I, unsigned VF, Type *&VectorTy); 1416 1417 /// Calculate vectorization cost of memory instruction \p I. 1418 unsigned getMemoryInstructionCost(Instruction *I, unsigned VF); 1419 1420 /// The cost computation for scalarized memory instruction. 1421 unsigned getMemInstScalarizationCost(Instruction *I, unsigned VF); 1422 1423 /// The cost computation for interleaving group of memory instructions. 1424 unsigned getInterleaveGroupCost(Instruction *I, unsigned VF); 1425 1426 /// The cost computation for Gather/Scatter instruction. 1427 unsigned getGatherScatterCost(Instruction *I, unsigned VF); 1428 1429 /// The cost computation for widening instruction \p I with consecutive 1430 /// memory access. 1431 unsigned getConsecutiveMemOpCost(Instruction *I, unsigned VF); 1432 1433 /// The cost calculation for Load/Store instruction \p I with uniform pointer - 1434 /// Load: scalar load + broadcast. 1435 /// Store: scalar store + (loop invariant value stored? 0 : extract of last 1436 /// element) 1437 unsigned getUniformMemOpCost(Instruction *I, unsigned VF); 1438 1439 /// Estimate the overhead of scalarizing an instruction. This is a 1440 /// convenience wrapper for the type-based getScalarizationOverhead API. 1441 unsigned getScalarizationOverhead(Instruction *I, unsigned VF); 1442 1443 /// Returns whether the instruction is a load or store and will be a emitted 1444 /// as a vector operation. 1445 bool isConsecutiveLoadOrStore(Instruction *I); 1446 1447 /// Returns true if an artificially high cost for emulated masked memrefs 1448 /// should be used. 1449 bool useEmulatedMaskMemRefHack(Instruction *I); 1450 1451 /// Map of scalar integer values to the smallest bitwidth they can be legally 1452 /// represented as. The vector equivalents of these values should be truncated 1453 /// to this type. 1454 MapVector<Instruction *, uint64_t> MinBWs; 1455 1456 /// A type representing the costs for instructions if they were to be 1457 /// scalarized rather than vectorized. The entries are Instruction-Cost 1458 /// pairs. 1459 using ScalarCostsTy = DenseMap<Instruction *, unsigned>; 1460 1461 /// A set containing all BasicBlocks that are known to present after 1462 /// vectorization as a predicated block. 1463 SmallPtrSet<BasicBlock *, 4> PredicatedBBsAfterVectorization; 1464 1465 /// Records whether it is allowed to have the original scalar loop execute at 1466 /// least once. This may be needed as a fallback loop in case runtime 1467 /// aliasing/dependence checks fail, or to handle the tail/remainder 1468 /// iterations when the trip count is unknown or doesn't divide by the VF, 1469 /// or as a peel-loop to handle gaps in interleave-groups. 1470 /// Under optsize and when the trip count is very small we don't allow any 1471 /// iterations to execute in the scalar loop. 1472 ScalarEpilogueLowering ScalarEpilogueStatus = CM_ScalarEpilogueAllowed; 1473 1474 /// All blocks of loop are to be masked to fold tail of scalar iterations. 1475 bool FoldTailByMasking = false; 1476 1477 /// A map holding scalar costs for different vectorization factors. The 1478 /// presence of a cost for an instruction in the mapping indicates that the 1479 /// instruction will be scalarized when vectorizing with the associated 1480 /// vectorization factor. The entries are VF-ScalarCostTy pairs. 1481 DenseMap<unsigned, ScalarCostsTy> InstsToScalarize; 1482 1483 /// Holds the instructions known to be uniform after vectorization. 1484 /// The data is collected per VF. 1485 DenseMap<unsigned, SmallPtrSet<Instruction *, 4>> Uniforms; 1486 1487 /// Holds the instructions known to be scalar after vectorization. 1488 /// The data is collected per VF. 1489 DenseMap<unsigned, SmallPtrSet<Instruction *, 4>> Scalars; 1490 1491 /// Holds the instructions (address computations) that are forced to be 1492 /// scalarized. 1493 DenseMap<unsigned, SmallPtrSet<Instruction *, 4>> ForcedScalars; 1494 1495 /// PHINodes of the reductions that should be expanded in-loop along with 1496 /// their associated chains of reduction operations, in program order from top 1497 /// (PHI) to bottom 1498 ReductionChainMap InLoopReductionChains; 1499 1500 /// Returns the expected difference in cost from scalarizing the expression 1501 /// feeding a predicated instruction \p PredInst. The instructions to 1502 /// scalarize and their scalar costs are collected in \p ScalarCosts. A 1503 /// non-negative return value implies the expression will be scalarized. 1504 /// Currently, only single-use chains are considered for scalarization. 1505 int computePredInstDiscount(Instruction *PredInst, ScalarCostsTy &ScalarCosts, 1506 unsigned VF); 1507 1508 /// Collect the instructions that are uniform after vectorization. An 1509 /// instruction is uniform if we represent it with a single scalar value in 1510 /// the vectorized loop corresponding to each vector iteration. Examples of 1511 /// uniform instructions include pointer operands of consecutive or 1512 /// interleaved memory accesses. Note that although uniformity implies an 1513 /// instruction will be scalar, the reverse is not true. In general, a 1514 /// scalarized instruction will be represented by VF scalar values in the 1515 /// vectorized loop, each corresponding to an iteration of the original 1516 /// scalar loop. 1517 void collectLoopUniforms(unsigned VF); 1518 1519 /// Collect the instructions that are scalar after vectorization. An 1520 /// instruction is scalar if it is known to be uniform or will be scalarized 1521 /// during vectorization. Non-uniform scalarized instructions will be 1522 /// represented by VF values in the vectorized loop, each corresponding to an 1523 /// iteration of the original scalar loop. 1524 void collectLoopScalars(unsigned VF); 1525 1526 /// Keeps cost model vectorization decision and cost for instructions. 1527 /// Right now it is used for memory instructions only. 1528 using DecisionList = DenseMap<std::pair<Instruction *, unsigned>, 1529 std::pair<InstWidening, unsigned>>; 1530 1531 DecisionList WideningDecisions; 1532 1533 /// Returns true if \p V is expected to be vectorized and it needs to be 1534 /// extracted. 1535 bool needsExtract(Value *V, unsigned VF) const { 1536 Instruction *I = dyn_cast<Instruction>(V); 1537 if (VF == 1 || !I || !TheLoop->contains(I) || TheLoop->isLoopInvariant(I)) 1538 return false; 1539 1540 // Assume we can vectorize V (and hence we need extraction) if the 1541 // scalars are not computed yet. This can happen, because it is called 1542 // via getScalarizationOverhead from setCostBasedWideningDecision, before 1543 // the scalars are collected. That should be a safe assumption in most 1544 // cases, because we check if the operands have vectorizable types 1545 // beforehand in LoopVectorizationLegality. 1546 return Scalars.find(VF) == Scalars.end() || 1547 !isScalarAfterVectorization(I, VF); 1548 }; 1549 1550 /// Returns a range containing only operands needing to be extracted. 1551 SmallVector<Value *, 4> filterExtractingOperands(Instruction::op_range Ops, 1552 unsigned VF) { 1553 return SmallVector<Value *, 4>(make_filter_range( 1554 Ops, [this, VF](Value *V) { return this->needsExtract(V, VF); })); 1555 } 1556 1557 public: 1558 /// The loop that we evaluate. 1559 Loop *TheLoop; 1560 1561 /// Predicated scalar evolution analysis. 1562 PredicatedScalarEvolution &PSE; 1563 1564 /// Loop Info analysis. 1565 LoopInfo *LI; 1566 1567 /// Vectorization legality. 1568 LoopVectorizationLegality *Legal; 1569 1570 /// Vector target information. 1571 const TargetTransformInfo &TTI; 1572 1573 /// Target Library Info. 1574 const TargetLibraryInfo *TLI; 1575 1576 /// Demanded bits analysis. 1577 DemandedBits *DB; 1578 1579 /// Assumption cache. 1580 AssumptionCache *AC; 1581 1582 /// Interface to emit optimization remarks. 1583 OptimizationRemarkEmitter *ORE; 1584 1585 const Function *TheFunction; 1586 1587 /// Loop Vectorize Hint. 1588 const LoopVectorizeHints *Hints; 1589 1590 /// The interleave access information contains groups of interleaved accesses 1591 /// with the same stride and close to each other. 1592 InterleavedAccessInfo &InterleaveInfo; 1593 1594 /// Values to ignore in the cost model. 1595 SmallPtrSet<const Value *, 16> ValuesToIgnore; 1596 1597 /// Values to ignore in the cost model when VF > 1. 1598 SmallPtrSet<const Value *, 16> VecValuesToIgnore; 1599 }; 1600 1601 } // end namespace llvm 1602 1603 // Return true if \p OuterLp is an outer loop annotated with hints for explicit 1604 // vectorization. The loop needs to be annotated with #pragma omp simd 1605 // simdlen(#) or #pragma clang vectorize(enable) vectorize_width(#). If the 1606 // vector length information is not provided, vectorization is not considered 1607 // explicit. Interleave hints are not allowed either. These limitations will be 1608 // relaxed in the future. 1609 // Please, note that we are currently forced to abuse the pragma 'clang 1610 // vectorize' semantics. This pragma provides *auto-vectorization hints* 1611 // (i.e., LV must check that vectorization is legal) whereas pragma 'omp simd' 1612 // provides *explicit vectorization hints* (LV can bypass legal checks and 1613 // assume that vectorization is legal). However, both hints are implemented 1614 // using the same metadata (llvm.loop.vectorize, processed by 1615 // LoopVectorizeHints). This will be fixed in the future when the native IR 1616 // representation for pragma 'omp simd' is introduced. 1617 static bool isExplicitVecOuterLoop(Loop *OuterLp, 1618 OptimizationRemarkEmitter *ORE) { 1619 assert(!OuterLp->empty() && "This is not an outer loop"); 1620 LoopVectorizeHints Hints(OuterLp, true /*DisableInterleaving*/, *ORE); 1621 1622 // Only outer loops with an explicit vectorization hint are supported. 1623 // Unannotated outer loops are ignored. 1624 if (Hints.getForce() == LoopVectorizeHints::FK_Undefined) 1625 return false; 1626 1627 Function *Fn = OuterLp->getHeader()->getParent(); 1628 if (!Hints.allowVectorization(Fn, OuterLp, 1629 true /*VectorizeOnlyWhenForced*/)) { 1630 LLVM_DEBUG(dbgs() << "LV: Loop hints prevent outer loop vectorization.\n"); 1631 return false; 1632 } 1633 1634 if (Hints.getInterleave() > 1) { 1635 // TODO: Interleave support is future work. 1636 LLVM_DEBUG(dbgs() << "LV: Not vectorizing: Interleave is not supported for " 1637 "outer loops.\n"); 1638 Hints.emitRemarkWithHints(); 1639 return false; 1640 } 1641 1642 return true; 1643 } 1644 1645 static void collectSupportedLoops(Loop &L, LoopInfo *LI, 1646 OptimizationRemarkEmitter *ORE, 1647 SmallVectorImpl<Loop *> &V) { 1648 // Collect inner loops and outer loops without irreducible control flow. For 1649 // now, only collect outer loops that have explicit vectorization hints. If we 1650 // are stress testing the VPlan H-CFG construction, we collect the outermost 1651 // loop of every loop nest. 1652 if (L.empty() || VPlanBuildStressTest || 1653 (EnableVPlanNativePath && isExplicitVecOuterLoop(&L, ORE))) { 1654 LoopBlocksRPO RPOT(&L); 1655 RPOT.perform(LI); 1656 if (!containsIrreducibleCFG<const BasicBlock *>(RPOT, *LI)) { 1657 V.push_back(&L); 1658 // TODO: Collect inner loops inside marked outer loops in case 1659 // vectorization fails for the outer loop. Do not invoke 1660 // 'containsIrreducibleCFG' again for inner loops when the outer loop is 1661 // already known to be reducible. We can use an inherited attribute for 1662 // that. 1663 return; 1664 } 1665 } 1666 for (Loop *InnerL : L) 1667 collectSupportedLoops(*InnerL, LI, ORE, V); 1668 } 1669 1670 namespace { 1671 1672 /// The LoopVectorize Pass. 1673 struct LoopVectorize : public FunctionPass { 1674 /// Pass identification, replacement for typeid 1675 static char ID; 1676 1677 LoopVectorizePass Impl; 1678 1679 explicit LoopVectorize(bool InterleaveOnlyWhenForced = false, 1680 bool VectorizeOnlyWhenForced = false) 1681 : FunctionPass(ID), 1682 Impl({InterleaveOnlyWhenForced, VectorizeOnlyWhenForced}) { 1683 initializeLoopVectorizePass(*PassRegistry::getPassRegistry()); 1684 } 1685 1686 bool runOnFunction(Function &F) override { 1687 if (skipFunction(F)) 1688 return false; 1689 1690 auto *SE = &getAnalysis<ScalarEvolutionWrapperPass>().getSE(); 1691 auto *LI = &getAnalysis<LoopInfoWrapperPass>().getLoopInfo(); 1692 auto *TTI = &getAnalysis<TargetTransformInfoWrapperPass>().getTTI(F); 1693 auto *DT = &getAnalysis<DominatorTreeWrapperPass>().getDomTree(); 1694 auto *BFI = &getAnalysis<BlockFrequencyInfoWrapperPass>().getBFI(); 1695 auto *TLIP = getAnalysisIfAvailable<TargetLibraryInfoWrapperPass>(); 1696 auto *TLI = TLIP ? &TLIP->getTLI(F) : nullptr; 1697 auto *AA = &getAnalysis<AAResultsWrapperPass>().getAAResults(); 1698 auto *AC = &getAnalysis<AssumptionCacheTracker>().getAssumptionCache(F); 1699 auto *LAA = &getAnalysis<LoopAccessLegacyAnalysis>(); 1700 auto *DB = &getAnalysis<DemandedBitsWrapperPass>().getDemandedBits(); 1701 auto *ORE = &getAnalysis<OptimizationRemarkEmitterWrapperPass>().getORE(); 1702 auto *PSI = &getAnalysis<ProfileSummaryInfoWrapperPass>().getPSI(); 1703 1704 std::function<const LoopAccessInfo &(Loop &)> GetLAA = 1705 [&](Loop &L) -> const LoopAccessInfo & { return LAA->getInfo(&L); }; 1706 1707 return Impl.runImpl(F, *SE, *LI, *TTI, *DT, *BFI, TLI, *DB, *AA, *AC, 1708 GetLAA, *ORE, PSI).MadeAnyChange; 1709 } 1710 1711 void getAnalysisUsage(AnalysisUsage &AU) const override { 1712 AU.addRequired<AssumptionCacheTracker>(); 1713 AU.addRequired<BlockFrequencyInfoWrapperPass>(); 1714 AU.addRequired<DominatorTreeWrapperPass>(); 1715 AU.addRequired<LoopInfoWrapperPass>(); 1716 AU.addRequired<ScalarEvolutionWrapperPass>(); 1717 AU.addRequired<TargetTransformInfoWrapperPass>(); 1718 AU.addRequired<AAResultsWrapperPass>(); 1719 AU.addRequired<LoopAccessLegacyAnalysis>(); 1720 AU.addRequired<DemandedBitsWrapperPass>(); 1721 AU.addRequired<OptimizationRemarkEmitterWrapperPass>(); 1722 AU.addRequired<InjectTLIMappingsLegacy>(); 1723 1724 // We currently do not preserve loopinfo/dominator analyses with outer loop 1725 // vectorization. Until this is addressed, mark these analyses as preserved 1726 // only for non-VPlan-native path. 1727 // TODO: Preserve Loop and Dominator analyses for VPlan-native path. 1728 if (!EnableVPlanNativePath) { 1729 AU.addPreserved<LoopInfoWrapperPass>(); 1730 AU.addPreserved<DominatorTreeWrapperPass>(); 1731 } 1732 1733 AU.addPreserved<BasicAAWrapperPass>(); 1734 AU.addPreserved<GlobalsAAWrapperPass>(); 1735 AU.addRequired<ProfileSummaryInfoWrapperPass>(); 1736 } 1737 }; 1738 1739 } // end anonymous namespace 1740 1741 //===----------------------------------------------------------------------===// 1742 // Implementation of LoopVectorizationLegality, InnerLoopVectorizer and 1743 // LoopVectorizationCostModel and LoopVectorizationPlanner. 1744 //===----------------------------------------------------------------------===// 1745 1746 Value *InnerLoopVectorizer::getBroadcastInstrs(Value *V) { 1747 // We need to place the broadcast of invariant variables outside the loop, 1748 // but only if it's proven safe to do so. Else, broadcast will be inside 1749 // vector loop body. 1750 Instruction *Instr = dyn_cast<Instruction>(V); 1751 bool SafeToHoist = OrigLoop->isLoopInvariant(V) && 1752 (!Instr || 1753 DT->dominates(Instr->getParent(), LoopVectorPreHeader)); 1754 // Place the code for broadcasting invariant variables in the new preheader. 1755 IRBuilder<>::InsertPointGuard Guard(Builder); 1756 if (SafeToHoist) 1757 Builder.SetInsertPoint(LoopVectorPreHeader->getTerminator()); 1758 1759 // Broadcast the scalar into all locations in the vector. 1760 Value *Shuf = Builder.CreateVectorSplat(VF, V, "broadcast"); 1761 1762 return Shuf; 1763 } 1764 1765 void InnerLoopVectorizer::createVectorIntOrFpInductionPHI( 1766 const InductionDescriptor &II, Value *Step, Instruction *EntryVal) { 1767 assert((isa<PHINode>(EntryVal) || isa<TruncInst>(EntryVal)) && 1768 "Expected either an induction phi-node or a truncate of it!"); 1769 Value *Start = II.getStartValue(); 1770 1771 // Construct the initial value of the vector IV in the vector loop preheader 1772 auto CurrIP = Builder.saveIP(); 1773 Builder.SetInsertPoint(LoopVectorPreHeader->getTerminator()); 1774 if (isa<TruncInst>(EntryVal)) { 1775 assert(Start->getType()->isIntegerTy() && 1776 "Truncation requires an integer type"); 1777 auto *TruncType = cast<IntegerType>(EntryVal->getType()); 1778 Step = Builder.CreateTrunc(Step, TruncType); 1779 Start = Builder.CreateCast(Instruction::Trunc, Start, TruncType); 1780 } 1781 Value *SplatStart = Builder.CreateVectorSplat(VF, Start); 1782 Value *SteppedStart = 1783 getStepVector(SplatStart, 0, Step, II.getInductionOpcode()); 1784 1785 // We create vector phi nodes for both integer and floating-point induction 1786 // variables. Here, we determine the kind of arithmetic we will perform. 1787 Instruction::BinaryOps AddOp; 1788 Instruction::BinaryOps MulOp; 1789 if (Step->getType()->isIntegerTy()) { 1790 AddOp = Instruction::Add; 1791 MulOp = Instruction::Mul; 1792 } else { 1793 AddOp = II.getInductionOpcode(); 1794 MulOp = Instruction::FMul; 1795 } 1796 1797 // Multiply the vectorization factor by the step using integer or 1798 // floating-point arithmetic as appropriate. 1799 Value *ConstVF = getSignedIntOrFpConstant(Step->getType(), VF); 1800 Value *Mul = addFastMathFlag(Builder.CreateBinOp(MulOp, Step, ConstVF)); 1801 1802 // Create a vector splat to use in the induction update. 1803 // 1804 // FIXME: If the step is non-constant, we create the vector splat with 1805 // IRBuilder. IRBuilder can constant-fold the multiply, but it doesn't 1806 // handle a constant vector splat. 1807 Value *SplatVF = isa<Constant>(Mul) 1808 ? ConstantVector::getSplat(ElementCount::getFixed(VF), 1809 cast<Constant>(Mul)) 1810 : Builder.CreateVectorSplat(VF, Mul); 1811 Builder.restoreIP(CurrIP); 1812 1813 // We may need to add the step a number of times, depending on the unroll 1814 // factor. The last of those goes into the PHI. 1815 PHINode *VecInd = PHINode::Create(SteppedStart->getType(), 2, "vec.ind", 1816 &*LoopVectorBody->getFirstInsertionPt()); 1817 VecInd->setDebugLoc(EntryVal->getDebugLoc()); 1818 Instruction *LastInduction = VecInd; 1819 for (unsigned Part = 0; Part < UF; ++Part) { 1820 VectorLoopValueMap.setVectorValue(EntryVal, Part, LastInduction); 1821 1822 if (isa<TruncInst>(EntryVal)) 1823 addMetadata(LastInduction, EntryVal); 1824 recordVectorLoopValueForInductionCast(II, EntryVal, LastInduction, Part); 1825 1826 LastInduction = cast<Instruction>(addFastMathFlag( 1827 Builder.CreateBinOp(AddOp, LastInduction, SplatVF, "step.add"))); 1828 LastInduction->setDebugLoc(EntryVal->getDebugLoc()); 1829 } 1830 1831 // Move the last step to the end of the latch block. This ensures consistent 1832 // placement of all induction updates. 1833 auto *LoopVectorLatch = LI->getLoopFor(LoopVectorBody)->getLoopLatch(); 1834 auto *Br = cast<BranchInst>(LoopVectorLatch->getTerminator()); 1835 auto *ICmp = cast<Instruction>(Br->getCondition()); 1836 LastInduction->moveBefore(ICmp); 1837 LastInduction->setName("vec.ind.next"); 1838 1839 VecInd->addIncoming(SteppedStart, LoopVectorPreHeader); 1840 VecInd->addIncoming(LastInduction, LoopVectorLatch); 1841 } 1842 1843 bool InnerLoopVectorizer::shouldScalarizeInstruction(Instruction *I) const { 1844 return Cost->isScalarAfterVectorization(I, VF) || 1845 Cost->isProfitableToScalarize(I, VF); 1846 } 1847 1848 bool InnerLoopVectorizer::needsScalarInduction(Instruction *IV) const { 1849 if (shouldScalarizeInstruction(IV)) 1850 return true; 1851 auto isScalarInst = [&](User *U) -> bool { 1852 auto *I = cast<Instruction>(U); 1853 return (OrigLoop->contains(I) && shouldScalarizeInstruction(I)); 1854 }; 1855 return llvm::any_of(IV->users(), isScalarInst); 1856 } 1857 1858 void InnerLoopVectorizer::recordVectorLoopValueForInductionCast( 1859 const InductionDescriptor &ID, const Instruction *EntryVal, 1860 Value *VectorLoopVal, unsigned Part, unsigned Lane) { 1861 assert((isa<PHINode>(EntryVal) || isa<TruncInst>(EntryVal)) && 1862 "Expected either an induction phi-node or a truncate of it!"); 1863 1864 // This induction variable is not the phi from the original loop but the 1865 // newly-created IV based on the proof that casted Phi is equal to the 1866 // uncasted Phi in the vectorized loop (under a runtime guard possibly). It 1867 // re-uses the same InductionDescriptor that original IV uses but we don't 1868 // have to do any recording in this case - that is done when original IV is 1869 // processed. 1870 if (isa<TruncInst>(EntryVal)) 1871 return; 1872 1873 const SmallVectorImpl<Instruction *> &Casts = ID.getCastInsts(); 1874 if (Casts.empty()) 1875 return; 1876 // Only the first Cast instruction in the Casts vector is of interest. 1877 // The rest of the Casts (if exist) have no uses outside the 1878 // induction update chain itself. 1879 Instruction *CastInst = *Casts.begin(); 1880 if (Lane < UINT_MAX) 1881 VectorLoopValueMap.setScalarValue(CastInst, {Part, Lane}, VectorLoopVal); 1882 else 1883 VectorLoopValueMap.setVectorValue(CastInst, Part, VectorLoopVal); 1884 } 1885 1886 void InnerLoopVectorizer::widenIntOrFpInduction(PHINode *IV, TruncInst *Trunc) { 1887 assert((IV->getType()->isIntegerTy() || IV != OldInduction) && 1888 "Primary induction variable must have an integer type"); 1889 1890 auto II = Legal->getInductionVars().find(IV); 1891 assert(II != Legal->getInductionVars().end() && "IV is not an induction"); 1892 1893 auto ID = II->second; 1894 assert(IV->getType() == ID.getStartValue()->getType() && "Types must match"); 1895 1896 // The value from the original loop to which we are mapping the new induction 1897 // variable. 1898 Instruction *EntryVal = Trunc ? cast<Instruction>(Trunc) : IV; 1899 1900 auto &DL = OrigLoop->getHeader()->getModule()->getDataLayout(); 1901 1902 // Generate code for the induction step. Note that induction steps are 1903 // required to be loop-invariant 1904 auto CreateStepValue = [&](const SCEV *Step) -> Value * { 1905 assert(PSE.getSE()->isLoopInvariant(Step, OrigLoop) && 1906 "Induction step should be loop invariant"); 1907 if (PSE.getSE()->isSCEVable(IV->getType())) { 1908 SCEVExpander Exp(*PSE.getSE(), DL, "induction"); 1909 return Exp.expandCodeFor(Step, Step->getType(), 1910 LoopVectorPreHeader->getTerminator()); 1911 } 1912 return cast<SCEVUnknown>(Step)->getValue(); 1913 }; 1914 1915 // The scalar value to broadcast. This is derived from the canonical 1916 // induction variable. If a truncation type is given, truncate the canonical 1917 // induction variable and step. Otherwise, derive these values from the 1918 // induction descriptor. 1919 auto CreateScalarIV = [&](Value *&Step) -> Value * { 1920 Value *ScalarIV = Induction; 1921 if (IV != OldInduction) { 1922 ScalarIV = IV->getType()->isIntegerTy() 1923 ? Builder.CreateSExtOrTrunc(Induction, IV->getType()) 1924 : Builder.CreateCast(Instruction::SIToFP, Induction, 1925 IV->getType()); 1926 ScalarIV = emitTransformedIndex(Builder, ScalarIV, PSE.getSE(), DL, ID); 1927 ScalarIV->setName("offset.idx"); 1928 } 1929 if (Trunc) { 1930 auto *TruncType = cast<IntegerType>(Trunc->getType()); 1931 assert(Step->getType()->isIntegerTy() && 1932 "Truncation requires an integer step"); 1933 ScalarIV = Builder.CreateTrunc(ScalarIV, TruncType); 1934 Step = Builder.CreateTrunc(Step, TruncType); 1935 } 1936 return ScalarIV; 1937 }; 1938 1939 // Create the vector values from the scalar IV, in the absence of creating a 1940 // vector IV. 1941 auto CreateSplatIV = [&](Value *ScalarIV, Value *Step) { 1942 Value *Broadcasted = getBroadcastInstrs(ScalarIV); 1943 for (unsigned Part = 0; Part < UF; ++Part) { 1944 Value *EntryPart = 1945 getStepVector(Broadcasted, VF * Part, Step, ID.getInductionOpcode()); 1946 VectorLoopValueMap.setVectorValue(EntryVal, Part, EntryPart); 1947 if (Trunc) 1948 addMetadata(EntryPart, Trunc); 1949 recordVectorLoopValueForInductionCast(ID, EntryVal, EntryPart, Part); 1950 } 1951 }; 1952 1953 // Now do the actual transformations, and start with creating the step value. 1954 Value *Step = CreateStepValue(ID.getStep()); 1955 if (VF <= 1) { 1956 Value *ScalarIV = CreateScalarIV(Step); 1957 CreateSplatIV(ScalarIV, Step); 1958 return; 1959 } 1960 1961 // Determine if we want a scalar version of the induction variable. This is 1962 // true if the induction variable itself is not widened, or if it has at 1963 // least one user in the loop that is not widened. 1964 auto NeedsScalarIV = needsScalarInduction(EntryVal); 1965 if (!NeedsScalarIV) { 1966 createVectorIntOrFpInductionPHI(ID, Step, EntryVal); 1967 return; 1968 } 1969 1970 // Try to create a new independent vector induction variable. If we can't 1971 // create the phi node, we will splat the scalar induction variable in each 1972 // loop iteration. 1973 if (!shouldScalarizeInstruction(EntryVal)) { 1974 createVectorIntOrFpInductionPHI(ID, Step, EntryVal); 1975 Value *ScalarIV = CreateScalarIV(Step); 1976 // Create scalar steps that can be used by instructions we will later 1977 // scalarize. Note that the addition of the scalar steps will not increase 1978 // the number of instructions in the loop in the common case prior to 1979 // InstCombine. We will be trading one vector extract for each scalar step. 1980 buildScalarSteps(ScalarIV, Step, EntryVal, ID); 1981 return; 1982 } 1983 1984 // All IV users are scalar instructions, so only emit a scalar IV, not a 1985 // vectorised IV. Except when we tail-fold, then the splat IV feeds the 1986 // predicate used by the masked loads/stores. 1987 Value *ScalarIV = CreateScalarIV(Step); 1988 if (!Cost->isScalarEpilogueAllowed()) 1989 CreateSplatIV(ScalarIV, Step); 1990 buildScalarSteps(ScalarIV, Step, EntryVal, ID); 1991 } 1992 1993 Value *InnerLoopVectorizer::getStepVector(Value *Val, int StartIdx, Value *Step, 1994 Instruction::BinaryOps BinOp) { 1995 // Create and check the types. 1996 auto *ValVTy = cast<VectorType>(Val->getType()); 1997 int VLen = ValVTy->getNumElements(); 1998 1999 Type *STy = Val->getType()->getScalarType(); 2000 assert((STy->isIntegerTy() || STy->isFloatingPointTy()) && 2001 "Induction Step must be an integer or FP"); 2002 assert(Step->getType() == STy && "Step has wrong type"); 2003 2004 SmallVector<Constant *, 8> Indices; 2005 2006 if (STy->isIntegerTy()) { 2007 // Create a vector of consecutive numbers from zero to VF. 2008 for (int i = 0; i < VLen; ++i) 2009 Indices.push_back(ConstantInt::get(STy, StartIdx + i)); 2010 2011 // Add the consecutive indices to the vector value. 2012 Constant *Cv = ConstantVector::get(Indices); 2013 assert(Cv->getType() == Val->getType() && "Invalid consecutive vec"); 2014 Step = Builder.CreateVectorSplat(VLen, Step); 2015 assert(Step->getType() == Val->getType() && "Invalid step vec"); 2016 // FIXME: The newly created binary instructions should contain nsw/nuw flags, 2017 // which can be found from the original scalar operations. 2018 Step = Builder.CreateMul(Cv, Step); 2019 return Builder.CreateAdd(Val, Step, "induction"); 2020 } 2021 2022 // Floating point induction. 2023 assert((BinOp == Instruction::FAdd || BinOp == Instruction::FSub) && 2024 "Binary Opcode should be specified for FP induction"); 2025 // Create a vector of consecutive numbers from zero to VF. 2026 for (int i = 0; i < VLen; ++i) 2027 Indices.push_back(ConstantFP::get(STy, (double)(StartIdx + i))); 2028 2029 // Add the consecutive indices to the vector value. 2030 Constant *Cv = ConstantVector::get(Indices); 2031 2032 Step = Builder.CreateVectorSplat(VLen, Step); 2033 2034 // Floating point operations had to be 'fast' to enable the induction. 2035 FastMathFlags Flags; 2036 Flags.setFast(); 2037 2038 Value *MulOp = Builder.CreateFMul(Cv, Step); 2039 if (isa<Instruction>(MulOp)) 2040 // Have to check, MulOp may be a constant 2041 cast<Instruction>(MulOp)->setFastMathFlags(Flags); 2042 2043 Value *BOp = Builder.CreateBinOp(BinOp, Val, MulOp, "induction"); 2044 if (isa<Instruction>(BOp)) 2045 cast<Instruction>(BOp)->setFastMathFlags(Flags); 2046 return BOp; 2047 } 2048 2049 void InnerLoopVectorizer::buildScalarSteps(Value *ScalarIV, Value *Step, 2050 Instruction *EntryVal, 2051 const InductionDescriptor &ID) { 2052 // We shouldn't have to build scalar steps if we aren't vectorizing. 2053 assert(VF > 1 && "VF should be greater than one"); 2054 2055 // Get the value type and ensure it and the step have the same integer type. 2056 Type *ScalarIVTy = ScalarIV->getType()->getScalarType(); 2057 assert(ScalarIVTy == Step->getType() && 2058 "Val and Step should have the same type"); 2059 2060 // We build scalar steps for both integer and floating-point induction 2061 // variables. Here, we determine the kind of arithmetic we will perform. 2062 Instruction::BinaryOps AddOp; 2063 Instruction::BinaryOps MulOp; 2064 if (ScalarIVTy->isIntegerTy()) { 2065 AddOp = Instruction::Add; 2066 MulOp = Instruction::Mul; 2067 } else { 2068 AddOp = ID.getInductionOpcode(); 2069 MulOp = Instruction::FMul; 2070 } 2071 2072 // Determine the number of scalars we need to generate for each unroll 2073 // iteration. If EntryVal is uniform, we only need to generate the first 2074 // lane. Otherwise, we generate all VF values. 2075 unsigned Lanes = 2076 Cost->isUniformAfterVectorization(cast<Instruction>(EntryVal), VF) ? 1 2077 : VF; 2078 // Compute the scalar steps and save the results in VectorLoopValueMap. 2079 for (unsigned Part = 0; Part < UF; ++Part) { 2080 for (unsigned Lane = 0; Lane < Lanes; ++Lane) { 2081 auto *StartIdx = getSignedIntOrFpConstant(ScalarIVTy, VF * Part + Lane); 2082 auto *Mul = addFastMathFlag(Builder.CreateBinOp(MulOp, StartIdx, Step)); 2083 auto *Add = addFastMathFlag(Builder.CreateBinOp(AddOp, ScalarIV, Mul)); 2084 VectorLoopValueMap.setScalarValue(EntryVal, {Part, Lane}, Add); 2085 recordVectorLoopValueForInductionCast(ID, EntryVal, Add, Part, Lane); 2086 } 2087 } 2088 } 2089 2090 Value *InnerLoopVectorizer::getOrCreateVectorValue(Value *V, unsigned Part) { 2091 assert(V != Induction && "The new induction variable should not be used."); 2092 assert(!V->getType()->isVectorTy() && "Can't widen a vector"); 2093 assert(!V->getType()->isVoidTy() && "Type does not produce a value"); 2094 2095 // If we have a stride that is replaced by one, do it here. Defer this for 2096 // the VPlan-native path until we start running Legal checks in that path. 2097 if (!EnableVPlanNativePath && Legal->hasStride(V)) 2098 V = ConstantInt::get(V->getType(), 1); 2099 2100 // If we have a vector mapped to this value, return it. 2101 if (VectorLoopValueMap.hasVectorValue(V, Part)) 2102 return VectorLoopValueMap.getVectorValue(V, Part); 2103 2104 // If the value has not been vectorized, check if it has been scalarized 2105 // instead. If it has been scalarized, and we actually need the value in 2106 // vector form, we will construct the vector values on demand. 2107 if (VectorLoopValueMap.hasAnyScalarValue(V)) { 2108 Value *ScalarValue = VectorLoopValueMap.getScalarValue(V, {Part, 0}); 2109 2110 // If we've scalarized a value, that value should be an instruction. 2111 auto *I = cast<Instruction>(V); 2112 2113 // If we aren't vectorizing, we can just copy the scalar map values over to 2114 // the vector map. 2115 if (VF == 1) { 2116 VectorLoopValueMap.setVectorValue(V, Part, ScalarValue); 2117 return ScalarValue; 2118 } 2119 2120 // Get the last scalar instruction we generated for V and Part. If the value 2121 // is known to be uniform after vectorization, this corresponds to lane zero 2122 // of the Part unroll iteration. Otherwise, the last instruction is the one 2123 // we created for the last vector lane of the Part unroll iteration. 2124 unsigned LastLane = Cost->isUniformAfterVectorization(I, VF) ? 0 : VF - 1; 2125 auto *LastInst = cast<Instruction>( 2126 VectorLoopValueMap.getScalarValue(V, {Part, LastLane})); 2127 2128 // Set the insert point after the last scalarized instruction. This ensures 2129 // the insertelement sequence will directly follow the scalar definitions. 2130 auto OldIP = Builder.saveIP(); 2131 auto NewIP = std::next(BasicBlock::iterator(LastInst)); 2132 Builder.SetInsertPoint(&*NewIP); 2133 2134 // However, if we are vectorizing, we need to construct the vector values. 2135 // If the value is known to be uniform after vectorization, we can just 2136 // broadcast the scalar value corresponding to lane zero for each unroll 2137 // iteration. Otherwise, we construct the vector values using insertelement 2138 // instructions. Since the resulting vectors are stored in 2139 // VectorLoopValueMap, we will only generate the insertelements once. 2140 Value *VectorValue = nullptr; 2141 if (Cost->isUniformAfterVectorization(I, VF)) { 2142 VectorValue = getBroadcastInstrs(ScalarValue); 2143 VectorLoopValueMap.setVectorValue(V, Part, VectorValue); 2144 } else { 2145 // Initialize packing with insertelements to start from undef. 2146 Value *Undef = UndefValue::get(FixedVectorType::get(V->getType(), VF)); 2147 VectorLoopValueMap.setVectorValue(V, Part, Undef); 2148 for (unsigned Lane = 0; Lane < VF; ++Lane) 2149 packScalarIntoVectorValue(V, {Part, Lane}); 2150 VectorValue = VectorLoopValueMap.getVectorValue(V, Part); 2151 } 2152 Builder.restoreIP(OldIP); 2153 return VectorValue; 2154 } 2155 2156 // If this scalar is unknown, assume that it is a constant or that it is 2157 // loop invariant. Broadcast V and save the value for future uses. 2158 Value *B = getBroadcastInstrs(V); 2159 VectorLoopValueMap.setVectorValue(V, Part, B); 2160 return B; 2161 } 2162 2163 Value * 2164 InnerLoopVectorizer::getOrCreateScalarValue(Value *V, 2165 const VPIteration &Instance) { 2166 // If the value is not an instruction contained in the loop, it should 2167 // already be scalar. 2168 if (OrigLoop->isLoopInvariant(V)) 2169 return V; 2170 2171 assert(Instance.Lane > 0 2172 ? !Cost->isUniformAfterVectorization(cast<Instruction>(V), VF) 2173 : true && "Uniform values only have lane zero"); 2174 2175 // If the value from the original loop has not been vectorized, it is 2176 // represented by UF x VF scalar values in the new loop. Return the requested 2177 // scalar value. 2178 if (VectorLoopValueMap.hasScalarValue(V, Instance)) 2179 return VectorLoopValueMap.getScalarValue(V, Instance); 2180 2181 // If the value has not been scalarized, get its entry in VectorLoopValueMap 2182 // for the given unroll part. If this entry is not a vector type (i.e., the 2183 // vectorization factor is one), there is no need to generate an 2184 // extractelement instruction. 2185 auto *U = getOrCreateVectorValue(V, Instance.Part); 2186 if (!U->getType()->isVectorTy()) { 2187 assert(VF == 1 && "Value not scalarized has non-vector type"); 2188 return U; 2189 } 2190 2191 // Otherwise, the value from the original loop has been vectorized and is 2192 // represented by UF vector values. Extract and return the requested scalar 2193 // value from the appropriate vector lane. 2194 return Builder.CreateExtractElement(U, Builder.getInt32(Instance.Lane)); 2195 } 2196 2197 void InnerLoopVectorizer::packScalarIntoVectorValue( 2198 Value *V, const VPIteration &Instance) { 2199 assert(V != Induction && "The new induction variable should not be used."); 2200 assert(!V->getType()->isVectorTy() && "Can't pack a vector"); 2201 assert(!V->getType()->isVoidTy() && "Type does not produce a value"); 2202 2203 Value *ScalarInst = VectorLoopValueMap.getScalarValue(V, Instance); 2204 Value *VectorValue = VectorLoopValueMap.getVectorValue(V, Instance.Part); 2205 VectorValue = Builder.CreateInsertElement(VectorValue, ScalarInst, 2206 Builder.getInt32(Instance.Lane)); 2207 VectorLoopValueMap.resetVectorValue(V, Instance.Part, VectorValue); 2208 } 2209 2210 Value *InnerLoopVectorizer::reverseVector(Value *Vec) { 2211 assert(Vec->getType()->isVectorTy() && "Invalid type"); 2212 SmallVector<int, 8> ShuffleMask; 2213 for (unsigned i = 0; i < VF; ++i) 2214 ShuffleMask.push_back(VF - i - 1); 2215 2216 return Builder.CreateShuffleVector(Vec, UndefValue::get(Vec->getType()), 2217 ShuffleMask, "reverse"); 2218 } 2219 2220 // Return whether we allow using masked interleave-groups (for dealing with 2221 // strided loads/stores that reside in predicated blocks, or for dealing 2222 // with gaps). 2223 static bool useMaskedInterleavedAccesses(const TargetTransformInfo &TTI) { 2224 // If an override option has been passed in for interleaved accesses, use it. 2225 if (EnableMaskedInterleavedMemAccesses.getNumOccurrences() > 0) 2226 return EnableMaskedInterleavedMemAccesses; 2227 2228 return TTI.enableMaskedInterleavedAccessVectorization(); 2229 } 2230 2231 // Try to vectorize the interleave group that \p Instr belongs to. 2232 // 2233 // E.g. Translate following interleaved load group (factor = 3): 2234 // for (i = 0; i < N; i+=3) { 2235 // R = Pic[i]; // Member of index 0 2236 // G = Pic[i+1]; // Member of index 1 2237 // B = Pic[i+2]; // Member of index 2 2238 // ... // do something to R, G, B 2239 // } 2240 // To: 2241 // %wide.vec = load <12 x i32> ; Read 4 tuples of R,G,B 2242 // %R.vec = shuffle %wide.vec, undef, <0, 3, 6, 9> ; R elements 2243 // %G.vec = shuffle %wide.vec, undef, <1, 4, 7, 10> ; G elements 2244 // %B.vec = shuffle %wide.vec, undef, <2, 5, 8, 11> ; B elements 2245 // 2246 // Or translate following interleaved store group (factor = 3): 2247 // for (i = 0; i < N; i+=3) { 2248 // ... do something to R, G, B 2249 // Pic[i] = R; // Member of index 0 2250 // Pic[i+1] = G; // Member of index 1 2251 // Pic[i+2] = B; // Member of index 2 2252 // } 2253 // To: 2254 // %R_G.vec = shuffle %R.vec, %G.vec, <0, 1, 2, ..., 7> 2255 // %B_U.vec = shuffle %B.vec, undef, <0, 1, 2, 3, u, u, u, u> 2256 // %interleaved.vec = shuffle %R_G.vec, %B_U.vec, 2257 // <0, 4, 8, 1, 5, 9, 2, 6, 10, 3, 7, 11> ; Interleave R,G,B elements 2258 // store <12 x i32> %interleaved.vec ; Write 4 tuples of R,G,B 2259 void InnerLoopVectorizer::vectorizeInterleaveGroup( 2260 const InterleaveGroup<Instruction> *Group, VPTransformState &State, 2261 VPValue *Addr, VPValue *BlockInMask) { 2262 Instruction *Instr = Group->getInsertPos(); 2263 const DataLayout &DL = Instr->getModule()->getDataLayout(); 2264 2265 // Prepare for the vector type of the interleaved load/store. 2266 Type *ScalarTy = getMemInstValueType(Instr); 2267 unsigned InterleaveFactor = Group->getFactor(); 2268 auto *VecTy = FixedVectorType::get(ScalarTy, InterleaveFactor * VF); 2269 2270 // Prepare for the new pointers. 2271 SmallVector<Value *, 2> AddrParts; 2272 unsigned Index = Group->getIndex(Instr); 2273 2274 // TODO: extend the masked interleaved-group support to reversed access. 2275 assert((!BlockInMask || !Group->isReverse()) && 2276 "Reversed masked interleave-group not supported."); 2277 2278 // If the group is reverse, adjust the index to refer to the last vector lane 2279 // instead of the first. We adjust the index from the first vector lane, 2280 // rather than directly getting the pointer for lane VF - 1, because the 2281 // pointer operand of the interleaved access is supposed to be uniform. For 2282 // uniform instructions, we're only required to generate a value for the 2283 // first vector lane in each unroll iteration. 2284 if (Group->isReverse()) 2285 Index += (VF - 1) * Group->getFactor(); 2286 2287 for (unsigned Part = 0; Part < UF; Part++) { 2288 Value *AddrPart = State.get(Addr, {Part, 0}); 2289 setDebugLocFromInst(Builder, AddrPart); 2290 2291 // Notice current instruction could be any index. Need to adjust the address 2292 // to the member of index 0. 2293 // 2294 // E.g. a = A[i+1]; // Member of index 1 (Current instruction) 2295 // b = A[i]; // Member of index 0 2296 // Current pointer is pointed to A[i+1], adjust it to A[i]. 2297 // 2298 // E.g. A[i+1] = a; // Member of index 1 2299 // A[i] = b; // Member of index 0 2300 // A[i+2] = c; // Member of index 2 (Current instruction) 2301 // Current pointer is pointed to A[i+2], adjust it to A[i]. 2302 2303 bool InBounds = false; 2304 if (auto *gep = dyn_cast<GetElementPtrInst>(AddrPart->stripPointerCasts())) 2305 InBounds = gep->isInBounds(); 2306 AddrPart = Builder.CreateGEP(ScalarTy, AddrPart, Builder.getInt32(-Index)); 2307 cast<GetElementPtrInst>(AddrPart)->setIsInBounds(InBounds); 2308 2309 // Cast to the vector pointer type. 2310 unsigned AddressSpace = AddrPart->getType()->getPointerAddressSpace(); 2311 Type *PtrTy = VecTy->getPointerTo(AddressSpace); 2312 AddrParts.push_back(Builder.CreateBitCast(AddrPart, PtrTy)); 2313 } 2314 2315 setDebugLocFromInst(Builder, Instr); 2316 Value *UndefVec = UndefValue::get(VecTy); 2317 2318 Value *MaskForGaps = nullptr; 2319 if (Group->requiresScalarEpilogue() && !Cost->isScalarEpilogueAllowed()) { 2320 MaskForGaps = createBitMaskForGaps(Builder, VF, *Group); 2321 assert(MaskForGaps && "Mask for Gaps is required but it is null"); 2322 } 2323 2324 // Vectorize the interleaved load group. 2325 if (isa<LoadInst>(Instr)) { 2326 // For each unroll part, create a wide load for the group. 2327 SmallVector<Value *, 2> NewLoads; 2328 for (unsigned Part = 0; Part < UF; Part++) { 2329 Instruction *NewLoad; 2330 if (BlockInMask || MaskForGaps) { 2331 assert(useMaskedInterleavedAccesses(*TTI) && 2332 "masked interleaved groups are not allowed."); 2333 Value *GroupMask = MaskForGaps; 2334 if (BlockInMask) { 2335 Value *BlockInMaskPart = State.get(BlockInMask, Part); 2336 auto *Undefs = UndefValue::get(BlockInMaskPart->getType()); 2337 Value *ShuffledMask = Builder.CreateShuffleVector( 2338 BlockInMaskPart, Undefs, 2339 createReplicatedMask(InterleaveFactor, VF), "interleaved.mask"); 2340 GroupMask = MaskForGaps 2341 ? Builder.CreateBinOp(Instruction::And, ShuffledMask, 2342 MaskForGaps) 2343 : ShuffledMask; 2344 } 2345 NewLoad = 2346 Builder.CreateMaskedLoad(AddrParts[Part], Group->getAlign(), 2347 GroupMask, UndefVec, "wide.masked.vec"); 2348 } 2349 else 2350 NewLoad = Builder.CreateAlignedLoad(VecTy, AddrParts[Part], 2351 Group->getAlign(), "wide.vec"); 2352 Group->addMetadata(NewLoad); 2353 NewLoads.push_back(NewLoad); 2354 } 2355 2356 // For each member in the group, shuffle out the appropriate data from the 2357 // wide loads. 2358 for (unsigned I = 0; I < InterleaveFactor; ++I) { 2359 Instruction *Member = Group->getMember(I); 2360 2361 // Skip the gaps in the group. 2362 if (!Member) 2363 continue; 2364 2365 auto StrideMask = createStrideMask(I, InterleaveFactor, VF); 2366 for (unsigned Part = 0; Part < UF; Part++) { 2367 Value *StridedVec = Builder.CreateShuffleVector( 2368 NewLoads[Part], UndefVec, StrideMask, "strided.vec"); 2369 2370 // If this member has different type, cast the result type. 2371 if (Member->getType() != ScalarTy) { 2372 VectorType *OtherVTy = FixedVectorType::get(Member->getType(), VF); 2373 StridedVec = createBitOrPointerCast(StridedVec, OtherVTy, DL); 2374 } 2375 2376 if (Group->isReverse()) 2377 StridedVec = reverseVector(StridedVec); 2378 2379 VectorLoopValueMap.setVectorValue(Member, Part, StridedVec); 2380 } 2381 } 2382 return; 2383 } 2384 2385 // The sub vector type for current instruction. 2386 auto *SubVT = FixedVectorType::get(ScalarTy, VF); 2387 2388 // Vectorize the interleaved store group. 2389 for (unsigned Part = 0; Part < UF; Part++) { 2390 // Collect the stored vector from each member. 2391 SmallVector<Value *, 4> StoredVecs; 2392 for (unsigned i = 0; i < InterleaveFactor; i++) { 2393 // Interleaved store group doesn't allow a gap, so each index has a member 2394 Instruction *Member = Group->getMember(i); 2395 assert(Member && "Fail to get a member from an interleaved store group"); 2396 2397 Value *StoredVec = getOrCreateVectorValue( 2398 cast<StoreInst>(Member)->getValueOperand(), Part); 2399 if (Group->isReverse()) 2400 StoredVec = reverseVector(StoredVec); 2401 2402 // If this member has different type, cast it to a unified type. 2403 2404 if (StoredVec->getType() != SubVT) 2405 StoredVec = createBitOrPointerCast(StoredVec, SubVT, DL); 2406 2407 StoredVecs.push_back(StoredVec); 2408 } 2409 2410 // Concatenate all vectors into a wide vector. 2411 Value *WideVec = concatenateVectors(Builder, StoredVecs); 2412 2413 // Interleave the elements in the wide vector. 2414 Value *IVec = Builder.CreateShuffleVector( 2415 WideVec, UndefVec, createInterleaveMask(VF, InterleaveFactor), 2416 "interleaved.vec"); 2417 2418 Instruction *NewStoreInstr; 2419 if (BlockInMask) { 2420 Value *BlockInMaskPart = State.get(BlockInMask, Part); 2421 auto *Undefs = UndefValue::get(BlockInMaskPart->getType()); 2422 Value *ShuffledMask = Builder.CreateShuffleVector( 2423 BlockInMaskPart, Undefs, createReplicatedMask(InterleaveFactor, VF), 2424 "interleaved.mask"); 2425 NewStoreInstr = Builder.CreateMaskedStore( 2426 IVec, AddrParts[Part], Group->getAlign(), ShuffledMask); 2427 } 2428 else 2429 NewStoreInstr = 2430 Builder.CreateAlignedStore(IVec, AddrParts[Part], Group->getAlign()); 2431 2432 Group->addMetadata(NewStoreInstr); 2433 } 2434 } 2435 2436 void InnerLoopVectorizer::vectorizeMemoryInstruction(Instruction *Instr, 2437 VPTransformState &State, 2438 VPValue *Addr, 2439 VPValue *StoredValue, 2440 VPValue *BlockInMask) { 2441 // Attempt to issue a wide load. 2442 LoadInst *LI = dyn_cast<LoadInst>(Instr); 2443 StoreInst *SI = dyn_cast<StoreInst>(Instr); 2444 2445 assert((LI || SI) && "Invalid Load/Store instruction"); 2446 assert((!SI || StoredValue) && "No stored value provided for widened store"); 2447 assert((!LI || !StoredValue) && "Stored value provided for widened load"); 2448 2449 LoopVectorizationCostModel::InstWidening Decision = 2450 Cost->getWideningDecision(Instr, VF); 2451 assert((Decision == LoopVectorizationCostModel::CM_Widen || 2452 Decision == LoopVectorizationCostModel::CM_Widen_Reverse || 2453 Decision == LoopVectorizationCostModel::CM_GatherScatter) && 2454 "CM decision is not to widen the memory instruction"); 2455 2456 Type *ScalarDataTy = getMemInstValueType(Instr); 2457 auto *DataTy = FixedVectorType::get(ScalarDataTy, VF); 2458 const Align Alignment = getLoadStoreAlignment(Instr); 2459 2460 // Determine if the pointer operand of the access is either consecutive or 2461 // reverse consecutive. 2462 bool Reverse = (Decision == LoopVectorizationCostModel::CM_Widen_Reverse); 2463 bool ConsecutiveStride = 2464 Reverse || (Decision == LoopVectorizationCostModel::CM_Widen); 2465 bool CreateGatherScatter = 2466 (Decision == LoopVectorizationCostModel::CM_GatherScatter); 2467 2468 // Either Ptr feeds a vector load/store, or a vector GEP should feed a vector 2469 // gather/scatter. Otherwise Decision should have been to Scalarize. 2470 assert((ConsecutiveStride || CreateGatherScatter) && 2471 "The instruction should be scalarized"); 2472 (void)ConsecutiveStride; 2473 2474 VectorParts BlockInMaskParts(UF); 2475 bool isMaskRequired = BlockInMask; 2476 if (isMaskRequired) 2477 for (unsigned Part = 0; Part < UF; ++Part) 2478 BlockInMaskParts[Part] = State.get(BlockInMask, Part); 2479 2480 const auto CreateVecPtr = [&](unsigned Part, Value *Ptr) -> Value * { 2481 // Calculate the pointer for the specific unroll-part. 2482 GetElementPtrInst *PartPtr = nullptr; 2483 2484 bool InBounds = false; 2485 if (auto *gep = dyn_cast<GetElementPtrInst>(Ptr->stripPointerCasts())) 2486 InBounds = gep->isInBounds(); 2487 2488 if (Reverse) { 2489 // If the address is consecutive but reversed, then the 2490 // wide store needs to start at the last vector element. 2491 PartPtr = cast<GetElementPtrInst>( 2492 Builder.CreateGEP(ScalarDataTy, Ptr, Builder.getInt32(-Part * VF))); 2493 PartPtr->setIsInBounds(InBounds); 2494 PartPtr = cast<GetElementPtrInst>( 2495 Builder.CreateGEP(ScalarDataTy, PartPtr, Builder.getInt32(1 - VF))); 2496 PartPtr->setIsInBounds(InBounds); 2497 if (isMaskRequired) // Reverse of a null all-one mask is a null mask. 2498 BlockInMaskParts[Part] = reverseVector(BlockInMaskParts[Part]); 2499 } else { 2500 PartPtr = cast<GetElementPtrInst>( 2501 Builder.CreateGEP(ScalarDataTy, Ptr, Builder.getInt32(Part * VF))); 2502 PartPtr->setIsInBounds(InBounds); 2503 } 2504 2505 unsigned AddressSpace = Ptr->getType()->getPointerAddressSpace(); 2506 return Builder.CreateBitCast(PartPtr, DataTy->getPointerTo(AddressSpace)); 2507 }; 2508 2509 // Handle Stores: 2510 if (SI) { 2511 setDebugLocFromInst(Builder, SI); 2512 2513 for (unsigned Part = 0; Part < UF; ++Part) { 2514 Instruction *NewSI = nullptr; 2515 Value *StoredVal = State.get(StoredValue, Part); 2516 if (CreateGatherScatter) { 2517 Value *MaskPart = isMaskRequired ? BlockInMaskParts[Part] : nullptr; 2518 Value *VectorGep = State.get(Addr, Part); 2519 NewSI = Builder.CreateMaskedScatter(StoredVal, VectorGep, Alignment, 2520 MaskPart); 2521 } else { 2522 if (Reverse) { 2523 // If we store to reverse consecutive memory locations, then we need 2524 // to reverse the order of elements in the stored value. 2525 StoredVal = reverseVector(StoredVal); 2526 // We don't want to update the value in the map as it might be used in 2527 // another expression. So don't call resetVectorValue(StoredVal). 2528 } 2529 auto *VecPtr = CreateVecPtr(Part, State.get(Addr, {0, 0})); 2530 if (isMaskRequired) 2531 NewSI = Builder.CreateMaskedStore(StoredVal, VecPtr, Alignment, 2532 BlockInMaskParts[Part]); 2533 else 2534 NewSI = Builder.CreateAlignedStore(StoredVal, VecPtr, Alignment); 2535 } 2536 addMetadata(NewSI, SI); 2537 } 2538 return; 2539 } 2540 2541 // Handle loads. 2542 assert(LI && "Must have a load instruction"); 2543 setDebugLocFromInst(Builder, LI); 2544 for (unsigned Part = 0; Part < UF; ++Part) { 2545 Value *NewLI; 2546 if (CreateGatherScatter) { 2547 Value *MaskPart = isMaskRequired ? BlockInMaskParts[Part] : nullptr; 2548 Value *VectorGep = State.get(Addr, Part); 2549 NewLI = Builder.CreateMaskedGather(VectorGep, Alignment, MaskPart, 2550 nullptr, "wide.masked.gather"); 2551 addMetadata(NewLI, LI); 2552 } else { 2553 auto *VecPtr = CreateVecPtr(Part, State.get(Addr, {0, 0})); 2554 if (isMaskRequired) 2555 NewLI = Builder.CreateMaskedLoad( 2556 VecPtr, Alignment, BlockInMaskParts[Part], UndefValue::get(DataTy), 2557 "wide.masked.load"); 2558 else 2559 NewLI = 2560 Builder.CreateAlignedLoad(DataTy, VecPtr, Alignment, "wide.load"); 2561 2562 // Add metadata to the load, but setVectorValue to the reverse shuffle. 2563 addMetadata(NewLI, LI); 2564 if (Reverse) 2565 NewLI = reverseVector(NewLI); 2566 } 2567 VectorLoopValueMap.setVectorValue(Instr, Part, NewLI); 2568 } 2569 } 2570 2571 void InnerLoopVectorizer::scalarizeInstruction(Instruction *Instr, VPUser &User, 2572 const VPIteration &Instance, 2573 bool IfPredicateInstr, 2574 VPTransformState &State) { 2575 assert(!Instr->getType()->isAggregateType() && "Can't handle vectors"); 2576 2577 setDebugLocFromInst(Builder, Instr); 2578 2579 // Does this instruction return a value ? 2580 bool IsVoidRetTy = Instr->getType()->isVoidTy(); 2581 2582 Instruction *Cloned = Instr->clone(); 2583 if (!IsVoidRetTy) 2584 Cloned->setName(Instr->getName() + ".cloned"); 2585 2586 // Replace the operands of the cloned instructions with their scalar 2587 // equivalents in the new loop. 2588 for (unsigned op = 0, e = User.getNumOperands(); op != e; ++op) { 2589 auto *NewOp = State.get(User.getOperand(op), Instance); 2590 Cloned->setOperand(op, NewOp); 2591 } 2592 addNewMetadata(Cloned, Instr); 2593 2594 // Place the cloned scalar in the new loop. 2595 Builder.Insert(Cloned); 2596 2597 // Add the cloned scalar to the scalar map entry. 2598 VectorLoopValueMap.setScalarValue(Instr, Instance, Cloned); 2599 2600 // If we just cloned a new assumption, add it the assumption cache. 2601 if (auto *II = dyn_cast<IntrinsicInst>(Cloned)) 2602 if (II->getIntrinsicID() == Intrinsic::assume) 2603 AC->registerAssumption(II); 2604 2605 // End if-block. 2606 if (IfPredicateInstr) 2607 PredicatedInstructions.push_back(Cloned); 2608 } 2609 2610 PHINode *InnerLoopVectorizer::createInductionVariable(Loop *L, Value *Start, 2611 Value *End, Value *Step, 2612 Instruction *DL) { 2613 BasicBlock *Header = L->getHeader(); 2614 BasicBlock *Latch = L->getLoopLatch(); 2615 // As we're just creating this loop, it's possible no latch exists 2616 // yet. If so, use the header as this will be a single block loop. 2617 if (!Latch) 2618 Latch = Header; 2619 2620 IRBuilder<> Builder(&*Header->getFirstInsertionPt()); 2621 Instruction *OldInst = getDebugLocFromInstOrOperands(OldInduction); 2622 setDebugLocFromInst(Builder, OldInst); 2623 auto *Induction = Builder.CreatePHI(Start->getType(), 2, "index"); 2624 2625 Builder.SetInsertPoint(Latch->getTerminator()); 2626 setDebugLocFromInst(Builder, OldInst); 2627 2628 // Create i+1 and fill the PHINode. 2629 Value *Next = Builder.CreateAdd(Induction, Step, "index.next"); 2630 Induction->addIncoming(Start, L->getLoopPreheader()); 2631 Induction->addIncoming(Next, Latch); 2632 // Create the compare. 2633 Value *ICmp = Builder.CreateICmpEQ(Next, End); 2634 Builder.CreateCondBr(ICmp, L->getExitBlock(), Header); 2635 2636 // Now we have two terminators. Remove the old one from the block. 2637 Latch->getTerminator()->eraseFromParent(); 2638 2639 return Induction; 2640 } 2641 2642 Value *InnerLoopVectorizer::getOrCreateTripCount(Loop *L) { 2643 if (TripCount) 2644 return TripCount; 2645 2646 assert(L && "Create Trip Count for null loop."); 2647 IRBuilder<> Builder(L->getLoopPreheader()->getTerminator()); 2648 // Find the loop boundaries. 2649 ScalarEvolution *SE = PSE.getSE(); 2650 const SCEV *BackedgeTakenCount = PSE.getBackedgeTakenCount(); 2651 assert(BackedgeTakenCount != SE->getCouldNotCompute() && 2652 "Invalid loop count"); 2653 2654 Type *IdxTy = Legal->getWidestInductionType(); 2655 assert(IdxTy && "No type for induction"); 2656 2657 // The exit count might have the type of i64 while the phi is i32. This can 2658 // happen if we have an induction variable that is sign extended before the 2659 // compare. The only way that we get a backedge taken count is that the 2660 // induction variable was signed and as such will not overflow. In such a case 2661 // truncation is legal. 2662 if (SE->getTypeSizeInBits(BackedgeTakenCount->getType()) > 2663 IdxTy->getPrimitiveSizeInBits()) 2664 BackedgeTakenCount = SE->getTruncateOrNoop(BackedgeTakenCount, IdxTy); 2665 BackedgeTakenCount = SE->getNoopOrZeroExtend(BackedgeTakenCount, IdxTy); 2666 2667 // Get the total trip count from the count by adding 1. 2668 const SCEV *ExitCount = SE->getAddExpr( 2669 BackedgeTakenCount, SE->getOne(BackedgeTakenCount->getType())); 2670 2671 const DataLayout &DL = L->getHeader()->getModule()->getDataLayout(); 2672 2673 // Expand the trip count and place the new instructions in the preheader. 2674 // Notice that the pre-header does not change, only the loop body. 2675 SCEVExpander Exp(*SE, DL, "induction"); 2676 2677 // Count holds the overall loop count (N). 2678 TripCount = Exp.expandCodeFor(ExitCount, ExitCount->getType(), 2679 L->getLoopPreheader()->getTerminator()); 2680 2681 if (TripCount->getType()->isPointerTy()) 2682 TripCount = 2683 CastInst::CreatePointerCast(TripCount, IdxTy, "exitcount.ptrcnt.to.int", 2684 L->getLoopPreheader()->getTerminator()); 2685 2686 return TripCount; 2687 } 2688 2689 Value *InnerLoopVectorizer::getOrCreateVectorTripCount(Loop *L) { 2690 if (VectorTripCount) 2691 return VectorTripCount; 2692 2693 Value *TC = getOrCreateTripCount(L); 2694 IRBuilder<> Builder(L->getLoopPreheader()->getTerminator()); 2695 2696 Type *Ty = TC->getType(); 2697 Constant *Step = ConstantInt::get(Ty, VF * UF); 2698 2699 // If the tail is to be folded by masking, round the number of iterations N 2700 // up to a multiple of Step instead of rounding down. This is done by first 2701 // adding Step-1 and then rounding down. Note that it's ok if this addition 2702 // overflows: the vector induction variable will eventually wrap to zero given 2703 // that it starts at zero and its Step is a power of two; the loop will then 2704 // exit, with the last early-exit vector comparison also producing all-true. 2705 if (Cost->foldTailByMasking()) { 2706 assert(isPowerOf2_32(VF * UF) && 2707 "VF*UF must be a power of 2 when folding tail by masking"); 2708 TC = Builder.CreateAdd(TC, ConstantInt::get(Ty, VF * UF - 1), "n.rnd.up"); 2709 } 2710 2711 // Now we need to generate the expression for the part of the loop that the 2712 // vectorized body will execute. This is equal to N - (N % Step) if scalar 2713 // iterations are not required for correctness, or N - Step, otherwise. Step 2714 // is equal to the vectorization factor (number of SIMD elements) times the 2715 // unroll factor (number of SIMD instructions). 2716 Value *R = Builder.CreateURem(TC, Step, "n.mod.vf"); 2717 2718 // If there is a non-reversed interleaved group that may speculatively access 2719 // memory out-of-bounds, we need to ensure that there will be at least one 2720 // iteration of the scalar epilogue loop. Thus, if the step evenly divides 2721 // the trip count, we set the remainder to be equal to the step. If the step 2722 // does not evenly divide the trip count, no adjustment is necessary since 2723 // there will already be scalar iterations. Note that the minimum iterations 2724 // check ensures that N >= Step. 2725 if (VF > 1 && Cost->requiresScalarEpilogue()) { 2726 auto *IsZero = Builder.CreateICmpEQ(R, ConstantInt::get(R->getType(), 0)); 2727 R = Builder.CreateSelect(IsZero, Step, R); 2728 } 2729 2730 VectorTripCount = Builder.CreateSub(TC, R, "n.vec"); 2731 2732 return VectorTripCount; 2733 } 2734 2735 Value *InnerLoopVectorizer::createBitOrPointerCast(Value *V, VectorType *DstVTy, 2736 const DataLayout &DL) { 2737 // Verify that V is a vector type with same number of elements as DstVTy. 2738 unsigned VF = DstVTy->getNumElements(); 2739 VectorType *SrcVecTy = cast<VectorType>(V->getType()); 2740 assert((VF == SrcVecTy->getNumElements()) && "Vector dimensions do not match"); 2741 Type *SrcElemTy = SrcVecTy->getElementType(); 2742 Type *DstElemTy = DstVTy->getElementType(); 2743 assert((DL.getTypeSizeInBits(SrcElemTy) == DL.getTypeSizeInBits(DstElemTy)) && 2744 "Vector elements must have same size"); 2745 2746 // Do a direct cast if element types are castable. 2747 if (CastInst::isBitOrNoopPointerCastable(SrcElemTy, DstElemTy, DL)) { 2748 return Builder.CreateBitOrPointerCast(V, DstVTy); 2749 } 2750 // V cannot be directly casted to desired vector type. 2751 // May happen when V is a floating point vector but DstVTy is a vector of 2752 // pointers or vice-versa. Handle this using a two-step bitcast using an 2753 // intermediate Integer type for the bitcast i.e. Ptr <-> Int <-> Float. 2754 assert((DstElemTy->isPointerTy() != SrcElemTy->isPointerTy()) && 2755 "Only one type should be a pointer type"); 2756 assert((DstElemTy->isFloatingPointTy() != SrcElemTy->isFloatingPointTy()) && 2757 "Only one type should be a floating point type"); 2758 Type *IntTy = 2759 IntegerType::getIntNTy(V->getContext(), DL.getTypeSizeInBits(SrcElemTy)); 2760 auto *VecIntTy = FixedVectorType::get(IntTy, VF); 2761 Value *CastVal = Builder.CreateBitOrPointerCast(V, VecIntTy); 2762 return Builder.CreateBitOrPointerCast(CastVal, DstVTy); 2763 } 2764 2765 void InnerLoopVectorizer::emitMinimumIterationCountCheck(Loop *L, 2766 BasicBlock *Bypass) { 2767 Value *Count = getOrCreateTripCount(L); 2768 // Reuse existing vector loop preheader for TC checks. 2769 // Note that new preheader block is generated for vector loop. 2770 BasicBlock *const TCCheckBlock = LoopVectorPreHeader; 2771 IRBuilder<> Builder(TCCheckBlock->getTerminator()); 2772 2773 // Generate code to check if the loop's trip count is less than VF * UF, or 2774 // equal to it in case a scalar epilogue is required; this implies that the 2775 // vector trip count is zero. This check also covers the case where adding one 2776 // to the backedge-taken count overflowed leading to an incorrect trip count 2777 // of zero. In this case we will also jump to the scalar loop. 2778 auto P = Cost->requiresScalarEpilogue() ? ICmpInst::ICMP_ULE 2779 : ICmpInst::ICMP_ULT; 2780 2781 // If tail is to be folded, vector loop takes care of all iterations. 2782 Value *CheckMinIters = Builder.getFalse(); 2783 if (!Cost->foldTailByMasking()) 2784 CheckMinIters = Builder.CreateICmp( 2785 P, Count, ConstantInt::get(Count->getType(), VF * UF), 2786 "min.iters.check"); 2787 2788 // Create new preheader for vector loop. 2789 LoopVectorPreHeader = 2790 SplitBlock(TCCheckBlock, TCCheckBlock->getTerminator(), DT, LI, nullptr, 2791 "vector.ph"); 2792 2793 assert(DT->properlyDominates(DT->getNode(TCCheckBlock), 2794 DT->getNode(Bypass)->getIDom()) && 2795 "TC check is expected to dominate Bypass"); 2796 2797 // Update dominator for Bypass & LoopExit. 2798 DT->changeImmediateDominator(Bypass, TCCheckBlock); 2799 DT->changeImmediateDominator(LoopExitBlock, TCCheckBlock); 2800 2801 ReplaceInstWithInst( 2802 TCCheckBlock->getTerminator(), 2803 BranchInst::Create(Bypass, LoopVectorPreHeader, CheckMinIters)); 2804 LoopBypassBlocks.push_back(TCCheckBlock); 2805 } 2806 2807 void InnerLoopVectorizer::emitSCEVChecks(Loop *L, BasicBlock *Bypass) { 2808 // Reuse existing vector loop preheader for SCEV checks. 2809 // Note that new preheader block is generated for vector loop. 2810 BasicBlock *const SCEVCheckBlock = LoopVectorPreHeader; 2811 2812 // Generate the code to check that the SCEV assumptions that we made. 2813 // We want the new basic block to start at the first instruction in a 2814 // sequence of instructions that form a check. 2815 SCEVExpander Exp(*PSE.getSE(), Bypass->getModule()->getDataLayout(), 2816 "scev.check"); 2817 Value *SCEVCheck = Exp.expandCodeForPredicate( 2818 &PSE.getUnionPredicate(), SCEVCheckBlock->getTerminator()); 2819 2820 if (auto *C = dyn_cast<ConstantInt>(SCEVCheck)) 2821 if (C->isZero()) 2822 return; 2823 2824 assert(!(SCEVCheckBlock->getParent()->hasOptSize() || 2825 (OptForSizeBasedOnProfile && 2826 Cost->Hints->getForce() != LoopVectorizeHints::FK_Enabled)) && 2827 "Cannot SCEV check stride or overflow when optimizing for size"); 2828 2829 SCEVCheckBlock->setName("vector.scevcheck"); 2830 // Create new preheader for vector loop. 2831 LoopVectorPreHeader = 2832 SplitBlock(SCEVCheckBlock, SCEVCheckBlock->getTerminator(), DT, LI, 2833 nullptr, "vector.ph"); 2834 2835 // Update dominator only if this is first RT check. 2836 if (LoopBypassBlocks.empty()) { 2837 DT->changeImmediateDominator(Bypass, SCEVCheckBlock); 2838 DT->changeImmediateDominator(LoopExitBlock, SCEVCheckBlock); 2839 } 2840 2841 ReplaceInstWithInst( 2842 SCEVCheckBlock->getTerminator(), 2843 BranchInst::Create(Bypass, LoopVectorPreHeader, SCEVCheck)); 2844 LoopBypassBlocks.push_back(SCEVCheckBlock); 2845 AddedSafetyChecks = true; 2846 } 2847 2848 void InnerLoopVectorizer::emitMemRuntimeChecks(Loop *L, BasicBlock *Bypass) { 2849 // VPlan-native path does not do any analysis for runtime checks currently. 2850 if (EnableVPlanNativePath) 2851 return; 2852 2853 // Reuse existing vector loop preheader for runtime memory checks. 2854 // Note that new preheader block is generated for vector loop. 2855 BasicBlock *const MemCheckBlock = L->getLoopPreheader(); 2856 2857 // Generate the code that checks in runtime if arrays overlap. We put the 2858 // checks into a separate block to make the more common case of few elements 2859 // faster. 2860 auto *LAI = Legal->getLAI(); 2861 const auto &RtPtrChecking = *LAI->getRuntimePointerChecking(); 2862 if (!RtPtrChecking.Need) 2863 return; 2864 Instruction *FirstCheckInst; 2865 Instruction *MemRuntimeCheck; 2866 std::tie(FirstCheckInst, MemRuntimeCheck) = 2867 addRuntimeChecks(MemCheckBlock->getTerminator(), OrigLoop, 2868 RtPtrChecking.getChecks(), RtPtrChecking.getSE()); 2869 assert(MemRuntimeCheck && "no RT checks generated although RtPtrChecking " 2870 "claimed checks are required"); 2871 2872 if (MemCheckBlock->getParent()->hasOptSize() || OptForSizeBasedOnProfile) { 2873 assert(Cost->Hints->getForce() == LoopVectorizeHints::FK_Enabled && 2874 "Cannot emit memory checks when optimizing for size, unless forced " 2875 "to vectorize."); 2876 ORE->emit([&]() { 2877 return OptimizationRemarkAnalysis(DEBUG_TYPE, "VectorizationCodeSize", 2878 L->getStartLoc(), L->getHeader()) 2879 << "Code-size may be reduced by not forcing " 2880 "vectorization, or by source-code modifications " 2881 "eliminating the need for runtime checks " 2882 "(e.g., adding 'restrict')."; 2883 }); 2884 } 2885 2886 MemCheckBlock->setName("vector.memcheck"); 2887 // Create new preheader for vector loop. 2888 LoopVectorPreHeader = 2889 SplitBlock(MemCheckBlock, MemCheckBlock->getTerminator(), DT, LI, nullptr, 2890 "vector.ph"); 2891 2892 // Update dominator only if this is first RT check. 2893 if (LoopBypassBlocks.empty()) { 2894 DT->changeImmediateDominator(Bypass, MemCheckBlock); 2895 DT->changeImmediateDominator(LoopExitBlock, MemCheckBlock); 2896 } 2897 2898 ReplaceInstWithInst( 2899 MemCheckBlock->getTerminator(), 2900 BranchInst::Create(Bypass, LoopVectorPreHeader, MemRuntimeCheck)); 2901 LoopBypassBlocks.push_back(MemCheckBlock); 2902 AddedSafetyChecks = true; 2903 2904 // We currently don't use LoopVersioning for the actual loop cloning but we 2905 // still use it to add the noalias metadata. 2906 LVer = std::make_unique<LoopVersioning>(*Legal->getLAI(), OrigLoop, LI, DT, 2907 PSE.getSE()); 2908 LVer->prepareNoAliasMetadata(); 2909 } 2910 2911 Value *InnerLoopVectorizer::emitTransformedIndex( 2912 IRBuilder<> &B, Value *Index, ScalarEvolution *SE, const DataLayout &DL, 2913 const InductionDescriptor &ID) const { 2914 2915 SCEVExpander Exp(*SE, DL, "induction"); 2916 auto Step = ID.getStep(); 2917 auto StartValue = ID.getStartValue(); 2918 assert(Index->getType() == Step->getType() && 2919 "Index type does not match StepValue type"); 2920 2921 // Note: the IR at this point is broken. We cannot use SE to create any new 2922 // SCEV and then expand it, hoping that SCEV's simplification will give us 2923 // a more optimal code. Unfortunately, attempt of doing so on invalid IR may 2924 // lead to various SCEV crashes. So all we can do is to use builder and rely 2925 // on InstCombine for future simplifications. Here we handle some trivial 2926 // cases only. 2927 auto CreateAdd = [&B](Value *X, Value *Y) { 2928 assert(X->getType() == Y->getType() && "Types don't match!"); 2929 if (auto *CX = dyn_cast<ConstantInt>(X)) 2930 if (CX->isZero()) 2931 return Y; 2932 if (auto *CY = dyn_cast<ConstantInt>(Y)) 2933 if (CY->isZero()) 2934 return X; 2935 return B.CreateAdd(X, Y); 2936 }; 2937 2938 auto CreateMul = [&B](Value *X, Value *Y) { 2939 assert(X->getType() == Y->getType() && "Types don't match!"); 2940 if (auto *CX = dyn_cast<ConstantInt>(X)) 2941 if (CX->isOne()) 2942 return Y; 2943 if (auto *CY = dyn_cast<ConstantInt>(Y)) 2944 if (CY->isOne()) 2945 return X; 2946 return B.CreateMul(X, Y); 2947 }; 2948 2949 // Get a suitable insert point for SCEV expansion. For blocks in the vector 2950 // loop, choose the end of the vector loop header (=LoopVectorBody), because 2951 // the DomTree is not kept up-to-date for additional blocks generated in the 2952 // vector loop. By using the header as insertion point, we guarantee that the 2953 // expanded instructions dominate all their uses. 2954 auto GetInsertPoint = [this, &B]() { 2955 BasicBlock *InsertBB = B.GetInsertPoint()->getParent(); 2956 if (InsertBB != LoopVectorBody && 2957 LI->getLoopFor(LoopVectorBody) == LI->getLoopFor(InsertBB)) 2958 return LoopVectorBody->getTerminator(); 2959 return &*B.GetInsertPoint(); 2960 }; 2961 switch (ID.getKind()) { 2962 case InductionDescriptor::IK_IntInduction: { 2963 assert(Index->getType() == StartValue->getType() && 2964 "Index type does not match StartValue type"); 2965 if (ID.getConstIntStepValue() && ID.getConstIntStepValue()->isMinusOne()) 2966 return B.CreateSub(StartValue, Index); 2967 auto *Offset = CreateMul( 2968 Index, Exp.expandCodeFor(Step, Index->getType(), GetInsertPoint())); 2969 return CreateAdd(StartValue, Offset); 2970 } 2971 case InductionDescriptor::IK_PtrInduction: { 2972 assert(isa<SCEVConstant>(Step) && 2973 "Expected constant step for pointer induction"); 2974 return B.CreateGEP( 2975 StartValue->getType()->getPointerElementType(), StartValue, 2976 CreateMul(Index, 2977 Exp.expandCodeFor(Step, Index->getType(), GetInsertPoint()))); 2978 } 2979 case InductionDescriptor::IK_FpInduction: { 2980 assert(Step->getType()->isFloatingPointTy() && "Expected FP Step value"); 2981 auto InductionBinOp = ID.getInductionBinOp(); 2982 assert(InductionBinOp && 2983 (InductionBinOp->getOpcode() == Instruction::FAdd || 2984 InductionBinOp->getOpcode() == Instruction::FSub) && 2985 "Original bin op should be defined for FP induction"); 2986 2987 Value *StepValue = cast<SCEVUnknown>(Step)->getValue(); 2988 2989 // Floating point operations had to be 'fast' to enable the induction. 2990 FastMathFlags Flags; 2991 Flags.setFast(); 2992 2993 Value *MulExp = B.CreateFMul(StepValue, Index); 2994 if (isa<Instruction>(MulExp)) 2995 // We have to check, the MulExp may be a constant. 2996 cast<Instruction>(MulExp)->setFastMathFlags(Flags); 2997 2998 Value *BOp = B.CreateBinOp(InductionBinOp->getOpcode(), StartValue, MulExp, 2999 "induction"); 3000 if (isa<Instruction>(BOp)) 3001 cast<Instruction>(BOp)->setFastMathFlags(Flags); 3002 3003 return BOp; 3004 } 3005 case InductionDescriptor::IK_NoInduction: 3006 return nullptr; 3007 } 3008 llvm_unreachable("invalid enum"); 3009 } 3010 3011 Loop *InnerLoopVectorizer::createVectorLoopSkeleton(StringRef Prefix) { 3012 LoopScalarBody = OrigLoop->getHeader(); 3013 LoopVectorPreHeader = OrigLoop->getLoopPreheader(); 3014 LoopExitBlock = OrigLoop->getExitBlock(); 3015 assert(LoopExitBlock && "Must have an exit block"); 3016 assert(LoopVectorPreHeader && "Invalid loop structure"); 3017 3018 LoopMiddleBlock = 3019 SplitBlock(LoopVectorPreHeader, LoopVectorPreHeader->getTerminator(), DT, 3020 LI, nullptr, Twine(Prefix) + "middle.block"); 3021 LoopScalarPreHeader = 3022 SplitBlock(LoopMiddleBlock, LoopMiddleBlock->getTerminator(), DT, LI, 3023 nullptr, Twine(Prefix) + "scalar.ph"); 3024 // We intentionally don't let SplitBlock to update LoopInfo since 3025 // LoopVectorBody should belong to another loop than LoopVectorPreHeader. 3026 // LoopVectorBody is explicitly added to the correct place few lines later. 3027 LoopVectorBody = 3028 SplitBlock(LoopVectorPreHeader, LoopVectorPreHeader->getTerminator(), DT, 3029 nullptr, nullptr, Twine(Prefix) + "vector.body"); 3030 3031 // Update dominator for loop exit. 3032 DT->changeImmediateDominator(LoopExitBlock, LoopMiddleBlock); 3033 3034 // Create and register the new vector loop. 3035 Loop *Lp = LI->AllocateLoop(); 3036 Loop *ParentLoop = OrigLoop->getParentLoop(); 3037 3038 // Insert the new loop into the loop nest and register the new basic blocks 3039 // before calling any utilities such as SCEV that require valid LoopInfo. 3040 if (ParentLoop) { 3041 ParentLoop->addChildLoop(Lp); 3042 } else { 3043 LI->addTopLevelLoop(Lp); 3044 } 3045 Lp->addBasicBlockToLoop(LoopVectorBody, *LI); 3046 return Lp; 3047 } 3048 3049 void InnerLoopVectorizer::createInductionResumeValues(Loop *L, 3050 Value *VectorTripCount) { 3051 assert(VectorTripCount && L && "Expected valid arguments"); 3052 // We are going to resume the execution of the scalar loop. 3053 // Go over all of the induction variables that we found and fix the 3054 // PHIs that are left in the scalar version of the loop. 3055 // The starting values of PHI nodes depend on the counter of the last 3056 // iteration in the vectorized loop. 3057 // If we come from a bypass edge then we need to start from the original 3058 // start value. 3059 for (auto &InductionEntry : Legal->getInductionVars()) { 3060 PHINode *OrigPhi = InductionEntry.first; 3061 InductionDescriptor II = InductionEntry.second; 3062 3063 // Create phi nodes to merge from the backedge-taken check block. 3064 PHINode *BCResumeVal = 3065 PHINode::Create(OrigPhi->getType(), 3, "bc.resume.val", 3066 LoopScalarPreHeader->getTerminator()); 3067 // Copy original phi DL over to the new one. 3068 BCResumeVal->setDebugLoc(OrigPhi->getDebugLoc()); 3069 Value *&EndValue = IVEndValues[OrigPhi]; 3070 if (OrigPhi == OldInduction) { 3071 // We know what the end value is. 3072 EndValue = VectorTripCount; 3073 } else { 3074 IRBuilder<> B(L->getLoopPreheader()->getTerminator()); 3075 Type *StepType = II.getStep()->getType(); 3076 Instruction::CastOps CastOp = 3077 CastInst::getCastOpcode(VectorTripCount, true, StepType, true); 3078 Value *CRD = B.CreateCast(CastOp, VectorTripCount, StepType, "cast.crd"); 3079 const DataLayout &DL = LoopScalarBody->getModule()->getDataLayout(); 3080 EndValue = emitTransformedIndex(B, CRD, PSE.getSE(), DL, II); 3081 EndValue->setName("ind.end"); 3082 } 3083 3084 // The new PHI merges the original incoming value, in case of a bypass, 3085 // or the value at the end of the vectorized loop. 3086 BCResumeVal->addIncoming(EndValue, LoopMiddleBlock); 3087 3088 // Fix the scalar body counter (PHI node). 3089 // The old induction's phi node in the scalar body needs the truncated 3090 // value. 3091 for (BasicBlock *BB : LoopBypassBlocks) 3092 BCResumeVal->addIncoming(II.getStartValue(), BB); 3093 OrigPhi->setIncomingValueForBlock(LoopScalarPreHeader, BCResumeVal); 3094 } 3095 } 3096 3097 BasicBlock *InnerLoopVectorizer::completeLoopSkeleton(Loop *L, 3098 MDNode *OrigLoopID) { 3099 assert(L && "Expected valid loop."); 3100 3101 // The trip counts should be cached by now. 3102 Value *Count = getOrCreateTripCount(L); 3103 Value *VectorTripCount = getOrCreateVectorTripCount(L); 3104 3105 // We need the OrigLoop (scalar loop part) latch terminator to help 3106 // produce correct debug info for the middle block BB instructions. 3107 // The legality check stage guarantees that the loop will have a single 3108 // latch. 3109 assert(isa<BranchInst>(OrigLoop->getLoopLatch()->getTerminator()) && 3110 "Scalar loop latch terminator isn't a branch"); 3111 BranchInst *ScalarLatchBr = 3112 cast<BranchInst>(OrigLoop->getLoopLatch()->getTerminator()); 3113 3114 // Add a check in the middle block to see if we have completed 3115 // all of the iterations in the first vector loop. 3116 // If (N - N%VF) == N, then we *don't* need to run the remainder. 3117 // If tail is to be folded, we know we don't need to run the remainder. 3118 Value *CmpN = Builder.getTrue(); 3119 if (!Cost->foldTailByMasking()) { 3120 CmpN = CmpInst::Create(Instruction::ICmp, CmpInst::ICMP_EQ, Count, 3121 VectorTripCount, "cmp.n", 3122 LoopMiddleBlock->getTerminator()); 3123 3124 // Here we use the same DebugLoc as the scalar loop latch branch instead 3125 // of the corresponding compare because they may have ended up with 3126 // different line numbers and we want to avoid awkward line stepping while 3127 // debugging. Eg. if the compare has got a line number inside the loop. 3128 cast<Instruction>(CmpN)->setDebugLoc(ScalarLatchBr->getDebugLoc()); 3129 } 3130 3131 BranchInst *BrInst = 3132 BranchInst::Create(LoopExitBlock, LoopScalarPreHeader, CmpN); 3133 BrInst->setDebugLoc(ScalarLatchBr->getDebugLoc()); 3134 ReplaceInstWithInst(LoopMiddleBlock->getTerminator(), BrInst); 3135 3136 // Get ready to start creating new instructions into the vectorized body. 3137 assert(LoopVectorPreHeader == L->getLoopPreheader() && 3138 "Inconsistent vector loop preheader"); 3139 Builder.SetInsertPoint(&*LoopVectorBody->getFirstInsertionPt()); 3140 3141 Optional<MDNode *> VectorizedLoopID = 3142 makeFollowupLoopID(OrigLoopID, {LLVMLoopVectorizeFollowupAll, 3143 LLVMLoopVectorizeFollowupVectorized}); 3144 if (VectorizedLoopID.hasValue()) { 3145 L->setLoopID(VectorizedLoopID.getValue()); 3146 3147 // Do not setAlreadyVectorized if loop attributes have been defined 3148 // explicitly. 3149 return LoopVectorPreHeader; 3150 } 3151 3152 // Keep all loop hints from the original loop on the vector loop (we'll 3153 // replace the vectorizer-specific hints below). 3154 if (MDNode *LID = OrigLoop->getLoopID()) 3155 L->setLoopID(LID); 3156 3157 LoopVectorizeHints Hints(L, true, *ORE); 3158 Hints.setAlreadyVectorized(); 3159 3160 #ifdef EXPENSIVE_CHECKS 3161 assert(DT->verify(DominatorTree::VerificationLevel::Fast)); 3162 LI->verify(*DT); 3163 #endif 3164 3165 return LoopVectorPreHeader; 3166 } 3167 3168 BasicBlock *InnerLoopVectorizer::createVectorizedLoopSkeleton() { 3169 /* 3170 In this function we generate a new loop. The new loop will contain 3171 the vectorized instructions while the old loop will continue to run the 3172 scalar remainder. 3173 3174 [ ] <-- loop iteration number check. 3175 / | 3176 / v 3177 | [ ] <-- vector loop bypass (may consist of multiple blocks). 3178 | / | 3179 | / v 3180 || [ ] <-- vector pre header. 3181 |/ | 3182 | v 3183 | [ ] \ 3184 | [ ]_| <-- vector loop. 3185 | | 3186 | v 3187 | -[ ] <--- middle-block. 3188 | / | 3189 | / v 3190 -|- >[ ] <--- new preheader. 3191 | | 3192 | v 3193 | [ ] \ 3194 | [ ]_| <-- old scalar loop to handle remainder. 3195 \ | 3196 \ v 3197 >[ ] <-- exit block. 3198 ... 3199 */ 3200 3201 // Get the metadata of the original loop before it gets modified. 3202 MDNode *OrigLoopID = OrigLoop->getLoopID(); 3203 3204 // Create an empty vector loop, and prepare basic blocks for the runtime 3205 // checks. 3206 Loop *Lp = createVectorLoopSkeleton(""); 3207 3208 // Now, compare the new count to zero. If it is zero skip the vector loop and 3209 // jump to the scalar loop. This check also covers the case where the 3210 // backedge-taken count is uint##_max: adding one to it will overflow leading 3211 // to an incorrect trip count of zero. In this (rare) case we will also jump 3212 // to the scalar loop. 3213 emitMinimumIterationCountCheck(Lp, LoopScalarPreHeader); 3214 3215 // Generate the code to check any assumptions that we've made for SCEV 3216 // expressions. 3217 emitSCEVChecks(Lp, LoopScalarPreHeader); 3218 3219 // Generate the code that checks in runtime if arrays overlap. We put the 3220 // checks into a separate block to make the more common case of few elements 3221 // faster. 3222 emitMemRuntimeChecks(Lp, LoopScalarPreHeader); 3223 3224 // Some loops have a single integer induction variable, while other loops 3225 // don't. One example is c++ iterators that often have multiple pointer 3226 // induction variables. In the code below we also support a case where we 3227 // don't have a single induction variable. 3228 // 3229 // We try to obtain an induction variable from the original loop as hard 3230 // as possible. However if we don't find one that: 3231 // - is an integer 3232 // - counts from zero, stepping by one 3233 // - is the size of the widest induction variable type 3234 // then we create a new one. 3235 OldInduction = Legal->getPrimaryInduction(); 3236 Type *IdxTy = Legal->getWidestInductionType(); 3237 Value *StartIdx = ConstantInt::get(IdxTy, 0); 3238 // The loop step is equal to the vectorization factor (num of SIMD elements) 3239 // times the unroll factor (num of SIMD instructions). 3240 Constant *Step = ConstantInt::get(IdxTy, VF * UF); 3241 Value *CountRoundDown = getOrCreateVectorTripCount(Lp); 3242 Induction = 3243 createInductionVariable(Lp, StartIdx, CountRoundDown, Step, 3244 getDebugLocFromInstOrOperands(OldInduction)); 3245 3246 // Emit phis for the new starting index of the scalar loop. 3247 createInductionResumeValues(Lp, CountRoundDown); 3248 3249 return completeLoopSkeleton(Lp, OrigLoopID); 3250 } 3251 3252 // Fix up external users of the induction variable. At this point, we are 3253 // in LCSSA form, with all external PHIs that use the IV having one input value, 3254 // coming from the remainder loop. We need those PHIs to also have a correct 3255 // value for the IV when arriving directly from the middle block. 3256 void InnerLoopVectorizer::fixupIVUsers(PHINode *OrigPhi, 3257 const InductionDescriptor &II, 3258 Value *CountRoundDown, Value *EndValue, 3259 BasicBlock *MiddleBlock) { 3260 // There are two kinds of external IV usages - those that use the value 3261 // computed in the last iteration (the PHI) and those that use the penultimate 3262 // value (the value that feeds into the phi from the loop latch). 3263 // We allow both, but they, obviously, have different values. 3264 3265 assert(OrigLoop->getExitBlock() && "Expected a single exit block"); 3266 3267 DenseMap<Value *, Value *> MissingVals; 3268 3269 // An external user of the last iteration's value should see the value that 3270 // the remainder loop uses to initialize its own IV. 3271 Value *PostInc = OrigPhi->getIncomingValueForBlock(OrigLoop->getLoopLatch()); 3272 for (User *U : PostInc->users()) { 3273 Instruction *UI = cast<Instruction>(U); 3274 if (!OrigLoop->contains(UI)) { 3275 assert(isa<PHINode>(UI) && "Expected LCSSA form"); 3276 MissingVals[UI] = EndValue; 3277 } 3278 } 3279 3280 // An external user of the penultimate value need to see EndValue - Step. 3281 // The simplest way to get this is to recompute it from the constituent SCEVs, 3282 // that is Start + (Step * (CRD - 1)). 3283 for (User *U : OrigPhi->users()) { 3284 auto *UI = cast<Instruction>(U); 3285 if (!OrigLoop->contains(UI)) { 3286 const DataLayout &DL = 3287 OrigLoop->getHeader()->getModule()->getDataLayout(); 3288 assert(isa<PHINode>(UI) && "Expected LCSSA form"); 3289 3290 IRBuilder<> B(MiddleBlock->getTerminator()); 3291 Value *CountMinusOne = B.CreateSub( 3292 CountRoundDown, ConstantInt::get(CountRoundDown->getType(), 1)); 3293 Value *CMO = 3294 !II.getStep()->getType()->isIntegerTy() 3295 ? B.CreateCast(Instruction::SIToFP, CountMinusOne, 3296 II.getStep()->getType()) 3297 : B.CreateSExtOrTrunc(CountMinusOne, II.getStep()->getType()); 3298 CMO->setName("cast.cmo"); 3299 Value *Escape = emitTransformedIndex(B, CMO, PSE.getSE(), DL, II); 3300 Escape->setName("ind.escape"); 3301 MissingVals[UI] = Escape; 3302 } 3303 } 3304 3305 for (auto &I : MissingVals) { 3306 PHINode *PHI = cast<PHINode>(I.first); 3307 // One corner case we have to handle is two IVs "chasing" each-other, 3308 // that is %IV2 = phi [...], [ %IV1, %latch ] 3309 // In this case, if IV1 has an external use, we need to avoid adding both 3310 // "last value of IV1" and "penultimate value of IV2". So, verify that we 3311 // don't already have an incoming value for the middle block. 3312 if (PHI->getBasicBlockIndex(MiddleBlock) == -1) 3313 PHI->addIncoming(I.second, MiddleBlock); 3314 } 3315 } 3316 3317 namespace { 3318 3319 struct CSEDenseMapInfo { 3320 static bool canHandle(const Instruction *I) { 3321 return isa<InsertElementInst>(I) || isa<ExtractElementInst>(I) || 3322 isa<ShuffleVectorInst>(I) || isa<GetElementPtrInst>(I); 3323 } 3324 3325 static inline Instruction *getEmptyKey() { 3326 return DenseMapInfo<Instruction *>::getEmptyKey(); 3327 } 3328 3329 static inline Instruction *getTombstoneKey() { 3330 return DenseMapInfo<Instruction *>::getTombstoneKey(); 3331 } 3332 3333 static unsigned getHashValue(const Instruction *I) { 3334 assert(canHandle(I) && "Unknown instruction!"); 3335 return hash_combine(I->getOpcode(), hash_combine_range(I->value_op_begin(), 3336 I->value_op_end())); 3337 } 3338 3339 static bool isEqual(const Instruction *LHS, const Instruction *RHS) { 3340 if (LHS == getEmptyKey() || RHS == getEmptyKey() || 3341 LHS == getTombstoneKey() || RHS == getTombstoneKey()) 3342 return LHS == RHS; 3343 return LHS->isIdenticalTo(RHS); 3344 } 3345 }; 3346 3347 } // end anonymous namespace 3348 3349 ///Perform cse of induction variable instructions. 3350 static void cse(BasicBlock *BB) { 3351 // Perform simple cse. 3352 SmallDenseMap<Instruction *, Instruction *, 4, CSEDenseMapInfo> CSEMap; 3353 for (BasicBlock::iterator I = BB->begin(), E = BB->end(); I != E;) { 3354 Instruction *In = &*I++; 3355 3356 if (!CSEDenseMapInfo::canHandle(In)) 3357 continue; 3358 3359 // Check if we can replace this instruction with any of the 3360 // visited instructions. 3361 if (Instruction *V = CSEMap.lookup(In)) { 3362 In->replaceAllUsesWith(V); 3363 In->eraseFromParent(); 3364 continue; 3365 } 3366 3367 CSEMap[In] = In; 3368 } 3369 } 3370 3371 unsigned LoopVectorizationCostModel::getVectorCallCost(CallInst *CI, 3372 unsigned VF, 3373 bool &NeedToScalarize) { 3374 Function *F = CI->getCalledFunction(); 3375 Type *ScalarRetTy = CI->getType(); 3376 SmallVector<Type *, 4> Tys, ScalarTys; 3377 for (auto &ArgOp : CI->arg_operands()) 3378 ScalarTys.push_back(ArgOp->getType()); 3379 3380 // Estimate cost of scalarized vector call. The source operands are assumed 3381 // to be vectors, so we need to extract individual elements from there, 3382 // execute VF scalar calls, and then gather the result into the vector return 3383 // value. 3384 unsigned ScalarCallCost = TTI.getCallInstrCost(F, ScalarRetTy, ScalarTys, 3385 TTI::TCK_RecipThroughput); 3386 if (VF == 1) 3387 return ScalarCallCost; 3388 3389 // Compute corresponding vector type for return value and arguments. 3390 Type *RetTy = ToVectorTy(ScalarRetTy, VF); 3391 for (Type *ScalarTy : ScalarTys) 3392 Tys.push_back(ToVectorTy(ScalarTy, VF)); 3393 3394 // Compute costs of unpacking argument values for the scalar calls and 3395 // packing the return values to a vector. 3396 unsigned ScalarizationCost = getScalarizationOverhead(CI, VF); 3397 3398 unsigned Cost = ScalarCallCost * VF + ScalarizationCost; 3399 3400 // If we can't emit a vector call for this function, then the currently found 3401 // cost is the cost we need to return. 3402 NeedToScalarize = true; 3403 VFShape Shape = 3404 VFShape::get(*CI, ElementCount::getFixed(VF), false /*HasGlobalPred*/); 3405 Function *VecFunc = VFDatabase(*CI).getVectorizedFunction(Shape); 3406 3407 if (!TLI || CI->isNoBuiltin() || !VecFunc) 3408 return Cost; 3409 3410 // If the corresponding vector cost is cheaper, return its cost. 3411 unsigned VectorCallCost = TTI.getCallInstrCost(nullptr, RetTy, Tys, 3412 TTI::TCK_RecipThroughput); 3413 if (VectorCallCost < Cost) { 3414 NeedToScalarize = false; 3415 return VectorCallCost; 3416 } 3417 return Cost; 3418 } 3419 3420 unsigned LoopVectorizationCostModel::getVectorIntrinsicCost(CallInst *CI, 3421 unsigned VF) { 3422 Intrinsic::ID ID = getVectorIntrinsicIDForCall(CI, TLI); 3423 assert(ID && "Expected intrinsic call!"); 3424 3425 IntrinsicCostAttributes CostAttrs(ID, *CI, VF); 3426 return TTI.getIntrinsicInstrCost(CostAttrs, 3427 TargetTransformInfo::TCK_RecipThroughput); 3428 } 3429 3430 static Type *smallestIntegerVectorType(Type *T1, Type *T2) { 3431 auto *I1 = cast<IntegerType>(cast<VectorType>(T1)->getElementType()); 3432 auto *I2 = cast<IntegerType>(cast<VectorType>(T2)->getElementType()); 3433 return I1->getBitWidth() < I2->getBitWidth() ? T1 : T2; 3434 } 3435 3436 static Type *largestIntegerVectorType(Type *T1, Type *T2) { 3437 auto *I1 = cast<IntegerType>(cast<VectorType>(T1)->getElementType()); 3438 auto *I2 = cast<IntegerType>(cast<VectorType>(T2)->getElementType()); 3439 return I1->getBitWidth() > I2->getBitWidth() ? T1 : T2; 3440 } 3441 3442 void InnerLoopVectorizer::truncateToMinimalBitwidths() { 3443 // For every instruction `I` in MinBWs, truncate the operands, create a 3444 // truncated version of `I` and reextend its result. InstCombine runs 3445 // later and will remove any ext/trunc pairs. 3446 SmallPtrSet<Value *, 4> Erased; 3447 for (const auto &KV : Cost->getMinimalBitwidths()) { 3448 // If the value wasn't vectorized, we must maintain the original scalar 3449 // type. The absence of the value from VectorLoopValueMap indicates that it 3450 // wasn't vectorized. 3451 if (!VectorLoopValueMap.hasAnyVectorValue(KV.first)) 3452 continue; 3453 for (unsigned Part = 0; Part < UF; ++Part) { 3454 Value *I = getOrCreateVectorValue(KV.first, Part); 3455 if (Erased.count(I) || I->use_empty() || !isa<Instruction>(I)) 3456 continue; 3457 Type *OriginalTy = I->getType(); 3458 Type *ScalarTruncatedTy = 3459 IntegerType::get(OriginalTy->getContext(), KV.second); 3460 auto *TruncatedTy = FixedVectorType::get( 3461 ScalarTruncatedTy, cast<VectorType>(OriginalTy)->getNumElements()); 3462 if (TruncatedTy == OriginalTy) 3463 continue; 3464 3465 IRBuilder<> B(cast<Instruction>(I)); 3466 auto ShrinkOperand = [&](Value *V) -> Value * { 3467 if (auto *ZI = dyn_cast<ZExtInst>(V)) 3468 if (ZI->getSrcTy() == TruncatedTy) 3469 return ZI->getOperand(0); 3470 return B.CreateZExtOrTrunc(V, TruncatedTy); 3471 }; 3472 3473 // The actual instruction modification depends on the instruction type, 3474 // unfortunately. 3475 Value *NewI = nullptr; 3476 if (auto *BO = dyn_cast<BinaryOperator>(I)) { 3477 NewI = B.CreateBinOp(BO->getOpcode(), ShrinkOperand(BO->getOperand(0)), 3478 ShrinkOperand(BO->getOperand(1))); 3479 3480 // Any wrapping introduced by shrinking this operation shouldn't be 3481 // considered undefined behavior. So, we can't unconditionally copy 3482 // arithmetic wrapping flags to NewI. 3483 cast<BinaryOperator>(NewI)->copyIRFlags(I, /*IncludeWrapFlags=*/false); 3484 } else if (auto *CI = dyn_cast<ICmpInst>(I)) { 3485 NewI = 3486 B.CreateICmp(CI->getPredicate(), ShrinkOperand(CI->getOperand(0)), 3487 ShrinkOperand(CI->getOperand(1))); 3488 } else if (auto *SI = dyn_cast<SelectInst>(I)) { 3489 NewI = B.CreateSelect(SI->getCondition(), 3490 ShrinkOperand(SI->getTrueValue()), 3491 ShrinkOperand(SI->getFalseValue())); 3492 } else if (auto *CI = dyn_cast<CastInst>(I)) { 3493 switch (CI->getOpcode()) { 3494 default: 3495 llvm_unreachable("Unhandled cast!"); 3496 case Instruction::Trunc: 3497 NewI = ShrinkOperand(CI->getOperand(0)); 3498 break; 3499 case Instruction::SExt: 3500 NewI = B.CreateSExtOrTrunc( 3501 CI->getOperand(0), 3502 smallestIntegerVectorType(OriginalTy, TruncatedTy)); 3503 break; 3504 case Instruction::ZExt: 3505 NewI = B.CreateZExtOrTrunc( 3506 CI->getOperand(0), 3507 smallestIntegerVectorType(OriginalTy, TruncatedTy)); 3508 break; 3509 } 3510 } else if (auto *SI = dyn_cast<ShuffleVectorInst>(I)) { 3511 auto Elements0 = 3512 cast<VectorType>(SI->getOperand(0)->getType())->getNumElements(); 3513 auto *O0 = B.CreateZExtOrTrunc( 3514 SI->getOperand(0), 3515 FixedVectorType::get(ScalarTruncatedTy, Elements0)); 3516 auto Elements1 = 3517 cast<VectorType>(SI->getOperand(1)->getType())->getNumElements(); 3518 auto *O1 = B.CreateZExtOrTrunc( 3519 SI->getOperand(1), 3520 FixedVectorType::get(ScalarTruncatedTy, Elements1)); 3521 3522 NewI = B.CreateShuffleVector(O0, O1, SI->getShuffleMask()); 3523 } else if (isa<LoadInst>(I) || isa<PHINode>(I)) { 3524 // Don't do anything with the operands, just extend the result. 3525 continue; 3526 } else if (auto *IE = dyn_cast<InsertElementInst>(I)) { 3527 auto Elements = 3528 cast<VectorType>(IE->getOperand(0)->getType())->getNumElements(); 3529 auto *O0 = B.CreateZExtOrTrunc( 3530 IE->getOperand(0), 3531 FixedVectorType::get(ScalarTruncatedTy, Elements)); 3532 auto *O1 = B.CreateZExtOrTrunc(IE->getOperand(1), ScalarTruncatedTy); 3533 NewI = B.CreateInsertElement(O0, O1, IE->getOperand(2)); 3534 } else if (auto *EE = dyn_cast<ExtractElementInst>(I)) { 3535 auto Elements = 3536 cast<VectorType>(EE->getOperand(0)->getType())->getNumElements(); 3537 auto *O0 = B.CreateZExtOrTrunc( 3538 EE->getOperand(0), 3539 FixedVectorType::get(ScalarTruncatedTy, Elements)); 3540 NewI = B.CreateExtractElement(O0, EE->getOperand(2)); 3541 } else { 3542 // If we don't know what to do, be conservative and don't do anything. 3543 continue; 3544 } 3545 3546 // Lastly, extend the result. 3547 NewI->takeName(cast<Instruction>(I)); 3548 Value *Res = B.CreateZExtOrTrunc(NewI, OriginalTy); 3549 I->replaceAllUsesWith(Res); 3550 cast<Instruction>(I)->eraseFromParent(); 3551 Erased.insert(I); 3552 VectorLoopValueMap.resetVectorValue(KV.first, Part, Res); 3553 } 3554 } 3555 3556 // We'll have created a bunch of ZExts that are now parentless. Clean up. 3557 for (const auto &KV : Cost->getMinimalBitwidths()) { 3558 // If the value wasn't vectorized, we must maintain the original scalar 3559 // type. The absence of the value from VectorLoopValueMap indicates that it 3560 // wasn't vectorized. 3561 if (!VectorLoopValueMap.hasAnyVectorValue(KV.first)) 3562 continue; 3563 for (unsigned Part = 0; Part < UF; ++Part) { 3564 Value *I = getOrCreateVectorValue(KV.first, Part); 3565 ZExtInst *Inst = dyn_cast<ZExtInst>(I); 3566 if (Inst && Inst->use_empty()) { 3567 Value *NewI = Inst->getOperand(0); 3568 Inst->eraseFromParent(); 3569 VectorLoopValueMap.resetVectorValue(KV.first, Part, NewI); 3570 } 3571 } 3572 } 3573 } 3574 3575 void InnerLoopVectorizer::fixVectorizedLoop() { 3576 // Insert truncates and extends for any truncated instructions as hints to 3577 // InstCombine. 3578 if (VF > 1) 3579 truncateToMinimalBitwidths(); 3580 3581 // Fix widened non-induction PHIs by setting up the PHI operands. 3582 if (OrigPHIsToFix.size()) { 3583 assert(EnableVPlanNativePath && 3584 "Unexpected non-induction PHIs for fixup in non VPlan-native path"); 3585 fixNonInductionPHIs(); 3586 } 3587 3588 // At this point every instruction in the original loop is widened to a 3589 // vector form. Now we need to fix the recurrences in the loop. These PHI 3590 // nodes are currently empty because we did not want to introduce cycles. 3591 // This is the second stage of vectorizing recurrences. 3592 fixCrossIterationPHIs(); 3593 3594 // Forget the original basic block. 3595 PSE.getSE()->forgetLoop(OrigLoop); 3596 3597 // Fix-up external users of the induction variables. 3598 for (auto &Entry : Legal->getInductionVars()) 3599 fixupIVUsers(Entry.first, Entry.second, 3600 getOrCreateVectorTripCount(LI->getLoopFor(LoopVectorBody)), 3601 IVEndValues[Entry.first], LoopMiddleBlock); 3602 3603 fixLCSSAPHIs(); 3604 for (Instruction *PI : PredicatedInstructions) 3605 sinkScalarOperands(&*PI); 3606 3607 // Remove redundant induction instructions. 3608 cse(LoopVectorBody); 3609 3610 // Set/update profile weights for the vector and remainder loops as original 3611 // loop iterations are now distributed among them. Note that original loop 3612 // represented by LoopScalarBody becomes remainder loop after vectorization. 3613 // 3614 // For cases like foldTailByMasking() and requiresScalarEpiloque() we may 3615 // end up getting slightly roughened result but that should be OK since 3616 // profile is not inherently precise anyway. Note also possible bypass of 3617 // vector code caused by legality checks is ignored, assigning all the weight 3618 // to the vector loop, optimistically. 3619 setProfileInfoAfterUnrolling(LI->getLoopFor(LoopScalarBody), 3620 LI->getLoopFor(LoopVectorBody), 3621 LI->getLoopFor(LoopScalarBody), VF * UF); 3622 } 3623 3624 void InnerLoopVectorizer::fixCrossIterationPHIs() { 3625 // In order to support recurrences we need to be able to vectorize Phi nodes. 3626 // Phi nodes have cycles, so we need to vectorize them in two stages. This is 3627 // stage #2: We now need to fix the recurrences by adding incoming edges to 3628 // the currently empty PHI nodes. At this point every instruction in the 3629 // original loop is widened to a vector form so we can use them to construct 3630 // the incoming edges. 3631 for (PHINode &Phi : OrigLoop->getHeader()->phis()) { 3632 // Handle first-order recurrences and reductions that need to be fixed. 3633 if (Legal->isFirstOrderRecurrence(&Phi)) 3634 fixFirstOrderRecurrence(&Phi); 3635 else if (Legal->isReductionVariable(&Phi)) 3636 fixReduction(&Phi); 3637 } 3638 } 3639 3640 void InnerLoopVectorizer::fixFirstOrderRecurrence(PHINode *Phi) { 3641 // This is the second phase of vectorizing first-order recurrences. An 3642 // overview of the transformation is described below. Suppose we have the 3643 // following loop. 3644 // 3645 // for (int i = 0; i < n; ++i) 3646 // b[i] = a[i] - a[i - 1]; 3647 // 3648 // There is a first-order recurrence on "a". For this loop, the shorthand 3649 // scalar IR looks like: 3650 // 3651 // scalar.ph: 3652 // s_init = a[-1] 3653 // br scalar.body 3654 // 3655 // scalar.body: 3656 // i = phi [0, scalar.ph], [i+1, scalar.body] 3657 // s1 = phi [s_init, scalar.ph], [s2, scalar.body] 3658 // s2 = a[i] 3659 // b[i] = s2 - s1 3660 // br cond, scalar.body, ... 3661 // 3662 // In this example, s1 is a recurrence because it's value depends on the 3663 // previous iteration. In the first phase of vectorization, we created a 3664 // temporary value for s1. We now complete the vectorization and produce the 3665 // shorthand vector IR shown below (for VF = 4, UF = 1). 3666 // 3667 // vector.ph: 3668 // v_init = vector(..., ..., ..., a[-1]) 3669 // br vector.body 3670 // 3671 // vector.body 3672 // i = phi [0, vector.ph], [i+4, vector.body] 3673 // v1 = phi [v_init, vector.ph], [v2, vector.body] 3674 // v2 = a[i, i+1, i+2, i+3]; 3675 // v3 = vector(v1(3), v2(0, 1, 2)) 3676 // b[i, i+1, i+2, i+3] = v2 - v3 3677 // br cond, vector.body, middle.block 3678 // 3679 // middle.block: 3680 // x = v2(3) 3681 // br scalar.ph 3682 // 3683 // scalar.ph: 3684 // s_init = phi [x, middle.block], [a[-1], otherwise] 3685 // br scalar.body 3686 // 3687 // After execution completes the vector loop, we extract the next value of 3688 // the recurrence (x) to use as the initial value in the scalar loop. 3689 3690 // Get the original loop preheader and single loop latch. 3691 auto *Preheader = OrigLoop->getLoopPreheader(); 3692 auto *Latch = OrigLoop->getLoopLatch(); 3693 3694 // Get the initial and previous values of the scalar recurrence. 3695 auto *ScalarInit = Phi->getIncomingValueForBlock(Preheader); 3696 auto *Previous = Phi->getIncomingValueForBlock(Latch); 3697 3698 // Create a vector from the initial value. 3699 auto *VectorInit = ScalarInit; 3700 if (VF > 1) { 3701 Builder.SetInsertPoint(LoopVectorPreHeader->getTerminator()); 3702 VectorInit = Builder.CreateInsertElement( 3703 UndefValue::get(FixedVectorType::get(VectorInit->getType(), VF)), 3704 VectorInit, Builder.getInt32(VF - 1), "vector.recur.init"); 3705 } 3706 3707 // We constructed a temporary phi node in the first phase of vectorization. 3708 // This phi node will eventually be deleted. 3709 Builder.SetInsertPoint( 3710 cast<Instruction>(VectorLoopValueMap.getVectorValue(Phi, 0))); 3711 3712 // Create a phi node for the new recurrence. The current value will either be 3713 // the initial value inserted into a vector or loop-varying vector value. 3714 auto *VecPhi = Builder.CreatePHI(VectorInit->getType(), 2, "vector.recur"); 3715 VecPhi->addIncoming(VectorInit, LoopVectorPreHeader); 3716 3717 // Get the vectorized previous value of the last part UF - 1. It appears last 3718 // among all unrolled iterations, due to the order of their construction. 3719 Value *PreviousLastPart = getOrCreateVectorValue(Previous, UF - 1); 3720 3721 // Find and set the insertion point after the previous value if it is an 3722 // instruction. 3723 BasicBlock::iterator InsertPt; 3724 // Note that the previous value may have been constant-folded so it is not 3725 // guaranteed to be an instruction in the vector loop. 3726 // FIXME: Loop invariant values do not form recurrences. We should deal with 3727 // them earlier. 3728 if (LI->getLoopFor(LoopVectorBody)->isLoopInvariant(PreviousLastPart)) 3729 InsertPt = LoopVectorBody->getFirstInsertionPt(); 3730 else { 3731 Instruction *PreviousInst = cast<Instruction>(PreviousLastPart); 3732 if (isa<PHINode>(PreviousLastPart)) 3733 // If the previous value is a phi node, we should insert after all the phi 3734 // nodes in the block containing the PHI to avoid breaking basic block 3735 // verification. Note that the basic block may be different to 3736 // LoopVectorBody, in case we predicate the loop. 3737 InsertPt = PreviousInst->getParent()->getFirstInsertionPt(); 3738 else 3739 InsertPt = ++PreviousInst->getIterator(); 3740 } 3741 Builder.SetInsertPoint(&*InsertPt); 3742 3743 // We will construct a vector for the recurrence by combining the values for 3744 // the current and previous iterations. This is the required shuffle mask. 3745 SmallVector<int, 8> ShuffleMask(VF); 3746 ShuffleMask[0] = VF - 1; 3747 for (unsigned I = 1; I < VF; ++I) 3748 ShuffleMask[I] = I + VF - 1; 3749 3750 // The vector from which to take the initial value for the current iteration 3751 // (actual or unrolled). Initially, this is the vector phi node. 3752 Value *Incoming = VecPhi; 3753 3754 // Shuffle the current and previous vector and update the vector parts. 3755 for (unsigned Part = 0; Part < UF; ++Part) { 3756 Value *PreviousPart = getOrCreateVectorValue(Previous, Part); 3757 Value *PhiPart = VectorLoopValueMap.getVectorValue(Phi, Part); 3758 auto *Shuffle = VF > 1 ? Builder.CreateShuffleVector(Incoming, PreviousPart, 3759 ShuffleMask) 3760 : Incoming; 3761 PhiPart->replaceAllUsesWith(Shuffle); 3762 cast<Instruction>(PhiPart)->eraseFromParent(); 3763 VectorLoopValueMap.resetVectorValue(Phi, Part, Shuffle); 3764 Incoming = PreviousPart; 3765 } 3766 3767 // Fix the latch value of the new recurrence in the vector loop. 3768 VecPhi->addIncoming(Incoming, LI->getLoopFor(LoopVectorBody)->getLoopLatch()); 3769 3770 // Extract the last vector element in the middle block. This will be the 3771 // initial value for the recurrence when jumping to the scalar loop. 3772 auto *ExtractForScalar = Incoming; 3773 if (VF > 1) { 3774 Builder.SetInsertPoint(LoopMiddleBlock->getTerminator()); 3775 ExtractForScalar = Builder.CreateExtractElement( 3776 ExtractForScalar, Builder.getInt32(VF - 1), "vector.recur.extract"); 3777 } 3778 // Extract the second last element in the middle block if the 3779 // Phi is used outside the loop. We need to extract the phi itself 3780 // and not the last element (the phi update in the current iteration). This 3781 // will be the value when jumping to the exit block from the LoopMiddleBlock, 3782 // when the scalar loop is not run at all. 3783 Value *ExtractForPhiUsedOutsideLoop = nullptr; 3784 if (VF > 1) 3785 ExtractForPhiUsedOutsideLoop = Builder.CreateExtractElement( 3786 Incoming, Builder.getInt32(VF - 2), "vector.recur.extract.for.phi"); 3787 // When loop is unrolled without vectorizing, initialize 3788 // ExtractForPhiUsedOutsideLoop with the value just prior to unrolled value of 3789 // `Incoming`. This is analogous to the vectorized case above: extracting the 3790 // second last element when VF > 1. 3791 else if (UF > 1) 3792 ExtractForPhiUsedOutsideLoop = getOrCreateVectorValue(Previous, UF - 2); 3793 3794 // Fix the initial value of the original recurrence in the scalar loop. 3795 Builder.SetInsertPoint(&*LoopScalarPreHeader->begin()); 3796 auto *Start = Builder.CreatePHI(Phi->getType(), 2, "scalar.recur.init"); 3797 for (auto *BB : predecessors(LoopScalarPreHeader)) { 3798 auto *Incoming = BB == LoopMiddleBlock ? ExtractForScalar : ScalarInit; 3799 Start->addIncoming(Incoming, BB); 3800 } 3801 3802 Phi->setIncomingValueForBlock(LoopScalarPreHeader, Start); 3803 Phi->setName("scalar.recur"); 3804 3805 // Finally, fix users of the recurrence outside the loop. The users will need 3806 // either the last value of the scalar recurrence or the last value of the 3807 // vector recurrence we extracted in the middle block. Since the loop is in 3808 // LCSSA form, we just need to find all the phi nodes for the original scalar 3809 // recurrence in the exit block, and then add an edge for the middle block. 3810 for (PHINode &LCSSAPhi : LoopExitBlock->phis()) { 3811 if (LCSSAPhi.getIncomingValue(0) == Phi) { 3812 LCSSAPhi.addIncoming(ExtractForPhiUsedOutsideLoop, LoopMiddleBlock); 3813 } 3814 } 3815 } 3816 3817 void InnerLoopVectorizer::fixReduction(PHINode *Phi) { 3818 Constant *Zero = Builder.getInt32(0); 3819 3820 // Get it's reduction variable descriptor. 3821 assert(Legal->isReductionVariable(Phi) && 3822 "Unable to find the reduction variable"); 3823 RecurrenceDescriptor RdxDesc = Legal->getReductionVars()[Phi]; 3824 3825 RecurrenceDescriptor::RecurrenceKind RK = RdxDesc.getRecurrenceKind(); 3826 TrackingVH<Value> ReductionStartValue = RdxDesc.getRecurrenceStartValue(); 3827 Instruction *LoopExitInst = RdxDesc.getLoopExitInstr(); 3828 RecurrenceDescriptor::MinMaxRecurrenceKind MinMaxKind = 3829 RdxDesc.getMinMaxRecurrenceKind(); 3830 setDebugLocFromInst(Builder, ReductionStartValue); 3831 bool IsInLoopReductionPhi = Cost->isInLoopReduction(Phi); 3832 3833 // We need to generate a reduction vector from the incoming scalar. 3834 // To do so, we need to generate the 'identity' vector and override 3835 // one of the elements with the incoming scalar reduction. We need 3836 // to do it in the vector-loop preheader. 3837 Builder.SetInsertPoint(LoopVectorPreHeader->getTerminator()); 3838 3839 // This is the vector-clone of the value that leaves the loop. 3840 Type *VecTy = getOrCreateVectorValue(LoopExitInst, 0)->getType(); 3841 3842 // Find the reduction identity variable. Zero for addition, or, xor, 3843 // one for multiplication, -1 for And. 3844 Value *Identity; 3845 Value *VectorStart; 3846 if (RK == RecurrenceDescriptor::RK_IntegerMinMax || 3847 RK == RecurrenceDescriptor::RK_FloatMinMax) { 3848 // MinMax reduction have the start value as their identify. 3849 if (VF == 1 || IsInLoopReductionPhi) { 3850 VectorStart = Identity = ReductionStartValue; 3851 } else { 3852 VectorStart = Identity = 3853 Builder.CreateVectorSplat(VF, ReductionStartValue, "minmax.ident"); 3854 } 3855 } else { 3856 // Handle other reduction kinds: 3857 Constant *Iden = RecurrenceDescriptor::getRecurrenceIdentity( 3858 RK, VecTy->getScalarType()); 3859 if (VF == 1 || IsInLoopReductionPhi) { 3860 Identity = Iden; 3861 // This vector is the Identity vector where the first element is the 3862 // incoming scalar reduction. 3863 VectorStart = ReductionStartValue; 3864 } else { 3865 Identity = ConstantVector::getSplat(ElementCount::getFixed(VF), Iden); 3866 3867 // This vector is the Identity vector where the first element is the 3868 // incoming scalar reduction. 3869 VectorStart = 3870 Builder.CreateInsertElement(Identity, ReductionStartValue, Zero); 3871 } 3872 } 3873 3874 // Wrap flags are in general invalid after vectorization, clear them. 3875 clearReductionWrapFlags(RdxDesc); 3876 3877 // Fix the vector-loop phi. 3878 3879 // Reductions do not have to start at zero. They can start with 3880 // any loop invariant values. 3881 BasicBlock *Latch = OrigLoop->getLoopLatch(); 3882 Value *LoopVal = Phi->getIncomingValueForBlock(Latch); 3883 3884 for (unsigned Part = 0; Part < UF; ++Part) { 3885 Value *VecRdxPhi = getOrCreateVectorValue(Phi, Part); 3886 Value *Val = getOrCreateVectorValue(LoopVal, Part); 3887 // Make sure to add the reduction start value only to the 3888 // first unroll part. 3889 Value *StartVal = (Part == 0) ? VectorStart : Identity; 3890 cast<PHINode>(VecRdxPhi)->addIncoming(StartVal, LoopVectorPreHeader); 3891 cast<PHINode>(VecRdxPhi) 3892 ->addIncoming(Val, LI->getLoopFor(LoopVectorBody)->getLoopLatch()); 3893 } 3894 3895 // Before each round, move the insertion point right between 3896 // the PHIs and the values we are going to write. 3897 // This allows us to write both PHINodes and the extractelement 3898 // instructions. 3899 Builder.SetInsertPoint(&*LoopMiddleBlock->getFirstInsertionPt()); 3900 3901 setDebugLocFromInst(Builder, LoopExitInst); 3902 3903 // If tail is folded by masking, the vector value to leave the loop should be 3904 // a Select choosing between the vectorized LoopExitInst and vectorized Phi, 3905 // instead of the former. 3906 if (Cost->foldTailByMasking()) { 3907 for (unsigned Part = 0; Part < UF; ++Part) { 3908 Value *VecLoopExitInst = 3909 VectorLoopValueMap.getVectorValue(LoopExitInst, Part); 3910 Value *Sel = nullptr; 3911 for (User *U : VecLoopExitInst->users()) { 3912 if (isa<SelectInst>(U)) { 3913 assert(!Sel && "Reduction exit feeding two selects"); 3914 Sel = U; 3915 } else 3916 assert(isa<PHINode>(U) && "Reduction exit must feed Phi's or select"); 3917 } 3918 assert(Sel && "Reduction exit feeds no select"); 3919 VectorLoopValueMap.resetVectorValue(LoopExitInst, Part, Sel); 3920 } 3921 } 3922 3923 // If the vector reduction can be performed in a smaller type, we truncate 3924 // then extend the loop exit value to enable InstCombine to evaluate the 3925 // entire expression in the smaller type. 3926 if (VF > 1 && Phi->getType() != RdxDesc.getRecurrenceType()) { 3927 assert(!IsInLoopReductionPhi && "Unexpected truncated inloop reduction!"); 3928 Type *RdxVecTy = FixedVectorType::get(RdxDesc.getRecurrenceType(), VF); 3929 Builder.SetInsertPoint( 3930 LI->getLoopFor(LoopVectorBody)->getLoopLatch()->getTerminator()); 3931 VectorParts RdxParts(UF); 3932 for (unsigned Part = 0; Part < UF; ++Part) { 3933 RdxParts[Part] = VectorLoopValueMap.getVectorValue(LoopExitInst, Part); 3934 Value *Trunc = Builder.CreateTrunc(RdxParts[Part], RdxVecTy); 3935 Value *Extnd = RdxDesc.isSigned() ? Builder.CreateSExt(Trunc, VecTy) 3936 : Builder.CreateZExt(Trunc, VecTy); 3937 for (Value::user_iterator UI = RdxParts[Part]->user_begin(); 3938 UI != RdxParts[Part]->user_end();) 3939 if (*UI != Trunc) { 3940 (*UI++)->replaceUsesOfWith(RdxParts[Part], Extnd); 3941 RdxParts[Part] = Extnd; 3942 } else { 3943 ++UI; 3944 } 3945 } 3946 Builder.SetInsertPoint(&*LoopMiddleBlock->getFirstInsertionPt()); 3947 for (unsigned Part = 0; Part < UF; ++Part) { 3948 RdxParts[Part] = Builder.CreateTrunc(RdxParts[Part], RdxVecTy); 3949 VectorLoopValueMap.resetVectorValue(LoopExitInst, Part, RdxParts[Part]); 3950 } 3951 } 3952 3953 // Reduce all of the unrolled parts into a single vector. 3954 Value *ReducedPartRdx = VectorLoopValueMap.getVectorValue(LoopExitInst, 0); 3955 unsigned Op = RecurrenceDescriptor::getRecurrenceBinOp(RK); 3956 3957 // The middle block terminator has already been assigned a DebugLoc here (the 3958 // OrigLoop's single latch terminator). We want the whole middle block to 3959 // appear to execute on this line because: (a) it is all compiler generated, 3960 // (b) these instructions are always executed after evaluating the latch 3961 // conditional branch, and (c) other passes may add new predecessors which 3962 // terminate on this line. This is the easiest way to ensure we don't 3963 // accidentally cause an extra step back into the loop while debugging. 3964 setDebugLocFromInst(Builder, LoopMiddleBlock->getTerminator()); 3965 for (unsigned Part = 1; Part < UF; ++Part) { 3966 Value *RdxPart = VectorLoopValueMap.getVectorValue(LoopExitInst, Part); 3967 if (Op != Instruction::ICmp && Op != Instruction::FCmp) 3968 // Floating point operations had to be 'fast' to enable the reduction. 3969 ReducedPartRdx = addFastMathFlag( 3970 Builder.CreateBinOp((Instruction::BinaryOps)Op, RdxPart, 3971 ReducedPartRdx, "bin.rdx"), 3972 RdxDesc.getFastMathFlags()); 3973 else 3974 ReducedPartRdx = createMinMaxOp(Builder, MinMaxKind, ReducedPartRdx, 3975 RdxPart); 3976 } 3977 3978 // Create the reduction after the loop. Note that inloop reductions create the 3979 // target reduction in the loop using a Reduction recipe. 3980 if (VF > 1 && !IsInLoopReductionPhi) { 3981 bool NoNaN = Legal->hasFunNoNaNAttr(); 3982 ReducedPartRdx = 3983 createTargetReduction(Builder, TTI, RdxDesc, ReducedPartRdx, NoNaN); 3984 // If the reduction can be performed in a smaller type, we need to extend 3985 // the reduction to the wider type before we branch to the original loop. 3986 if (Phi->getType() != RdxDesc.getRecurrenceType()) 3987 ReducedPartRdx = 3988 RdxDesc.isSigned() 3989 ? Builder.CreateSExt(ReducedPartRdx, Phi->getType()) 3990 : Builder.CreateZExt(ReducedPartRdx, Phi->getType()); 3991 } 3992 3993 // Create a phi node that merges control-flow from the backedge-taken check 3994 // block and the middle block. 3995 PHINode *BCBlockPhi = PHINode::Create(Phi->getType(), 2, "bc.merge.rdx", 3996 LoopScalarPreHeader->getTerminator()); 3997 for (unsigned I = 0, E = LoopBypassBlocks.size(); I != E; ++I) 3998 BCBlockPhi->addIncoming(ReductionStartValue, LoopBypassBlocks[I]); 3999 BCBlockPhi->addIncoming(ReducedPartRdx, LoopMiddleBlock); 4000 4001 // Now, we need to fix the users of the reduction variable 4002 // inside and outside of the scalar remainder loop. 4003 // We know that the loop is in LCSSA form. We need to update the 4004 // PHI nodes in the exit blocks. 4005 for (PHINode &LCSSAPhi : LoopExitBlock->phis()) { 4006 // All PHINodes need to have a single entry edge, or two if 4007 // we already fixed them. 4008 assert(LCSSAPhi.getNumIncomingValues() < 3 && "Invalid LCSSA PHI"); 4009 4010 // We found a reduction value exit-PHI. Update it with the 4011 // incoming bypass edge. 4012 if (LCSSAPhi.getIncomingValue(0) == LoopExitInst) 4013 LCSSAPhi.addIncoming(ReducedPartRdx, LoopMiddleBlock); 4014 } // end of the LCSSA phi scan. 4015 4016 // Fix the scalar loop reduction variable with the incoming reduction sum 4017 // from the vector body and from the backedge value. 4018 int IncomingEdgeBlockIdx = 4019 Phi->getBasicBlockIndex(OrigLoop->getLoopLatch()); 4020 assert(IncomingEdgeBlockIdx >= 0 && "Invalid block index"); 4021 // Pick the other block. 4022 int SelfEdgeBlockIdx = (IncomingEdgeBlockIdx ? 0 : 1); 4023 Phi->setIncomingValue(SelfEdgeBlockIdx, BCBlockPhi); 4024 Phi->setIncomingValue(IncomingEdgeBlockIdx, LoopExitInst); 4025 } 4026 4027 void InnerLoopVectorizer::clearReductionWrapFlags( 4028 RecurrenceDescriptor &RdxDesc) { 4029 RecurrenceDescriptor::RecurrenceKind RK = RdxDesc.getRecurrenceKind(); 4030 if (RK != RecurrenceDescriptor::RK_IntegerAdd && 4031 RK != RecurrenceDescriptor::RK_IntegerMult) 4032 return; 4033 4034 Instruction *LoopExitInstr = RdxDesc.getLoopExitInstr(); 4035 assert(LoopExitInstr && "null loop exit instruction"); 4036 SmallVector<Instruction *, 8> Worklist; 4037 SmallPtrSet<Instruction *, 8> Visited; 4038 Worklist.push_back(LoopExitInstr); 4039 Visited.insert(LoopExitInstr); 4040 4041 while (!Worklist.empty()) { 4042 Instruction *Cur = Worklist.pop_back_val(); 4043 if (isa<OverflowingBinaryOperator>(Cur)) 4044 for (unsigned Part = 0; Part < UF; ++Part) { 4045 Value *V = getOrCreateVectorValue(Cur, Part); 4046 cast<Instruction>(V)->dropPoisonGeneratingFlags(); 4047 } 4048 4049 for (User *U : Cur->users()) { 4050 Instruction *UI = cast<Instruction>(U); 4051 if ((Cur != LoopExitInstr || OrigLoop->contains(UI->getParent())) && 4052 Visited.insert(UI).second) 4053 Worklist.push_back(UI); 4054 } 4055 } 4056 } 4057 4058 void InnerLoopVectorizer::fixLCSSAPHIs() { 4059 for (PHINode &LCSSAPhi : LoopExitBlock->phis()) { 4060 if (LCSSAPhi.getNumIncomingValues() == 1) { 4061 auto *IncomingValue = LCSSAPhi.getIncomingValue(0); 4062 // Non-instruction incoming values will have only one value. 4063 unsigned LastLane = 0; 4064 if (isa<Instruction>(IncomingValue)) 4065 LastLane = Cost->isUniformAfterVectorization( 4066 cast<Instruction>(IncomingValue), VF) 4067 ? 0 4068 : VF - 1; 4069 // Can be a loop invariant incoming value or the last scalar value to be 4070 // extracted from the vectorized loop. 4071 Builder.SetInsertPoint(LoopMiddleBlock->getTerminator()); 4072 Value *lastIncomingValue = 4073 getOrCreateScalarValue(IncomingValue, { UF - 1, LastLane }); 4074 LCSSAPhi.addIncoming(lastIncomingValue, LoopMiddleBlock); 4075 } 4076 } 4077 } 4078 4079 void InnerLoopVectorizer::sinkScalarOperands(Instruction *PredInst) { 4080 // The basic block and loop containing the predicated instruction. 4081 auto *PredBB = PredInst->getParent(); 4082 auto *VectorLoop = LI->getLoopFor(PredBB); 4083 4084 // Initialize a worklist with the operands of the predicated instruction. 4085 SetVector<Value *> Worklist(PredInst->op_begin(), PredInst->op_end()); 4086 4087 // Holds instructions that we need to analyze again. An instruction may be 4088 // reanalyzed if we don't yet know if we can sink it or not. 4089 SmallVector<Instruction *, 8> InstsToReanalyze; 4090 4091 // Returns true if a given use occurs in the predicated block. Phi nodes use 4092 // their operands in their corresponding predecessor blocks. 4093 auto isBlockOfUsePredicated = [&](Use &U) -> bool { 4094 auto *I = cast<Instruction>(U.getUser()); 4095 BasicBlock *BB = I->getParent(); 4096 if (auto *Phi = dyn_cast<PHINode>(I)) 4097 BB = Phi->getIncomingBlock( 4098 PHINode::getIncomingValueNumForOperand(U.getOperandNo())); 4099 return BB == PredBB; 4100 }; 4101 4102 // Iteratively sink the scalarized operands of the predicated instruction 4103 // into the block we created for it. When an instruction is sunk, it's 4104 // operands are then added to the worklist. The algorithm ends after one pass 4105 // through the worklist doesn't sink a single instruction. 4106 bool Changed; 4107 do { 4108 // Add the instructions that need to be reanalyzed to the worklist, and 4109 // reset the changed indicator. 4110 Worklist.insert(InstsToReanalyze.begin(), InstsToReanalyze.end()); 4111 InstsToReanalyze.clear(); 4112 Changed = false; 4113 4114 while (!Worklist.empty()) { 4115 auto *I = dyn_cast<Instruction>(Worklist.pop_back_val()); 4116 4117 // We can't sink an instruction if it is a phi node, is already in the 4118 // predicated block, is not in the loop, or may have side effects. 4119 if (!I || isa<PHINode>(I) || I->getParent() == PredBB || 4120 !VectorLoop->contains(I) || I->mayHaveSideEffects()) 4121 continue; 4122 4123 // It's legal to sink the instruction if all its uses occur in the 4124 // predicated block. Otherwise, there's nothing to do yet, and we may 4125 // need to reanalyze the instruction. 4126 if (!llvm::all_of(I->uses(), isBlockOfUsePredicated)) { 4127 InstsToReanalyze.push_back(I); 4128 continue; 4129 } 4130 4131 // Move the instruction to the beginning of the predicated block, and add 4132 // it's operands to the worklist. 4133 I->moveBefore(&*PredBB->getFirstInsertionPt()); 4134 Worklist.insert(I->op_begin(), I->op_end()); 4135 4136 // The sinking may have enabled other instructions to be sunk, so we will 4137 // need to iterate. 4138 Changed = true; 4139 } 4140 } while (Changed); 4141 } 4142 4143 void InnerLoopVectorizer::fixNonInductionPHIs() { 4144 for (PHINode *OrigPhi : OrigPHIsToFix) { 4145 PHINode *NewPhi = 4146 cast<PHINode>(VectorLoopValueMap.getVectorValue(OrigPhi, 0)); 4147 unsigned NumIncomingValues = OrigPhi->getNumIncomingValues(); 4148 4149 SmallVector<BasicBlock *, 2> ScalarBBPredecessors( 4150 predecessors(OrigPhi->getParent())); 4151 SmallVector<BasicBlock *, 2> VectorBBPredecessors( 4152 predecessors(NewPhi->getParent())); 4153 assert(ScalarBBPredecessors.size() == VectorBBPredecessors.size() && 4154 "Scalar and Vector BB should have the same number of predecessors"); 4155 4156 // The insertion point in Builder may be invalidated by the time we get 4157 // here. Force the Builder insertion point to something valid so that we do 4158 // not run into issues during insertion point restore in 4159 // getOrCreateVectorValue calls below. 4160 Builder.SetInsertPoint(NewPhi); 4161 4162 // The predecessor order is preserved and we can rely on mapping between 4163 // scalar and vector block predecessors. 4164 for (unsigned i = 0; i < NumIncomingValues; ++i) { 4165 BasicBlock *NewPredBB = VectorBBPredecessors[i]; 4166 4167 // When looking up the new scalar/vector values to fix up, use incoming 4168 // values from original phi. 4169 Value *ScIncV = 4170 OrigPhi->getIncomingValueForBlock(ScalarBBPredecessors[i]); 4171 4172 // Scalar incoming value may need a broadcast 4173 Value *NewIncV = getOrCreateVectorValue(ScIncV, 0); 4174 NewPhi->addIncoming(NewIncV, NewPredBB); 4175 } 4176 } 4177 } 4178 4179 void InnerLoopVectorizer::widenGEP(GetElementPtrInst *GEP, VPUser &Operands, 4180 unsigned UF, unsigned VF, 4181 bool IsPtrLoopInvariant, 4182 SmallBitVector &IsIndexLoopInvariant, 4183 VPTransformState &State) { 4184 // Construct a vector GEP by widening the operands of the scalar GEP as 4185 // necessary. We mark the vector GEP 'inbounds' if appropriate. A GEP 4186 // results in a vector of pointers when at least one operand of the GEP 4187 // is vector-typed. Thus, to keep the representation compact, we only use 4188 // vector-typed operands for loop-varying values. 4189 4190 if (VF > 1 && IsPtrLoopInvariant && IsIndexLoopInvariant.all()) { 4191 // If we are vectorizing, but the GEP has only loop-invariant operands, 4192 // the GEP we build (by only using vector-typed operands for 4193 // loop-varying values) would be a scalar pointer. Thus, to ensure we 4194 // produce a vector of pointers, we need to either arbitrarily pick an 4195 // operand to broadcast, or broadcast a clone of the original GEP. 4196 // Here, we broadcast a clone of the original. 4197 // 4198 // TODO: If at some point we decide to scalarize instructions having 4199 // loop-invariant operands, this special case will no longer be 4200 // required. We would add the scalarization decision to 4201 // collectLoopScalars() and teach getVectorValue() to broadcast 4202 // the lane-zero scalar value. 4203 auto *Clone = Builder.Insert(GEP->clone()); 4204 for (unsigned Part = 0; Part < UF; ++Part) { 4205 Value *EntryPart = Builder.CreateVectorSplat(VF, Clone); 4206 VectorLoopValueMap.setVectorValue(GEP, Part, EntryPart); 4207 addMetadata(EntryPart, GEP); 4208 } 4209 } else { 4210 // If the GEP has at least one loop-varying operand, we are sure to 4211 // produce a vector of pointers. But if we are only unrolling, we want 4212 // to produce a scalar GEP for each unroll part. Thus, the GEP we 4213 // produce with the code below will be scalar (if VF == 1) or vector 4214 // (otherwise). Note that for the unroll-only case, we still maintain 4215 // values in the vector mapping with initVector, as we do for other 4216 // instructions. 4217 for (unsigned Part = 0; Part < UF; ++Part) { 4218 // The pointer operand of the new GEP. If it's loop-invariant, we 4219 // won't broadcast it. 4220 auto *Ptr = IsPtrLoopInvariant ? State.get(Operands.getOperand(0), {0, 0}) 4221 : State.get(Operands.getOperand(0), Part); 4222 4223 // Collect all the indices for the new GEP. If any index is 4224 // loop-invariant, we won't broadcast it. 4225 SmallVector<Value *, 4> Indices; 4226 for (unsigned I = 1, E = Operands.getNumOperands(); I < E; I++) { 4227 VPValue *Operand = Operands.getOperand(I); 4228 if (IsIndexLoopInvariant[I - 1]) 4229 Indices.push_back(State.get(Operand, {0, 0})); 4230 else 4231 Indices.push_back(State.get(Operand, Part)); 4232 } 4233 4234 // Create the new GEP. Note that this GEP may be a scalar if VF == 1, 4235 // but it should be a vector, otherwise. 4236 auto *NewGEP = 4237 GEP->isInBounds() 4238 ? Builder.CreateInBoundsGEP(GEP->getSourceElementType(), Ptr, 4239 Indices) 4240 : Builder.CreateGEP(GEP->getSourceElementType(), Ptr, Indices); 4241 assert((VF == 1 || NewGEP->getType()->isVectorTy()) && 4242 "NewGEP is not a pointer vector"); 4243 VectorLoopValueMap.setVectorValue(GEP, Part, NewGEP); 4244 addMetadata(NewGEP, GEP); 4245 } 4246 } 4247 } 4248 4249 void InnerLoopVectorizer::widenPHIInstruction(Instruction *PN, unsigned UF, 4250 unsigned VF) { 4251 PHINode *P = cast<PHINode>(PN); 4252 if (EnableVPlanNativePath) { 4253 // Currently we enter here in the VPlan-native path for non-induction 4254 // PHIs where all control flow is uniform. We simply widen these PHIs. 4255 // Create a vector phi with no operands - the vector phi operands will be 4256 // set at the end of vector code generation. 4257 Type *VecTy = 4258 (VF == 1) ? PN->getType() : FixedVectorType::get(PN->getType(), VF); 4259 Value *VecPhi = Builder.CreatePHI(VecTy, PN->getNumOperands(), "vec.phi"); 4260 VectorLoopValueMap.setVectorValue(P, 0, VecPhi); 4261 OrigPHIsToFix.push_back(P); 4262 4263 return; 4264 } 4265 4266 assert(PN->getParent() == OrigLoop->getHeader() && 4267 "Non-header phis should have been handled elsewhere"); 4268 4269 // In order to support recurrences we need to be able to vectorize Phi nodes. 4270 // Phi nodes have cycles, so we need to vectorize them in two stages. This is 4271 // stage #1: We create a new vector PHI node with no incoming edges. We'll use 4272 // this value when we vectorize all of the instructions that use the PHI. 4273 if (Legal->isReductionVariable(P) || Legal->isFirstOrderRecurrence(P)) { 4274 for (unsigned Part = 0; Part < UF; ++Part) { 4275 // This is phase one of vectorizing PHIs. 4276 bool ScalarPHI = (VF == 1) || Cost->isInLoopReduction(cast<PHINode>(PN)); 4277 Type *VecTy = 4278 ScalarPHI ? PN->getType() : FixedVectorType::get(PN->getType(), VF); 4279 Value *EntryPart = PHINode::Create( 4280 VecTy, 2, "vec.phi", &*LoopVectorBody->getFirstInsertionPt()); 4281 VectorLoopValueMap.setVectorValue(P, Part, EntryPart); 4282 } 4283 return; 4284 } 4285 4286 setDebugLocFromInst(Builder, P); 4287 4288 // This PHINode must be an induction variable. 4289 // Make sure that we know about it. 4290 assert(Legal->getInductionVars().count(P) && "Not an induction variable"); 4291 4292 InductionDescriptor II = Legal->getInductionVars().lookup(P); 4293 const DataLayout &DL = OrigLoop->getHeader()->getModule()->getDataLayout(); 4294 4295 // FIXME: The newly created binary instructions should contain nsw/nuw flags, 4296 // which can be found from the original scalar operations. 4297 switch (II.getKind()) { 4298 case InductionDescriptor::IK_NoInduction: 4299 llvm_unreachable("Unknown induction"); 4300 case InductionDescriptor::IK_IntInduction: 4301 case InductionDescriptor::IK_FpInduction: 4302 llvm_unreachable("Integer/fp induction is handled elsewhere."); 4303 case InductionDescriptor::IK_PtrInduction: { 4304 // Handle the pointer induction variable case. 4305 assert(P->getType()->isPointerTy() && "Unexpected type."); 4306 4307 if (Cost->isScalarAfterVectorization(P, VF)) { 4308 // This is the normalized GEP that starts counting at zero. 4309 Value *PtrInd = 4310 Builder.CreateSExtOrTrunc(Induction, II.getStep()->getType()); 4311 // Determine the number of scalars we need to generate for each unroll 4312 // iteration. If the instruction is uniform, we only need to generate the 4313 // first lane. Otherwise, we generate all VF values. 4314 unsigned Lanes = Cost->isUniformAfterVectorization(P, VF) ? 1 : VF; 4315 for (unsigned Part = 0; Part < UF; ++Part) { 4316 for (unsigned Lane = 0; Lane < Lanes; ++Lane) { 4317 Constant *Idx = ConstantInt::get(PtrInd->getType(), Lane + Part * VF); 4318 Value *GlobalIdx = Builder.CreateAdd(PtrInd, Idx); 4319 Value *SclrGep = 4320 emitTransformedIndex(Builder, GlobalIdx, PSE.getSE(), DL, II); 4321 SclrGep->setName("next.gep"); 4322 VectorLoopValueMap.setScalarValue(P, {Part, Lane}, SclrGep); 4323 } 4324 } 4325 return; 4326 } 4327 assert(isa<SCEVConstant>(II.getStep()) && 4328 "Induction step not a SCEV constant!"); 4329 Type *PhiType = II.getStep()->getType(); 4330 4331 // Build a pointer phi 4332 Value *ScalarStartValue = II.getStartValue(); 4333 Type *ScStValueType = ScalarStartValue->getType(); 4334 PHINode *NewPointerPhi = 4335 PHINode::Create(ScStValueType, 2, "pointer.phi", Induction); 4336 NewPointerPhi->addIncoming(ScalarStartValue, LoopVectorPreHeader); 4337 4338 // A pointer induction, performed by using a gep 4339 BasicBlock *LoopLatch = LI->getLoopFor(LoopVectorBody)->getLoopLatch(); 4340 Instruction *InductionLoc = LoopLatch->getTerminator(); 4341 const SCEV *ScalarStep = II.getStep(); 4342 SCEVExpander Exp(*PSE.getSE(), DL, "induction"); 4343 Value *ScalarStepValue = 4344 Exp.expandCodeFor(ScalarStep, PhiType, InductionLoc); 4345 Value *InductionGEP = GetElementPtrInst::Create( 4346 ScStValueType->getPointerElementType(), NewPointerPhi, 4347 Builder.CreateMul(ScalarStepValue, ConstantInt::get(PhiType, VF * UF)), 4348 "ptr.ind", InductionLoc); 4349 NewPointerPhi->addIncoming(InductionGEP, LoopLatch); 4350 4351 // Create UF many actual address geps that use the pointer 4352 // phi as base and a vectorized version of the step value 4353 // (<step*0, ..., step*N>) as offset. 4354 for (unsigned Part = 0; Part < UF; ++Part) { 4355 SmallVector<Constant *, 8> Indices; 4356 // Create a vector of consecutive numbers from zero to VF. 4357 for (unsigned i = 0; i < VF; ++i) 4358 Indices.push_back(ConstantInt::get(PhiType, i + Part * VF)); 4359 Constant *StartOffset = ConstantVector::get(Indices); 4360 4361 Value *GEP = Builder.CreateGEP( 4362 ScStValueType->getPointerElementType(), NewPointerPhi, 4363 Builder.CreateMul(StartOffset, 4364 Builder.CreateVectorSplat(VF, ScalarStepValue), 4365 "vector.gep")); 4366 VectorLoopValueMap.setVectorValue(P, Part, GEP); 4367 } 4368 } 4369 } 4370 } 4371 4372 /// A helper function for checking whether an integer division-related 4373 /// instruction may divide by zero (in which case it must be predicated if 4374 /// executed conditionally in the scalar code). 4375 /// TODO: It may be worthwhile to generalize and check isKnownNonZero(). 4376 /// Non-zero divisors that are non compile-time constants will not be 4377 /// converted into multiplication, so we will still end up scalarizing 4378 /// the division, but can do so w/o predication. 4379 static bool mayDivideByZero(Instruction &I) { 4380 assert((I.getOpcode() == Instruction::UDiv || 4381 I.getOpcode() == Instruction::SDiv || 4382 I.getOpcode() == Instruction::URem || 4383 I.getOpcode() == Instruction::SRem) && 4384 "Unexpected instruction"); 4385 Value *Divisor = I.getOperand(1); 4386 auto *CInt = dyn_cast<ConstantInt>(Divisor); 4387 return !CInt || CInt->isZero(); 4388 } 4389 4390 void InnerLoopVectorizer::widenInstruction(Instruction &I, VPUser &User, 4391 VPTransformState &State) { 4392 switch (I.getOpcode()) { 4393 case Instruction::Call: 4394 case Instruction::Br: 4395 case Instruction::PHI: 4396 case Instruction::GetElementPtr: 4397 case Instruction::Select: 4398 llvm_unreachable("This instruction is handled by a different recipe."); 4399 case Instruction::UDiv: 4400 case Instruction::SDiv: 4401 case Instruction::SRem: 4402 case Instruction::URem: 4403 case Instruction::Add: 4404 case Instruction::FAdd: 4405 case Instruction::Sub: 4406 case Instruction::FSub: 4407 case Instruction::FNeg: 4408 case Instruction::Mul: 4409 case Instruction::FMul: 4410 case Instruction::FDiv: 4411 case Instruction::FRem: 4412 case Instruction::Shl: 4413 case Instruction::LShr: 4414 case Instruction::AShr: 4415 case Instruction::And: 4416 case Instruction::Or: 4417 case Instruction::Xor: { 4418 // Just widen unops and binops. 4419 setDebugLocFromInst(Builder, &I); 4420 4421 for (unsigned Part = 0; Part < UF; ++Part) { 4422 SmallVector<Value *, 2> Ops; 4423 for (VPValue *VPOp : User.operands()) 4424 Ops.push_back(State.get(VPOp, Part)); 4425 4426 Value *V = Builder.CreateNAryOp(I.getOpcode(), Ops); 4427 4428 if (auto *VecOp = dyn_cast<Instruction>(V)) 4429 VecOp->copyIRFlags(&I); 4430 4431 // Use this vector value for all users of the original instruction. 4432 VectorLoopValueMap.setVectorValue(&I, Part, V); 4433 addMetadata(V, &I); 4434 } 4435 4436 break; 4437 } 4438 case Instruction::ICmp: 4439 case Instruction::FCmp: { 4440 // Widen compares. Generate vector compares. 4441 bool FCmp = (I.getOpcode() == Instruction::FCmp); 4442 auto *Cmp = cast<CmpInst>(&I); 4443 setDebugLocFromInst(Builder, Cmp); 4444 for (unsigned Part = 0; Part < UF; ++Part) { 4445 Value *A = State.get(User.getOperand(0), Part); 4446 Value *B = State.get(User.getOperand(1), Part); 4447 Value *C = nullptr; 4448 if (FCmp) { 4449 // Propagate fast math flags. 4450 IRBuilder<>::FastMathFlagGuard FMFG(Builder); 4451 Builder.setFastMathFlags(Cmp->getFastMathFlags()); 4452 C = Builder.CreateFCmp(Cmp->getPredicate(), A, B); 4453 } else { 4454 C = Builder.CreateICmp(Cmp->getPredicate(), A, B); 4455 } 4456 VectorLoopValueMap.setVectorValue(&I, Part, C); 4457 addMetadata(C, &I); 4458 } 4459 4460 break; 4461 } 4462 4463 case Instruction::ZExt: 4464 case Instruction::SExt: 4465 case Instruction::FPToUI: 4466 case Instruction::FPToSI: 4467 case Instruction::FPExt: 4468 case Instruction::PtrToInt: 4469 case Instruction::IntToPtr: 4470 case Instruction::SIToFP: 4471 case Instruction::UIToFP: 4472 case Instruction::Trunc: 4473 case Instruction::FPTrunc: 4474 case Instruction::BitCast: { 4475 auto *CI = cast<CastInst>(&I); 4476 setDebugLocFromInst(Builder, CI); 4477 4478 /// Vectorize casts. 4479 Type *DestTy = 4480 (VF == 1) ? CI->getType() : FixedVectorType::get(CI->getType(), VF); 4481 4482 for (unsigned Part = 0; Part < UF; ++Part) { 4483 Value *A = State.get(User.getOperand(0), Part); 4484 Value *Cast = Builder.CreateCast(CI->getOpcode(), A, DestTy); 4485 VectorLoopValueMap.setVectorValue(&I, Part, Cast); 4486 addMetadata(Cast, &I); 4487 } 4488 break; 4489 } 4490 default: 4491 // This instruction is not vectorized by simple widening. 4492 LLVM_DEBUG(dbgs() << "LV: Found an unhandled instruction: " << I); 4493 llvm_unreachable("Unhandled instruction!"); 4494 } // end of switch. 4495 } 4496 4497 void InnerLoopVectorizer::widenCallInstruction(CallInst &I, VPUser &ArgOperands, 4498 VPTransformState &State) { 4499 assert(!isa<DbgInfoIntrinsic>(I) && 4500 "DbgInfoIntrinsic should have been dropped during VPlan construction"); 4501 setDebugLocFromInst(Builder, &I); 4502 4503 Module *M = I.getParent()->getParent()->getParent(); 4504 auto *CI = cast<CallInst>(&I); 4505 4506 SmallVector<Type *, 4> Tys; 4507 for (Value *ArgOperand : CI->arg_operands()) 4508 Tys.push_back(ToVectorTy(ArgOperand->getType(), VF)); 4509 4510 Intrinsic::ID ID = getVectorIntrinsicIDForCall(CI, TLI); 4511 4512 // The flag shows whether we use Intrinsic or a usual Call for vectorized 4513 // version of the instruction. 4514 // Is it beneficial to perform intrinsic call compared to lib call? 4515 bool NeedToScalarize = false; 4516 unsigned CallCost = Cost->getVectorCallCost(CI, VF, NeedToScalarize); 4517 bool UseVectorIntrinsic = 4518 ID && Cost->getVectorIntrinsicCost(CI, VF) <= CallCost; 4519 assert((UseVectorIntrinsic || !NeedToScalarize) && 4520 "Instruction should be scalarized elsewhere."); 4521 4522 for (unsigned Part = 0; Part < UF; ++Part) { 4523 SmallVector<Value *, 4> Args; 4524 for (auto &I : enumerate(ArgOperands.operands())) { 4525 // Some intrinsics have a scalar argument - don't replace it with a 4526 // vector. 4527 Value *Arg; 4528 if (!UseVectorIntrinsic || !hasVectorInstrinsicScalarOpd(ID, I.index())) 4529 Arg = State.get(I.value(), Part); 4530 else 4531 Arg = State.get(I.value(), {0, 0}); 4532 Args.push_back(Arg); 4533 } 4534 4535 Function *VectorF; 4536 if (UseVectorIntrinsic) { 4537 // Use vector version of the intrinsic. 4538 Type *TysForDecl[] = {CI->getType()}; 4539 if (VF > 1) 4540 TysForDecl[0] = 4541 FixedVectorType::get(CI->getType()->getScalarType(), VF); 4542 VectorF = Intrinsic::getDeclaration(M, ID, TysForDecl); 4543 assert(VectorF && "Can't retrieve vector intrinsic."); 4544 } else { 4545 // Use vector version of the function call. 4546 const VFShape Shape = VFShape::get(*CI, ElementCount::getFixed(VF), 4547 false /*HasGlobalPred*/); 4548 #ifndef NDEBUG 4549 assert(VFDatabase(*CI).getVectorizedFunction(Shape) != nullptr && 4550 "Can't create vector function."); 4551 #endif 4552 VectorF = VFDatabase(*CI).getVectorizedFunction(Shape); 4553 } 4554 SmallVector<OperandBundleDef, 1> OpBundles; 4555 CI->getOperandBundlesAsDefs(OpBundles); 4556 CallInst *V = Builder.CreateCall(VectorF, Args, OpBundles); 4557 4558 if (isa<FPMathOperator>(V)) 4559 V->copyFastMathFlags(CI); 4560 4561 VectorLoopValueMap.setVectorValue(&I, Part, V); 4562 addMetadata(V, &I); 4563 } 4564 } 4565 4566 void InnerLoopVectorizer::widenSelectInstruction(SelectInst &I, 4567 VPUser &Operands, 4568 bool InvariantCond, 4569 VPTransformState &State) { 4570 setDebugLocFromInst(Builder, &I); 4571 4572 // The condition can be loop invariant but still defined inside the 4573 // loop. This means that we can't just use the original 'cond' value. 4574 // We have to take the 'vectorized' value and pick the first lane. 4575 // Instcombine will make this a no-op. 4576 auto *InvarCond = 4577 InvariantCond ? State.get(Operands.getOperand(0), {0, 0}) : nullptr; 4578 4579 for (unsigned Part = 0; Part < UF; ++Part) { 4580 Value *Cond = 4581 InvarCond ? InvarCond : State.get(Operands.getOperand(0), Part); 4582 Value *Op0 = State.get(Operands.getOperand(1), Part); 4583 Value *Op1 = State.get(Operands.getOperand(2), Part); 4584 Value *Sel = Builder.CreateSelect(Cond, Op0, Op1); 4585 VectorLoopValueMap.setVectorValue(&I, Part, Sel); 4586 addMetadata(Sel, &I); 4587 } 4588 } 4589 4590 void LoopVectorizationCostModel::collectLoopScalars(unsigned VF) { 4591 // We should not collect Scalars more than once per VF. Right now, this 4592 // function is called from collectUniformsAndScalars(), which already does 4593 // this check. Collecting Scalars for VF=1 does not make any sense. 4594 assert(VF >= 2 && Scalars.find(VF) == Scalars.end() && 4595 "This function should not be visited twice for the same VF"); 4596 4597 SmallSetVector<Instruction *, 8> Worklist; 4598 4599 // These sets are used to seed the analysis with pointers used by memory 4600 // accesses that will remain scalar. 4601 SmallSetVector<Instruction *, 8> ScalarPtrs; 4602 SmallPtrSet<Instruction *, 8> PossibleNonScalarPtrs; 4603 auto *Latch = TheLoop->getLoopLatch(); 4604 4605 // A helper that returns true if the use of Ptr by MemAccess will be scalar. 4606 // The pointer operands of loads and stores will be scalar as long as the 4607 // memory access is not a gather or scatter operation. The value operand of a 4608 // store will remain scalar if the store is scalarized. 4609 auto isScalarUse = [&](Instruction *MemAccess, Value *Ptr) { 4610 InstWidening WideningDecision = getWideningDecision(MemAccess, VF); 4611 assert(WideningDecision != CM_Unknown && 4612 "Widening decision should be ready at this moment"); 4613 if (auto *Store = dyn_cast<StoreInst>(MemAccess)) 4614 if (Ptr == Store->getValueOperand()) 4615 return WideningDecision == CM_Scalarize; 4616 assert(Ptr == getLoadStorePointerOperand(MemAccess) && 4617 "Ptr is neither a value or pointer operand"); 4618 return WideningDecision != CM_GatherScatter; 4619 }; 4620 4621 // A helper that returns true if the given value is a bitcast or 4622 // getelementptr instruction contained in the loop. 4623 auto isLoopVaryingBitCastOrGEP = [&](Value *V) { 4624 return ((isa<BitCastInst>(V) && V->getType()->isPointerTy()) || 4625 isa<GetElementPtrInst>(V)) && 4626 !TheLoop->isLoopInvariant(V); 4627 }; 4628 4629 auto isScalarPtrInduction = [&](Instruction *MemAccess, Value *Ptr) { 4630 if (!isa<PHINode>(Ptr) || 4631 !Legal->getInductionVars().count(cast<PHINode>(Ptr))) 4632 return false; 4633 auto &Induction = Legal->getInductionVars()[cast<PHINode>(Ptr)]; 4634 if (Induction.getKind() != InductionDescriptor::IK_PtrInduction) 4635 return false; 4636 return isScalarUse(MemAccess, Ptr); 4637 }; 4638 4639 // A helper that evaluates a memory access's use of a pointer. If the 4640 // pointer is actually the pointer induction of a loop, it is being 4641 // inserted into Worklist. If the use will be a scalar use, and the 4642 // pointer is only used by memory accesses, we place the pointer in 4643 // ScalarPtrs. Otherwise, the pointer is placed in PossibleNonScalarPtrs. 4644 auto evaluatePtrUse = [&](Instruction *MemAccess, Value *Ptr) { 4645 if (isScalarPtrInduction(MemAccess, Ptr)) { 4646 Worklist.insert(cast<Instruction>(Ptr)); 4647 Instruction *Update = cast<Instruction>( 4648 cast<PHINode>(Ptr)->getIncomingValueForBlock(Latch)); 4649 Worklist.insert(Update); 4650 LLVM_DEBUG(dbgs() << "LV: Found new scalar instruction: " << *Ptr 4651 << "\n"); 4652 LLVM_DEBUG(dbgs() << "LV: Found new scalar instruction: " << *Update 4653 << "\n"); 4654 return; 4655 } 4656 // We only care about bitcast and getelementptr instructions contained in 4657 // the loop. 4658 if (!isLoopVaryingBitCastOrGEP(Ptr)) 4659 return; 4660 4661 // If the pointer has already been identified as scalar (e.g., if it was 4662 // also identified as uniform), there's nothing to do. 4663 auto *I = cast<Instruction>(Ptr); 4664 if (Worklist.count(I)) 4665 return; 4666 4667 // If the use of the pointer will be a scalar use, and all users of the 4668 // pointer are memory accesses, place the pointer in ScalarPtrs. Otherwise, 4669 // place the pointer in PossibleNonScalarPtrs. 4670 if (isScalarUse(MemAccess, Ptr) && llvm::all_of(I->users(), [&](User *U) { 4671 return isa<LoadInst>(U) || isa<StoreInst>(U); 4672 })) 4673 ScalarPtrs.insert(I); 4674 else 4675 PossibleNonScalarPtrs.insert(I); 4676 }; 4677 4678 // We seed the scalars analysis with three classes of instructions: (1) 4679 // instructions marked uniform-after-vectorization and (2) bitcast, 4680 // getelementptr and (pointer) phi instructions used by memory accesses 4681 // requiring a scalar use. 4682 // 4683 // (1) Add to the worklist all instructions that have been identified as 4684 // uniform-after-vectorization. 4685 Worklist.insert(Uniforms[VF].begin(), Uniforms[VF].end()); 4686 4687 // (2) Add to the worklist all bitcast and getelementptr instructions used by 4688 // memory accesses requiring a scalar use. The pointer operands of loads and 4689 // stores will be scalar as long as the memory accesses is not a gather or 4690 // scatter operation. The value operand of a store will remain scalar if the 4691 // store is scalarized. 4692 for (auto *BB : TheLoop->blocks()) 4693 for (auto &I : *BB) { 4694 if (auto *Load = dyn_cast<LoadInst>(&I)) { 4695 evaluatePtrUse(Load, Load->getPointerOperand()); 4696 } else if (auto *Store = dyn_cast<StoreInst>(&I)) { 4697 evaluatePtrUse(Store, Store->getPointerOperand()); 4698 evaluatePtrUse(Store, Store->getValueOperand()); 4699 } 4700 } 4701 for (auto *I : ScalarPtrs) 4702 if (!PossibleNonScalarPtrs.count(I)) { 4703 LLVM_DEBUG(dbgs() << "LV: Found scalar instruction: " << *I << "\n"); 4704 Worklist.insert(I); 4705 } 4706 4707 // Insert the forced scalars. 4708 // FIXME: Currently widenPHIInstruction() often creates a dead vector 4709 // induction variable when the PHI user is scalarized. 4710 auto ForcedScalar = ForcedScalars.find(VF); 4711 if (ForcedScalar != ForcedScalars.end()) 4712 for (auto *I : ForcedScalar->second) 4713 Worklist.insert(I); 4714 4715 // Expand the worklist by looking through any bitcasts and getelementptr 4716 // instructions we've already identified as scalar. This is similar to the 4717 // expansion step in collectLoopUniforms(); however, here we're only 4718 // expanding to include additional bitcasts and getelementptr instructions. 4719 unsigned Idx = 0; 4720 while (Idx != Worklist.size()) { 4721 Instruction *Dst = Worklist[Idx++]; 4722 if (!isLoopVaryingBitCastOrGEP(Dst->getOperand(0))) 4723 continue; 4724 auto *Src = cast<Instruction>(Dst->getOperand(0)); 4725 if (llvm::all_of(Src->users(), [&](User *U) -> bool { 4726 auto *J = cast<Instruction>(U); 4727 return !TheLoop->contains(J) || Worklist.count(J) || 4728 ((isa<LoadInst>(J) || isa<StoreInst>(J)) && 4729 isScalarUse(J, Src)); 4730 })) { 4731 Worklist.insert(Src); 4732 LLVM_DEBUG(dbgs() << "LV: Found scalar instruction: " << *Src << "\n"); 4733 } 4734 } 4735 4736 // An induction variable will remain scalar if all users of the induction 4737 // variable and induction variable update remain scalar. 4738 for (auto &Induction : Legal->getInductionVars()) { 4739 auto *Ind = Induction.first; 4740 auto *IndUpdate = cast<Instruction>(Ind->getIncomingValueForBlock(Latch)); 4741 4742 // If tail-folding is applied, the primary induction variable will be used 4743 // to feed a vector compare. 4744 if (Ind == Legal->getPrimaryInduction() && foldTailByMasking()) 4745 continue; 4746 4747 // Determine if all users of the induction variable are scalar after 4748 // vectorization. 4749 auto ScalarInd = llvm::all_of(Ind->users(), [&](User *U) -> bool { 4750 auto *I = cast<Instruction>(U); 4751 return I == IndUpdate || !TheLoop->contains(I) || Worklist.count(I); 4752 }); 4753 if (!ScalarInd) 4754 continue; 4755 4756 // Determine if all users of the induction variable update instruction are 4757 // scalar after vectorization. 4758 auto ScalarIndUpdate = 4759 llvm::all_of(IndUpdate->users(), [&](User *U) -> bool { 4760 auto *I = cast<Instruction>(U); 4761 return I == Ind || !TheLoop->contains(I) || Worklist.count(I); 4762 }); 4763 if (!ScalarIndUpdate) 4764 continue; 4765 4766 // The induction variable and its update instruction will remain scalar. 4767 Worklist.insert(Ind); 4768 Worklist.insert(IndUpdate); 4769 LLVM_DEBUG(dbgs() << "LV: Found scalar instruction: " << *Ind << "\n"); 4770 LLVM_DEBUG(dbgs() << "LV: Found scalar instruction: " << *IndUpdate 4771 << "\n"); 4772 } 4773 4774 Scalars[VF].insert(Worklist.begin(), Worklist.end()); 4775 } 4776 4777 bool LoopVectorizationCostModel::isScalarWithPredication(Instruction *I, unsigned VF) { 4778 if (!blockNeedsPredication(I->getParent())) 4779 return false; 4780 switch(I->getOpcode()) { 4781 default: 4782 break; 4783 case Instruction::Load: 4784 case Instruction::Store: { 4785 if (!Legal->isMaskRequired(I)) 4786 return false; 4787 auto *Ptr = getLoadStorePointerOperand(I); 4788 auto *Ty = getMemInstValueType(I); 4789 // We have already decided how to vectorize this instruction, get that 4790 // result. 4791 if (VF > 1) { 4792 InstWidening WideningDecision = getWideningDecision(I, VF); 4793 assert(WideningDecision != CM_Unknown && 4794 "Widening decision should be ready at this moment"); 4795 return WideningDecision == CM_Scalarize; 4796 } 4797 const Align Alignment = getLoadStoreAlignment(I); 4798 return isa<LoadInst>(I) ? !(isLegalMaskedLoad(Ty, Ptr, Alignment) || 4799 isLegalMaskedGather(Ty, Alignment)) 4800 : !(isLegalMaskedStore(Ty, Ptr, Alignment) || 4801 isLegalMaskedScatter(Ty, Alignment)); 4802 } 4803 case Instruction::UDiv: 4804 case Instruction::SDiv: 4805 case Instruction::SRem: 4806 case Instruction::URem: 4807 return mayDivideByZero(*I); 4808 } 4809 return false; 4810 } 4811 4812 bool LoopVectorizationCostModel::interleavedAccessCanBeWidened(Instruction *I, 4813 unsigned VF) { 4814 assert(isAccessInterleaved(I) && "Expecting interleaved access."); 4815 assert(getWideningDecision(I, VF) == CM_Unknown && 4816 "Decision should not be set yet."); 4817 auto *Group = getInterleavedAccessGroup(I); 4818 assert(Group && "Must have a group."); 4819 4820 // If the instruction's allocated size doesn't equal it's type size, it 4821 // requires padding and will be scalarized. 4822 auto &DL = I->getModule()->getDataLayout(); 4823 auto *ScalarTy = getMemInstValueType(I); 4824 if (hasIrregularType(ScalarTy, DL, VF)) 4825 return false; 4826 4827 // Check if masking is required. 4828 // A Group may need masking for one of two reasons: it resides in a block that 4829 // needs predication, or it was decided to use masking to deal with gaps. 4830 bool PredicatedAccessRequiresMasking = 4831 Legal->blockNeedsPredication(I->getParent()) && Legal->isMaskRequired(I); 4832 bool AccessWithGapsRequiresMasking = 4833 Group->requiresScalarEpilogue() && !isScalarEpilogueAllowed(); 4834 if (!PredicatedAccessRequiresMasking && !AccessWithGapsRequiresMasking) 4835 return true; 4836 4837 // If masked interleaving is required, we expect that the user/target had 4838 // enabled it, because otherwise it either wouldn't have been created or 4839 // it should have been invalidated by the CostModel. 4840 assert(useMaskedInterleavedAccesses(TTI) && 4841 "Masked interleave-groups for predicated accesses are not enabled."); 4842 4843 auto *Ty = getMemInstValueType(I); 4844 const Align Alignment = getLoadStoreAlignment(I); 4845 return isa<LoadInst>(I) ? TTI.isLegalMaskedLoad(Ty, Alignment) 4846 : TTI.isLegalMaskedStore(Ty, Alignment); 4847 } 4848 4849 bool LoopVectorizationCostModel::memoryInstructionCanBeWidened(Instruction *I, 4850 unsigned VF) { 4851 // Get and ensure we have a valid memory instruction. 4852 LoadInst *LI = dyn_cast<LoadInst>(I); 4853 StoreInst *SI = dyn_cast<StoreInst>(I); 4854 assert((LI || SI) && "Invalid memory instruction"); 4855 4856 auto *Ptr = getLoadStorePointerOperand(I); 4857 4858 // In order to be widened, the pointer should be consecutive, first of all. 4859 if (!Legal->isConsecutivePtr(Ptr)) 4860 return false; 4861 4862 // If the instruction is a store located in a predicated block, it will be 4863 // scalarized. 4864 if (isScalarWithPredication(I)) 4865 return false; 4866 4867 // If the instruction's allocated size doesn't equal it's type size, it 4868 // requires padding and will be scalarized. 4869 auto &DL = I->getModule()->getDataLayout(); 4870 auto *ScalarTy = LI ? LI->getType() : SI->getValueOperand()->getType(); 4871 if (hasIrregularType(ScalarTy, DL, VF)) 4872 return false; 4873 4874 return true; 4875 } 4876 4877 void LoopVectorizationCostModel::collectLoopUniforms(unsigned VF) { 4878 // We should not collect Uniforms more than once per VF. Right now, 4879 // this function is called from collectUniformsAndScalars(), which 4880 // already does this check. Collecting Uniforms for VF=1 does not make any 4881 // sense. 4882 4883 assert(VF >= 2 && Uniforms.find(VF) == Uniforms.end() && 4884 "This function should not be visited twice for the same VF"); 4885 4886 // Visit the list of Uniforms. If we'll not find any uniform value, we'll 4887 // not analyze again. Uniforms.count(VF) will return 1. 4888 Uniforms[VF].clear(); 4889 4890 // We now know that the loop is vectorizable! 4891 // Collect instructions inside the loop that will remain uniform after 4892 // vectorization. 4893 4894 // Global values, params and instructions outside of current loop are out of 4895 // scope. 4896 auto isOutOfScope = [&](Value *V) -> bool { 4897 Instruction *I = dyn_cast<Instruction>(V); 4898 return (!I || !TheLoop->contains(I)); 4899 }; 4900 4901 SetVector<Instruction *> Worklist; 4902 BasicBlock *Latch = TheLoop->getLoopLatch(); 4903 4904 // Instructions that are scalar with predication must not be considered 4905 // uniform after vectorization, because that would create an erroneous 4906 // replicating region where only a single instance out of VF should be formed. 4907 // TODO: optimize such seldom cases if found important, see PR40816. 4908 auto addToWorklistIfAllowed = [&](Instruction *I) -> void { 4909 if (isScalarWithPredication(I, VF)) { 4910 LLVM_DEBUG(dbgs() << "LV: Found not uniform being ScalarWithPredication: " 4911 << *I << "\n"); 4912 return; 4913 } 4914 LLVM_DEBUG(dbgs() << "LV: Found uniform instruction: " << *I << "\n"); 4915 Worklist.insert(I); 4916 }; 4917 4918 // Start with the conditional branch. If the branch condition is an 4919 // instruction contained in the loop that is only used by the branch, it is 4920 // uniform. 4921 auto *Cmp = dyn_cast<Instruction>(Latch->getTerminator()->getOperand(0)); 4922 if (Cmp && TheLoop->contains(Cmp) && Cmp->hasOneUse()) 4923 addToWorklistIfAllowed(Cmp); 4924 4925 // Holds consecutive and consecutive-like pointers. Consecutive-like pointers 4926 // are pointers that are treated like consecutive pointers during 4927 // vectorization. The pointer operands of interleaved accesses are an 4928 // example. 4929 SmallSetVector<Instruction *, 8> ConsecutiveLikePtrs; 4930 4931 // Holds pointer operands of instructions that are possibly non-uniform. 4932 SmallPtrSet<Instruction *, 8> PossibleNonUniformPtrs; 4933 4934 auto isUniformDecision = [&](Instruction *I, unsigned VF) { 4935 InstWidening WideningDecision = getWideningDecision(I, VF); 4936 assert(WideningDecision != CM_Unknown && 4937 "Widening decision should be ready at this moment"); 4938 4939 return (WideningDecision == CM_Widen || 4940 WideningDecision == CM_Widen_Reverse || 4941 WideningDecision == CM_Interleave); 4942 }; 4943 // Iterate over the instructions in the loop, and collect all 4944 // consecutive-like pointer operands in ConsecutiveLikePtrs. If it's possible 4945 // that a consecutive-like pointer operand will be scalarized, we collect it 4946 // in PossibleNonUniformPtrs instead. We use two sets here because a single 4947 // getelementptr instruction can be used by both vectorized and scalarized 4948 // memory instructions. For example, if a loop loads and stores from the same 4949 // location, but the store is conditional, the store will be scalarized, and 4950 // the getelementptr won't remain uniform. 4951 for (auto *BB : TheLoop->blocks()) 4952 for (auto &I : *BB) { 4953 // If there's no pointer operand, there's nothing to do. 4954 auto *Ptr = dyn_cast_or_null<Instruction>(getLoadStorePointerOperand(&I)); 4955 if (!Ptr) 4956 continue; 4957 4958 // True if all users of Ptr are memory accesses that have Ptr as their 4959 // pointer operand. 4960 auto UsersAreMemAccesses = 4961 llvm::all_of(Ptr->users(), [&](User *U) -> bool { 4962 return getLoadStorePointerOperand(U) == Ptr; 4963 }); 4964 4965 // Ensure the memory instruction will not be scalarized or used by 4966 // gather/scatter, making its pointer operand non-uniform. If the pointer 4967 // operand is used by any instruction other than a memory access, we 4968 // conservatively assume the pointer operand may be non-uniform. 4969 if (!UsersAreMemAccesses || !isUniformDecision(&I, VF)) 4970 PossibleNonUniformPtrs.insert(Ptr); 4971 4972 // If the memory instruction will be vectorized and its pointer operand 4973 // is consecutive-like, or interleaving - the pointer operand should 4974 // remain uniform. 4975 else 4976 ConsecutiveLikePtrs.insert(Ptr); 4977 } 4978 4979 // Add to the Worklist all consecutive and consecutive-like pointers that 4980 // aren't also identified as possibly non-uniform. 4981 for (auto *V : ConsecutiveLikePtrs) 4982 if (!PossibleNonUniformPtrs.count(V)) 4983 addToWorklistIfAllowed(V); 4984 4985 // Expand Worklist in topological order: whenever a new instruction 4986 // is added , its users should be already inside Worklist. It ensures 4987 // a uniform instruction will only be used by uniform instructions. 4988 unsigned idx = 0; 4989 while (idx != Worklist.size()) { 4990 Instruction *I = Worklist[idx++]; 4991 4992 for (auto OV : I->operand_values()) { 4993 // isOutOfScope operands cannot be uniform instructions. 4994 if (isOutOfScope(OV)) 4995 continue; 4996 // First order recurrence Phi's should typically be considered 4997 // non-uniform. 4998 auto *OP = dyn_cast<PHINode>(OV); 4999 if (OP && Legal->isFirstOrderRecurrence(OP)) 5000 continue; 5001 // If all the users of the operand are uniform, then add the 5002 // operand into the uniform worklist. 5003 auto *OI = cast<Instruction>(OV); 5004 if (llvm::all_of(OI->users(), [&](User *U) -> bool { 5005 auto *J = cast<Instruction>(U); 5006 return Worklist.count(J) || 5007 (OI == getLoadStorePointerOperand(J) && 5008 isUniformDecision(J, VF)); 5009 })) 5010 addToWorklistIfAllowed(OI); 5011 } 5012 } 5013 5014 // Returns true if Ptr is the pointer operand of a memory access instruction 5015 // I, and I is known to not require scalarization. 5016 auto isVectorizedMemAccessUse = [&](Instruction *I, Value *Ptr) -> bool { 5017 return getLoadStorePointerOperand(I) == Ptr && isUniformDecision(I, VF); 5018 }; 5019 5020 // For an instruction to be added into Worklist above, all its users inside 5021 // the loop should also be in Worklist. However, this condition cannot be 5022 // true for phi nodes that form a cyclic dependence. We must process phi 5023 // nodes separately. An induction variable will remain uniform if all users 5024 // of the induction variable and induction variable update remain uniform. 5025 // The code below handles both pointer and non-pointer induction variables. 5026 for (auto &Induction : Legal->getInductionVars()) { 5027 auto *Ind = Induction.first; 5028 auto *IndUpdate = cast<Instruction>(Ind->getIncomingValueForBlock(Latch)); 5029 5030 // Determine if all users of the induction variable are uniform after 5031 // vectorization. 5032 auto UniformInd = llvm::all_of(Ind->users(), [&](User *U) -> bool { 5033 auto *I = cast<Instruction>(U); 5034 return I == IndUpdate || !TheLoop->contains(I) || Worklist.count(I) || 5035 isVectorizedMemAccessUse(I, Ind); 5036 }); 5037 if (!UniformInd) 5038 continue; 5039 5040 // Determine if all users of the induction variable update instruction are 5041 // uniform after vectorization. 5042 auto UniformIndUpdate = 5043 llvm::all_of(IndUpdate->users(), [&](User *U) -> bool { 5044 auto *I = cast<Instruction>(U); 5045 return I == Ind || !TheLoop->contains(I) || Worklist.count(I) || 5046 isVectorizedMemAccessUse(I, IndUpdate); 5047 }); 5048 if (!UniformIndUpdate) 5049 continue; 5050 5051 // The induction variable and its update instruction will remain uniform. 5052 addToWorklistIfAllowed(Ind); 5053 addToWorklistIfAllowed(IndUpdate); 5054 } 5055 5056 Uniforms[VF].insert(Worklist.begin(), Worklist.end()); 5057 } 5058 5059 bool LoopVectorizationCostModel::runtimeChecksRequired() { 5060 LLVM_DEBUG(dbgs() << "LV: Performing code size checks.\n"); 5061 5062 if (Legal->getRuntimePointerChecking()->Need) { 5063 reportVectorizationFailure("Runtime ptr check is required with -Os/-Oz", 5064 "runtime pointer checks needed. Enable vectorization of this " 5065 "loop with '#pragma clang loop vectorize(enable)' when " 5066 "compiling with -Os/-Oz", 5067 "CantVersionLoopWithOptForSize", ORE, TheLoop); 5068 return true; 5069 } 5070 5071 if (!PSE.getUnionPredicate().getPredicates().empty()) { 5072 reportVectorizationFailure("Runtime SCEV check is required with -Os/-Oz", 5073 "runtime SCEV checks needed. Enable vectorization of this " 5074 "loop with '#pragma clang loop vectorize(enable)' when " 5075 "compiling with -Os/-Oz", 5076 "CantVersionLoopWithOptForSize", ORE, TheLoop); 5077 return true; 5078 } 5079 5080 // FIXME: Avoid specializing for stride==1 instead of bailing out. 5081 if (!Legal->getLAI()->getSymbolicStrides().empty()) { 5082 reportVectorizationFailure("Runtime stride check for small trip count", 5083 "runtime stride == 1 checks needed. Enable vectorization of " 5084 "this loop without such check by compiling with -Os/-Oz", 5085 "CantVersionLoopWithOptForSize", ORE, TheLoop); 5086 return true; 5087 } 5088 5089 return false; 5090 } 5091 5092 Optional<unsigned> LoopVectorizationCostModel::computeMaxVF(unsigned UserVF, 5093 unsigned UserIC) { 5094 if (Legal->getRuntimePointerChecking()->Need && TTI.hasBranchDivergence()) { 5095 // TODO: It may by useful to do since it's still likely to be dynamically 5096 // uniform if the target can skip. 5097 reportVectorizationFailure( 5098 "Not inserting runtime ptr check for divergent target", 5099 "runtime pointer checks needed. Not enabled for divergent target", 5100 "CantVersionLoopWithDivergentTarget", ORE, TheLoop); 5101 return None; 5102 } 5103 5104 unsigned TC = PSE.getSE()->getSmallConstantTripCount(TheLoop); 5105 LLVM_DEBUG(dbgs() << "LV: Found trip count: " << TC << '\n'); 5106 if (TC == 1) { 5107 reportVectorizationFailure("Single iteration (non) loop", 5108 "loop trip count is one, irrelevant for vectorization", 5109 "SingleIterationLoop", ORE, TheLoop); 5110 return None; 5111 } 5112 5113 switch (ScalarEpilogueStatus) { 5114 case CM_ScalarEpilogueAllowed: 5115 return UserVF ? UserVF : computeFeasibleMaxVF(TC); 5116 case CM_ScalarEpilogueNotNeededUsePredicate: 5117 LLVM_DEBUG( 5118 dbgs() << "LV: vector predicate hint/switch found.\n" 5119 << "LV: Not allowing scalar epilogue, creating predicated " 5120 << "vector loop.\n"); 5121 break; 5122 case CM_ScalarEpilogueNotAllowedLowTripLoop: 5123 // fallthrough as a special case of OptForSize 5124 case CM_ScalarEpilogueNotAllowedOptSize: 5125 if (ScalarEpilogueStatus == CM_ScalarEpilogueNotAllowedOptSize) 5126 LLVM_DEBUG( 5127 dbgs() << "LV: Not allowing scalar epilogue due to -Os/-Oz.\n"); 5128 else 5129 LLVM_DEBUG(dbgs() << "LV: Not allowing scalar epilogue due to low trip " 5130 << "count.\n"); 5131 5132 // Bail if runtime checks are required, which are not good when optimising 5133 // for size. 5134 if (runtimeChecksRequired()) 5135 return None; 5136 break; 5137 } 5138 5139 // Now try the tail folding 5140 5141 // Invalidate interleave groups that require an epilogue if we can't mask 5142 // the interleave-group. 5143 if (!useMaskedInterleavedAccesses(TTI)) { 5144 assert(WideningDecisions.empty() && Uniforms.empty() && Scalars.empty() && 5145 "No decisions should have been taken at this point"); 5146 // Note: There is no need to invalidate any cost modeling decisions here, as 5147 // non where taken so far. 5148 InterleaveInfo.invalidateGroupsRequiringScalarEpilogue(); 5149 } 5150 5151 unsigned MaxVF = UserVF ? UserVF : computeFeasibleMaxVF(TC); 5152 assert((UserVF || isPowerOf2_32(MaxVF)) && "MaxVF must be a power of 2"); 5153 unsigned MaxVFtimesIC = UserIC ? MaxVF * UserIC : MaxVF; 5154 if (TC > 0 && TC % MaxVFtimesIC == 0) { 5155 // Accept MaxVF if we do not have a tail. 5156 LLVM_DEBUG(dbgs() << "LV: No tail will remain for any chosen VF.\n"); 5157 return MaxVF; 5158 } 5159 5160 // If we don't know the precise trip count, or if the trip count that we 5161 // found modulo the vectorization factor is not zero, try to fold the tail 5162 // by masking. 5163 // FIXME: look for a smaller MaxVF that does divide TC rather than masking. 5164 if (Legal->prepareToFoldTailByMasking()) { 5165 FoldTailByMasking = true; 5166 return MaxVF; 5167 } 5168 5169 if (TC == 0) { 5170 reportVectorizationFailure( 5171 "Unable to calculate the loop count due to complex control flow", 5172 "unable to calculate the loop count due to complex control flow", 5173 "UnknownLoopCountComplexCFG", ORE, TheLoop); 5174 return None; 5175 } 5176 5177 reportVectorizationFailure( 5178 "Cannot optimize for size and vectorize at the same time.", 5179 "cannot optimize for size and vectorize at the same time. " 5180 "Enable vectorization of this loop with '#pragma clang loop " 5181 "vectorize(enable)' when compiling with -Os/-Oz", 5182 "NoTailLoopWithOptForSize", ORE, TheLoop); 5183 return None; 5184 } 5185 5186 unsigned 5187 LoopVectorizationCostModel::computeFeasibleMaxVF(unsigned ConstTripCount) { 5188 MinBWs = computeMinimumValueSizes(TheLoop->getBlocks(), *DB, &TTI); 5189 unsigned SmallestType, WidestType; 5190 std::tie(SmallestType, WidestType) = getSmallestAndWidestTypes(); 5191 unsigned WidestRegister = TTI.getRegisterBitWidth(true); 5192 5193 // Get the maximum safe dependence distance in bits computed by LAA. 5194 // It is computed by MaxVF * sizeOf(type) * 8, where type is taken from 5195 // the memory accesses that is most restrictive (involved in the smallest 5196 // dependence distance). 5197 unsigned MaxSafeRegisterWidth = Legal->getMaxSafeRegisterWidth(); 5198 5199 WidestRegister = std::min(WidestRegister, MaxSafeRegisterWidth); 5200 5201 // Ensure MaxVF is a power of 2; the dependence distance bound may not be. 5202 // Note that both WidestRegister and WidestType may not be a powers of 2. 5203 unsigned MaxVectorSize = PowerOf2Floor(WidestRegister / WidestType); 5204 5205 LLVM_DEBUG(dbgs() << "LV: The Smallest and Widest types: " << SmallestType 5206 << " / " << WidestType << " bits.\n"); 5207 LLVM_DEBUG(dbgs() << "LV: The Widest register safe to use is: " 5208 << WidestRegister << " bits.\n"); 5209 5210 assert(MaxVectorSize <= 256 && "Did not expect to pack so many elements" 5211 " into one vector!"); 5212 if (MaxVectorSize == 0) { 5213 LLVM_DEBUG(dbgs() << "LV: The target has no vector registers.\n"); 5214 MaxVectorSize = 1; 5215 return MaxVectorSize; 5216 } else if (ConstTripCount && ConstTripCount < MaxVectorSize && 5217 isPowerOf2_32(ConstTripCount)) { 5218 // We need to clamp the VF to be the ConstTripCount. There is no point in 5219 // choosing a higher viable VF as done in the loop below. 5220 LLVM_DEBUG(dbgs() << "LV: Clamping the MaxVF to the constant trip count: " 5221 << ConstTripCount << "\n"); 5222 MaxVectorSize = ConstTripCount; 5223 return MaxVectorSize; 5224 } 5225 5226 unsigned MaxVF = MaxVectorSize; 5227 if (TTI.shouldMaximizeVectorBandwidth(!isScalarEpilogueAllowed()) || 5228 (MaximizeBandwidth && isScalarEpilogueAllowed())) { 5229 // Collect all viable vectorization factors larger than the default MaxVF 5230 // (i.e. MaxVectorSize). 5231 SmallVector<unsigned, 8> VFs; 5232 unsigned NewMaxVectorSize = WidestRegister / SmallestType; 5233 for (unsigned VS = MaxVectorSize * 2; VS <= NewMaxVectorSize; VS *= 2) 5234 VFs.push_back(VS); 5235 5236 // For each VF calculate its register usage. 5237 auto RUs = calculateRegisterUsage(VFs); 5238 5239 // Select the largest VF which doesn't require more registers than existing 5240 // ones. 5241 for (int i = RUs.size() - 1; i >= 0; --i) { 5242 bool Selected = true; 5243 for (auto& pair : RUs[i].MaxLocalUsers) { 5244 unsigned TargetNumRegisters = TTI.getNumberOfRegisters(pair.first); 5245 if (pair.second > TargetNumRegisters) 5246 Selected = false; 5247 } 5248 if (Selected) { 5249 MaxVF = VFs[i]; 5250 break; 5251 } 5252 } 5253 if (unsigned MinVF = TTI.getMinimumVF(SmallestType)) { 5254 if (MaxVF < MinVF) { 5255 LLVM_DEBUG(dbgs() << "LV: Overriding calculated MaxVF(" << MaxVF 5256 << ") with target's minimum: " << MinVF << '\n'); 5257 MaxVF = MinVF; 5258 } 5259 } 5260 } 5261 return MaxVF; 5262 } 5263 5264 VectorizationFactor 5265 LoopVectorizationCostModel::selectVectorizationFactor(unsigned MaxVF) { 5266 float Cost = expectedCost(1).first; 5267 const float ScalarCost = Cost; 5268 unsigned Width = 1; 5269 LLVM_DEBUG(dbgs() << "LV: Scalar loop costs: " << (int)ScalarCost << ".\n"); 5270 5271 bool ForceVectorization = Hints->getForce() == LoopVectorizeHints::FK_Enabled; 5272 if (ForceVectorization && MaxVF > 1) { 5273 // Ignore scalar width, because the user explicitly wants vectorization. 5274 // Initialize cost to max so that VF = 2 is, at least, chosen during cost 5275 // evaluation. 5276 Cost = std::numeric_limits<float>::max(); 5277 } 5278 5279 for (unsigned i = 2; i <= MaxVF; i *= 2) { 5280 // Notice that the vector loop needs to be executed less times, so 5281 // we need to divide the cost of the vector loops by the width of 5282 // the vector elements. 5283 VectorizationCostTy C = expectedCost(i); 5284 float VectorCost = C.first / (float)i; 5285 LLVM_DEBUG(dbgs() << "LV: Vector loop of width " << i 5286 << " costs: " << (int)VectorCost << ".\n"); 5287 if (!C.second && !ForceVectorization) { 5288 LLVM_DEBUG( 5289 dbgs() << "LV: Not considering vector loop of width " << i 5290 << " because it will not generate any vector instructions.\n"); 5291 continue; 5292 } 5293 if (VectorCost < Cost) { 5294 Cost = VectorCost; 5295 Width = i; 5296 } 5297 } 5298 5299 if (!EnableCondStoresVectorization && NumPredStores) { 5300 reportVectorizationFailure("There are conditional stores.", 5301 "store that is conditionally executed prevents vectorization", 5302 "ConditionalStore", ORE, TheLoop); 5303 Width = 1; 5304 Cost = ScalarCost; 5305 } 5306 5307 LLVM_DEBUG(if (ForceVectorization && Width > 1 && Cost >= ScalarCost) dbgs() 5308 << "LV: Vectorization seems to be not beneficial, " 5309 << "but was forced by a user.\n"); 5310 LLVM_DEBUG(dbgs() << "LV: Selecting VF: " << Width << ".\n"); 5311 VectorizationFactor Factor = {Width, (unsigned)(Width * Cost)}; 5312 return Factor; 5313 } 5314 5315 std::pair<unsigned, unsigned> 5316 LoopVectorizationCostModel::getSmallestAndWidestTypes() { 5317 unsigned MinWidth = -1U; 5318 unsigned MaxWidth = 8; 5319 const DataLayout &DL = TheFunction->getParent()->getDataLayout(); 5320 5321 // For each block. 5322 for (BasicBlock *BB : TheLoop->blocks()) { 5323 // For each instruction in the loop. 5324 for (Instruction &I : BB->instructionsWithoutDebug()) { 5325 Type *T = I.getType(); 5326 5327 // Skip ignored values. 5328 if (ValuesToIgnore.count(&I)) 5329 continue; 5330 5331 // Only examine Loads, Stores and PHINodes. 5332 if (!isa<LoadInst>(I) && !isa<StoreInst>(I) && !isa<PHINode>(I)) 5333 continue; 5334 5335 // Examine PHI nodes that are reduction variables. Update the type to 5336 // account for the recurrence type. 5337 if (auto *PN = dyn_cast<PHINode>(&I)) { 5338 if (!Legal->isReductionVariable(PN)) 5339 continue; 5340 RecurrenceDescriptor RdxDesc = Legal->getReductionVars()[PN]; 5341 T = RdxDesc.getRecurrenceType(); 5342 } 5343 5344 // Examine the stored values. 5345 if (auto *ST = dyn_cast<StoreInst>(&I)) 5346 T = ST->getValueOperand()->getType(); 5347 5348 // Ignore loaded pointer types and stored pointer types that are not 5349 // vectorizable. 5350 // 5351 // FIXME: The check here attempts to predict whether a load or store will 5352 // be vectorized. We only know this for certain after a VF has 5353 // been selected. Here, we assume that if an access can be 5354 // vectorized, it will be. We should also look at extending this 5355 // optimization to non-pointer types. 5356 // 5357 if (T->isPointerTy() && !isConsecutiveLoadOrStore(&I) && 5358 !isAccessInterleaved(&I) && !isLegalGatherOrScatter(&I)) 5359 continue; 5360 5361 MinWidth = std::min(MinWidth, 5362 (unsigned)DL.getTypeSizeInBits(T->getScalarType())); 5363 MaxWidth = std::max(MaxWidth, 5364 (unsigned)DL.getTypeSizeInBits(T->getScalarType())); 5365 } 5366 } 5367 5368 return {MinWidth, MaxWidth}; 5369 } 5370 5371 unsigned LoopVectorizationCostModel::selectInterleaveCount(unsigned VF, 5372 unsigned LoopCost) { 5373 // -- The interleave heuristics -- 5374 // We interleave the loop in order to expose ILP and reduce the loop overhead. 5375 // There are many micro-architectural considerations that we can't predict 5376 // at this level. For example, frontend pressure (on decode or fetch) due to 5377 // code size, or the number and capabilities of the execution ports. 5378 // 5379 // We use the following heuristics to select the interleave count: 5380 // 1. If the code has reductions, then we interleave to break the cross 5381 // iteration dependency. 5382 // 2. If the loop is really small, then we interleave to reduce the loop 5383 // overhead. 5384 // 3. We don't interleave if we think that we will spill registers to memory 5385 // due to the increased register pressure. 5386 5387 if (!isScalarEpilogueAllowed()) 5388 return 1; 5389 5390 // We used the distance for the interleave count. 5391 if (Legal->getMaxSafeDepDistBytes() != -1U) 5392 return 1; 5393 5394 // Do not interleave loops with a relatively small known or estimated trip 5395 // count. 5396 auto BestKnownTC = getSmallBestKnownTC(*PSE.getSE(), TheLoop); 5397 if (BestKnownTC && *BestKnownTC < TinyTripCountInterleaveThreshold) 5398 return 1; 5399 5400 RegisterUsage R = calculateRegisterUsage({VF})[0]; 5401 // We divide by these constants so assume that we have at least one 5402 // instruction that uses at least one register. 5403 for (auto& pair : R.MaxLocalUsers) { 5404 pair.second = std::max(pair.second, 1U); 5405 } 5406 5407 // We calculate the interleave count using the following formula. 5408 // Subtract the number of loop invariants from the number of available 5409 // registers. These registers are used by all of the interleaved instances. 5410 // Next, divide the remaining registers by the number of registers that is 5411 // required by the loop, in order to estimate how many parallel instances 5412 // fit without causing spills. All of this is rounded down if necessary to be 5413 // a power of two. We want power of two interleave count to simplify any 5414 // addressing operations or alignment considerations. 5415 // We also want power of two interleave counts to ensure that the induction 5416 // variable of the vector loop wraps to zero, when tail is folded by masking; 5417 // this currently happens when OptForSize, in which case IC is set to 1 above. 5418 unsigned IC = UINT_MAX; 5419 5420 for (auto& pair : R.MaxLocalUsers) { 5421 unsigned TargetNumRegisters = TTI.getNumberOfRegisters(pair.first); 5422 LLVM_DEBUG(dbgs() << "LV: The target has " << TargetNumRegisters 5423 << " registers of " 5424 << TTI.getRegisterClassName(pair.first) << " register class\n"); 5425 if (VF == 1) { 5426 if (ForceTargetNumScalarRegs.getNumOccurrences() > 0) 5427 TargetNumRegisters = ForceTargetNumScalarRegs; 5428 } else { 5429 if (ForceTargetNumVectorRegs.getNumOccurrences() > 0) 5430 TargetNumRegisters = ForceTargetNumVectorRegs; 5431 } 5432 unsigned MaxLocalUsers = pair.second; 5433 unsigned LoopInvariantRegs = 0; 5434 if (R.LoopInvariantRegs.find(pair.first) != R.LoopInvariantRegs.end()) 5435 LoopInvariantRegs = R.LoopInvariantRegs[pair.first]; 5436 5437 unsigned TmpIC = PowerOf2Floor((TargetNumRegisters - LoopInvariantRegs) / MaxLocalUsers); 5438 // Don't count the induction variable as interleaved. 5439 if (EnableIndVarRegisterHeur) { 5440 TmpIC = 5441 PowerOf2Floor((TargetNumRegisters - LoopInvariantRegs - 1) / 5442 std::max(1U, (MaxLocalUsers - 1))); 5443 } 5444 5445 IC = std::min(IC, TmpIC); 5446 } 5447 5448 // Clamp the interleave ranges to reasonable counts. 5449 unsigned MaxInterleaveCount = TTI.getMaxInterleaveFactor(VF); 5450 5451 // Check if the user has overridden the max. 5452 if (VF == 1) { 5453 if (ForceTargetMaxScalarInterleaveFactor.getNumOccurrences() > 0) 5454 MaxInterleaveCount = ForceTargetMaxScalarInterleaveFactor; 5455 } else { 5456 if (ForceTargetMaxVectorInterleaveFactor.getNumOccurrences() > 0) 5457 MaxInterleaveCount = ForceTargetMaxVectorInterleaveFactor; 5458 } 5459 5460 // If trip count is known or estimated compile time constant, limit the 5461 // interleave count to be less than the trip count divided by VF. 5462 if (BestKnownTC) { 5463 MaxInterleaveCount = std::min(*BestKnownTC / VF, MaxInterleaveCount); 5464 } 5465 5466 // If we did not calculate the cost for VF (because the user selected the VF) 5467 // then we calculate the cost of VF here. 5468 if (LoopCost == 0) 5469 LoopCost = expectedCost(VF).first; 5470 5471 assert(LoopCost && "Non-zero loop cost expected"); 5472 5473 // Clamp the calculated IC to be between the 1 and the max interleave count 5474 // that the target and trip count allows. 5475 if (IC > MaxInterleaveCount) 5476 IC = MaxInterleaveCount; 5477 else if (IC < 1) 5478 IC = 1; 5479 5480 // Interleave if we vectorized this loop and there is a reduction that could 5481 // benefit from interleaving. 5482 if (VF > 1 && !Legal->getReductionVars().empty()) { 5483 LLVM_DEBUG(dbgs() << "LV: Interleaving because of reductions.\n"); 5484 return IC; 5485 } 5486 5487 // Note that if we've already vectorized the loop we will have done the 5488 // runtime check and so interleaving won't require further checks. 5489 bool InterleavingRequiresRuntimePointerCheck = 5490 (VF == 1 && Legal->getRuntimePointerChecking()->Need); 5491 5492 // We want to interleave small loops in order to reduce the loop overhead and 5493 // potentially expose ILP opportunities. 5494 LLVM_DEBUG(dbgs() << "LV: Loop cost is " << LoopCost << '\n'); 5495 if (!InterleavingRequiresRuntimePointerCheck && LoopCost < SmallLoopCost) { 5496 // We assume that the cost overhead is 1 and we use the cost model 5497 // to estimate the cost of the loop and interleave until the cost of the 5498 // loop overhead is about 5% of the cost of the loop. 5499 unsigned SmallIC = 5500 std::min(IC, (unsigned)PowerOf2Floor(SmallLoopCost / LoopCost)); 5501 5502 // Interleave until store/load ports (estimated by max interleave count) are 5503 // saturated. 5504 unsigned NumStores = Legal->getNumStores(); 5505 unsigned NumLoads = Legal->getNumLoads(); 5506 unsigned StoresIC = IC / (NumStores ? NumStores : 1); 5507 unsigned LoadsIC = IC / (NumLoads ? NumLoads : 1); 5508 5509 // If we have a scalar reduction (vector reductions are already dealt with 5510 // by this point), we can increase the critical path length if the loop 5511 // we're interleaving is inside another loop. Limit, by default to 2, so the 5512 // critical path only gets increased by one reduction operation. 5513 if (!Legal->getReductionVars().empty() && TheLoop->getLoopDepth() > 1) { 5514 unsigned F = static_cast<unsigned>(MaxNestedScalarReductionIC); 5515 SmallIC = std::min(SmallIC, F); 5516 StoresIC = std::min(StoresIC, F); 5517 LoadsIC = std::min(LoadsIC, F); 5518 } 5519 5520 if (EnableLoadStoreRuntimeInterleave && 5521 std::max(StoresIC, LoadsIC) > SmallIC) { 5522 LLVM_DEBUG( 5523 dbgs() << "LV: Interleaving to saturate store or load ports.\n"); 5524 return std::max(StoresIC, LoadsIC); 5525 } 5526 5527 LLVM_DEBUG(dbgs() << "LV: Interleaving to reduce branch cost.\n"); 5528 return SmallIC; 5529 } 5530 5531 // Interleave if this is a large loop (small loops are already dealt with by 5532 // this point) that could benefit from interleaving. 5533 bool HasReductions = !Legal->getReductionVars().empty(); 5534 if (TTI.enableAggressiveInterleaving(HasReductions)) { 5535 LLVM_DEBUG(dbgs() << "LV: Interleaving to expose ILP.\n"); 5536 return IC; 5537 } 5538 5539 LLVM_DEBUG(dbgs() << "LV: Not Interleaving.\n"); 5540 return 1; 5541 } 5542 5543 SmallVector<LoopVectorizationCostModel::RegisterUsage, 8> 5544 LoopVectorizationCostModel::calculateRegisterUsage(ArrayRef<unsigned> VFs) { 5545 // This function calculates the register usage by measuring the highest number 5546 // of values that are alive at a single location. Obviously, this is a very 5547 // rough estimation. We scan the loop in a topological order in order and 5548 // assign a number to each instruction. We use RPO to ensure that defs are 5549 // met before their users. We assume that each instruction that has in-loop 5550 // users starts an interval. We record every time that an in-loop value is 5551 // used, so we have a list of the first and last occurrences of each 5552 // instruction. Next, we transpose this data structure into a multi map that 5553 // holds the list of intervals that *end* at a specific location. This multi 5554 // map allows us to perform a linear search. We scan the instructions linearly 5555 // and record each time that a new interval starts, by placing it in a set. 5556 // If we find this value in the multi-map then we remove it from the set. 5557 // The max register usage is the maximum size of the set. 5558 // We also search for instructions that are defined outside the loop, but are 5559 // used inside the loop. We need this number separately from the max-interval 5560 // usage number because when we unroll, loop-invariant values do not take 5561 // more register. 5562 LoopBlocksDFS DFS(TheLoop); 5563 DFS.perform(LI); 5564 5565 RegisterUsage RU; 5566 5567 // Each 'key' in the map opens a new interval. The values 5568 // of the map are the index of the 'last seen' usage of the 5569 // instruction that is the key. 5570 using IntervalMap = DenseMap<Instruction *, unsigned>; 5571 5572 // Maps instruction to its index. 5573 SmallVector<Instruction *, 64> IdxToInstr; 5574 // Marks the end of each interval. 5575 IntervalMap EndPoint; 5576 // Saves the list of instruction indices that are used in the loop. 5577 SmallPtrSet<Instruction *, 8> Ends; 5578 // Saves the list of values that are used in the loop but are 5579 // defined outside the loop, such as arguments and constants. 5580 SmallPtrSet<Value *, 8> LoopInvariants; 5581 5582 for (BasicBlock *BB : make_range(DFS.beginRPO(), DFS.endRPO())) { 5583 for (Instruction &I : BB->instructionsWithoutDebug()) { 5584 IdxToInstr.push_back(&I); 5585 5586 // Save the end location of each USE. 5587 for (Value *U : I.operands()) { 5588 auto *Instr = dyn_cast<Instruction>(U); 5589 5590 // Ignore non-instruction values such as arguments, constants, etc. 5591 if (!Instr) 5592 continue; 5593 5594 // If this instruction is outside the loop then record it and continue. 5595 if (!TheLoop->contains(Instr)) { 5596 LoopInvariants.insert(Instr); 5597 continue; 5598 } 5599 5600 // Overwrite previous end points. 5601 EndPoint[Instr] = IdxToInstr.size(); 5602 Ends.insert(Instr); 5603 } 5604 } 5605 } 5606 5607 // Saves the list of intervals that end with the index in 'key'. 5608 using InstrList = SmallVector<Instruction *, 2>; 5609 DenseMap<unsigned, InstrList> TransposeEnds; 5610 5611 // Transpose the EndPoints to a list of values that end at each index. 5612 for (auto &Interval : EndPoint) 5613 TransposeEnds[Interval.second].push_back(Interval.first); 5614 5615 SmallPtrSet<Instruction *, 8> OpenIntervals; 5616 5617 // Get the size of the widest register. 5618 unsigned MaxSafeDepDist = -1U; 5619 if (Legal->getMaxSafeDepDistBytes() != -1U) 5620 MaxSafeDepDist = Legal->getMaxSafeDepDistBytes() * 8; 5621 unsigned WidestRegister = 5622 std::min(TTI.getRegisterBitWidth(true), MaxSafeDepDist); 5623 const DataLayout &DL = TheFunction->getParent()->getDataLayout(); 5624 5625 SmallVector<RegisterUsage, 8> RUs(VFs.size()); 5626 SmallVector<SmallMapVector<unsigned, unsigned, 4>, 8> MaxUsages(VFs.size()); 5627 5628 LLVM_DEBUG(dbgs() << "LV(REG): Calculating max register usage:\n"); 5629 5630 // A lambda that gets the register usage for the given type and VF. 5631 auto GetRegUsage = [&DL, WidestRegister](Type *Ty, unsigned VF) { 5632 if (Ty->isTokenTy()) 5633 return 0U; 5634 unsigned TypeSize = DL.getTypeSizeInBits(Ty->getScalarType()); 5635 return std::max<unsigned>(1, VF * TypeSize / WidestRegister); 5636 }; 5637 5638 for (unsigned int i = 0, s = IdxToInstr.size(); i < s; ++i) { 5639 Instruction *I = IdxToInstr[i]; 5640 5641 // Remove all of the instructions that end at this location. 5642 InstrList &List = TransposeEnds[i]; 5643 for (Instruction *ToRemove : List) 5644 OpenIntervals.erase(ToRemove); 5645 5646 // Ignore instructions that are never used within the loop. 5647 if (!Ends.count(I)) 5648 continue; 5649 5650 // Skip ignored values. 5651 if (ValuesToIgnore.count(I)) 5652 continue; 5653 5654 // For each VF find the maximum usage of registers. 5655 for (unsigned j = 0, e = VFs.size(); j < e; ++j) { 5656 // Count the number of live intervals. 5657 SmallMapVector<unsigned, unsigned, 4> RegUsage; 5658 5659 if (VFs[j] == 1) { 5660 for (auto Inst : OpenIntervals) { 5661 unsigned ClassID = TTI.getRegisterClassForType(false, Inst->getType()); 5662 if (RegUsage.find(ClassID) == RegUsage.end()) 5663 RegUsage[ClassID] = 1; 5664 else 5665 RegUsage[ClassID] += 1; 5666 } 5667 } else { 5668 collectUniformsAndScalars(VFs[j]); 5669 for (auto Inst : OpenIntervals) { 5670 // Skip ignored values for VF > 1. 5671 if (VecValuesToIgnore.count(Inst)) 5672 continue; 5673 if (isScalarAfterVectorization(Inst, VFs[j])) { 5674 unsigned ClassID = TTI.getRegisterClassForType(false, Inst->getType()); 5675 if (RegUsage.find(ClassID) == RegUsage.end()) 5676 RegUsage[ClassID] = 1; 5677 else 5678 RegUsage[ClassID] += 1; 5679 } else { 5680 unsigned ClassID = TTI.getRegisterClassForType(true, Inst->getType()); 5681 if (RegUsage.find(ClassID) == RegUsage.end()) 5682 RegUsage[ClassID] = GetRegUsage(Inst->getType(), VFs[j]); 5683 else 5684 RegUsage[ClassID] += GetRegUsage(Inst->getType(), VFs[j]); 5685 } 5686 } 5687 } 5688 5689 for (auto& pair : RegUsage) { 5690 if (MaxUsages[j].find(pair.first) != MaxUsages[j].end()) 5691 MaxUsages[j][pair.first] = std::max(MaxUsages[j][pair.first], pair.second); 5692 else 5693 MaxUsages[j][pair.first] = pair.second; 5694 } 5695 } 5696 5697 LLVM_DEBUG(dbgs() << "LV(REG): At #" << i << " Interval # " 5698 << OpenIntervals.size() << '\n'); 5699 5700 // Add the current instruction to the list of open intervals. 5701 OpenIntervals.insert(I); 5702 } 5703 5704 for (unsigned i = 0, e = VFs.size(); i < e; ++i) { 5705 SmallMapVector<unsigned, unsigned, 4> Invariant; 5706 5707 for (auto Inst : LoopInvariants) { 5708 unsigned Usage = VFs[i] == 1 ? 1 : GetRegUsage(Inst->getType(), VFs[i]); 5709 unsigned ClassID = TTI.getRegisterClassForType(VFs[i] > 1, Inst->getType()); 5710 if (Invariant.find(ClassID) == Invariant.end()) 5711 Invariant[ClassID] = Usage; 5712 else 5713 Invariant[ClassID] += Usage; 5714 } 5715 5716 LLVM_DEBUG({ 5717 dbgs() << "LV(REG): VF = " << VFs[i] << '\n'; 5718 dbgs() << "LV(REG): Found max usage: " << MaxUsages[i].size() 5719 << " item\n"; 5720 for (const auto &pair : MaxUsages[i]) { 5721 dbgs() << "LV(REG): RegisterClass: " 5722 << TTI.getRegisterClassName(pair.first) << ", " << pair.second 5723 << " registers\n"; 5724 } 5725 dbgs() << "LV(REG): Found invariant usage: " << Invariant.size() 5726 << " item\n"; 5727 for (const auto &pair : Invariant) { 5728 dbgs() << "LV(REG): RegisterClass: " 5729 << TTI.getRegisterClassName(pair.first) << ", " << pair.second 5730 << " registers\n"; 5731 } 5732 }); 5733 5734 RU.LoopInvariantRegs = Invariant; 5735 RU.MaxLocalUsers = MaxUsages[i]; 5736 RUs[i] = RU; 5737 } 5738 5739 return RUs; 5740 } 5741 5742 bool LoopVectorizationCostModel::useEmulatedMaskMemRefHack(Instruction *I){ 5743 // TODO: Cost model for emulated masked load/store is completely 5744 // broken. This hack guides the cost model to use an artificially 5745 // high enough value to practically disable vectorization with such 5746 // operations, except where previously deployed legality hack allowed 5747 // using very low cost values. This is to avoid regressions coming simply 5748 // from moving "masked load/store" check from legality to cost model. 5749 // Masked Load/Gather emulation was previously never allowed. 5750 // Limited number of Masked Store/Scatter emulation was allowed. 5751 assert(isPredicatedInst(I) && "Expecting a scalar emulated instruction"); 5752 return isa<LoadInst>(I) || 5753 (isa<StoreInst>(I) && 5754 NumPredStores > NumberOfStoresToPredicate); 5755 } 5756 5757 void LoopVectorizationCostModel::collectInstsToScalarize(unsigned VF) { 5758 // If we aren't vectorizing the loop, or if we've already collected the 5759 // instructions to scalarize, there's nothing to do. Collection may already 5760 // have occurred if we have a user-selected VF and are now computing the 5761 // expected cost for interleaving. 5762 if (VF < 2 || InstsToScalarize.find(VF) != InstsToScalarize.end()) 5763 return; 5764 5765 // Initialize a mapping for VF in InstsToScalalarize. If we find that it's 5766 // not profitable to scalarize any instructions, the presence of VF in the 5767 // map will indicate that we've analyzed it already. 5768 ScalarCostsTy &ScalarCostsVF = InstsToScalarize[VF]; 5769 5770 // Find all the instructions that are scalar with predication in the loop and 5771 // determine if it would be better to not if-convert the blocks they are in. 5772 // If so, we also record the instructions to scalarize. 5773 for (BasicBlock *BB : TheLoop->blocks()) { 5774 if (!blockNeedsPredication(BB)) 5775 continue; 5776 for (Instruction &I : *BB) 5777 if (isScalarWithPredication(&I)) { 5778 ScalarCostsTy ScalarCosts; 5779 // Do not apply discount logic if hacked cost is needed 5780 // for emulated masked memrefs. 5781 if (!useEmulatedMaskMemRefHack(&I) && 5782 computePredInstDiscount(&I, ScalarCosts, VF) >= 0) 5783 ScalarCostsVF.insert(ScalarCosts.begin(), ScalarCosts.end()); 5784 // Remember that BB will remain after vectorization. 5785 PredicatedBBsAfterVectorization.insert(BB); 5786 } 5787 } 5788 } 5789 5790 int LoopVectorizationCostModel::computePredInstDiscount( 5791 Instruction *PredInst, DenseMap<Instruction *, unsigned> &ScalarCosts, 5792 unsigned VF) { 5793 assert(!isUniformAfterVectorization(PredInst, VF) && 5794 "Instruction marked uniform-after-vectorization will be predicated"); 5795 5796 // Initialize the discount to zero, meaning that the scalar version and the 5797 // vector version cost the same. 5798 int Discount = 0; 5799 5800 // Holds instructions to analyze. The instructions we visit are mapped in 5801 // ScalarCosts. Those instructions are the ones that would be scalarized if 5802 // we find that the scalar version costs less. 5803 SmallVector<Instruction *, 8> Worklist; 5804 5805 // Returns true if the given instruction can be scalarized. 5806 auto canBeScalarized = [&](Instruction *I) -> bool { 5807 // We only attempt to scalarize instructions forming a single-use chain 5808 // from the original predicated block that would otherwise be vectorized. 5809 // Although not strictly necessary, we give up on instructions we know will 5810 // already be scalar to avoid traversing chains that are unlikely to be 5811 // beneficial. 5812 if (!I->hasOneUse() || PredInst->getParent() != I->getParent() || 5813 isScalarAfterVectorization(I, VF)) 5814 return false; 5815 5816 // If the instruction is scalar with predication, it will be analyzed 5817 // separately. We ignore it within the context of PredInst. 5818 if (isScalarWithPredication(I)) 5819 return false; 5820 5821 // If any of the instruction's operands are uniform after vectorization, 5822 // the instruction cannot be scalarized. This prevents, for example, a 5823 // masked load from being scalarized. 5824 // 5825 // We assume we will only emit a value for lane zero of an instruction 5826 // marked uniform after vectorization, rather than VF identical values. 5827 // Thus, if we scalarize an instruction that uses a uniform, we would 5828 // create uses of values corresponding to the lanes we aren't emitting code 5829 // for. This behavior can be changed by allowing getScalarValue to clone 5830 // the lane zero values for uniforms rather than asserting. 5831 for (Use &U : I->operands()) 5832 if (auto *J = dyn_cast<Instruction>(U.get())) 5833 if (isUniformAfterVectorization(J, VF)) 5834 return false; 5835 5836 // Otherwise, we can scalarize the instruction. 5837 return true; 5838 }; 5839 5840 // Compute the expected cost discount from scalarizing the entire expression 5841 // feeding the predicated instruction. We currently only consider expressions 5842 // that are single-use instruction chains. 5843 Worklist.push_back(PredInst); 5844 while (!Worklist.empty()) { 5845 Instruction *I = Worklist.pop_back_val(); 5846 5847 // If we've already analyzed the instruction, there's nothing to do. 5848 if (ScalarCosts.find(I) != ScalarCosts.end()) 5849 continue; 5850 5851 // Compute the cost of the vector instruction. Note that this cost already 5852 // includes the scalarization overhead of the predicated instruction. 5853 unsigned VectorCost = getInstructionCost(I, VF).first; 5854 5855 // Compute the cost of the scalarized instruction. This cost is the cost of 5856 // the instruction as if it wasn't if-converted and instead remained in the 5857 // predicated block. We will scale this cost by block probability after 5858 // computing the scalarization overhead. 5859 unsigned ScalarCost = VF * getInstructionCost(I, 1).first; 5860 5861 // Compute the scalarization overhead of needed insertelement instructions 5862 // and phi nodes. 5863 if (isScalarWithPredication(I) && !I->getType()->isVoidTy()) { 5864 ScalarCost += TTI.getScalarizationOverhead( 5865 cast<VectorType>(ToVectorTy(I->getType(), VF)), 5866 APInt::getAllOnesValue(VF), true, false); 5867 ScalarCost += VF * TTI.getCFInstrCost(Instruction::PHI, 5868 TTI::TCK_RecipThroughput); 5869 } 5870 5871 // Compute the scalarization overhead of needed extractelement 5872 // instructions. For each of the instruction's operands, if the operand can 5873 // be scalarized, add it to the worklist; otherwise, account for the 5874 // overhead. 5875 for (Use &U : I->operands()) 5876 if (auto *J = dyn_cast<Instruction>(U.get())) { 5877 assert(VectorType::isValidElementType(J->getType()) && 5878 "Instruction has non-scalar type"); 5879 if (canBeScalarized(J)) 5880 Worklist.push_back(J); 5881 else if (needsExtract(J, VF)) 5882 ScalarCost += TTI.getScalarizationOverhead( 5883 cast<VectorType>(ToVectorTy(J->getType(), VF)), 5884 APInt::getAllOnesValue(VF), false, true); 5885 } 5886 5887 // Scale the total scalar cost by block probability. 5888 ScalarCost /= getReciprocalPredBlockProb(); 5889 5890 // Compute the discount. A non-negative discount means the vector version 5891 // of the instruction costs more, and scalarizing would be beneficial. 5892 Discount += VectorCost - ScalarCost; 5893 ScalarCosts[I] = ScalarCost; 5894 } 5895 5896 return Discount; 5897 } 5898 5899 LoopVectorizationCostModel::VectorizationCostTy 5900 LoopVectorizationCostModel::expectedCost(unsigned VF) { 5901 VectorizationCostTy Cost; 5902 5903 // For each block. 5904 for (BasicBlock *BB : TheLoop->blocks()) { 5905 VectorizationCostTy BlockCost; 5906 5907 // For each instruction in the old loop. 5908 for (Instruction &I : BB->instructionsWithoutDebug()) { 5909 // Skip ignored values. 5910 if (ValuesToIgnore.count(&I) || (VF > 1 && VecValuesToIgnore.count(&I))) 5911 continue; 5912 5913 VectorizationCostTy C = getInstructionCost(&I, VF); 5914 5915 // Check if we should override the cost. 5916 if (ForceTargetInstructionCost.getNumOccurrences() > 0) 5917 C.first = ForceTargetInstructionCost; 5918 5919 BlockCost.first += C.first; 5920 BlockCost.second |= C.second; 5921 LLVM_DEBUG(dbgs() << "LV: Found an estimated cost of " << C.first 5922 << " for VF " << VF << " For instruction: " << I 5923 << '\n'); 5924 } 5925 5926 // If we are vectorizing a predicated block, it will have been 5927 // if-converted. This means that the block's instructions (aside from 5928 // stores and instructions that may divide by zero) will now be 5929 // unconditionally executed. For the scalar case, we may not always execute 5930 // the predicated block. Thus, scale the block's cost by the probability of 5931 // executing it. 5932 if (VF == 1 && blockNeedsPredication(BB)) 5933 BlockCost.first /= getReciprocalPredBlockProb(); 5934 5935 Cost.first += BlockCost.first; 5936 Cost.second |= BlockCost.second; 5937 } 5938 5939 return Cost; 5940 } 5941 5942 /// Gets Address Access SCEV after verifying that the access pattern 5943 /// is loop invariant except the induction variable dependence. 5944 /// 5945 /// This SCEV can be sent to the Target in order to estimate the address 5946 /// calculation cost. 5947 static const SCEV *getAddressAccessSCEV( 5948 Value *Ptr, 5949 LoopVectorizationLegality *Legal, 5950 PredicatedScalarEvolution &PSE, 5951 const Loop *TheLoop) { 5952 5953 auto *Gep = dyn_cast<GetElementPtrInst>(Ptr); 5954 if (!Gep) 5955 return nullptr; 5956 5957 // We are looking for a gep with all loop invariant indices except for one 5958 // which should be an induction variable. 5959 auto SE = PSE.getSE(); 5960 unsigned NumOperands = Gep->getNumOperands(); 5961 for (unsigned i = 1; i < NumOperands; ++i) { 5962 Value *Opd = Gep->getOperand(i); 5963 if (!SE->isLoopInvariant(SE->getSCEV(Opd), TheLoop) && 5964 !Legal->isInductionVariable(Opd)) 5965 return nullptr; 5966 } 5967 5968 // Now we know we have a GEP ptr, %inv, %ind, %inv. return the Ptr SCEV. 5969 return PSE.getSCEV(Ptr); 5970 } 5971 5972 static bool isStrideMul(Instruction *I, LoopVectorizationLegality *Legal) { 5973 return Legal->hasStride(I->getOperand(0)) || 5974 Legal->hasStride(I->getOperand(1)); 5975 } 5976 5977 unsigned LoopVectorizationCostModel::getMemInstScalarizationCost(Instruction *I, 5978 unsigned VF) { 5979 assert(VF > 1 && "Scalarization cost of instruction implies vectorization."); 5980 Type *ValTy = getMemInstValueType(I); 5981 auto SE = PSE.getSE(); 5982 5983 unsigned AS = getLoadStoreAddressSpace(I); 5984 Value *Ptr = getLoadStorePointerOperand(I); 5985 Type *PtrTy = ToVectorTy(Ptr->getType(), VF); 5986 5987 // Figure out whether the access is strided and get the stride value 5988 // if it's known in compile time 5989 const SCEV *PtrSCEV = getAddressAccessSCEV(Ptr, Legal, PSE, TheLoop); 5990 5991 // Get the cost of the scalar memory instruction and address computation. 5992 unsigned Cost = VF * TTI.getAddressComputationCost(PtrTy, SE, PtrSCEV); 5993 5994 // Don't pass *I here, since it is scalar but will actually be part of a 5995 // vectorized loop where the user of it is a vectorized instruction. 5996 const Align Alignment = getLoadStoreAlignment(I); 5997 Cost += VF * TTI.getMemoryOpCost(I->getOpcode(), ValTy->getScalarType(), 5998 Alignment, AS, 5999 TTI::TCK_RecipThroughput); 6000 6001 // Get the overhead of the extractelement and insertelement instructions 6002 // we might create due to scalarization. 6003 Cost += getScalarizationOverhead(I, VF); 6004 6005 // If we have a predicated store, it may not be executed for each vector 6006 // lane. Scale the cost by the probability of executing the predicated 6007 // block. 6008 if (isPredicatedInst(I)) { 6009 Cost /= getReciprocalPredBlockProb(); 6010 6011 if (useEmulatedMaskMemRefHack(I)) 6012 // Artificially setting to a high enough value to practically disable 6013 // vectorization with such operations. 6014 Cost = 3000000; 6015 } 6016 6017 return Cost; 6018 } 6019 6020 unsigned LoopVectorizationCostModel::getConsecutiveMemOpCost(Instruction *I, 6021 unsigned VF) { 6022 Type *ValTy = getMemInstValueType(I); 6023 auto *VectorTy = cast<VectorType>(ToVectorTy(ValTy, VF)); 6024 Value *Ptr = getLoadStorePointerOperand(I); 6025 unsigned AS = getLoadStoreAddressSpace(I); 6026 int ConsecutiveStride = Legal->isConsecutivePtr(Ptr); 6027 enum TTI::TargetCostKind CostKind = TTI::TCK_RecipThroughput; 6028 6029 assert((ConsecutiveStride == 1 || ConsecutiveStride == -1) && 6030 "Stride should be 1 or -1 for consecutive memory access"); 6031 const Align Alignment = getLoadStoreAlignment(I); 6032 unsigned Cost = 0; 6033 if (Legal->isMaskRequired(I)) 6034 Cost += TTI.getMaskedMemoryOpCost(I->getOpcode(), VectorTy, Alignment, AS, 6035 CostKind); 6036 else 6037 Cost += TTI.getMemoryOpCost(I->getOpcode(), VectorTy, Alignment, AS, 6038 CostKind, I); 6039 6040 bool Reverse = ConsecutiveStride < 0; 6041 if (Reverse) 6042 Cost += TTI.getShuffleCost(TargetTransformInfo::SK_Reverse, VectorTy, 0); 6043 return Cost; 6044 } 6045 6046 unsigned LoopVectorizationCostModel::getUniformMemOpCost(Instruction *I, 6047 unsigned VF) { 6048 Type *ValTy = getMemInstValueType(I); 6049 auto *VectorTy = cast<VectorType>(ToVectorTy(ValTy, VF)); 6050 const Align Alignment = getLoadStoreAlignment(I); 6051 unsigned AS = getLoadStoreAddressSpace(I); 6052 enum TTI::TargetCostKind CostKind = TTI::TCK_RecipThroughput; 6053 if (isa<LoadInst>(I)) { 6054 return TTI.getAddressComputationCost(ValTy) + 6055 TTI.getMemoryOpCost(Instruction::Load, ValTy, Alignment, AS, 6056 CostKind) + 6057 TTI.getShuffleCost(TargetTransformInfo::SK_Broadcast, VectorTy); 6058 } 6059 StoreInst *SI = cast<StoreInst>(I); 6060 6061 bool isLoopInvariantStoreValue = Legal->isUniform(SI->getValueOperand()); 6062 return TTI.getAddressComputationCost(ValTy) + 6063 TTI.getMemoryOpCost(Instruction::Store, ValTy, Alignment, AS, 6064 CostKind) + 6065 (isLoopInvariantStoreValue 6066 ? 0 6067 : TTI.getVectorInstrCost(Instruction::ExtractElement, VectorTy, 6068 VF - 1)); 6069 } 6070 6071 unsigned LoopVectorizationCostModel::getGatherScatterCost(Instruction *I, 6072 unsigned VF) { 6073 Type *ValTy = getMemInstValueType(I); 6074 auto *VectorTy = cast<VectorType>(ToVectorTy(ValTy, VF)); 6075 const Align Alignment = getLoadStoreAlignment(I); 6076 const Value *Ptr = getLoadStorePointerOperand(I); 6077 6078 return TTI.getAddressComputationCost(VectorTy) + 6079 TTI.getGatherScatterOpCost( 6080 I->getOpcode(), VectorTy, Ptr, Legal->isMaskRequired(I), Alignment, 6081 TargetTransformInfo::TCK_RecipThroughput, I); 6082 } 6083 6084 unsigned LoopVectorizationCostModel::getInterleaveGroupCost(Instruction *I, 6085 unsigned VF) { 6086 Type *ValTy = getMemInstValueType(I); 6087 auto *VectorTy = cast<VectorType>(ToVectorTy(ValTy, VF)); 6088 unsigned AS = getLoadStoreAddressSpace(I); 6089 6090 auto Group = getInterleavedAccessGroup(I); 6091 assert(Group && "Fail to get an interleaved access group."); 6092 6093 unsigned InterleaveFactor = Group->getFactor(); 6094 auto *WideVecTy = FixedVectorType::get(ValTy, VF * InterleaveFactor); 6095 6096 // Holds the indices of existing members in an interleaved load group. 6097 // An interleaved store group doesn't need this as it doesn't allow gaps. 6098 SmallVector<unsigned, 4> Indices; 6099 if (isa<LoadInst>(I)) { 6100 for (unsigned i = 0; i < InterleaveFactor; i++) 6101 if (Group->getMember(i)) 6102 Indices.push_back(i); 6103 } 6104 6105 // Calculate the cost of the whole interleaved group. 6106 bool UseMaskForGaps = 6107 Group->requiresScalarEpilogue() && !isScalarEpilogueAllowed(); 6108 unsigned Cost = TTI.getInterleavedMemoryOpCost( 6109 I->getOpcode(), WideVecTy, Group->getFactor(), Indices, Group->getAlign(), 6110 AS, TTI::TCK_RecipThroughput, Legal->isMaskRequired(I), UseMaskForGaps); 6111 6112 if (Group->isReverse()) { 6113 // TODO: Add support for reversed masked interleaved access. 6114 assert(!Legal->isMaskRequired(I) && 6115 "Reverse masked interleaved access not supported."); 6116 Cost += Group->getNumMembers() * 6117 TTI.getShuffleCost(TargetTransformInfo::SK_Reverse, VectorTy, 0); 6118 } 6119 return Cost; 6120 } 6121 6122 unsigned LoopVectorizationCostModel::getMemoryInstructionCost(Instruction *I, 6123 unsigned VF) { 6124 // Calculate scalar cost only. Vectorization cost should be ready at this 6125 // moment. 6126 if (VF == 1) { 6127 Type *ValTy = getMemInstValueType(I); 6128 const Align Alignment = getLoadStoreAlignment(I); 6129 unsigned AS = getLoadStoreAddressSpace(I); 6130 6131 return TTI.getAddressComputationCost(ValTy) + 6132 TTI.getMemoryOpCost(I->getOpcode(), ValTy, Alignment, AS, 6133 TTI::TCK_RecipThroughput, I); 6134 } 6135 return getWideningCost(I, VF); 6136 } 6137 6138 LoopVectorizationCostModel::VectorizationCostTy 6139 LoopVectorizationCostModel::getInstructionCost(Instruction *I, unsigned VF) { 6140 // If we know that this instruction will remain uniform, check the cost of 6141 // the scalar version. 6142 if (isUniformAfterVectorization(I, VF)) 6143 VF = 1; 6144 6145 if (VF > 1 && isProfitableToScalarize(I, VF)) 6146 return VectorizationCostTy(InstsToScalarize[VF][I], false); 6147 6148 // Forced scalars do not have any scalarization overhead. 6149 auto ForcedScalar = ForcedScalars.find(VF); 6150 if (VF > 1 && ForcedScalar != ForcedScalars.end()) { 6151 auto InstSet = ForcedScalar->second; 6152 if (InstSet.count(I)) 6153 return VectorizationCostTy((getInstructionCost(I, 1).first * VF), false); 6154 } 6155 6156 Type *VectorTy; 6157 unsigned C = getInstructionCost(I, VF, VectorTy); 6158 6159 bool TypeNotScalarized = 6160 VF > 1 && VectorTy->isVectorTy() && TTI.getNumberOfParts(VectorTy) < VF; 6161 return VectorizationCostTy(C, TypeNotScalarized); 6162 } 6163 6164 unsigned LoopVectorizationCostModel::getScalarizationOverhead(Instruction *I, 6165 unsigned VF) { 6166 6167 if (VF == 1) 6168 return 0; 6169 6170 unsigned Cost = 0; 6171 Type *RetTy = ToVectorTy(I->getType(), VF); 6172 if (!RetTy->isVoidTy() && 6173 (!isa<LoadInst>(I) || !TTI.supportsEfficientVectorElementLoadStore())) 6174 Cost += TTI.getScalarizationOverhead( 6175 cast<VectorType>(RetTy), APInt::getAllOnesValue(VF), true, false); 6176 6177 // Some targets keep addresses scalar. 6178 if (isa<LoadInst>(I) && !TTI.prefersVectorizedAddressing()) 6179 return Cost; 6180 6181 // Some targets support efficient element stores. 6182 if (isa<StoreInst>(I) && TTI.supportsEfficientVectorElementLoadStore()) 6183 return Cost; 6184 6185 // Collect operands to consider. 6186 CallInst *CI = dyn_cast<CallInst>(I); 6187 Instruction::op_range Ops = CI ? CI->arg_operands() : I->operands(); 6188 6189 // Skip operands that do not require extraction/scalarization and do not incur 6190 // any overhead. 6191 return Cost + TTI.getOperandsScalarizationOverhead( 6192 filterExtractingOperands(Ops, VF), VF); 6193 } 6194 6195 void LoopVectorizationCostModel::setCostBasedWideningDecision(unsigned VF) { 6196 if (VF == 1) 6197 return; 6198 NumPredStores = 0; 6199 for (BasicBlock *BB : TheLoop->blocks()) { 6200 // For each instruction in the old loop. 6201 for (Instruction &I : *BB) { 6202 Value *Ptr = getLoadStorePointerOperand(&I); 6203 if (!Ptr) 6204 continue; 6205 6206 // TODO: We should generate better code and update the cost model for 6207 // predicated uniform stores. Today they are treated as any other 6208 // predicated store (see added test cases in 6209 // invariant-store-vectorization.ll). 6210 if (isa<StoreInst>(&I) && isScalarWithPredication(&I)) 6211 NumPredStores++; 6212 6213 if (Legal->isUniform(Ptr) && 6214 // Conditional loads and stores should be scalarized and predicated. 6215 // isScalarWithPredication cannot be used here since masked 6216 // gather/scatters are not considered scalar with predication. 6217 !Legal->blockNeedsPredication(I.getParent())) { 6218 // TODO: Avoid replicating loads and stores instead of 6219 // relying on instcombine to remove them. 6220 // Load: Scalar load + broadcast 6221 // Store: Scalar store + isLoopInvariantStoreValue ? 0 : extract 6222 unsigned Cost = getUniformMemOpCost(&I, VF); 6223 setWideningDecision(&I, VF, CM_Scalarize, Cost); 6224 continue; 6225 } 6226 6227 // We assume that widening is the best solution when possible. 6228 if (memoryInstructionCanBeWidened(&I, VF)) { 6229 unsigned Cost = getConsecutiveMemOpCost(&I, VF); 6230 int ConsecutiveStride = 6231 Legal->isConsecutivePtr(getLoadStorePointerOperand(&I)); 6232 assert((ConsecutiveStride == 1 || ConsecutiveStride == -1) && 6233 "Expected consecutive stride."); 6234 InstWidening Decision = 6235 ConsecutiveStride == 1 ? CM_Widen : CM_Widen_Reverse; 6236 setWideningDecision(&I, VF, Decision, Cost); 6237 continue; 6238 } 6239 6240 // Choose between Interleaving, Gather/Scatter or Scalarization. 6241 unsigned InterleaveCost = std::numeric_limits<unsigned>::max(); 6242 unsigned NumAccesses = 1; 6243 if (isAccessInterleaved(&I)) { 6244 auto Group = getInterleavedAccessGroup(&I); 6245 assert(Group && "Fail to get an interleaved access group."); 6246 6247 // Make one decision for the whole group. 6248 if (getWideningDecision(&I, VF) != CM_Unknown) 6249 continue; 6250 6251 NumAccesses = Group->getNumMembers(); 6252 if (interleavedAccessCanBeWidened(&I, VF)) 6253 InterleaveCost = getInterleaveGroupCost(&I, VF); 6254 } 6255 6256 unsigned GatherScatterCost = 6257 isLegalGatherOrScatter(&I) 6258 ? getGatherScatterCost(&I, VF) * NumAccesses 6259 : std::numeric_limits<unsigned>::max(); 6260 6261 unsigned ScalarizationCost = 6262 getMemInstScalarizationCost(&I, VF) * NumAccesses; 6263 6264 // Choose better solution for the current VF, 6265 // write down this decision and use it during vectorization. 6266 unsigned Cost; 6267 InstWidening Decision; 6268 if (InterleaveCost <= GatherScatterCost && 6269 InterleaveCost < ScalarizationCost) { 6270 Decision = CM_Interleave; 6271 Cost = InterleaveCost; 6272 } else if (GatherScatterCost < ScalarizationCost) { 6273 Decision = CM_GatherScatter; 6274 Cost = GatherScatterCost; 6275 } else { 6276 Decision = CM_Scalarize; 6277 Cost = ScalarizationCost; 6278 } 6279 // If the instructions belongs to an interleave group, the whole group 6280 // receives the same decision. The whole group receives the cost, but 6281 // the cost will actually be assigned to one instruction. 6282 if (auto Group = getInterleavedAccessGroup(&I)) 6283 setWideningDecision(Group, VF, Decision, Cost); 6284 else 6285 setWideningDecision(&I, VF, Decision, Cost); 6286 } 6287 } 6288 6289 // Make sure that any load of address and any other address computation 6290 // remains scalar unless there is gather/scatter support. This avoids 6291 // inevitable extracts into address registers, and also has the benefit of 6292 // activating LSR more, since that pass can't optimize vectorized 6293 // addresses. 6294 if (TTI.prefersVectorizedAddressing()) 6295 return; 6296 6297 // Start with all scalar pointer uses. 6298 SmallPtrSet<Instruction *, 8> AddrDefs; 6299 for (BasicBlock *BB : TheLoop->blocks()) 6300 for (Instruction &I : *BB) { 6301 Instruction *PtrDef = 6302 dyn_cast_or_null<Instruction>(getLoadStorePointerOperand(&I)); 6303 if (PtrDef && TheLoop->contains(PtrDef) && 6304 getWideningDecision(&I, VF) != CM_GatherScatter) 6305 AddrDefs.insert(PtrDef); 6306 } 6307 6308 // Add all instructions used to generate the addresses. 6309 SmallVector<Instruction *, 4> Worklist; 6310 for (auto *I : AddrDefs) 6311 Worklist.push_back(I); 6312 while (!Worklist.empty()) { 6313 Instruction *I = Worklist.pop_back_val(); 6314 for (auto &Op : I->operands()) 6315 if (auto *InstOp = dyn_cast<Instruction>(Op)) 6316 if ((InstOp->getParent() == I->getParent()) && !isa<PHINode>(InstOp) && 6317 AddrDefs.insert(InstOp).second) 6318 Worklist.push_back(InstOp); 6319 } 6320 6321 for (auto *I : AddrDefs) { 6322 if (isa<LoadInst>(I)) { 6323 // Setting the desired widening decision should ideally be handled in 6324 // by cost functions, but since this involves the task of finding out 6325 // if the loaded register is involved in an address computation, it is 6326 // instead changed here when we know this is the case. 6327 InstWidening Decision = getWideningDecision(I, VF); 6328 if (Decision == CM_Widen || Decision == CM_Widen_Reverse) 6329 // Scalarize a widened load of address. 6330 setWideningDecision(I, VF, CM_Scalarize, 6331 (VF * getMemoryInstructionCost(I, 1))); 6332 else if (auto Group = getInterleavedAccessGroup(I)) { 6333 // Scalarize an interleave group of address loads. 6334 for (unsigned I = 0; I < Group->getFactor(); ++I) { 6335 if (Instruction *Member = Group->getMember(I)) 6336 setWideningDecision(Member, VF, CM_Scalarize, 6337 (VF * getMemoryInstructionCost(Member, 1))); 6338 } 6339 } 6340 } else 6341 // Make sure I gets scalarized and a cost estimate without 6342 // scalarization overhead. 6343 ForcedScalars[VF].insert(I); 6344 } 6345 } 6346 6347 unsigned LoopVectorizationCostModel::getInstructionCost(Instruction *I, 6348 unsigned VF, 6349 Type *&VectorTy) { 6350 Type *RetTy = I->getType(); 6351 if (canTruncateToMinimalBitwidth(I, VF)) 6352 RetTy = IntegerType::get(RetTy->getContext(), MinBWs[I]); 6353 VectorTy = isScalarAfterVectorization(I, VF) ? RetTy : ToVectorTy(RetTy, VF); 6354 auto SE = PSE.getSE(); 6355 TTI::TargetCostKind CostKind = TTI::TCK_RecipThroughput; 6356 6357 // TODO: We need to estimate the cost of intrinsic calls. 6358 switch (I->getOpcode()) { 6359 case Instruction::GetElementPtr: 6360 // We mark this instruction as zero-cost because the cost of GEPs in 6361 // vectorized code depends on whether the corresponding memory instruction 6362 // is scalarized or not. Therefore, we handle GEPs with the memory 6363 // instruction cost. 6364 return 0; 6365 case Instruction::Br: { 6366 // In cases of scalarized and predicated instructions, there will be VF 6367 // predicated blocks in the vectorized loop. Each branch around these 6368 // blocks requires also an extract of its vector compare i1 element. 6369 bool ScalarPredicatedBB = false; 6370 BranchInst *BI = cast<BranchInst>(I); 6371 if (VF > 1 && BI->isConditional() && 6372 (PredicatedBBsAfterVectorization.count(BI->getSuccessor(0)) || 6373 PredicatedBBsAfterVectorization.count(BI->getSuccessor(1)))) 6374 ScalarPredicatedBB = true; 6375 6376 if (ScalarPredicatedBB) { 6377 // Return cost for branches around scalarized and predicated blocks. 6378 auto *Vec_i1Ty = 6379 FixedVectorType::get(IntegerType::getInt1Ty(RetTy->getContext()), VF); 6380 return (TTI.getScalarizationOverhead(Vec_i1Ty, APInt::getAllOnesValue(VF), 6381 false, true) + 6382 (TTI.getCFInstrCost(Instruction::Br, CostKind) * VF)); 6383 } else if (I->getParent() == TheLoop->getLoopLatch() || VF == 1) 6384 // The back-edge branch will remain, as will all scalar branches. 6385 return TTI.getCFInstrCost(Instruction::Br, CostKind); 6386 else 6387 // This branch will be eliminated by if-conversion. 6388 return 0; 6389 // Note: We currently assume zero cost for an unconditional branch inside 6390 // a predicated block since it will become a fall-through, although we 6391 // may decide in the future to call TTI for all branches. 6392 } 6393 case Instruction::PHI: { 6394 auto *Phi = cast<PHINode>(I); 6395 6396 // First-order recurrences are replaced by vector shuffles inside the loop. 6397 // NOTE: Don't use ToVectorTy as SK_ExtractSubvector expects a vector type. 6398 if (VF > 1 && Legal->isFirstOrderRecurrence(Phi)) 6399 return TTI.getShuffleCost(TargetTransformInfo::SK_ExtractSubvector, 6400 cast<VectorType>(VectorTy), VF - 1, 6401 FixedVectorType::get(RetTy, 1)); 6402 6403 // Phi nodes in non-header blocks (not inductions, reductions, etc.) are 6404 // converted into select instructions. We require N - 1 selects per phi 6405 // node, where N is the number of incoming values. 6406 if (VF > 1 && Phi->getParent() != TheLoop->getHeader()) 6407 return (Phi->getNumIncomingValues() - 1) * 6408 TTI.getCmpSelInstrCost( 6409 Instruction::Select, ToVectorTy(Phi->getType(), VF), 6410 ToVectorTy(Type::getInt1Ty(Phi->getContext()), VF), 6411 CostKind); 6412 6413 return TTI.getCFInstrCost(Instruction::PHI, CostKind); 6414 } 6415 case Instruction::UDiv: 6416 case Instruction::SDiv: 6417 case Instruction::URem: 6418 case Instruction::SRem: 6419 // If we have a predicated instruction, it may not be executed for each 6420 // vector lane. Get the scalarization cost and scale this amount by the 6421 // probability of executing the predicated block. If the instruction is not 6422 // predicated, we fall through to the next case. 6423 if (VF > 1 && isScalarWithPredication(I)) { 6424 unsigned Cost = 0; 6425 6426 // These instructions have a non-void type, so account for the phi nodes 6427 // that we will create. This cost is likely to be zero. The phi node 6428 // cost, if any, should be scaled by the block probability because it 6429 // models a copy at the end of each predicated block. 6430 Cost += VF * TTI.getCFInstrCost(Instruction::PHI, CostKind); 6431 6432 // The cost of the non-predicated instruction. 6433 Cost += VF * TTI.getArithmeticInstrCost(I->getOpcode(), RetTy, CostKind); 6434 6435 // The cost of insertelement and extractelement instructions needed for 6436 // scalarization. 6437 Cost += getScalarizationOverhead(I, VF); 6438 6439 // Scale the cost by the probability of executing the predicated blocks. 6440 // This assumes the predicated block for each vector lane is equally 6441 // likely. 6442 return Cost / getReciprocalPredBlockProb(); 6443 } 6444 LLVM_FALLTHROUGH; 6445 case Instruction::Add: 6446 case Instruction::FAdd: 6447 case Instruction::Sub: 6448 case Instruction::FSub: 6449 case Instruction::Mul: 6450 case Instruction::FMul: 6451 case Instruction::FDiv: 6452 case Instruction::FRem: 6453 case Instruction::Shl: 6454 case Instruction::LShr: 6455 case Instruction::AShr: 6456 case Instruction::And: 6457 case Instruction::Or: 6458 case Instruction::Xor: { 6459 // Since we will replace the stride by 1 the multiplication should go away. 6460 if (I->getOpcode() == Instruction::Mul && isStrideMul(I, Legal)) 6461 return 0; 6462 // Certain instructions can be cheaper to vectorize if they have a constant 6463 // second vector operand. One example of this are shifts on x86. 6464 Value *Op2 = I->getOperand(1); 6465 TargetTransformInfo::OperandValueProperties Op2VP; 6466 TargetTransformInfo::OperandValueKind Op2VK = 6467 TTI.getOperandInfo(Op2, Op2VP); 6468 if (Op2VK == TargetTransformInfo::OK_AnyValue && Legal->isUniform(Op2)) 6469 Op2VK = TargetTransformInfo::OK_UniformValue; 6470 6471 SmallVector<const Value *, 4> Operands(I->operand_values()); 6472 unsigned N = isScalarAfterVectorization(I, VF) ? VF : 1; 6473 return N * TTI.getArithmeticInstrCost( 6474 I->getOpcode(), VectorTy, CostKind, 6475 TargetTransformInfo::OK_AnyValue, 6476 Op2VK, TargetTransformInfo::OP_None, Op2VP, Operands, I); 6477 } 6478 case Instruction::FNeg: { 6479 unsigned N = isScalarAfterVectorization(I, VF) ? VF : 1; 6480 return N * TTI.getArithmeticInstrCost( 6481 I->getOpcode(), VectorTy, CostKind, 6482 TargetTransformInfo::OK_AnyValue, 6483 TargetTransformInfo::OK_AnyValue, 6484 TargetTransformInfo::OP_None, TargetTransformInfo::OP_None, 6485 I->getOperand(0), I); 6486 } 6487 case Instruction::Select: { 6488 SelectInst *SI = cast<SelectInst>(I); 6489 const SCEV *CondSCEV = SE->getSCEV(SI->getCondition()); 6490 bool ScalarCond = (SE->isLoopInvariant(CondSCEV, TheLoop)); 6491 Type *CondTy = SI->getCondition()->getType(); 6492 if (!ScalarCond) 6493 CondTy = FixedVectorType::get(CondTy, VF); 6494 6495 return TTI.getCmpSelInstrCost(I->getOpcode(), VectorTy, CondTy, 6496 CostKind, I); 6497 } 6498 case Instruction::ICmp: 6499 case Instruction::FCmp: { 6500 Type *ValTy = I->getOperand(0)->getType(); 6501 Instruction *Op0AsInstruction = dyn_cast<Instruction>(I->getOperand(0)); 6502 if (canTruncateToMinimalBitwidth(Op0AsInstruction, VF)) 6503 ValTy = IntegerType::get(ValTy->getContext(), MinBWs[Op0AsInstruction]); 6504 VectorTy = ToVectorTy(ValTy, VF); 6505 return TTI.getCmpSelInstrCost(I->getOpcode(), VectorTy, nullptr, CostKind, 6506 I); 6507 } 6508 case Instruction::Store: 6509 case Instruction::Load: { 6510 unsigned Width = VF; 6511 if (Width > 1) { 6512 InstWidening Decision = getWideningDecision(I, Width); 6513 assert(Decision != CM_Unknown && 6514 "CM decision should be taken at this point"); 6515 if (Decision == CM_Scalarize) 6516 Width = 1; 6517 } 6518 VectorTy = ToVectorTy(getMemInstValueType(I), Width); 6519 return getMemoryInstructionCost(I, VF); 6520 } 6521 case Instruction::ZExt: 6522 case Instruction::SExt: 6523 case Instruction::FPToUI: 6524 case Instruction::FPToSI: 6525 case Instruction::FPExt: 6526 case Instruction::PtrToInt: 6527 case Instruction::IntToPtr: 6528 case Instruction::SIToFP: 6529 case Instruction::UIToFP: 6530 case Instruction::Trunc: 6531 case Instruction::FPTrunc: 6532 case Instruction::BitCast: { 6533 // Computes the CastContextHint from a Load/Store instruction. 6534 auto ComputeCCH = [&](Instruction *I) -> TTI::CastContextHint { 6535 assert((isa<LoadInst>(I) || isa<StoreInst>(I)) && 6536 "Expected a load or a store!"); 6537 6538 if (VF == 1 || !TheLoop->contains(I)) 6539 return TTI::CastContextHint::Normal; 6540 6541 switch (getWideningDecision(I, VF)) { 6542 case LoopVectorizationCostModel::CM_GatherScatter: 6543 return TTI::CastContextHint::GatherScatter; 6544 case LoopVectorizationCostModel::CM_Interleave: 6545 return TTI::CastContextHint::Interleave; 6546 case LoopVectorizationCostModel::CM_Scalarize: 6547 case LoopVectorizationCostModel::CM_Widen: 6548 return Legal->isMaskRequired(I) ? TTI::CastContextHint::Masked 6549 : TTI::CastContextHint::Normal; 6550 case LoopVectorizationCostModel::CM_Widen_Reverse: 6551 return TTI::CastContextHint::Reversed; 6552 case LoopVectorizationCostModel::CM_Unknown: 6553 llvm_unreachable("Instr did not go through cost modelling?"); 6554 } 6555 6556 llvm_unreachable("Unhandled case!"); 6557 }; 6558 6559 unsigned Opcode = I->getOpcode(); 6560 TTI::CastContextHint CCH = TTI::CastContextHint::None; 6561 // For Trunc, the context is the only user, which must be a StoreInst. 6562 if (Opcode == Instruction::Trunc || Opcode == Instruction::FPTrunc) { 6563 if (I->hasOneUse()) 6564 if (StoreInst *Store = dyn_cast<StoreInst>(*I->user_begin())) 6565 CCH = ComputeCCH(Store); 6566 } 6567 // For Z/Sext, the context is the operand, which must be a LoadInst. 6568 else if (Opcode == Instruction::ZExt || Opcode == Instruction::SExt || 6569 Opcode == Instruction::FPExt) { 6570 if (LoadInst *Load = dyn_cast<LoadInst>(I->getOperand(0))) 6571 CCH = ComputeCCH(Load); 6572 } 6573 6574 // We optimize the truncation of induction variables having constant 6575 // integer steps. The cost of these truncations is the same as the scalar 6576 // operation. 6577 if (isOptimizableIVTruncate(I, VF)) { 6578 auto *Trunc = cast<TruncInst>(I); 6579 return TTI.getCastInstrCost(Instruction::Trunc, Trunc->getDestTy(), 6580 Trunc->getSrcTy(), CCH, CostKind, Trunc); 6581 } 6582 6583 Type *SrcScalarTy = I->getOperand(0)->getType(); 6584 Type *SrcVecTy = 6585 VectorTy->isVectorTy() ? ToVectorTy(SrcScalarTy, VF) : SrcScalarTy; 6586 if (canTruncateToMinimalBitwidth(I, VF)) { 6587 // This cast is going to be shrunk. This may remove the cast or it might 6588 // turn it into slightly different cast. For example, if MinBW == 16, 6589 // "zext i8 %1 to i32" becomes "zext i8 %1 to i16". 6590 // 6591 // Calculate the modified src and dest types. 6592 Type *MinVecTy = VectorTy; 6593 if (Opcode == Instruction::Trunc) { 6594 SrcVecTy = smallestIntegerVectorType(SrcVecTy, MinVecTy); 6595 VectorTy = 6596 largestIntegerVectorType(ToVectorTy(I->getType(), VF), MinVecTy); 6597 } else if (Opcode == Instruction::ZExt || Opcode == Instruction::SExt) { 6598 SrcVecTy = largestIntegerVectorType(SrcVecTy, MinVecTy); 6599 VectorTy = 6600 smallestIntegerVectorType(ToVectorTy(I->getType(), VF), MinVecTy); 6601 } 6602 } 6603 6604 unsigned N = isScalarAfterVectorization(I, VF) ? VF : 1; 6605 return N * 6606 TTI.getCastInstrCost(Opcode, VectorTy, SrcVecTy, CCH, CostKind, I); 6607 } 6608 case Instruction::Call: { 6609 bool NeedToScalarize; 6610 CallInst *CI = cast<CallInst>(I); 6611 unsigned CallCost = getVectorCallCost(CI, VF, NeedToScalarize); 6612 if (getVectorIntrinsicIDForCall(CI, TLI)) 6613 return std::min(CallCost, getVectorIntrinsicCost(CI, VF)); 6614 return CallCost; 6615 } 6616 default: 6617 // The cost of executing VF copies of the scalar instruction. This opcode 6618 // is unknown. Assume that it is the same as 'mul'. 6619 return VF * TTI.getArithmeticInstrCost(Instruction::Mul, VectorTy, 6620 CostKind) + 6621 getScalarizationOverhead(I, VF); 6622 } // end of switch. 6623 } 6624 6625 char LoopVectorize::ID = 0; 6626 6627 static const char lv_name[] = "Loop Vectorization"; 6628 6629 INITIALIZE_PASS_BEGIN(LoopVectorize, LV_NAME, lv_name, false, false) 6630 INITIALIZE_PASS_DEPENDENCY(TargetTransformInfoWrapperPass) 6631 INITIALIZE_PASS_DEPENDENCY(BasicAAWrapperPass) 6632 INITIALIZE_PASS_DEPENDENCY(AAResultsWrapperPass) 6633 INITIALIZE_PASS_DEPENDENCY(GlobalsAAWrapperPass) 6634 INITIALIZE_PASS_DEPENDENCY(AssumptionCacheTracker) 6635 INITIALIZE_PASS_DEPENDENCY(BlockFrequencyInfoWrapperPass) 6636 INITIALIZE_PASS_DEPENDENCY(DominatorTreeWrapperPass) 6637 INITIALIZE_PASS_DEPENDENCY(ScalarEvolutionWrapperPass) 6638 INITIALIZE_PASS_DEPENDENCY(LoopInfoWrapperPass) 6639 INITIALIZE_PASS_DEPENDENCY(LoopAccessLegacyAnalysis) 6640 INITIALIZE_PASS_DEPENDENCY(DemandedBitsWrapperPass) 6641 INITIALIZE_PASS_DEPENDENCY(OptimizationRemarkEmitterWrapperPass) 6642 INITIALIZE_PASS_DEPENDENCY(ProfileSummaryInfoWrapperPass) 6643 INITIALIZE_PASS_DEPENDENCY(InjectTLIMappingsLegacy) 6644 INITIALIZE_PASS_END(LoopVectorize, LV_NAME, lv_name, false, false) 6645 6646 namespace llvm { 6647 6648 Pass *createLoopVectorizePass() { return new LoopVectorize(); } 6649 6650 Pass *createLoopVectorizePass(bool InterleaveOnlyWhenForced, 6651 bool VectorizeOnlyWhenForced) { 6652 return new LoopVectorize(InterleaveOnlyWhenForced, VectorizeOnlyWhenForced); 6653 } 6654 6655 } // end namespace llvm 6656 6657 bool LoopVectorizationCostModel::isConsecutiveLoadOrStore(Instruction *Inst) { 6658 // Check if the pointer operand of a load or store instruction is 6659 // consecutive. 6660 if (auto *Ptr = getLoadStorePointerOperand(Inst)) 6661 return Legal->isConsecutivePtr(Ptr); 6662 return false; 6663 } 6664 6665 void LoopVectorizationCostModel::collectValuesToIgnore() { 6666 // Ignore ephemeral values. 6667 CodeMetrics::collectEphemeralValues(TheLoop, AC, ValuesToIgnore); 6668 6669 // Ignore type-promoting instructions we identified during reduction 6670 // detection. 6671 for (auto &Reduction : Legal->getReductionVars()) { 6672 RecurrenceDescriptor &RedDes = Reduction.second; 6673 SmallPtrSetImpl<Instruction *> &Casts = RedDes.getCastInsts(); 6674 VecValuesToIgnore.insert(Casts.begin(), Casts.end()); 6675 } 6676 // Ignore type-casting instructions we identified during induction 6677 // detection. 6678 for (auto &Induction : Legal->getInductionVars()) { 6679 InductionDescriptor &IndDes = Induction.second; 6680 const SmallVectorImpl<Instruction *> &Casts = IndDes.getCastInsts(); 6681 VecValuesToIgnore.insert(Casts.begin(), Casts.end()); 6682 } 6683 } 6684 6685 void LoopVectorizationCostModel::collectInLoopReductions() { 6686 // For the moment, without predicated reduction instructions, we do not 6687 // support inloop reductions whilst folding the tail, and hence in those cases 6688 // all reductions are currently out of the loop. 6689 if (!PreferInLoopReductions || foldTailByMasking()) 6690 return; 6691 6692 for (auto &Reduction : Legal->getReductionVars()) { 6693 PHINode *Phi = Reduction.first; 6694 RecurrenceDescriptor &RdxDesc = Reduction.second; 6695 6696 // We don't collect reductions that are type promoted (yet). 6697 if (RdxDesc.getRecurrenceType() != Phi->getType()) 6698 continue; 6699 6700 // Check that we can correctly put the reductions into the loop, by 6701 // finding the chain of operations that leads from the phi to the loop 6702 // exit value. 6703 SmallVector<Instruction *, 4> ReductionOperations = 6704 RdxDesc.getReductionOpChain(Phi, TheLoop); 6705 bool InLoop = !ReductionOperations.empty(); 6706 if (InLoop) 6707 InLoopReductionChains[Phi] = ReductionOperations; 6708 LLVM_DEBUG(dbgs() << "LV: Using " << (InLoop ? "inloop" : "out of loop") 6709 << " reduction for phi: " << *Phi << "\n"); 6710 } 6711 } 6712 6713 // TODO: we could return a pair of values that specify the max VF and 6714 // min VF, to be used in `buildVPlans(MinVF, MaxVF)` instead of 6715 // `buildVPlans(VF, VF)`. We cannot do it because VPLAN at the moment 6716 // doesn't have a cost model that can choose which plan to execute if 6717 // more than one is generated. 6718 static unsigned determineVPlanVF(const unsigned WidestVectorRegBits, 6719 LoopVectorizationCostModel &CM) { 6720 unsigned WidestType; 6721 std::tie(std::ignore, WidestType) = CM.getSmallestAndWidestTypes(); 6722 return WidestVectorRegBits / WidestType; 6723 } 6724 6725 VectorizationFactor 6726 LoopVectorizationPlanner::planInVPlanNativePath(unsigned UserVF) { 6727 unsigned VF = UserVF; 6728 // Outer loop handling: They may require CFG and instruction level 6729 // transformations before even evaluating whether vectorization is profitable. 6730 // Since we cannot modify the incoming IR, we need to build VPlan upfront in 6731 // the vectorization pipeline. 6732 if (!OrigLoop->empty()) { 6733 // If the user doesn't provide a vectorization factor, determine a 6734 // reasonable one. 6735 if (!UserVF) { 6736 VF = determineVPlanVF(TTI->getRegisterBitWidth(true /* Vector*/), CM); 6737 LLVM_DEBUG(dbgs() << "LV: VPlan computed VF " << VF << ".\n"); 6738 6739 // Make sure we have a VF > 1 for stress testing. 6740 if (VPlanBuildStressTest && VF < 2) { 6741 LLVM_DEBUG(dbgs() << "LV: VPlan stress testing: " 6742 << "overriding computed VF.\n"); 6743 VF = 4; 6744 } 6745 } 6746 assert(EnableVPlanNativePath && "VPlan-native path is not enabled."); 6747 assert(isPowerOf2_32(VF) && "VF needs to be a power of two"); 6748 LLVM_DEBUG(dbgs() << "LV: Using " << (UserVF ? "user " : "") << "VF " << VF 6749 << " to build VPlans.\n"); 6750 buildVPlans(VF, VF); 6751 6752 // For VPlan build stress testing, we bail out after VPlan construction. 6753 if (VPlanBuildStressTest) 6754 return VectorizationFactor::Disabled(); 6755 6756 return {VF, 0}; 6757 } 6758 6759 LLVM_DEBUG( 6760 dbgs() << "LV: Not vectorizing. Inner loops aren't supported in the " 6761 "VPlan-native path.\n"); 6762 return VectorizationFactor::Disabled(); 6763 } 6764 6765 Optional<VectorizationFactor> LoopVectorizationPlanner::plan(unsigned UserVF, 6766 unsigned UserIC) { 6767 assert(OrigLoop->empty() && "Inner loop expected."); 6768 Optional<unsigned> MaybeMaxVF = CM.computeMaxVF(UserVF, UserIC); 6769 if (!MaybeMaxVF) // Cases that should not to be vectorized nor interleaved. 6770 return None; 6771 6772 // Invalidate interleave groups if all blocks of loop will be predicated. 6773 if (CM.blockNeedsPredication(OrigLoop->getHeader()) && 6774 !useMaskedInterleavedAccesses(*TTI)) { 6775 LLVM_DEBUG( 6776 dbgs() 6777 << "LV: Invalidate all interleaved groups due to fold-tail by masking " 6778 "which requires masked-interleaved support.\n"); 6779 if (CM.InterleaveInfo.invalidateGroups()) 6780 // Invalidating interleave groups also requires invalidating all decisions 6781 // based on them, which includes widening decisions and uniform and scalar 6782 // values. 6783 CM.invalidateCostModelingDecisions(); 6784 } 6785 6786 if (UserVF) { 6787 LLVM_DEBUG(dbgs() << "LV: Using user VF " << UserVF << ".\n"); 6788 assert(isPowerOf2_32(UserVF) && "VF needs to be a power of two"); 6789 // Collect the instructions (and their associated costs) that will be more 6790 // profitable to scalarize. 6791 CM.selectUserVectorizationFactor(UserVF); 6792 CM.collectInLoopReductions(); 6793 buildVPlansWithVPRecipes(UserVF, UserVF); 6794 LLVM_DEBUG(printPlans(dbgs())); 6795 return {{UserVF, 0}}; 6796 } 6797 6798 unsigned MaxVF = MaybeMaxVF.getValue(); 6799 assert(MaxVF != 0 && "MaxVF is zero."); 6800 6801 for (unsigned VF = 1; VF <= MaxVF; VF *= 2) { 6802 // Collect Uniform and Scalar instructions after vectorization with VF. 6803 CM.collectUniformsAndScalars(VF); 6804 6805 // Collect the instructions (and their associated costs) that will be more 6806 // profitable to scalarize. 6807 if (VF > 1) 6808 CM.collectInstsToScalarize(VF); 6809 } 6810 6811 CM.collectInLoopReductions(); 6812 6813 buildVPlansWithVPRecipes(1, MaxVF); 6814 LLVM_DEBUG(printPlans(dbgs())); 6815 if (MaxVF == 1) 6816 return VectorizationFactor::Disabled(); 6817 6818 // Select the optimal vectorization factor. 6819 return CM.selectVectorizationFactor(MaxVF); 6820 } 6821 6822 void LoopVectorizationPlanner::setBestPlan(unsigned VF, unsigned UF) { 6823 LLVM_DEBUG(dbgs() << "Setting best plan to VF=" << VF << ", UF=" << UF 6824 << '\n'); 6825 BestVF = VF; 6826 BestUF = UF; 6827 6828 erase_if(VPlans, [VF](const VPlanPtr &Plan) { 6829 return !Plan->hasVF(VF); 6830 }); 6831 assert(VPlans.size() == 1 && "Best VF has not a single VPlan."); 6832 } 6833 6834 void LoopVectorizationPlanner::executePlan(InnerLoopVectorizer &ILV, 6835 DominatorTree *DT) { 6836 // Perform the actual loop transformation. 6837 6838 // 1. Create a new empty loop. Unlink the old loop and connect the new one. 6839 VPCallbackILV CallbackILV(ILV); 6840 6841 VPTransformState State{BestVF, BestUF, LI, 6842 DT, ILV.Builder, ILV.VectorLoopValueMap, 6843 &ILV, CallbackILV}; 6844 State.CFG.PrevBB = ILV.createVectorizedLoopSkeleton(); 6845 State.TripCount = ILV.getOrCreateTripCount(nullptr); 6846 State.CanonicalIV = ILV.Induction; 6847 6848 //===------------------------------------------------===// 6849 // 6850 // Notice: any optimization or new instruction that go 6851 // into the code below should also be implemented in 6852 // the cost-model. 6853 // 6854 //===------------------------------------------------===// 6855 6856 // 2. Copy and widen instructions from the old loop into the new loop. 6857 assert(VPlans.size() == 1 && "Not a single VPlan to execute."); 6858 VPlans.front()->execute(&State); 6859 6860 // 3. Fix the vectorized code: take care of header phi's, live-outs, 6861 // predication, updating analyses. 6862 ILV.fixVectorizedLoop(); 6863 } 6864 6865 void LoopVectorizationPlanner::collectTriviallyDeadInstructions( 6866 SmallPtrSetImpl<Instruction *> &DeadInstructions) { 6867 BasicBlock *Latch = OrigLoop->getLoopLatch(); 6868 6869 // We create new control-flow for the vectorized loop, so the original 6870 // condition will be dead after vectorization if it's only used by the 6871 // branch. 6872 auto *Cmp = dyn_cast<Instruction>(Latch->getTerminator()->getOperand(0)); 6873 if (Cmp && Cmp->hasOneUse()) 6874 DeadInstructions.insert(Cmp); 6875 6876 // We create new "steps" for induction variable updates to which the original 6877 // induction variables map. An original update instruction will be dead if 6878 // all its users except the induction variable are dead. 6879 for (auto &Induction : Legal->getInductionVars()) { 6880 PHINode *Ind = Induction.first; 6881 auto *IndUpdate = cast<Instruction>(Ind->getIncomingValueForBlock(Latch)); 6882 if (llvm::all_of(IndUpdate->users(), [&](User *U) -> bool { 6883 return U == Ind || DeadInstructions.count(cast<Instruction>(U)); 6884 })) 6885 DeadInstructions.insert(IndUpdate); 6886 6887 // We record as "Dead" also the type-casting instructions we had identified 6888 // during induction analysis. We don't need any handling for them in the 6889 // vectorized loop because we have proven that, under a proper runtime 6890 // test guarding the vectorized loop, the value of the phi, and the casted 6891 // value of the phi, are the same. The last instruction in this casting chain 6892 // will get its scalar/vector/widened def from the scalar/vector/widened def 6893 // of the respective phi node. Any other casts in the induction def-use chain 6894 // have no other uses outside the phi update chain, and will be ignored. 6895 InductionDescriptor &IndDes = Induction.second; 6896 const SmallVectorImpl<Instruction *> &Casts = IndDes.getCastInsts(); 6897 DeadInstructions.insert(Casts.begin(), Casts.end()); 6898 } 6899 } 6900 6901 Value *InnerLoopUnroller::reverseVector(Value *Vec) { return Vec; } 6902 6903 Value *InnerLoopUnroller::getBroadcastInstrs(Value *V) { return V; } 6904 6905 Value *InnerLoopUnroller::getStepVector(Value *Val, int StartIdx, Value *Step, 6906 Instruction::BinaryOps BinOp) { 6907 // When unrolling and the VF is 1, we only need to add a simple scalar. 6908 Type *Ty = Val->getType(); 6909 assert(!Ty->isVectorTy() && "Val must be a scalar"); 6910 6911 if (Ty->isFloatingPointTy()) { 6912 Constant *C = ConstantFP::get(Ty, (double)StartIdx); 6913 6914 // Floating point operations had to be 'fast' to enable the unrolling. 6915 Value *MulOp = addFastMathFlag(Builder.CreateFMul(C, Step)); 6916 return addFastMathFlag(Builder.CreateBinOp(BinOp, Val, MulOp)); 6917 } 6918 Constant *C = ConstantInt::get(Ty, StartIdx); 6919 return Builder.CreateAdd(Val, Builder.CreateMul(C, Step), "induction"); 6920 } 6921 6922 static void AddRuntimeUnrollDisableMetaData(Loop *L) { 6923 SmallVector<Metadata *, 4> MDs; 6924 // Reserve first location for self reference to the LoopID metadata node. 6925 MDs.push_back(nullptr); 6926 bool IsUnrollMetadata = false; 6927 MDNode *LoopID = L->getLoopID(); 6928 if (LoopID) { 6929 // First find existing loop unrolling disable metadata. 6930 for (unsigned i = 1, ie = LoopID->getNumOperands(); i < ie; ++i) { 6931 auto *MD = dyn_cast<MDNode>(LoopID->getOperand(i)); 6932 if (MD) { 6933 const auto *S = dyn_cast<MDString>(MD->getOperand(0)); 6934 IsUnrollMetadata = 6935 S && S->getString().startswith("llvm.loop.unroll.disable"); 6936 } 6937 MDs.push_back(LoopID->getOperand(i)); 6938 } 6939 } 6940 6941 if (!IsUnrollMetadata) { 6942 // Add runtime unroll disable metadata. 6943 LLVMContext &Context = L->getHeader()->getContext(); 6944 SmallVector<Metadata *, 1> DisableOperands; 6945 DisableOperands.push_back( 6946 MDString::get(Context, "llvm.loop.unroll.runtime.disable")); 6947 MDNode *DisableNode = MDNode::get(Context, DisableOperands); 6948 MDs.push_back(DisableNode); 6949 MDNode *NewLoopID = MDNode::get(Context, MDs); 6950 // Set operand 0 to refer to the loop id itself. 6951 NewLoopID->replaceOperandWith(0, NewLoopID); 6952 L->setLoopID(NewLoopID); 6953 } 6954 } 6955 6956 bool LoopVectorizationPlanner::getDecisionAndClampRange( 6957 const std::function<bool(unsigned)> &Predicate, VFRange &Range) { 6958 assert(Range.End > Range.Start && "Trying to test an empty VF range."); 6959 bool PredicateAtRangeStart = Predicate(Range.Start); 6960 6961 for (unsigned TmpVF = Range.Start * 2; TmpVF < Range.End; TmpVF *= 2) 6962 if (Predicate(TmpVF) != PredicateAtRangeStart) { 6963 Range.End = TmpVF; 6964 break; 6965 } 6966 6967 return PredicateAtRangeStart; 6968 } 6969 6970 /// Build VPlans for the full range of feasible VF's = {\p MinVF, 2 * \p MinVF, 6971 /// 4 * \p MinVF, ..., \p MaxVF} by repeatedly building a VPlan for a sub-range 6972 /// of VF's starting at a given VF and extending it as much as possible. Each 6973 /// vectorization decision can potentially shorten this sub-range during 6974 /// buildVPlan(). 6975 void LoopVectorizationPlanner::buildVPlans(unsigned MinVF, unsigned MaxVF) { 6976 for (unsigned VF = MinVF; VF < MaxVF + 1;) { 6977 VFRange SubRange = {VF, MaxVF + 1}; 6978 VPlans.push_back(buildVPlan(SubRange)); 6979 VF = SubRange.End; 6980 } 6981 } 6982 6983 VPValue *VPRecipeBuilder::createEdgeMask(BasicBlock *Src, BasicBlock *Dst, 6984 VPlanPtr &Plan) { 6985 assert(is_contained(predecessors(Dst), Src) && "Invalid edge"); 6986 6987 // Look for cached value. 6988 std::pair<BasicBlock *, BasicBlock *> Edge(Src, Dst); 6989 EdgeMaskCacheTy::iterator ECEntryIt = EdgeMaskCache.find(Edge); 6990 if (ECEntryIt != EdgeMaskCache.end()) 6991 return ECEntryIt->second; 6992 6993 VPValue *SrcMask = createBlockInMask(Src, Plan); 6994 6995 // The terminator has to be a branch inst! 6996 BranchInst *BI = dyn_cast<BranchInst>(Src->getTerminator()); 6997 assert(BI && "Unexpected terminator found"); 6998 6999 if (!BI->isConditional() || BI->getSuccessor(0) == BI->getSuccessor(1)) 7000 return EdgeMaskCache[Edge] = SrcMask; 7001 7002 VPValue *EdgeMask = Plan->getVPValue(BI->getCondition()); 7003 assert(EdgeMask && "No Edge Mask found for condition"); 7004 7005 if (BI->getSuccessor(0) != Dst) 7006 EdgeMask = Builder.createNot(EdgeMask); 7007 7008 if (SrcMask) // Otherwise block in-mask is all-one, no need to AND. 7009 EdgeMask = Builder.createAnd(EdgeMask, SrcMask); 7010 7011 return EdgeMaskCache[Edge] = EdgeMask; 7012 } 7013 7014 VPValue *VPRecipeBuilder::createBlockInMask(BasicBlock *BB, VPlanPtr &Plan) { 7015 assert(OrigLoop->contains(BB) && "Block is not a part of a loop"); 7016 7017 // Look for cached value. 7018 BlockMaskCacheTy::iterator BCEntryIt = BlockMaskCache.find(BB); 7019 if (BCEntryIt != BlockMaskCache.end()) 7020 return BCEntryIt->second; 7021 7022 // All-one mask is modelled as no-mask following the convention for masked 7023 // load/store/gather/scatter. Initialize BlockMask to no-mask. 7024 VPValue *BlockMask = nullptr; 7025 7026 if (OrigLoop->getHeader() == BB) { 7027 if (!CM.blockNeedsPredication(BB)) 7028 return BlockMaskCache[BB] = BlockMask; // Loop incoming mask is all-one. 7029 7030 // Introduce the early-exit compare IV <= BTC to form header block mask. 7031 // This is used instead of IV < TC because TC may wrap, unlike BTC. 7032 // Start by constructing the desired canonical IV. 7033 VPValue *IV = nullptr; 7034 if (Legal->getPrimaryInduction()) 7035 IV = Plan->getVPValue(Legal->getPrimaryInduction()); 7036 else { 7037 auto IVRecipe = new VPWidenCanonicalIVRecipe(); 7038 Builder.getInsertBlock()->appendRecipe(IVRecipe); 7039 IV = IVRecipe->getVPValue(); 7040 } 7041 VPValue *BTC = Plan->getOrCreateBackedgeTakenCount(); 7042 bool TailFolded = !CM.isScalarEpilogueAllowed(); 7043 if (TailFolded && CM.TTI.emitGetActiveLaneMask()) 7044 BlockMask = Builder.createNaryOp(VPInstruction::ActiveLaneMask, {IV, BTC}); 7045 else 7046 BlockMask = Builder.createNaryOp(VPInstruction::ICmpULE, {IV, BTC}); 7047 return BlockMaskCache[BB] = BlockMask; 7048 } 7049 7050 // This is the block mask. We OR all incoming edges. 7051 for (auto *Predecessor : predecessors(BB)) { 7052 VPValue *EdgeMask = createEdgeMask(Predecessor, BB, Plan); 7053 if (!EdgeMask) // Mask of predecessor is all-one so mask of block is too. 7054 return BlockMaskCache[BB] = EdgeMask; 7055 7056 if (!BlockMask) { // BlockMask has its initialized nullptr value. 7057 BlockMask = EdgeMask; 7058 continue; 7059 } 7060 7061 BlockMask = Builder.createOr(BlockMask, EdgeMask); 7062 } 7063 7064 return BlockMaskCache[BB] = BlockMask; 7065 } 7066 7067 VPWidenMemoryInstructionRecipe * 7068 VPRecipeBuilder::tryToWidenMemory(Instruction *I, VFRange &Range, 7069 VPlanPtr &Plan) { 7070 assert((isa<LoadInst>(I) || isa<StoreInst>(I)) && 7071 "Must be called with either a load or store"); 7072 7073 auto willWiden = [&](unsigned VF) -> bool { 7074 if (VF == 1) 7075 return false; 7076 LoopVectorizationCostModel::InstWidening Decision = 7077 CM.getWideningDecision(I, VF); 7078 assert(Decision != LoopVectorizationCostModel::CM_Unknown && 7079 "CM decision should be taken at this point."); 7080 if (Decision == LoopVectorizationCostModel::CM_Interleave) 7081 return true; 7082 if (CM.isScalarAfterVectorization(I, VF) || 7083 CM.isProfitableToScalarize(I, VF)) 7084 return false; 7085 return Decision != LoopVectorizationCostModel::CM_Scalarize; 7086 }; 7087 7088 if (!LoopVectorizationPlanner::getDecisionAndClampRange(willWiden, Range)) 7089 return nullptr; 7090 7091 VPValue *Mask = nullptr; 7092 if (Legal->isMaskRequired(I)) 7093 Mask = createBlockInMask(I->getParent(), Plan); 7094 7095 VPValue *Addr = Plan->getOrAddVPValue(getLoadStorePointerOperand(I)); 7096 if (LoadInst *Load = dyn_cast<LoadInst>(I)) 7097 return new VPWidenMemoryInstructionRecipe(*Load, Addr, Mask); 7098 7099 StoreInst *Store = cast<StoreInst>(I); 7100 VPValue *StoredValue = Plan->getOrAddVPValue(Store->getValueOperand()); 7101 return new VPWidenMemoryInstructionRecipe(*Store, Addr, StoredValue, Mask); 7102 } 7103 7104 VPWidenIntOrFpInductionRecipe * 7105 VPRecipeBuilder::tryToOptimizeInductionPHI(PHINode *Phi) const { 7106 // Check if this is an integer or fp induction. If so, build the recipe that 7107 // produces its scalar and vector values. 7108 InductionDescriptor II = Legal->getInductionVars().lookup(Phi); 7109 if (II.getKind() == InductionDescriptor::IK_IntInduction || 7110 II.getKind() == InductionDescriptor::IK_FpInduction) 7111 return new VPWidenIntOrFpInductionRecipe(Phi); 7112 7113 return nullptr; 7114 } 7115 7116 VPWidenIntOrFpInductionRecipe * 7117 VPRecipeBuilder::tryToOptimizeInductionTruncate(TruncInst *I, 7118 VFRange &Range) const { 7119 // Optimize the special case where the source is a constant integer 7120 // induction variable. Notice that we can only optimize the 'trunc' case 7121 // because (a) FP conversions lose precision, (b) sext/zext may wrap, and 7122 // (c) other casts depend on pointer size. 7123 7124 // Determine whether \p K is a truncation based on an induction variable that 7125 // can be optimized. 7126 auto isOptimizableIVTruncate = 7127 [&](Instruction *K) -> std::function<bool(unsigned)> { 7128 return 7129 [=](unsigned VF) -> bool { return CM.isOptimizableIVTruncate(K, VF); }; 7130 }; 7131 7132 if (LoopVectorizationPlanner::getDecisionAndClampRange( 7133 isOptimizableIVTruncate(I), Range)) 7134 return new VPWidenIntOrFpInductionRecipe(cast<PHINode>(I->getOperand(0)), 7135 I); 7136 return nullptr; 7137 } 7138 7139 VPBlendRecipe *VPRecipeBuilder::tryToBlend(PHINode *Phi, VPlanPtr &Plan) { 7140 // We know that all PHIs in non-header blocks are converted into selects, so 7141 // we don't have to worry about the insertion order and we can just use the 7142 // builder. At this point we generate the predication tree. There may be 7143 // duplications since this is a simple recursive scan, but future 7144 // optimizations will clean it up. 7145 7146 SmallVector<VPValue *, 2> Operands; 7147 unsigned NumIncoming = Phi->getNumIncomingValues(); 7148 for (unsigned In = 0; In < NumIncoming; In++) { 7149 VPValue *EdgeMask = 7150 createEdgeMask(Phi->getIncomingBlock(In), Phi->getParent(), Plan); 7151 assert((EdgeMask || NumIncoming == 1) && 7152 "Multiple predecessors with one having a full mask"); 7153 Operands.push_back(Plan->getOrAddVPValue(Phi->getIncomingValue(In))); 7154 if (EdgeMask) 7155 Operands.push_back(EdgeMask); 7156 } 7157 return new VPBlendRecipe(Phi, Operands); 7158 } 7159 7160 VPWidenCallRecipe *VPRecipeBuilder::tryToWidenCall(CallInst *CI, VFRange &Range, 7161 VPlan &Plan) const { 7162 7163 bool IsPredicated = LoopVectorizationPlanner::getDecisionAndClampRange( 7164 [this, CI](unsigned VF) { return CM.isScalarWithPredication(CI, VF); }, 7165 Range); 7166 7167 if (IsPredicated) 7168 return nullptr; 7169 7170 Intrinsic::ID ID = getVectorIntrinsicIDForCall(CI, TLI); 7171 if (ID && (ID == Intrinsic::assume || ID == Intrinsic::lifetime_end || 7172 ID == Intrinsic::lifetime_start || ID == Intrinsic::sideeffect)) 7173 return nullptr; 7174 7175 auto willWiden = [&](unsigned VF) -> bool { 7176 Intrinsic::ID ID = getVectorIntrinsicIDForCall(CI, TLI); 7177 // The following case may be scalarized depending on the VF. 7178 // The flag shows whether we use Intrinsic or a usual Call for vectorized 7179 // version of the instruction. 7180 // Is it beneficial to perform intrinsic call compared to lib call? 7181 bool NeedToScalarize = false; 7182 unsigned CallCost = CM.getVectorCallCost(CI, VF, NeedToScalarize); 7183 bool UseVectorIntrinsic = 7184 ID && CM.getVectorIntrinsicCost(CI, VF) <= CallCost; 7185 return UseVectorIntrinsic || !NeedToScalarize; 7186 }; 7187 7188 if (!LoopVectorizationPlanner::getDecisionAndClampRange(willWiden, Range)) 7189 return nullptr; 7190 7191 return new VPWidenCallRecipe(*CI, Plan.mapToVPValues(CI->arg_operands())); 7192 } 7193 7194 bool VPRecipeBuilder::shouldWiden(Instruction *I, VFRange &Range) const { 7195 assert(!isa<BranchInst>(I) && !isa<PHINode>(I) && !isa<LoadInst>(I) && 7196 !isa<StoreInst>(I) && "Instruction should have been handled earlier"); 7197 // Instruction should be widened, unless it is scalar after vectorization, 7198 // scalarization is profitable or it is predicated. 7199 auto WillScalarize = [this, I](unsigned VF) -> bool { 7200 return CM.isScalarAfterVectorization(I, VF) || 7201 CM.isProfitableToScalarize(I, VF) || 7202 CM.isScalarWithPredication(I, VF); 7203 }; 7204 return !LoopVectorizationPlanner::getDecisionAndClampRange(WillScalarize, 7205 Range); 7206 } 7207 7208 VPWidenRecipe *VPRecipeBuilder::tryToWiden(Instruction *I, VPlan &Plan) const { 7209 auto IsVectorizableOpcode = [](unsigned Opcode) { 7210 switch (Opcode) { 7211 case Instruction::Add: 7212 case Instruction::And: 7213 case Instruction::AShr: 7214 case Instruction::BitCast: 7215 case Instruction::FAdd: 7216 case Instruction::FCmp: 7217 case Instruction::FDiv: 7218 case Instruction::FMul: 7219 case Instruction::FNeg: 7220 case Instruction::FPExt: 7221 case Instruction::FPToSI: 7222 case Instruction::FPToUI: 7223 case Instruction::FPTrunc: 7224 case Instruction::FRem: 7225 case Instruction::FSub: 7226 case Instruction::ICmp: 7227 case Instruction::IntToPtr: 7228 case Instruction::LShr: 7229 case Instruction::Mul: 7230 case Instruction::Or: 7231 case Instruction::PtrToInt: 7232 case Instruction::SDiv: 7233 case Instruction::Select: 7234 case Instruction::SExt: 7235 case Instruction::Shl: 7236 case Instruction::SIToFP: 7237 case Instruction::SRem: 7238 case Instruction::Sub: 7239 case Instruction::Trunc: 7240 case Instruction::UDiv: 7241 case Instruction::UIToFP: 7242 case Instruction::URem: 7243 case Instruction::Xor: 7244 case Instruction::ZExt: 7245 return true; 7246 } 7247 return false; 7248 }; 7249 7250 if (!IsVectorizableOpcode(I->getOpcode())) 7251 return nullptr; 7252 7253 // Success: widen this instruction. 7254 return new VPWidenRecipe(*I, Plan.mapToVPValues(I->operands())); 7255 } 7256 7257 VPBasicBlock *VPRecipeBuilder::handleReplication( 7258 Instruction *I, VFRange &Range, VPBasicBlock *VPBB, 7259 DenseMap<Instruction *, VPReplicateRecipe *> &PredInst2Recipe, 7260 VPlanPtr &Plan) { 7261 bool IsUniform = LoopVectorizationPlanner::getDecisionAndClampRange( 7262 [&](unsigned VF) { return CM.isUniformAfterVectorization(I, VF); }, 7263 Range); 7264 7265 bool IsPredicated = LoopVectorizationPlanner::getDecisionAndClampRange( 7266 [&](unsigned VF) { return CM.isScalarWithPredication(I, VF); }, Range); 7267 7268 auto *Recipe = new VPReplicateRecipe(I, Plan->mapToVPValues(I->operands()), 7269 IsUniform, IsPredicated); 7270 setRecipe(I, Recipe); 7271 7272 // Find if I uses a predicated instruction. If so, it will use its scalar 7273 // value. Avoid hoisting the insert-element which packs the scalar value into 7274 // a vector value, as that happens iff all users use the vector value. 7275 for (auto &Op : I->operands()) 7276 if (auto *PredInst = dyn_cast<Instruction>(Op)) 7277 if (PredInst2Recipe.find(PredInst) != PredInst2Recipe.end()) 7278 PredInst2Recipe[PredInst]->setAlsoPack(false); 7279 7280 // Finalize the recipe for Instr, first if it is not predicated. 7281 if (!IsPredicated) { 7282 LLVM_DEBUG(dbgs() << "LV: Scalarizing:" << *I << "\n"); 7283 VPBB->appendRecipe(Recipe); 7284 return VPBB; 7285 } 7286 LLVM_DEBUG(dbgs() << "LV: Scalarizing and predicating:" << *I << "\n"); 7287 assert(VPBB->getSuccessors().empty() && 7288 "VPBB has successors when handling predicated replication."); 7289 // Record predicated instructions for above packing optimizations. 7290 PredInst2Recipe[I] = Recipe; 7291 VPBlockBase *Region = createReplicateRegion(I, Recipe, Plan); 7292 VPBlockUtils::insertBlockAfter(Region, VPBB); 7293 auto *RegSucc = new VPBasicBlock(); 7294 VPBlockUtils::insertBlockAfter(RegSucc, Region); 7295 return RegSucc; 7296 } 7297 7298 VPRegionBlock *VPRecipeBuilder::createReplicateRegion(Instruction *Instr, 7299 VPRecipeBase *PredRecipe, 7300 VPlanPtr &Plan) { 7301 // Instructions marked for predication are replicated and placed under an 7302 // if-then construct to prevent side-effects. 7303 7304 // Generate recipes to compute the block mask for this region. 7305 VPValue *BlockInMask = createBlockInMask(Instr->getParent(), Plan); 7306 7307 // Build the triangular if-then region. 7308 std::string RegionName = (Twine("pred.") + Instr->getOpcodeName()).str(); 7309 assert(Instr->getParent() && "Predicated instruction not in any basic block"); 7310 auto *BOMRecipe = new VPBranchOnMaskRecipe(BlockInMask); 7311 auto *Entry = new VPBasicBlock(Twine(RegionName) + ".entry", BOMRecipe); 7312 auto *PHIRecipe = 7313 Instr->getType()->isVoidTy() ? nullptr : new VPPredInstPHIRecipe(Instr); 7314 auto *Exit = new VPBasicBlock(Twine(RegionName) + ".continue", PHIRecipe); 7315 auto *Pred = new VPBasicBlock(Twine(RegionName) + ".if", PredRecipe); 7316 VPRegionBlock *Region = new VPRegionBlock(Entry, Exit, RegionName, true); 7317 7318 // Note: first set Entry as region entry and then connect successors starting 7319 // from it in order, to propagate the "parent" of each VPBasicBlock. 7320 VPBlockUtils::insertTwoBlocksAfter(Pred, Exit, BlockInMask, Entry); 7321 VPBlockUtils::connectBlocks(Pred, Exit); 7322 7323 return Region; 7324 } 7325 7326 VPRecipeBase *VPRecipeBuilder::tryToCreateWidenRecipe(Instruction *Instr, 7327 VFRange &Range, 7328 VPlanPtr &Plan) { 7329 // First, check for specific widening recipes that deal with calls, memory 7330 // operations, inductions and Phi nodes. 7331 if (auto *CI = dyn_cast<CallInst>(Instr)) 7332 return tryToWidenCall(CI, Range, *Plan); 7333 7334 if (isa<LoadInst>(Instr) || isa<StoreInst>(Instr)) 7335 return tryToWidenMemory(Instr, Range, Plan); 7336 7337 VPRecipeBase *Recipe; 7338 if (auto Phi = dyn_cast<PHINode>(Instr)) { 7339 if (Phi->getParent() != OrigLoop->getHeader()) 7340 return tryToBlend(Phi, Plan); 7341 if ((Recipe = tryToOptimizeInductionPHI(Phi))) 7342 return Recipe; 7343 return new VPWidenPHIRecipe(Phi); 7344 } 7345 7346 if (isa<TruncInst>(Instr) && 7347 (Recipe = tryToOptimizeInductionTruncate(cast<TruncInst>(Instr), Range))) 7348 return Recipe; 7349 7350 if (!shouldWiden(Instr, Range)) 7351 return nullptr; 7352 7353 if (auto GEP = dyn_cast<GetElementPtrInst>(Instr)) 7354 return new VPWidenGEPRecipe(GEP, Plan->mapToVPValues(GEP->operands()), 7355 OrigLoop); 7356 7357 if (auto *SI = dyn_cast<SelectInst>(Instr)) { 7358 bool InvariantCond = 7359 PSE.getSE()->isLoopInvariant(PSE.getSCEV(SI->getOperand(0)), OrigLoop); 7360 return new VPWidenSelectRecipe(*SI, Plan->mapToVPValues(SI->operands()), 7361 InvariantCond); 7362 } 7363 7364 return tryToWiden(Instr, *Plan); 7365 } 7366 7367 void LoopVectorizationPlanner::buildVPlansWithVPRecipes(unsigned MinVF, 7368 unsigned MaxVF) { 7369 assert(OrigLoop->empty() && "Inner loop expected."); 7370 7371 // Collect conditions feeding internal conditional branches; they need to be 7372 // represented in VPlan for it to model masking. 7373 SmallPtrSet<Value *, 1> NeedDef; 7374 7375 auto *Latch = OrigLoop->getLoopLatch(); 7376 for (BasicBlock *BB : OrigLoop->blocks()) { 7377 if (BB == Latch) 7378 continue; 7379 BranchInst *Branch = dyn_cast<BranchInst>(BB->getTerminator()); 7380 if (Branch && Branch->isConditional()) 7381 NeedDef.insert(Branch->getCondition()); 7382 } 7383 7384 // If the tail is to be folded by masking, the primary induction variable, if 7385 // exists needs to be represented in VPlan for it to model early-exit masking. 7386 // Also, both the Phi and the live-out instruction of each reduction are 7387 // required in order to introduce a select between them in VPlan. 7388 if (CM.foldTailByMasking()) { 7389 if (Legal->getPrimaryInduction()) 7390 NeedDef.insert(Legal->getPrimaryInduction()); 7391 for (auto &Reduction : Legal->getReductionVars()) { 7392 NeedDef.insert(Reduction.first); 7393 NeedDef.insert(Reduction.second.getLoopExitInstr()); 7394 } 7395 } 7396 7397 // Collect instructions from the original loop that will become trivially dead 7398 // in the vectorized loop. We don't need to vectorize these instructions. For 7399 // example, original induction update instructions can become dead because we 7400 // separately emit induction "steps" when generating code for the new loop. 7401 // Similarly, we create a new latch condition when setting up the structure 7402 // of the new loop, so the old one can become dead. 7403 SmallPtrSet<Instruction *, 4> DeadInstructions; 7404 collectTriviallyDeadInstructions(DeadInstructions); 7405 7406 // Add assume instructions we need to drop to DeadInstructions, to prevent 7407 // them from being added to the VPlan. 7408 // TODO: We only need to drop assumes in blocks that get flattend. If the 7409 // control flow is preserved, we should keep them. 7410 auto &ConditionalAssumes = Legal->getConditionalAssumes(); 7411 DeadInstructions.insert(ConditionalAssumes.begin(), ConditionalAssumes.end()); 7412 7413 DenseMap<Instruction *, Instruction *> &SinkAfter = Legal->getSinkAfter(); 7414 // Dead instructions do not need sinking. Remove them from SinkAfter. 7415 for (Instruction *I : DeadInstructions) 7416 SinkAfter.erase(I); 7417 7418 for (unsigned VF = MinVF; VF < MaxVF + 1;) { 7419 VFRange SubRange = {VF, MaxVF + 1}; 7420 VPlans.push_back(buildVPlanWithVPRecipes(SubRange, NeedDef, 7421 DeadInstructions, SinkAfter)); 7422 VF = SubRange.End; 7423 } 7424 } 7425 7426 VPlanPtr LoopVectorizationPlanner::buildVPlanWithVPRecipes( 7427 VFRange &Range, SmallPtrSetImpl<Value *> &NeedDef, 7428 SmallPtrSetImpl<Instruction *> &DeadInstructions, 7429 const DenseMap<Instruction *, Instruction *> &SinkAfter) { 7430 7431 // Hold a mapping from predicated instructions to their recipes, in order to 7432 // fix their AlsoPack behavior if a user is determined to replicate and use a 7433 // scalar instead of vector value. 7434 DenseMap<Instruction *, VPReplicateRecipe *> PredInst2Recipe; 7435 7436 SmallPtrSet<const InterleaveGroup<Instruction> *, 1> InterleaveGroups; 7437 7438 VPRecipeBuilder RecipeBuilder(OrigLoop, TLI, Legal, CM, PSE, Builder); 7439 7440 // --------------------------------------------------------------------------- 7441 // Pre-construction: record ingredients whose recipes we'll need to further 7442 // process after constructing the initial VPlan. 7443 // --------------------------------------------------------------------------- 7444 7445 // Mark instructions we'll need to sink later and their targets as 7446 // ingredients whose recipe we'll need to record. 7447 for (auto &Entry : SinkAfter) { 7448 RecipeBuilder.recordRecipeOf(Entry.first); 7449 RecipeBuilder.recordRecipeOf(Entry.second); 7450 } 7451 for (auto &Reduction : CM.getInLoopReductionChains()) { 7452 PHINode *Phi = Reduction.first; 7453 RecurrenceDescriptor::RecurrenceKind Kind = 7454 Legal->getReductionVars()[Phi].getRecurrenceKind(); 7455 const SmallVector<Instruction *, 4> &ReductionOperations = Reduction.second; 7456 7457 RecipeBuilder.recordRecipeOf(Phi); 7458 for (auto &R : ReductionOperations) { 7459 RecipeBuilder.recordRecipeOf(R); 7460 // For min/max reducitons, where we have a pair of icmp/select, we also 7461 // need to record the ICmp recipe, so it can be removed later. 7462 if (Kind == RecurrenceDescriptor::RK_IntegerMinMax || 7463 Kind == RecurrenceDescriptor::RK_FloatMinMax) { 7464 RecipeBuilder.recordRecipeOf(cast<Instruction>(R->getOperand(0))); 7465 } 7466 } 7467 } 7468 7469 // For each interleave group which is relevant for this (possibly trimmed) 7470 // Range, add it to the set of groups to be later applied to the VPlan and add 7471 // placeholders for its members' Recipes which we'll be replacing with a 7472 // single VPInterleaveRecipe. 7473 for (InterleaveGroup<Instruction> *IG : IAI.getInterleaveGroups()) { 7474 auto applyIG = [IG, this](unsigned VF) -> bool { 7475 return (VF >= 2 && // Query is illegal for VF == 1 7476 CM.getWideningDecision(IG->getInsertPos(), VF) == 7477 LoopVectorizationCostModel::CM_Interleave); 7478 }; 7479 if (!getDecisionAndClampRange(applyIG, Range)) 7480 continue; 7481 InterleaveGroups.insert(IG); 7482 for (unsigned i = 0; i < IG->getFactor(); i++) 7483 if (Instruction *Member = IG->getMember(i)) 7484 RecipeBuilder.recordRecipeOf(Member); 7485 }; 7486 7487 // --------------------------------------------------------------------------- 7488 // Build initial VPlan: Scan the body of the loop in a topological order to 7489 // visit each basic block after having visited its predecessor basic blocks. 7490 // --------------------------------------------------------------------------- 7491 7492 // Create a dummy pre-entry VPBasicBlock to start building the VPlan. 7493 auto Plan = std::make_unique<VPlan>(); 7494 VPBasicBlock *VPBB = new VPBasicBlock("Pre-Entry"); 7495 Plan->setEntry(VPBB); 7496 7497 // Represent values that will have defs inside VPlan. 7498 for (Value *V : NeedDef) 7499 Plan->addVPValue(V); 7500 7501 // Scan the body of the loop in a topological order to visit each basic block 7502 // after having visited its predecessor basic blocks. 7503 LoopBlocksDFS DFS(OrigLoop); 7504 DFS.perform(LI); 7505 7506 for (BasicBlock *BB : make_range(DFS.beginRPO(), DFS.endRPO())) { 7507 // Relevant instructions from basic block BB will be grouped into VPRecipe 7508 // ingredients and fill a new VPBasicBlock. 7509 unsigned VPBBsForBB = 0; 7510 auto *FirstVPBBForBB = new VPBasicBlock(BB->getName()); 7511 VPBlockUtils::insertBlockAfter(FirstVPBBForBB, VPBB); 7512 VPBB = FirstVPBBForBB; 7513 Builder.setInsertPoint(VPBB); 7514 7515 // Introduce each ingredient into VPlan. 7516 // TODO: Model and preserve debug instrinsics in VPlan. 7517 for (Instruction &I : BB->instructionsWithoutDebug()) { 7518 Instruction *Instr = &I; 7519 7520 // First filter out irrelevant instructions, to ensure no recipes are 7521 // built for them. 7522 if (isa<BranchInst>(Instr) || DeadInstructions.count(Instr)) 7523 continue; 7524 7525 if (auto Recipe = 7526 RecipeBuilder.tryToCreateWidenRecipe(Instr, Range, Plan)) { 7527 RecipeBuilder.setRecipe(Instr, Recipe); 7528 VPBB->appendRecipe(Recipe); 7529 continue; 7530 } 7531 7532 // Otherwise, if all widening options failed, Instruction is to be 7533 // replicated. This may create a successor for VPBB. 7534 VPBasicBlock *NextVPBB = RecipeBuilder.handleReplication( 7535 Instr, Range, VPBB, PredInst2Recipe, Plan); 7536 if (NextVPBB != VPBB) { 7537 VPBB = NextVPBB; 7538 VPBB->setName(BB->hasName() ? BB->getName() + "." + Twine(VPBBsForBB++) 7539 : ""); 7540 } 7541 } 7542 } 7543 7544 // Discard empty dummy pre-entry VPBasicBlock. Note that other VPBasicBlocks 7545 // may also be empty, such as the last one VPBB, reflecting original 7546 // basic-blocks with no recipes. 7547 VPBasicBlock *PreEntry = cast<VPBasicBlock>(Plan->getEntry()); 7548 assert(PreEntry->empty() && "Expecting empty pre-entry block."); 7549 VPBlockBase *Entry = Plan->setEntry(PreEntry->getSingleSuccessor()); 7550 VPBlockUtils::disconnectBlocks(PreEntry, Entry); 7551 delete PreEntry; 7552 7553 // --------------------------------------------------------------------------- 7554 // Transform initial VPlan: Apply previously taken decisions, in order, to 7555 // bring the VPlan to its final state. 7556 // --------------------------------------------------------------------------- 7557 7558 // Apply Sink-After legal constraints. 7559 for (auto &Entry : SinkAfter) { 7560 VPRecipeBase *Sink = RecipeBuilder.getRecipe(Entry.first); 7561 VPRecipeBase *Target = RecipeBuilder.getRecipe(Entry.second); 7562 Sink->moveAfter(Target); 7563 } 7564 7565 // Interleave memory: for each Interleave Group we marked earlier as relevant 7566 // for this VPlan, replace the Recipes widening its memory instructions with a 7567 // single VPInterleaveRecipe at its insertion point. 7568 for (auto IG : InterleaveGroups) { 7569 auto *Recipe = cast<VPWidenMemoryInstructionRecipe>( 7570 RecipeBuilder.getRecipe(IG->getInsertPos())); 7571 (new VPInterleaveRecipe(IG, Recipe->getAddr(), Recipe->getMask())) 7572 ->insertBefore(Recipe); 7573 7574 for (unsigned i = 0; i < IG->getFactor(); ++i) 7575 if (Instruction *Member = IG->getMember(i)) { 7576 RecipeBuilder.getRecipe(Member)->eraseFromParent(); 7577 } 7578 } 7579 7580 // Adjust the recipes for any inloop reductions. 7581 if (Range.Start > 1) 7582 adjustRecipesForInLoopReductions(Plan, RecipeBuilder); 7583 7584 // Finally, if tail is folded by masking, introduce selects between the phi 7585 // and the live-out instruction of each reduction, at the end of the latch. 7586 if (CM.foldTailByMasking() && !Legal->getReductionVars().empty()) { 7587 Builder.setInsertPoint(VPBB); 7588 auto *Cond = RecipeBuilder.createBlockInMask(OrigLoop->getHeader(), Plan); 7589 for (auto &Reduction : Legal->getReductionVars()) { 7590 assert(!CM.isInLoopReduction(Reduction.first) && 7591 "Didn't expect inloop tail folded reduction yet!"); 7592 VPValue *Phi = Plan->getVPValue(Reduction.first); 7593 VPValue *Red = Plan->getVPValue(Reduction.second.getLoopExitInstr()); 7594 Builder.createNaryOp(Instruction::Select, {Cond, Red, Phi}); 7595 } 7596 } 7597 7598 std::string PlanName; 7599 raw_string_ostream RSO(PlanName); 7600 unsigned VF = Range.Start; 7601 Plan->addVF(VF); 7602 RSO << "Initial VPlan for VF={" << VF; 7603 for (VF *= 2; VF < Range.End; VF *= 2) { 7604 Plan->addVF(VF); 7605 RSO << "," << VF; 7606 } 7607 RSO << "},UF>=1"; 7608 RSO.flush(); 7609 Plan->setName(PlanName); 7610 7611 return Plan; 7612 } 7613 7614 VPlanPtr LoopVectorizationPlanner::buildVPlan(VFRange &Range) { 7615 // Outer loop handling: They may require CFG and instruction level 7616 // transformations before even evaluating whether vectorization is profitable. 7617 // Since we cannot modify the incoming IR, we need to build VPlan upfront in 7618 // the vectorization pipeline. 7619 assert(!OrigLoop->empty()); 7620 assert(EnableVPlanNativePath && "VPlan-native path is not enabled."); 7621 7622 // Create new empty VPlan 7623 auto Plan = std::make_unique<VPlan>(); 7624 7625 // Build hierarchical CFG 7626 VPlanHCFGBuilder HCFGBuilder(OrigLoop, LI, *Plan); 7627 HCFGBuilder.buildHierarchicalCFG(); 7628 7629 for (unsigned VF = Range.Start; VF < Range.End; VF *= 2) 7630 Plan->addVF(VF); 7631 7632 if (EnableVPlanPredication) { 7633 VPlanPredicator VPP(*Plan); 7634 VPP.predicate(); 7635 7636 // Avoid running transformation to recipes until masked code generation in 7637 // VPlan-native path is in place. 7638 return Plan; 7639 } 7640 7641 SmallPtrSet<Instruction *, 1> DeadInstructions; 7642 VPlanTransforms::VPInstructionsToVPRecipes( 7643 OrigLoop, Plan, Legal->getInductionVars(), DeadInstructions); 7644 return Plan; 7645 } 7646 7647 // Adjust the recipes for any inloop reductions. The chain of instructions 7648 // leading from the loop exit instr to the phi need to be converted to 7649 // reductions, with one operand being vector and the other being the scalar 7650 // reduction chain. 7651 void LoopVectorizationPlanner::adjustRecipesForInLoopReductions( 7652 VPlanPtr &Plan, VPRecipeBuilder &RecipeBuilder) { 7653 for (auto &Reduction : CM.getInLoopReductionChains()) { 7654 PHINode *Phi = Reduction.first; 7655 RecurrenceDescriptor &RdxDesc = Legal->getReductionVars()[Phi]; 7656 const SmallVector<Instruction *, 4> &ReductionOperations = Reduction.second; 7657 7658 // ReductionOperations are orders top-down from the phi's use to the 7659 // LoopExitValue. We keep a track of the previous item (the Chain) to tell 7660 // which of the two operands will remain scalar and which will be reduced. 7661 // For minmax the chain will be the select instructions. 7662 Instruction *Chain = Phi; 7663 for (Instruction *R : ReductionOperations) { 7664 VPRecipeBase *WidenRecipe = RecipeBuilder.getRecipe(R); 7665 RecurrenceDescriptor::RecurrenceKind Kind = RdxDesc.getRecurrenceKind(); 7666 7667 VPValue *ChainOp = Plan->getVPValue(Chain); 7668 unsigned FirstOpId; 7669 if (Kind == RecurrenceDescriptor::RK_IntegerMinMax || 7670 Kind == RecurrenceDescriptor::RK_FloatMinMax) { 7671 assert(WidenRecipe->getVPRecipeID() == VPRecipeBase::VPWidenSelectSC && 7672 "Expected to replace a VPWidenSelectSC"); 7673 FirstOpId = 1; 7674 } else { 7675 assert(WidenRecipe->getVPRecipeID() == VPRecipeBase::VPWidenSC && 7676 "Expected to replace a VPWidenSC"); 7677 FirstOpId = 0; 7678 } 7679 unsigned VecOpId = 7680 R->getOperand(FirstOpId) == Chain ? FirstOpId + 1 : FirstOpId; 7681 VPValue *VecOp = Plan->getVPValue(R->getOperand(VecOpId)); 7682 7683 VPReductionRecipe *RedRecipe = new VPReductionRecipe( 7684 &RdxDesc, R, ChainOp, VecOp, Legal->hasFunNoNaNAttr(), TTI); 7685 WidenRecipe->getParent()->insert(RedRecipe, WidenRecipe->getIterator()); 7686 WidenRecipe->eraseFromParent(); 7687 7688 if (Kind == RecurrenceDescriptor::RK_IntegerMinMax || 7689 Kind == RecurrenceDescriptor::RK_FloatMinMax) { 7690 VPRecipeBase *CompareRecipe = 7691 RecipeBuilder.getRecipe(cast<Instruction>(R->getOperand(0))); 7692 assert(CompareRecipe->getVPRecipeID() == VPRecipeBase::VPWidenSC && 7693 "Expected to replace a VPWidenSC"); 7694 CompareRecipe->eraseFromParent(); 7695 } 7696 Chain = R; 7697 } 7698 } 7699 } 7700 7701 Value* LoopVectorizationPlanner::VPCallbackILV:: 7702 getOrCreateVectorValues(Value *V, unsigned Part) { 7703 return ILV.getOrCreateVectorValue(V, Part); 7704 } 7705 7706 Value *LoopVectorizationPlanner::VPCallbackILV::getOrCreateScalarValue( 7707 Value *V, const VPIteration &Instance) { 7708 return ILV.getOrCreateScalarValue(V, Instance); 7709 } 7710 7711 void VPInterleaveRecipe::print(raw_ostream &O, const Twine &Indent, 7712 VPSlotTracker &SlotTracker) const { 7713 O << "\"INTERLEAVE-GROUP with factor " << IG->getFactor() << " at "; 7714 IG->getInsertPos()->printAsOperand(O, false); 7715 O << ", "; 7716 getAddr()->printAsOperand(O, SlotTracker); 7717 VPValue *Mask = getMask(); 7718 if (Mask) { 7719 O << ", "; 7720 Mask->printAsOperand(O, SlotTracker); 7721 } 7722 for (unsigned i = 0; i < IG->getFactor(); ++i) 7723 if (Instruction *I = IG->getMember(i)) 7724 O << "\\l\" +\n" << Indent << "\" " << VPlanIngredient(I) << " " << i; 7725 } 7726 7727 void VPWidenCallRecipe::execute(VPTransformState &State) { 7728 State.ILV->widenCallInstruction(Ingredient, User, State); 7729 } 7730 7731 void VPWidenSelectRecipe::execute(VPTransformState &State) { 7732 State.ILV->widenSelectInstruction(Ingredient, User, InvariantCond, State); 7733 } 7734 7735 void VPWidenRecipe::execute(VPTransformState &State) { 7736 State.ILV->widenInstruction(Ingredient, User, State); 7737 } 7738 7739 void VPWidenGEPRecipe::execute(VPTransformState &State) { 7740 State.ILV->widenGEP(GEP, User, State.UF, State.VF, IsPtrLoopInvariant, 7741 IsIndexLoopInvariant, State); 7742 } 7743 7744 void VPWidenIntOrFpInductionRecipe::execute(VPTransformState &State) { 7745 assert(!State.Instance && "Int or FP induction being replicated."); 7746 State.ILV->widenIntOrFpInduction(IV, Trunc); 7747 } 7748 7749 void VPWidenPHIRecipe::execute(VPTransformState &State) { 7750 State.ILV->widenPHIInstruction(Phi, State.UF, State.VF); 7751 } 7752 7753 void VPBlendRecipe::execute(VPTransformState &State) { 7754 State.ILV->setDebugLocFromInst(State.Builder, Phi); 7755 // We know that all PHIs in non-header blocks are converted into 7756 // selects, so we don't have to worry about the insertion order and we 7757 // can just use the builder. 7758 // At this point we generate the predication tree. There may be 7759 // duplications since this is a simple recursive scan, but future 7760 // optimizations will clean it up. 7761 7762 unsigned NumIncoming = getNumIncomingValues(); 7763 7764 // Generate a sequence of selects of the form: 7765 // SELECT(Mask3, In3, 7766 // SELECT(Mask2, In2, 7767 // SELECT(Mask1, In1, 7768 // In0))) 7769 // Note that Mask0 is never used: lanes for which no path reaches this phi and 7770 // are essentially undef are taken from In0. 7771 InnerLoopVectorizer::VectorParts Entry(State.UF); 7772 for (unsigned In = 0; In < NumIncoming; ++In) { 7773 for (unsigned Part = 0; Part < State.UF; ++Part) { 7774 // We might have single edge PHIs (blocks) - use an identity 7775 // 'select' for the first PHI operand. 7776 Value *In0 = State.get(getIncomingValue(In), Part); 7777 if (In == 0) 7778 Entry[Part] = In0; // Initialize with the first incoming value. 7779 else { 7780 // Select between the current value and the previous incoming edge 7781 // based on the incoming mask. 7782 Value *Cond = State.get(getMask(In), Part); 7783 Entry[Part] = 7784 State.Builder.CreateSelect(Cond, In0, Entry[Part], "predphi"); 7785 } 7786 } 7787 } 7788 for (unsigned Part = 0; Part < State.UF; ++Part) 7789 State.ValueMap.setVectorValue(Phi, Part, Entry[Part]); 7790 } 7791 7792 void VPInterleaveRecipe::execute(VPTransformState &State) { 7793 assert(!State.Instance && "Interleave group being replicated."); 7794 State.ILV->vectorizeInterleaveGroup(IG, State, getAddr(), getMask()); 7795 } 7796 7797 void VPReductionRecipe::execute(VPTransformState &State) { 7798 assert(!State.Instance && "Reduction being replicated."); 7799 for (unsigned Part = 0; Part < State.UF; ++Part) { 7800 unsigned Kind = RdxDesc->getRecurrenceKind(); 7801 Value *NewVecOp = State.get(VecOp, Part); 7802 Value *NewRed = 7803 createTargetReduction(State.Builder, TTI, *RdxDesc, NewVecOp, NoNaN); 7804 Value *PrevInChain = State.get(ChainOp, Part); 7805 Value *NextInChain; 7806 if (Kind == RecurrenceDescriptor::RK_IntegerMinMax || 7807 Kind == RecurrenceDescriptor::RK_FloatMinMax) { 7808 NextInChain = 7809 createMinMaxOp(State.Builder, RdxDesc->getMinMaxRecurrenceKind(), 7810 NewRed, PrevInChain); 7811 } else { 7812 NextInChain = State.Builder.CreateBinOp( 7813 (Instruction::BinaryOps)I->getOpcode(), NewRed, PrevInChain); 7814 } 7815 State.ValueMap.setVectorValue(I, Part, NextInChain); 7816 } 7817 } 7818 7819 void VPReplicateRecipe::execute(VPTransformState &State) { 7820 if (State.Instance) { // Generate a single instance. 7821 State.ILV->scalarizeInstruction(Ingredient, User, *State.Instance, 7822 IsPredicated, State); 7823 // Insert scalar instance packing it into a vector. 7824 if (AlsoPack && State.VF > 1) { 7825 // If we're constructing lane 0, initialize to start from undef. 7826 if (State.Instance->Lane == 0) { 7827 Value *Undef = UndefValue::get( 7828 FixedVectorType::get(Ingredient->getType(), State.VF)); 7829 State.ValueMap.setVectorValue(Ingredient, State.Instance->Part, Undef); 7830 } 7831 State.ILV->packScalarIntoVectorValue(Ingredient, *State.Instance); 7832 } 7833 return; 7834 } 7835 7836 // Generate scalar instances for all VF lanes of all UF parts, unless the 7837 // instruction is uniform inwhich case generate only the first lane for each 7838 // of the UF parts. 7839 unsigned EndLane = IsUniform ? 1 : State.VF; 7840 for (unsigned Part = 0; Part < State.UF; ++Part) 7841 for (unsigned Lane = 0; Lane < EndLane; ++Lane) 7842 State.ILV->scalarizeInstruction(Ingredient, User, {Part, Lane}, 7843 IsPredicated, State); 7844 } 7845 7846 void VPBranchOnMaskRecipe::execute(VPTransformState &State) { 7847 assert(State.Instance && "Branch on Mask works only on single instance."); 7848 7849 unsigned Part = State.Instance->Part; 7850 unsigned Lane = State.Instance->Lane; 7851 7852 Value *ConditionBit = nullptr; 7853 VPValue *BlockInMask = getMask(); 7854 if (BlockInMask) { 7855 ConditionBit = State.get(BlockInMask, Part); 7856 if (ConditionBit->getType()->isVectorTy()) 7857 ConditionBit = State.Builder.CreateExtractElement( 7858 ConditionBit, State.Builder.getInt32(Lane)); 7859 } else // Block in mask is all-one. 7860 ConditionBit = State.Builder.getTrue(); 7861 7862 // Replace the temporary unreachable terminator with a new conditional branch, 7863 // whose two destinations will be set later when they are created. 7864 auto *CurrentTerminator = State.CFG.PrevBB->getTerminator(); 7865 assert(isa<UnreachableInst>(CurrentTerminator) && 7866 "Expected to replace unreachable terminator with conditional branch."); 7867 auto *CondBr = BranchInst::Create(State.CFG.PrevBB, nullptr, ConditionBit); 7868 CondBr->setSuccessor(0, nullptr); 7869 ReplaceInstWithInst(CurrentTerminator, CondBr); 7870 } 7871 7872 void VPPredInstPHIRecipe::execute(VPTransformState &State) { 7873 assert(State.Instance && "Predicated instruction PHI works per instance."); 7874 Instruction *ScalarPredInst = cast<Instruction>( 7875 State.ValueMap.getScalarValue(PredInst, *State.Instance)); 7876 BasicBlock *PredicatedBB = ScalarPredInst->getParent(); 7877 BasicBlock *PredicatingBB = PredicatedBB->getSinglePredecessor(); 7878 assert(PredicatingBB && "Predicated block has no single predecessor."); 7879 7880 // By current pack/unpack logic we need to generate only a single phi node: if 7881 // a vector value for the predicated instruction exists at this point it means 7882 // the instruction has vector users only, and a phi for the vector value is 7883 // needed. In this case the recipe of the predicated instruction is marked to 7884 // also do that packing, thereby "hoisting" the insert-element sequence. 7885 // Otherwise, a phi node for the scalar value is needed. 7886 unsigned Part = State.Instance->Part; 7887 if (State.ValueMap.hasVectorValue(PredInst, Part)) { 7888 Value *VectorValue = State.ValueMap.getVectorValue(PredInst, Part); 7889 InsertElementInst *IEI = cast<InsertElementInst>(VectorValue); 7890 PHINode *VPhi = State.Builder.CreatePHI(IEI->getType(), 2); 7891 VPhi->addIncoming(IEI->getOperand(0), PredicatingBB); // Unmodified vector. 7892 VPhi->addIncoming(IEI, PredicatedBB); // New vector with inserted element. 7893 State.ValueMap.resetVectorValue(PredInst, Part, VPhi); // Update cache. 7894 } else { 7895 Type *PredInstType = PredInst->getType(); 7896 PHINode *Phi = State.Builder.CreatePHI(PredInstType, 2); 7897 Phi->addIncoming(UndefValue::get(ScalarPredInst->getType()), PredicatingBB); 7898 Phi->addIncoming(ScalarPredInst, PredicatedBB); 7899 State.ValueMap.resetScalarValue(PredInst, *State.Instance, Phi); 7900 } 7901 } 7902 7903 void VPWidenMemoryInstructionRecipe::execute(VPTransformState &State) { 7904 VPValue *StoredValue = isa<StoreInst>(Instr) ? getStoredValue() : nullptr; 7905 State.ILV->vectorizeMemoryInstruction(&Instr, State, getAddr(), StoredValue, 7906 getMask()); 7907 } 7908 7909 // Determine how to lower the scalar epilogue, which depends on 1) optimising 7910 // for minimum code-size, 2) predicate compiler options, 3) loop hints forcing 7911 // predication, and 4) a TTI hook that analyses whether the loop is suitable 7912 // for predication. 7913 static ScalarEpilogueLowering getScalarEpilogueLowering( 7914 Function *F, Loop *L, LoopVectorizeHints &Hints, ProfileSummaryInfo *PSI, 7915 BlockFrequencyInfo *BFI, TargetTransformInfo *TTI, TargetLibraryInfo *TLI, 7916 AssumptionCache *AC, LoopInfo *LI, ScalarEvolution *SE, DominatorTree *DT, 7917 LoopVectorizationLegality &LVL) { 7918 // 1) OptSize takes precedence over all other options, i.e. if this is set, 7919 // don't look at hints or options, and don't request a scalar epilogue. 7920 // (For PGSO, as shouldOptimizeForSize isn't currently accessible from 7921 // LoopAccessInfo (due to code dependency and not being able to reliably get 7922 // PSI/BFI from a loop analysis under NPM), we cannot suppress the collection 7923 // of strides in LoopAccessInfo::analyzeLoop() and vectorize without 7924 // versioning when the vectorization is forced, unlike hasOptSize. So revert 7925 // back to the old way and vectorize with versioning when forced. See D81345.) 7926 if (F->hasOptSize() || (llvm::shouldOptimizeForSize(L->getHeader(), PSI, BFI, 7927 PGSOQueryType::IRPass) && 7928 Hints.getForce() != LoopVectorizeHints::FK_Enabled)) 7929 return CM_ScalarEpilogueNotAllowedOptSize; 7930 7931 bool PredicateOptDisabled = PreferPredicateOverEpilog.getNumOccurrences() && 7932 !PreferPredicateOverEpilog; 7933 7934 // 2) Next, if disabling predication is requested on the command line, honour 7935 // this and request a scalar epilogue. 7936 if (PredicateOptDisabled) 7937 return CM_ScalarEpilogueAllowed; 7938 7939 // 3) and 4) look if enabling predication is requested on the command line, 7940 // with a loop hint, or if the TTI hook indicates this is profitable, request 7941 // predication . 7942 if (PreferPredicateOverEpilog || 7943 Hints.getPredicate() == LoopVectorizeHints::FK_Enabled || 7944 (TTI->preferPredicateOverEpilogue(L, LI, *SE, *AC, TLI, DT, 7945 LVL.getLAI()) && 7946 Hints.getPredicate() != LoopVectorizeHints::FK_Disabled)) 7947 return CM_ScalarEpilogueNotNeededUsePredicate; 7948 7949 return CM_ScalarEpilogueAllowed; 7950 } 7951 7952 // Process the loop in the VPlan-native vectorization path. This path builds 7953 // VPlan upfront in the vectorization pipeline, which allows to apply 7954 // VPlan-to-VPlan transformations from the very beginning without modifying the 7955 // input LLVM IR. 7956 static bool processLoopInVPlanNativePath( 7957 Loop *L, PredicatedScalarEvolution &PSE, LoopInfo *LI, DominatorTree *DT, 7958 LoopVectorizationLegality *LVL, TargetTransformInfo *TTI, 7959 TargetLibraryInfo *TLI, DemandedBits *DB, AssumptionCache *AC, 7960 OptimizationRemarkEmitter *ORE, BlockFrequencyInfo *BFI, 7961 ProfileSummaryInfo *PSI, LoopVectorizeHints &Hints) { 7962 7963 if (PSE.getBackedgeTakenCount() == PSE.getSE()->getCouldNotCompute()) { 7964 LLVM_DEBUG(dbgs() << "LV: cannot compute the outer-loop trip count\n"); 7965 return false; 7966 } 7967 assert(EnableVPlanNativePath && "VPlan-native path is disabled."); 7968 Function *F = L->getHeader()->getParent(); 7969 InterleavedAccessInfo IAI(PSE, L, DT, LI, LVL->getLAI()); 7970 7971 ScalarEpilogueLowering SEL = getScalarEpilogueLowering( 7972 F, L, Hints, PSI, BFI, TTI, TLI, AC, LI, PSE.getSE(), DT, *LVL); 7973 7974 LoopVectorizationCostModel CM(SEL, L, PSE, LI, LVL, *TTI, TLI, DB, AC, ORE, F, 7975 &Hints, IAI); 7976 // Use the planner for outer loop vectorization. 7977 // TODO: CM is not used at this point inside the planner. Turn CM into an 7978 // optional argument if we don't need it in the future. 7979 LoopVectorizationPlanner LVP(L, LI, TLI, TTI, LVL, CM, IAI, PSE); 7980 7981 // Get user vectorization factor. 7982 const unsigned UserVF = Hints.getWidth(); 7983 7984 // Plan how to best vectorize, return the best VF and its cost. 7985 const VectorizationFactor VF = LVP.planInVPlanNativePath(UserVF); 7986 7987 // If we are stress testing VPlan builds, do not attempt to generate vector 7988 // code. Masked vector code generation support will follow soon. 7989 // Also, do not attempt to vectorize if no vector code will be produced. 7990 if (VPlanBuildStressTest || EnableVPlanPredication || 7991 VectorizationFactor::Disabled() == VF) 7992 return false; 7993 7994 LVP.setBestPlan(VF.Width, 1); 7995 7996 InnerLoopVectorizer LB(L, PSE, LI, DT, TLI, TTI, AC, ORE, VF.Width, 1, LVL, 7997 &CM, BFI, PSI); 7998 LLVM_DEBUG(dbgs() << "Vectorizing outer loop in \"" 7999 << L->getHeader()->getParent()->getName() << "\"\n"); 8000 LVP.executePlan(LB, DT); 8001 8002 // Mark the loop as already vectorized to avoid vectorizing again. 8003 Hints.setAlreadyVectorized(); 8004 8005 assert(!verifyFunction(*L->getHeader()->getParent(), &dbgs())); 8006 return true; 8007 } 8008 8009 LoopVectorizePass::LoopVectorizePass(LoopVectorizeOptions Opts) 8010 : InterleaveOnlyWhenForced(Opts.InterleaveOnlyWhenForced || 8011 !EnableLoopInterleaving), 8012 VectorizeOnlyWhenForced(Opts.VectorizeOnlyWhenForced || 8013 !EnableLoopVectorization) {} 8014 8015 bool LoopVectorizePass::processLoop(Loop *L) { 8016 assert((EnableVPlanNativePath || L->empty()) && 8017 "VPlan-native path is not enabled. Only process inner loops."); 8018 8019 #ifndef NDEBUG 8020 const std::string DebugLocStr = getDebugLocString(L); 8021 #endif /* NDEBUG */ 8022 8023 LLVM_DEBUG(dbgs() << "\nLV: Checking a loop in \"" 8024 << L->getHeader()->getParent()->getName() << "\" from " 8025 << DebugLocStr << "\n"); 8026 8027 LoopVectorizeHints Hints(L, InterleaveOnlyWhenForced, *ORE); 8028 8029 LLVM_DEBUG( 8030 dbgs() << "LV: Loop hints:" 8031 << " force=" 8032 << (Hints.getForce() == LoopVectorizeHints::FK_Disabled 8033 ? "disabled" 8034 : (Hints.getForce() == LoopVectorizeHints::FK_Enabled 8035 ? "enabled" 8036 : "?")) 8037 << " width=" << Hints.getWidth() 8038 << " unroll=" << Hints.getInterleave() << "\n"); 8039 8040 // Function containing loop 8041 Function *F = L->getHeader()->getParent(); 8042 8043 // Looking at the diagnostic output is the only way to determine if a loop 8044 // was vectorized (other than looking at the IR or machine code), so it 8045 // is important to generate an optimization remark for each loop. Most of 8046 // these messages are generated as OptimizationRemarkAnalysis. Remarks 8047 // generated as OptimizationRemark and OptimizationRemarkMissed are 8048 // less verbose reporting vectorized loops and unvectorized loops that may 8049 // benefit from vectorization, respectively. 8050 8051 if (!Hints.allowVectorization(F, L, VectorizeOnlyWhenForced)) { 8052 LLVM_DEBUG(dbgs() << "LV: Loop hints prevent vectorization.\n"); 8053 return false; 8054 } 8055 8056 PredicatedScalarEvolution PSE(*SE, *L); 8057 8058 // Check if it is legal to vectorize the loop. 8059 LoopVectorizationRequirements Requirements(*ORE); 8060 LoopVectorizationLegality LVL(L, PSE, DT, TTI, TLI, AA, F, GetLAA, LI, ORE, 8061 &Requirements, &Hints, DB, AC, BFI, PSI); 8062 if (!LVL.canVectorize(EnableVPlanNativePath)) { 8063 LLVM_DEBUG(dbgs() << "LV: Not vectorizing: Cannot prove legality.\n"); 8064 Hints.emitRemarkWithHints(); 8065 return false; 8066 } 8067 8068 // Check the function attributes and profiles to find out if this function 8069 // should be optimized for size. 8070 ScalarEpilogueLowering SEL = getScalarEpilogueLowering( 8071 F, L, Hints, PSI, BFI, TTI, TLI, AC, LI, PSE.getSE(), DT, LVL); 8072 8073 // Entrance to the VPlan-native vectorization path. Outer loops are processed 8074 // here. They may require CFG and instruction level transformations before 8075 // even evaluating whether vectorization is profitable. Since we cannot modify 8076 // the incoming IR, we need to build VPlan upfront in the vectorization 8077 // pipeline. 8078 if (!L->empty()) 8079 return processLoopInVPlanNativePath(L, PSE, LI, DT, &LVL, TTI, TLI, DB, AC, 8080 ORE, BFI, PSI, Hints); 8081 8082 assert(L->empty() && "Inner loop expected."); 8083 8084 // Check the loop for a trip count threshold: vectorize loops with a tiny trip 8085 // count by optimizing for size, to minimize overheads. 8086 auto ExpectedTC = getSmallBestKnownTC(*SE, L); 8087 if (ExpectedTC && *ExpectedTC < TinyTripCountVectorThreshold) { 8088 LLVM_DEBUG(dbgs() << "LV: Found a loop with a very small trip count. " 8089 << "This loop is worth vectorizing only if no scalar " 8090 << "iteration overheads are incurred."); 8091 if (Hints.getForce() == LoopVectorizeHints::FK_Enabled) 8092 LLVM_DEBUG(dbgs() << " But vectorizing was explicitly forced.\n"); 8093 else { 8094 LLVM_DEBUG(dbgs() << "\n"); 8095 SEL = CM_ScalarEpilogueNotAllowedLowTripLoop; 8096 } 8097 } 8098 8099 // Check the function attributes to see if implicit floats are allowed. 8100 // FIXME: This check doesn't seem possibly correct -- what if the loop is 8101 // an integer loop and the vector instructions selected are purely integer 8102 // vector instructions? 8103 if (F->hasFnAttribute(Attribute::NoImplicitFloat)) { 8104 reportVectorizationFailure( 8105 "Can't vectorize when the NoImplicitFloat attribute is used", 8106 "loop not vectorized due to NoImplicitFloat attribute", 8107 "NoImplicitFloat", ORE, L); 8108 Hints.emitRemarkWithHints(); 8109 return false; 8110 } 8111 8112 // Check if the target supports potentially unsafe FP vectorization. 8113 // FIXME: Add a check for the type of safety issue (denormal, signaling) 8114 // for the target we're vectorizing for, to make sure none of the 8115 // additional fp-math flags can help. 8116 if (Hints.isPotentiallyUnsafe() && 8117 TTI->isFPVectorizationPotentiallyUnsafe()) { 8118 reportVectorizationFailure( 8119 "Potentially unsafe FP op prevents vectorization", 8120 "loop not vectorized due to unsafe FP support.", 8121 "UnsafeFP", ORE, L); 8122 Hints.emitRemarkWithHints(); 8123 return false; 8124 } 8125 8126 bool UseInterleaved = TTI->enableInterleavedAccessVectorization(); 8127 InterleavedAccessInfo IAI(PSE, L, DT, LI, LVL.getLAI()); 8128 8129 // If an override option has been passed in for interleaved accesses, use it. 8130 if (EnableInterleavedMemAccesses.getNumOccurrences() > 0) 8131 UseInterleaved = EnableInterleavedMemAccesses; 8132 8133 // Analyze interleaved memory accesses. 8134 if (UseInterleaved) { 8135 IAI.analyzeInterleaving(useMaskedInterleavedAccesses(*TTI)); 8136 } 8137 8138 // Use the cost model. 8139 LoopVectorizationCostModel CM(SEL, L, PSE, LI, &LVL, *TTI, TLI, DB, AC, ORE, 8140 F, &Hints, IAI); 8141 CM.collectValuesToIgnore(); 8142 8143 // Use the planner for vectorization. 8144 LoopVectorizationPlanner LVP(L, LI, TLI, TTI, &LVL, CM, IAI, PSE); 8145 8146 // Get user vectorization factor and interleave count. 8147 unsigned UserVF = Hints.getWidth(); 8148 unsigned UserIC = Hints.getInterleave(); 8149 8150 // Plan how to best vectorize, return the best VF and its cost. 8151 Optional<VectorizationFactor> MaybeVF = LVP.plan(UserVF, UserIC); 8152 8153 VectorizationFactor VF = VectorizationFactor::Disabled(); 8154 unsigned IC = 1; 8155 8156 if (MaybeVF) { 8157 VF = *MaybeVF; 8158 // Select the interleave count. 8159 IC = CM.selectInterleaveCount(VF.Width, VF.Cost); 8160 } 8161 8162 // Identify the diagnostic messages that should be produced. 8163 std::pair<StringRef, std::string> VecDiagMsg, IntDiagMsg; 8164 bool VectorizeLoop = true, InterleaveLoop = true; 8165 if (Requirements.doesNotMeet(F, L, Hints)) { 8166 LLVM_DEBUG(dbgs() << "LV: Not vectorizing: loop did not meet vectorization " 8167 "requirements.\n"); 8168 Hints.emitRemarkWithHints(); 8169 return false; 8170 } 8171 8172 if (VF.Width == 1) { 8173 LLVM_DEBUG(dbgs() << "LV: Vectorization is possible but not beneficial.\n"); 8174 VecDiagMsg = std::make_pair( 8175 "VectorizationNotBeneficial", 8176 "the cost-model indicates that vectorization is not beneficial"); 8177 VectorizeLoop = false; 8178 } 8179 8180 if (!MaybeVF && UserIC > 1) { 8181 // Tell the user interleaving was avoided up-front, despite being explicitly 8182 // requested. 8183 LLVM_DEBUG(dbgs() << "LV: Ignoring UserIC, because vectorization and " 8184 "interleaving should be avoided up front\n"); 8185 IntDiagMsg = std::make_pair( 8186 "InterleavingAvoided", 8187 "Ignoring UserIC, because interleaving was avoided up front"); 8188 InterleaveLoop = false; 8189 } else if (IC == 1 && UserIC <= 1) { 8190 // Tell the user interleaving is not beneficial. 8191 LLVM_DEBUG(dbgs() << "LV: Interleaving is not beneficial.\n"); 8192 IntDiagMsg = std::make_pair( 8193 "InterleavingNotBeneficial", 8194 "the cost-model indicates that interleaving is not beneficial"); 8195 InterleaveLoop = false; 8196 if (UserIC == 1) { 8197 IntDiagMsg.first = "InterleavingNotBeneficialAndDisabled"; 8198 IntDiagMsg.second += 8199 " and is explicitly disabled or interleave count is set to 1"; 8200 } 8201 } else if (IC > 1 && UserIC == 1) { 8202 // Tell the user interleaving is beneficial, but it explicitly disabled. 8203 LLVM_DEBUG( 8204 dbgs() << "LV: Interleaving is beneficial but is explicitly disabled."); 8205 IntDiagMsg = std::make_pair( 8206 "InterleavingBeneficialButDisabled", 8207 "the cost-model indicates that interleaving is beneficial " 8208 "but is explicitly disabled or interleave count is set to 1"); 8209 InterleaveLoop = false; 8210 } 8211 8212 // Override IC if user provided an interleave count. 8213 IC = UserIC > 0 ? UserIC : IC; 8214 8215 // Emit diagnostic messages, if any. 8216 const char *VAPassName = Hints.vectorizeAnalysisPassName(); 8217 if (!VectorizeLoop && !InterleaveLoop) { 8218 // Do not vectorize or interleaving the loop. 8219 ORE->emit([&]() { 8220 return OptimizationRemarkMissed(VAPassName, VecDiagMsg.first, 8221 L->getStartLoc(), L->getHeader()) 8222 << VecDiagMsg.second; 8223 }); 8224 ORE->emit([&]() { 8225 return OptimizationRemarkMissed(LV_NAME, IntDiagMsg.first, 8226 L->getStartLoc(), L->getHeader()) 8227 << IntDiagMsg.second; 8228 }); 8229 return false; 8230 } else if (!VectorizeLoop && InterleaveLoop) { 8231 LLVM_DEBUG(dbgs() << "LV: Interleave Count is " << IC << '\n'); 8232 ORE->emit([&]() { 8233 return OptimizationRemarkAnalysis(VAPassName, VecDiagMsg.first, 8234 L->getStartLoc(), L->getHeader()) 8235 << VecDiagMsg.second; 8236 }); 8237 } else if (VectorizeLoop && !InterleaveLoop) { 8238 LLVM_DEBUG(dbgs() << "LV: Found a vectorizable loop (" << VF.Width 8239 << ") in " << DebugLocStr << '\n'); 8240 ORE->emit([&]() { 8241 return OptimizationRemarkAnalysis(LV_NAME, IntDiagMsg.first, 8242 L->getStartLoc(), L->getHeader()) 8243 << IntDiagMsg.second; 8244 }); 8245 } else if (VectorizeLoop && InterleaveLoop) { 8246 LLVM_DEBUG(dbgs() << "LV: Found a vectorizable loop (" << VF.Width 8247 << ") in " << DebugLocStr << '\n'); 8248 LLVM_DEBUG(dbgs() << "LV: Interleave Count is " << IC << '\n'); 8249 } 8250 8251 LVP.setBestPlan(VF.Width, IC); 8252 8253 using namespace ore; 8254 bool DisableRuntimeUnroll = false; 8255 MDNode *OrigLoopID = L->getLoopID(); 8256 8257 if (!VectorizeLoop) { 8258 assert(IC > 1 && "interleave count should not be 1 or 0"); 8259 // If we decided that it is not legal to vectorize the loop, then 8260 // interleave it. 8261 InnerLoopUnroller Unroller(L, PSE, LI, DT, TLI, TTI, AC, ORE, IC, &LVL, &CM, 8262 BFI, PSI); 8263 LVP.executePlan(Unroller, DT); 8264 8265 ORE->emit([&]() { 8266 return OptimizationRemark(LV_NAME, "Interleaved", L->getStartLoc(), 8267 L->getHeader()) 8268 << "interleaved loop (interleaved count: " 8269 << NV("InterleaveCount", IC) << ")"; 8270 }); 8271 } else { 8272 // If we decided that it is *legal* to vectorize the loop, then do it. 8273 InnerLoopVectorizer LB(L, PSE, LI, DT, TLI, TTI, AC, ORE, VF.Width, IC, 8274 &LVL, &CM, BFI, PSI); 8275 LVP.executePlan(LB, DT); 8276 ++LoopsVectorized; 8277 8278 // Add metadata to disable runtime unrolling a scalar loop when there are 8279 // no runtime checks about strides and memory. A scalar loop that is 8280 // rarely used is not worth unrolling. 8281 if (!LB.areSafetyChecksAdded()) 8282 DisableRuntimeUnroll = true; 8283 8284 // Report the vectorization decision. 8285 ORE->emit([&]() { 8286 return OptimizationRemark(LV_NAME, "Vectorized", L->getStartLoc(), 8287 L->getHeader()) 8288 << "vectorized loop (vectorization width: " 8289 << NV("VectorizationFactor", VF.Width) 8290 << ", interleaved count: " << NV("InterleaveCount", IC) << ")"; 8291 }); 8292 } 8293 8294 Optional<MDNode *> RemainderLoopID = 8295 makeFollowupLoopID(OrigLoopID, {LLVMLoopVectorizeFollowupAll, 8296 LLVMLoopVectorizeFollowupEpilogue}); 8297 if (RemainderLoopID.hasValue()) { 8298 L->setLoopID(RemainderLoopID.getValue()); 8299 } else { 8300 if (DisableRuntimeUnroll) 8301 AddRuntimeUnrollDisableMetaData(L); 8302 8303 // Mark the loop as already vectorized to avoid vectorizing again. 8304 Hints.setAlreadyVectorized(); 8305 } 8306 8307 assert(!verifyFunction(*L->getHeader()->getParent(), &dbgs())); 8308 return true; 8309 } 8310 8311 LoopVectorizeResult LoopVectorizePass::runImpl( 8312 Function &F, ScalarEvolution &SE_, LoopInfo &LI_, TargetTransformInfo &TTI_, 8313 DominatorTree &DT_, BlockFrequencyInfo &BFI_, TargetLibraryInfo *TLI_, 8314 DemandedBits &DB_, AAResults &AA_, AssumptionCache &AC_, 8315 std::function<const LoopAccessInfo &(Loop &)> &GetLAA_, 8316 OptimizationRemarkEmitter &ORE_, ProfileSummaryInfo *PSI_) { 8317 SE = &SE_; 8318 LI = &LI_; 8319 TTI = &TTI_; 8320 DT = &DT_; 8321 BFI = &BFI_; 8322 TLI = TLI_; 8323 AA = &AA_; 8324 AC = &AC_; 8325 GetLAA = &GetLAA_; 8326 DB = &DB_; 8327 ORE = &ORE_; 8328 PSI = PSI_; 8329 8330 // Don't attempt if 8331 // 1. the target claims to have no vector registers, and 8332 // 2. interleaving won't help ILP. 8333 // 8334 // The second condition is necessary because, even if the target has no 8335 // vector registers, loop vectorization may still enable scalar 8336 // interleaving. 8337 if (!TTI->getNumberOfRegisters(TTI->getRegisterClassForType(true)) && 8338 TTI->getMaxInterleaveFactor(1) < 2) 8339 return LoopVectorizeResult(false, false); 8340 8341 bool Changed = false, CFGChanged = false; 8342 8343 // The vectorizer requires loops to be in simplified form. 8344 // Since simplification may add new inner loops, it has to run before the 8345 // legality and profitability checks. This means running the loop vectorizer 8346 // will simplify all loops, regardless of whether anything end up being 8347 // vectorized. 8348 for (auto &L : *LI) 8349 Changed |= CFGChanged |= 8350 simplifyLoop(L, DT, LI, SE, AC, nullptr, false /* PreserveLCSSA */); 8351 8352 // Build up a worklist of inner-loops to vectorize. This is necessary as 8353 // the act of vectorizing or partially unrolling a loop creates new loops 8354 // and can invalidate iterators across the loops. 8355 SmallVector<Loop *, 8> Worklist; 8356 8357 for (Loop *L : *LI) 8358 collectSupportedLoops(*L, LI, ORE, Worklist); 8359 8360 LoopsAnalyzed += Worklist.size(); 8361 8362 // Now walk the identified inner loops. 8363 while (!Worklist.empty()) { 8364 Loop *L = Worklist.pop_back_val(); 8365 8366 // For the inner loops we actually process, form LCSSA to simplify the 8367 // transform. 8368 Changed |= formLCSSARecursively(*L, *DT, LI, SE); 8369 8370 Changed |= CFGChanged |= processLoop(L); 8371 } 8372 8373 // Process each loop nest in the function. 8374 return LoopVectorizeResult(Changed, CFGChanged); 8375 } 8376 8377 PreservedAnalyses LoopVectorizePass::run(Function &F, 8378 FunctionAnalysisManager &AM) { 8379 auto &SE = AM.getResult<ScalarEvolutionAnalysis>(F); 8380 auto &LI = AM.getResult<LoopAnalysis>(F); 8381 auto &TTI = AM.getResult<TargetIRAnalysis>(F); 8382 auto &DT = AM.getResult<DominatorTreeAnalysis>(F); 8383 auto &BFI = AM.getResult<BlockFrequencyAnalysis>(F); 8384 auto &TLI = AM.getResult<TargetLibraryAnalysis>(F); 8385 auto &AA = AM.getResult<AAManager>(F); 8386 auto &AC = AM.getResult<AssumptionAnalysis>(F); 8387 auto &DB = AM.getResult<DemandedBitsAnalysis>(F); 8388 auto &ORE = AM.getResult<OptimizationRemarkEmitterAnalysis>(F); 8389 MemorySSA *MSSA = EnableMSSALoopDependency 8390 ? &AM.getResult<MemorySSAAnalysis>(F).getMSSA() 8391 : nullptr; 8392 8393 auto &LAM = AM.getResult<LoopAnalysisManagerFunctionProxy>(F).getManager(); 8394 std::function<const LoopAccessInfo &(Loop &)> GetLAA = 8395 [&](Loop &L) -> const LoopAccessInfo & { 8396 LoopStandardAnalysisResults AR = {AA, AC, DT, LI, SE, TLI, TTI, MSSA}; 8397 return LAM.getResult<LoopAccessAnalysis>(L, AR); 8398 }; 8399 auto &MAMProxy = AM.getResult<ModuleAnalysisManagerFunctionProxy>(F); 8400 ProfileSummaryInfo *PSI = 8401 MAMProxy.getCachedResult<ProfileSummaryAnalysis>(*F.getParent()); 8402 LoopVectorizeResult Result = 8403 runImpl(F, SE, LI, TTI, DT, BFI, &TLI, DB, AA, AC, GetLAA, ORE, PSI); 8404 if (!Result.MadeAnyChange) 8405 return PreservedAnalyses::all(); 8406 PreservedAnalyses PA; 8407 8408 // We currently do not preserve loopinfo/dominator analyses with outer loop 8409 // vectorization. Until this is addressed, mark these analyses as preserved 8410 // only for non-VPlan-native path. 8411 // TODO: Preserve Loop and Dominator analyses for VPlan-native path. 8412 if (!EnableVPlanNativePath) { 8413 PA.preserve<LoopAnalysis>(); 8414 PA.preserve<DominatorTreeAnalysis>(); 8415 } 8416 PA.preserve<BasicAA>(); 8417 PA.preserve<GlobalsAA>(); 8418 if (!Result.MadeCFGChange) 8419 PA.preserveSet<CFGAnalyses>(); 8420 return PA; 8421 } 8422