1 //===- LoopVectorize.cpp - A Loop Vectorizer ------------------------------===// 2 // 3 // Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. 4 // See https://llvm.org/LICENSE.txt for license information. 5 // SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception 6 // 7 //===----------------------------------------------------------------------===// 8 // 9 // This is the LLVM loop vectorizer. This pass modifies 'vectorizable' loops 10 // and generates target-independent LLVM-IR. 11 // The vectorizer uses the TargetTransformInfo analysis to estimate the costs 12 // of instructions in order to estimate the profitability of vectorization. 13 // 14 // The loop vectorizer combines consecutive loop iterations into a single 15 // 'wide' iteration. After this transformation the index is incremented 16 // by the SIMD vector width, and not by one. 17 // 18 // This pass has three parts: 19 // 1. The main loop pass that drives the different parts. 20 // 2. LoopVectorizationLegality - A unit that checks for the legality 21 // of the vectorization. 22 // 3. InnerLoopVectorizer - A unit that performs the actual 23 // widening of instructions. 24 // 4. LoopVectorizationCostModel - A unit that checks for the profitability 25 // of vectorization. It decides on the optimal vector width, which 26 // can be one, if vectorization is not profitable. 27 // 28 // There is a development effort going on to migrate loop vectorizer to the 29 // VPlan infrastructure and to introduce outer loop vectorization support (see 30 // docs/Proposal/VectorizationPlan.rst and 31 // http://lists.llvm.org/pipermail/llvm-dev/2017-December/119523.html). For this 32 // purpose, we temporarily introduced the VPlan-native vectorization path: an 33 // alternative vectorization path that is natively implemented on top of the 34 // VPlan infrastructure. See EnableVPlanNativePath for enabling. 35 // 36 //===----------------------------------------------------------------------===// 37 // 38 // The reduction-variable vectorization is based on the paper: 39 // D. Nuzman and R. Henderson. Multi-platform Auto-vectorization. 40 // 41 // Variable uniformity checks are inspired by: 42 // Karrenberg, R. and Hack, S. Whole Function Vectorization. 43 // 44 // The interleaved access vectorization is based on the paper: 45 // Dorit Nuzman, Ira Rosen and Ayal Zaks. Auto-Vectorization of Interleaved 46 // Data for SIMD 47 // 48 // Other ideas/concepts are from: 49 // A. Zaks and D. Nuzman. Autovectorization in GCC-two years later. 50 // 51 // S. Maleki, Y. Gao, M. Garzaran, T. Wong and D. Padua. An Evaluation of 52 // Vectorizing Compilers. 53 // 54 //===----------------------------------------------------------------------===// 55 56 #include "llvm/Transforms/Vectorize/LoopVectorize.h" 57 #include "LoopVectorizationPlanner.h" 58 #include "VPRecipeBuilder.h" 59 #include "VPlan.h" 60 #include "VPlanHCFGBuilder.h" 61 #include "VPlanPredicator.h" 62 #include "VPlanTransforms.h" 63 #include "llvm/ADT/APInt.h" 64 #include "llvm/ADT/ArrayRef.h" 65 #include "llvm/ADT/DenseMap.h" 66 #include "llvm/ADT/DenseMapInfo.h" 67 #include "llvm/ADT/Hashing.h" 68 #include "llvm/ADT/MapVector.h" 69 #include "llvm/ADT/None.h" 70 #include "llvm/ADT/Optional.h" 71 #include "llvm/ADT/STLExtras.h" 72 #include "llvm/ADT/SetVector.h" 73 #include "llvm/ADT/SmallPtrSet.h" 74 #include "llvm/ADT/SmallVector.h" 75 #include "llvm/ADT/Statistic.h" 76 #include "llvm/ADT/StringRef.h" 77 #include "llvm/ADT/Twine.h" 78 #include "llvm/ADT/iterator_range.h" 79 #include "llvm/Analysis/AssumptionCache.h" 80 #include "llvm/Analysis/BasicAliasAnalysis.h" 81 #include "llvm/Analysis/BlockFrequencyInfo.h" 82 #include "llvm/Analysis/CFG.h" 83 #include "llvm/Analysis/CodeMetrics.h" 84 #include "llvm/Analysis/DemandedBits.h" 85 #include "llvm/Analysis/GlobalsModRef.h" 86 #include "llvm/Analysis/LoopAccessAnalysis.h" 87 #include "llvm/Analysis/LoopAnalysisManager.h" 88 #include "llvm/Analysis/LoopInfo.h" 89 #include "llvm/Analysis/LoopIterator.h" 90 #include "llvm/Analysis/MemorySSA.h" 91 #include "llvm/Analysis/OptimizationRemarkEmitter.h" 92 #include "llvm/Analysis/ProfileSummaryInfo.h" 93 #include "llvm/Analysis/ScalarEvolution.h" 94 #include "llvm/Analysis/ScalarEvolutionExpressions.h" 95 #include "llvm/Analysis/TargetLibraryInfo.h" 96 #include "llvm/Analysis/TargetTransformInfo.h" 97 #include "llvm/Analysis/VectorUtils.h" 98 #include "llvm/IR/Attributes.h" 99 #include "llvm/IR/BasicBlock.h" 100 #include "llvm/IR/CFG.h" 101 #include "llvm/IR/Constant.h" 102 #include "llvm/IR/Constants.h" 103 #include "llvm/IR/DataLayout.h" 104 #include "llvm/IR/DebugInfoMetadata.h" 105 #include "llvm/IR/DebugLoc.h" 106 #include "llvm/IR/DerivedTypes.h" 107 #include "llvm/IR/DiagnosticInfo.h" 108 #include "llvm/IR/Dominators.h" 109 #include "llvm/IR/Function.h" 110 #include "llvm/IR/IRBuilder.h" 111 #include "llvm/IR/InstrTypes.h" 112 #include "llvm/IR/Instruction.h" 113 #include "llvm/IR/Instructions.h" 114 #include "llvm/IR/IntrinsicInst.h" 115 #include "llvm/IR/Intrinsics.h" 116 #include "llvm/IR/LLVMContext.h" 117 #include "llvm/IR/Metadata.h" 118 #include "llvm/IR/Module.h" 119 #include "llvm/IR/Operator.h" 120 #include "llvm/IR/Type.h" 121 #include "llvm/IR/Use.h" 122 #include "llvm/IR/User.h" 123 #include "llvm/IR/Value.h" 124 #include "llvm/IR/ValueHandle.h" 125 #include "llvm/IR/Verifier.h" 126 #include "llvm/InitializePasses.h" 127 #include "llvm/Pass.h" 128 #include "llvm/Support/Casting.h" 129 #include "llvm/Support/CommandLine.h" 130 #include "llvm/Support/Compiler.h" 131 #include "llvm/Support/Debug.h" 132 #include "llvm/Support/ErrorHandling.h" 133 #include "llvm/Support/MathExtras.h" 134 #include "llvm/Support/raw_ostream.h" 135 #include "llvm/Transforms/Utils/BasicBlockUtils.h" 136 #include "llvm/Transforms/Utils/InjectTLIMappings.h" 137 #include "llvm/Transforms/Utils/LoopSimplify.h" 138 #include "llvm/Transforms/Utils/LoopUtils.h" 139 #include "llvm/Transforms/Utils/LoopVersioning.h" 140 #include "llvm/Transforms/Utils/ScalarEvolutionExpander.h" 141 #include "llvm/Transforms/Utils/SizeOpts.h" 142 #include "llvm/Transforms/Vectorize/LoopVectorizationLegality.h" 143 #include <algorithm> 144 #include <cassert> 145 #include <cstdint> 146 #include <cstdlib> 147 #include <functional> 148 #include <iterator> 149 #include <limits> 150 #include <memory> 151 #include <string> 152 #include <tuple> 153 #include <utility> 154 155 using namespace llvm; 156 157 #define LV_NAME "loop-vectorize" 158 #define DEBUG_TYPE LV_NAME 159 160 /// @{ 161 /// Metadata attribute names 162 static const char *const LLVMLoopVectorizeFollowupAll = 163 "llvm.loop.vectorize.followup_all"; 164 static const char *const LLVMLoopVectorizeFollowupVectorized = 165 "llvm.loop.vectorize.followup_vectorized"; 166 static const char *const LLVMLoopVectorizeFollowupEpilogue = 167 "llvm.loop.vectorize.followup_epilogue"; 168 /// @} 169 170 STATISTIC(LoopsVectorized, "Number of loops vectorized"); 171 STATISTIC(LoopsAnalyzed, "Number of loops analyzed for vectorization"); 172 173 /// Loops with a known constant trip count below this number are vectorized only 174 /// if no scalar iteration overheads are incurred. 175 static cl::opt<unsigned> TinyTripCountVectorThreshold( 176 "vectorizer-min-trip-count", cl::init(16), cl::Hidden, 177 cl::desc("Loops with a constant trip count that is smaller than this " 178 "value are vectorized only if no scalar iteration overheads " 179 "are incurred.")); 180 181 // Indicates that an epilogue is undesired, predication is preferred. 182 // This means that the vectorizer will try to fold the loop-tail (epilogue) 183 // into the loop and predicate the loop body accordingly. 184 static cl::opt<bool> PreferPredicateOverEpilog( 185 "prefer-predicate-over-epilog", cl::init(false), cl::Hidden, 186 cl::desc("Indicate that an epilogue is undesired, predication should be " 187 "used instead.")); 188 189 static cl::opt<bool> MaximizeBandwidth( 190 "vectorizer-maximize-bandwidth", cl::init(false), cl::Hidden, 191 cl::desc("Maximize bandwidth when selecting vectorization factor which " 192 "will be determined by the smallest type in loop.")); 193 194 static cl::opt<bool> EnableInterleavedMemAccesses( 195 "enable-interleaved-mem-accesses", cl::init(false), cl::Hidden, 196 cl::desc("Enable vectorization on interleaved memory accesses in a loop")); 197 198 /// An interleave-group may need masking if it resides in a block that needs 199 /// predication, or in order to mask away gaps. 200 static cl::opt<bool> EnableMaskedInterleavedMemAccesses( 201 "enable-masked-interleaved-mem-accesses", cl::init(false), cl::Hidden, 202 cl::desc("Enable vectorization on masked interleaved memory accesses in a loop")); 203 204 static cl::opt<unsigned> TinyTripCountInterleaveThreshold( 205 "tiny-trip-count-interleave-threshold", cl::init(128), cl::Hidden, 206 cl::desc("We don't interleave loops with a estimated constant trip count " 207 "below this number")); 208 209 static cl::opt<unsigned> ForceTargetNumScalarRegs( 210 "force-target-num-scalar-regs", cl::init(0), cl::Hidden, 211 cl::desc("A flag that overrides the target's number of scalar registers.")); 212 213 static cl::opt<unsigned> ForceTargetNumVectorRegs( 214 "force-target-num-vector-regs", cl::init(0), cl::Hidden, 215 cl::desc("A flag that overrides the target's number of vector registers.")); 216 217 static cl::opt<unsigned> ForceTargetMaxScalarInterleaveFactor( 218 "force-target-max-scalar-interleave", cl::init(0), cl::Hidden, 219 cl::desc("A flag that overrides the target's max interleave factor for " 220 "scalar loops.")); 221 222 static cl::opt<unsigned> ForceTargetMaxVectorInterleaveFactor( 223 "force-target-max-vector-interleave", cl::init(0), cl::Hidden, 224 cl::desc("A flag that overrides the target's max interleave factor for " 225 "vectorized loops.")); 226 227 static cl::opt<unsigned> ForceTargetInstructionCost( 228 "force-target-instruction-cost", cl::init(0), cl::Hidden, 229 cl::desc("A flag that overrides the target's expected cost for " 230 "an instruction to a single constant value. Mostly " 231 "useful for getting consistent testing.")); 232 233 static cl::opt<unsigned> SmallLoopCost( 234 "small-loop-cost", cl::init(20), cl::Hidden, 235 cl::desc( 236 "The cost of a loop that is considered 'small' by the interleaver.")); 237 238 static cl::opt<bool> LoopVectorizeWithBlockFrequency( 239 "loop-vectorize-with-block-frequency", cl::init(true), cl::Hidden, 240 cl::desc("Enable the use of the block frequency analysis to access PGO " 241 "heuristics minimizing code growth in cold regions and being more " 242 "aggressive in hot regions.")); 243 244 // Runtime interleave loops for load/store throughput. 245 static cl::opt<bool> EnableLoadStoreRuntimeInterleave( 246 "enable-loadstore-runtime-interleave", cl::init(true), cl::Hidden, 247 cl::desc( 248 "Enable runtime interleaving until load/store ports are saturated")); 249 250 /// The number of stores in a loop that are allowed to need predication. 251 static cl::opt<unsigned> NumberOfStoresToPredicate( 252 "vectorize-num-stores-pred", cl::init(1), cl::Hidden, 253 cl::desc("Max number of stores to be predicated behind an if.")); 254 255 static cl::opt<bool> EnableIndVarRegisterHeur( 256 "enable-ind-var-reg-heur", cl::init(true), cl::Hidden, 257 cl::desc("Count the induction variable only once when interleaving")); 258 259 static cl::opt<bool> EnableCondStoresVectorization( 260 "enable-cond-stores-vec", cl::init(true), cl::Hidden, 261 cl::desc("Enable if predication of stores during vectorization.")); 262 263 static cl::opt<unsigned> MaxNestedScalarReductionIC( 264 "max-nested-scalar-reduction-interleave", cl::init(2), cl::Hidden, 265 cl::desc("The maximum interleave count to use when interleaving a scalar " 266 "reduction in a nested loop.")); 267 268 static cl::opt<bool> 269 PreferInLoopReductions("prefer-inloop-reductions", cl::init(false), 270 cl::Hidden, 271 cl::desc("Prefer in-loop vector reductions, " 272 "overriding the targets preference.")); 273 274 static cl::opt<bool> PreferPredicatedReductionSelect( 275 "prefer-predicated-reduction-select", cl::init(false), cl::Hidden, 276 cl::desc( 277 "Prefer predicating a reduction operation over an after loop select.")); 278 279 cl::opt<bool> EnableVPlanNativePath( 280 "enable-vplan-native-path", cl::init(false), cl::Hidden, 281 cl::desc("Enable VPlan-native vectorization path with " 282 "support for outer loop vectorization.")); 283 284 // FIXME: Remove this switch once we have divergence analysis. Currently we 285 // assume divergent non-backedge branches when this switch is true. 286 cl::opt<bool> EnableVPlanPredication( 287 "enable-vplan-predication", cl::init(false), cl::Hidden, 288 cl::desc("Enable VPlan-native vectorization path predicator with " 289 "support for outer loop vectorization.")); 290 291 // This flag enables the stress testing of the VPlan H-CFG construction in the 292 // VPlan-native vectorization path. It must be used in conjuction with 293 // -enable-vplan-native-path. -vplan-verify-hcfg can also be used to enable the 294 // verification of the H-CFGs built. 295 static cl::opt<bool> VPlanBuildStressTest( 296 "vplan-build-stress-test", cl::init(false), cl::Hidden, 297 cl::desc( 298 "Build VPlan for every supported loop nest in the function and bail " 299 "out right after the build (stress test the VPlan H-CFG construction " 300 "in the VPlan-native vectorization path).")); 301 302 cl::opt<bool> llvm::EnableLoopInterleaving( 303 "interleave-loops", cl::init(true), cl::Hidden, 304 cl::desc("Enable loop interleaving in Loop vectorization passes")); 305 cl::opt<bool> llvm::EnableLoopVectorization( 306 "vectorize-loops", cl::init(true), cl::Hidden, 307 cl::desc("Run the Loop vectorization passes")); 308 309 /// A helper function that returns the type of loaded or stored value. 310 static Type *getMemInstValueType(Value *I) { 311 assert((isa<LoadInst>(I) || isa<StoreInst>(I)) && 312 "Expected Load or Store instruction"); 313 if (auto *LI = dyn_cast<LoadInst>(I)) 314 return LI->getType(); 315 return cast<StoreInst>(I)->getValueOperand()->getType(); 316 } 317 318 /// A helper function that returns true if the given type is irregular. The 319 /// type is irregular if its allocated size doesn't equal the store size of an 320 /// element of the corresponding vector type at the given vectorization factor. 321 static bool hasIrregularType(Type *Ty, const DataLayout &DL, ElementCount VF) { 322 assert(!VF.Scalable && "scalable vectors not yet supported."); 323 // Determine if an array of VF elements of type Ty is "bitcast compatible" 324 // with a <VF x Ty> vector. 325 if (VF.isVector()) { 326 auto *VectorTy = VectorType::get(Ty, VF); 327 return VF * DL.getTypeAllocSize(Ty) != DL.getTypeStoreSize(VectorTy); 328 } 329 330 // If the vectorization factor is one, we just check if an array of type Ty 331 // requires padding between elements. 332 return DL.getTypeAllocSizeInBits(Ty) != DL.getTypeSizeInBits(Ty); 333 } 334 335 /// A helper function that returns the reciprocal of the block probability of 336 /// predicated blocks. If we return X, we are assuming the predicated block 337 /// will execute once for every X iterations of the loop header. 338 /// 339 /// TODO: We should use actual block probability here, if available. Currently, 340 /// we always assume predicated blocks have a 50% chance of executing. 341 static unsigned getReciprocalPredBlockProb() { return 2; } 342 343 /// A helper function that adds a 'fast' flag to floating-point operations. 344 static Value *addFastMathFlag(Value *V) { 345 if (isa<FPMathOperator>(V)) 346 cast<Instruction>(V)->setFastMathFlags(FastMathFlags::getFast()); 347 return V; 348 } 349 350 static Value *addFastMathFlag(Value *V, FastMathFlags FMF) { 351 if (isa<FPMathOperator>(V)) 352 cast<Instruction>(V)->setFastMathFlags(FMF); 353 return V; 354 } 355 356 /// A helper function that returns an integer or floating-point constant with 357 /// value C. 358 static Constant *getSignedIntOrFpConstant(Type *Ty, int64_t C) { 359 return Ty->isIntegerTy() ? ConstantInt::getSigned(Ty, C) 360 : ConstantFP::get(Ty, C); 361 } 362 363 /// Returns "best known" trip count for the specified loop \p L as defined by 364 /// the following procedure: 365 /// 1) Returns exact trip count if it is known. 366 /// 2) Returns expected trip count according to profile data if any. 367 /// 3) Returns upper bound estimate if it is known. 368 /// 4) Returns None if all of the above failed. 369 static Optional<unsigned> getSmallBestKnownTC(ScalarEvolution &SE, Loop *L) { 370 // Check if exact trip count is known. 371 if (unsigned ExpectedTC = SE.getSmallConstantTripCount(L)) 372 return ExpectedTC; 373 374 // Check if there is an expected trip count available from profile data. 375 if (LoopVectorizeWithBlockFrequency) 376 if (auto EstimatedTC = getLoopEstimatedTripCount(L)) 377 return EstimatedTC; 378 379 // Check if upper bound estimate is known. 380 if (unsigned ExpectedTC = SE.getSmallConstantMaxTripCount(L)) 381 return ExpectedTC; 382 383 return None; 384 } 385 386 namespace llvm { 387 388 /// InnerLoopVectorizer vectorizes loops which contain only one basic 389 /// block to a specified vectorization factor (VF). 390 /// This class performs the widening of scalars into vectors, or multiple 391 /// scalars. This class also implements the following features: 392 /// * It inserts an epilogue loop for handling loops that don't have iteration 393 /// counts that are known to be a multiple of the vectorization factor. 394 /// * It handles the code generation for reduction variables. 395 /// * Scalarization (implementation using scalars) of un-vectorizable 396 /// instructions. 397 /// InnerLoopVectorizer does not perform any vectorization-legality 398 /// checks, and relies on the caller to check for the different legality 399 /// aspects. The InnerLoopVectorizer relies on the 400 /// LoopVectorizationLegality class to provide information about the induction 401 /// and reduction variables that were found to a given vectorization factor. 402 class InnerLoopVectorizer { 403 public: 404 InnerLoopVectorizer(Loop *OrigLoop, PredicatedScalarEvolution &PSE, 405 LoopInfo *LI, DominatorTree *DT, 406 const TargetLibraryInfo *TLI, 407 const TargetTransformInfo *TTI, AssumptionCache *AC, 408 OptimizationRemarkEmitter *ORE, ElementCount VecWidth, 409 unsigned UnrollFactor, LoopVectorizationLegality *LVL, 410 LoopVectorizationCostModel *CM, BlockFrequencyInfo *BFI, 411 ProfileSummaryInfo *PSI) 412 : OrigLoop(OrigLoop), PSE(PSE), LI(LI), DT(DT), TLI(TLI), TTI(TTI), 413 AC(AC), ORE(ORE), VF(VecWidth), UF(UnrollFactor), 414 Builder(PSE.getSE()->getContext()), 415 VectorLoopValueMap(UnrollFactor, VecWidth), Legal(LVL), Cost(CM), 416 BFI(BFI), PSI(PSI) { 417 // Query this against the original loop and save it here because the profile 418 // of the original loop header may change as the transformation happens. 419 OptForSizeBasedOnProfile = llvm::shouldOptimizeForSize( 420 OrigLoop->getHeader(), PSI, BFI, PGSOQueryType::IRPass); 421 } 422 423 virtual ~InnerLoopVectorizer() = default; 424 425 /// Create a new empty loop that will contain vectorized instructions later 426 /// on, while the old loop will be used as the scalar remainder. Control flow 427 /// is generated around the vectorized (and scalar epilogue) loops consisting 428 /// of various checks and bypasses. Return the pre-header block of the new 429 /// loop. 430 BasicBlock *createVectorizedLoopSkeleton(); 431 432 /// Widen a single instruction within the innermost loop. 433 void widenInstruction(Instruction &I, VPUser &Operands, 434 VPTransformState &State); 435 436 /// Widen a single call instruction within the innermost loop. 437 void widenCallInstruction(CallInst &I, VPUser &ArgOperands, 438 VPTransformState &State); 439 440 /// Widen a single select instruction within the innermost loop. 441 void widenSelectInstruction(SelectInst &I, VPUser &Operands, 442 bool InvariantCond, VPTransformState &State); 443 444 /// Fix the vectorized code, taking care of header phi's, live-outs, and more. 445 void fixVectorizedLoop(); 446 447 // Return true if any runtime check is added. 448 bool areSafetyChecksAdded() { return AddedSafetyChecks; } 449 450 /// A type for vectorized values in the new loop. Each value from the 451 /// original loop, when vectorized, is represented by UF vector values in the 452 /// new unrolled loop, where UF is the unroll factor. 453 using VectorParts = SmallVector<Value *, 2>; 454 455 /// Vectorize a single GetElementPtrInst based on information gathered and 456 /// decisions taken during planning. 457 void widenGEP(GetElementPtrInst *GEP, VPUser &Indices, unsigned UF, 458 ElementCount VF, bool IsPtrLoopInvariant, 459 SmallBitVector &IsIndexLoopInvariant, VPTransformState &State); 460 461 /// Vectorize a single PHINode in a block. This method handles the induction 462 /// variable canonicalization. It supports both VF = 1 for unrolled loops and 463 /// arbitrary length vectors. 464 void widenPHIInstruction(Instruction *PN, unsigned UF, ElementCount VF); 465 466 /// A helper function to scalarize a single Instruction in the innermost loop. 467 /// Generates a sequence of scalar instances for each lane between \p MinLane 468 /// and \p MaxLane, times each part between \p MinPart and \p MaxPart, 469 /// inclusive. Uses the VPValue operands from \p Operands instead of \p 470 /// Instr's operands. 471 void scalarizeInstruction(Instruction *Instr, VPUser &Operands, 472 const VPIteration &Instance, bool IfPredicateInstr, 473 VPTransformState &State); 474 475 /// Widen an integer or floating-point induction variable \p IV. If \p Trunc 476 /// is provided, the integer induction variable will first be truncated to 477 /// the corresponding type. 478 void widenIntOrFpInduction(PHINode *IV, TruncInst *Trunc = nullptr); 479 480 /// getOrCreateVectorValue and getOrCreateScalarValue coordinate to generate a 481 /// vector or scalar value on-demand if one is not yet available. When 482 /// vectorizing a loop, we visit the definition of an instruction before its 483 /// uses. When visiting the definition, we either vectorize or scalarize the 484 /// instruction, creating an entry for it in the corresponding map. (In some 485 /// cases, such as induction variables, we will create both vector and scalar 486 /// entries.) Then, as we encounter uses of the definition, we derive values 487 /// for each scalar or vector use unless such a value is already available. 488 /// For example, if we scalarize a definition and one of its uses is vector, 489 /// we build the required vector on-demand with an insertelement sequence 490 /// when visiting the use. Otherwise, if the use is scalar, we can use the 491 /// existing scalar definition. 492 /// 493 /// Return a value in the new loop corresponding to \p V from the original 494 /// loop at unroll index \p Part. If the value has already been vectorized, 495 /// the corresponding vector entry in VectorLoopValueMap is returned. If, 496 /// however, the value has a scalar entry in VectorLoopValueMap, we construct 497 /// a new vector value on-demand by inserting the scalar values into a vector 498 /// with an insertelement sequence. If the value has been neither vectorized 499 /// nor scalarized, it must be loop invariant, so we simply broadcast the 500 /// value into a vector. 501 Value *getOrCreateVectorValue(Value *V, unsigned Part); 502 503 /// Return a value in the new loop corresponding to \p V from the original 504 /// loop at unroll and vector indices \p Instance. If the value has been 505 /// vectorized but not scalarized, the necessary extractelement instruction 506 /// will be generated. 507 Value *getOrCreateScalarValue(Value *V, const VPIteration &Instance); 508 509 /// Construct the vector value of a scalarized value \p V one lane at a time. 510 void packScalarIntoVectorValue(Value *V, const VPIteration &Instance); 511 512 /// Try to vectorize interleaved access group \p Group with the base address 513 /// given in \p Addr, optionally masking the vector operations if \p 514 /// BlockInMask is non-null. Use \p State to translate given VPValues to IR 515 /// values in the vectorized loop. 516 void vectorizeInterleaveGroup(const InterleaveGroup<Instruction> *Group, 517 VPTransformState &State, VPValue *Addr, 518 VPValue *BlockInMask = nullptr); 519 520 /// Vectorize Load and Store instructions with the base address given in \p 521 /// Addr, optionally masking the vector operations if \p BlockInMask is 522 /// non-null. Use \p State to translate given VPValues to IR values in the 523 /// vectorized loop. 524 void vectorizeMemoryInstruction(Instruction *Instr, VPTransformState &State, 525 VPValue *Addr, VPValue *StoredValue, 526 VPValue *BlockInMask); 527 528 /// Set the debug location in the builder using the debug location in 529 /// the instruction. 530 void setDebugLocFromInst(IRBuilder<> &B, const Value *Ptr); 531 532 /// Fix the non-induction PHIs in the OrigPHIsToFix vector. 533 void fixNonInductionPHIs(void); 534 535 protected: 536 friend class LoopVectorizationPlanner; 537 538 /// A small list of PHINodes. 539 using PhiVector = SmallVector<PHINode *, 4>; 540 541 /// A type for scalarized values in the new loop. Each value from the 542 /// original loop, when scalarized, is represented by UF x VF scalar values 543 /// in the new unrolled loop, where UF is the unroll factor and VF is the 544 /// vectorization factor. 545 using ScalarParts = SmallVector<SmallVector<Value *, 4>, 2>; 546 547 /// Set up the values of the IVs correctly when exiting the vector loop. 548 void fixupIVUsers(PHINode *OrigPhi, const InductionDescriptor &II, 549 Value *CountRoundDown, Value *EndValue, 550 BasicBlock *MiddleBlock); 551 552 /// Create a new induction variable inside L. 553 PHINode *createInductionVariable(Loop *L, Value *Start, Value *End, 554 Value *Step, Instruction *DL); 555 556 /// Handle all cross-iteration phis in the header. 557 void fixCrossIterationPHIs(); 558 559 /// Fix a first-order recurrence. This is the second phase of vectorizing 560 /// this phi node. 561 void fixFirstOrderRecurrence(PHINode *Phi); 562 563 /// Fix a reduction cross-iteration phi. This is the second phase of 564 /// vectorizing this phi node. 565 void fixReduction(PHINode *Phi); 566 567 /// Clear NSW/NUW flags from reduction instructions if necessary. 568 void clearReductionWrapFlags(RecurrenceDescriptor &RdxDesc); 569 570 /// The Loop exit block may have single value PHI nodes with some 571 /// incoming value. While vectorizing we only handled real values 572 /// that were defined inside the loop and we should have one value for 573 /// each predecessor of its parent basic block. See PR14725. 574 void fixLCSSAPHIs(); 575 576 /// Iteratively sink the scalarized operands of a predicated instruction into 577 /// the block that was created for it. 578 void sinkScalarOperands(Instruction *PredInst); 579 580 /// Shrinks vector element sizes to the smallest bitwidth they can be legally 581 /// represented as. 582 void truncateToMinimalBitwidths(); 583 584 /// Create a broadcast instruction. This method generates a broadcast 585 /// instruction (shuffle) for loop invariant values and for the induction 586 /// value. If this is the induction variable then we extend it to N, N+1, ... 587 /// this is needed because each iteration in the loop corresponds to a SIMD 588 /// element. 589 virtual Value *getBroadcastInstrs(Value *V); 590 591 /// This function adds (StartIdx, StartIdx + Step, StartIdx + 2*Step, ...) 592 /// to each vector element of Val. The sequence starts at StartIndex. 593 /// \p Opcode is relevant for FP induction variable. 594 virtual Value *getStepVector(Value *Val, int StartIdx, Value *Step, 595 Instruction::BinaryOps Opcode = 596 Instruction::BinaryOpsEnd); 597 598 /// Compute scalar induction steps. \p ScalarIV is the scalar induction 599 /// variable on which to base the steps, \p Step is the size of the step, and 600 /// \p EntryVal is the value from the original loop that maps to the steps. 601 /// Note that \p EntryVal doesn't have to be an induction variable - it 602 /// can also be a truncate instruction. 603 void buildScalarSteps(Value *ScalarIV, Value *Step, Instruction *EntryVal, 604 const InductionDescriptor &ID); 605 606 /// Create a vector induction phi node based on an existing scalar one. \p 607 /// EntryVal is the value from the original loop that maps to the vector phi 608 /// node, and \p Step is the loop-invariant step. If \p EntryVal is a 609 /// truncate instruction, instead of widening the original IV, we widen a 610 /// version of the IV truncated to \p EntryVal's type. 611 void createVectorIntOrFpInductionPHI(const InductionDescriptor &II, 612 Value *Step, Instruction *EntryVal); 613 614 /// Returns true if an instruction \p I should be scalarized instead of 615 /// vectorized for the chosen vectorization factor. 616 bool shouldScalarizeInstruction(Instruction *I) const; 617 618 /// Returns true if we should generate a scalar version of \p IV. 619 bool needsScalarInduction(Instruction *IV) const; 620 621 /// If there is a cast involved in the induction variable \p ID, which should 622 /// be ignored in the vectorized loop body, this function records the 623 /// VectorLoopValue of the respective Phi also as the VectorLoopValue of the 624 /// cast. We had already proved that the casted Phi is equal to the uncasted 625 /// Phi in the vectorized loop (under a runtime guard), and therefore 626 /// there is no need to vectorize the cast - the same value can be used in the 627 /// vector loop for both the Phi and the cast. 628 /// If \p VectorLoopValue is a scalarized value, \p Lane is also specified, 629 /// Otherwise, \p VectorLoopValue is a widened/vectorized value. 630 /// 631 /// \p EntryVal is the value from the original loop that maps to the vector 632 /// phi node and is used to distinguish what is the IV currently being 633 /// processed - original one (if \p EntryVal is a phi corresponding to the 634 /// original IV) or the "newly-created" one based on the proof mentioned above 635 /// (see also buildScalarSteps() and createVectorIntOrFPInductionPHI()). In the 636 /// latter case \p EntryVal is a TruncInst and we must not record anything for 637 /// that IV, but it's error-prone to expect callers of this routine to care 638 /// about that, hence this explicit parameter. 639 void recordVectorLoopValueForInductionCast(const InductionDescriptor &ID, 640 const Instruction *EntryVal, 641 Value *VectorLoopValue, 642 unsigned Part, 643 unsigned Lane = UINT_MAX); 644 645 /// Generate a shuffle sequence that will reverse the vector Vec. 646 virtual Value *reverseVector(Value *Vec); 647 648 /// Returns (and creates if needed) the original loop trip count. 649 Value *getOrCreateTripCount(Loop *NewLoop); 650 651 /// Returns (and creates if needed) the trip count of the widened loop. 652 Value *getOrCreateVectorTripCount(Loop *NewLoop); 653 654 /// Returns a bitcasted value to the requested vector type. 655 /// Also handles bitcasts of vector<float> <-> vector<pointer> types. 656 Value *createBitOrPointerCast(Value *V, VectorType *DstVTy, 657 const DataLayout &DL); 658 659 /// Emit a bypass check to see if the vector trip count is zero, including if 660 /// it overflows. 661 void emitMinimumIterationCountCheck(Loop *L, BasicBlock *Bypass); 662 663 /// Emit a bypass check to see if all of the SCEV assumptions we've 664 /// had to make are correct. 665 void emitSCEVChecks(Loop *L, BasicBlock *Bypass); 666 667 /// Emit bypass checks to check any memory assumptions we may have made. 668 void emitMemRuntimeChecks(Loop *L, BasicBlock *Bypass); 669 670 /// Compute the transformed value of Index at offset StartValue using step 671 /// StepValue. 672 /// For integer induction, returns StartValue + Index * StepValue. 673 /// For pointer induction, returns StartValue[Index * StepValue]. 674 /// FIXME: The newly created binary instructions should contain nsw/nuw 675 /// flags, which can be found from the original scalar operations. 676 Value *emitTransformedIndex(IRBuilder<> &B, Value *Index, ScalarEvolution *SE, 677 const DataLayout &DL, 678 const InductionDescriptor &ID) const; 679 680 /// Emit basic blocks (prefixed with \p Prefix) for the iteration check, 681 /// vector loop preheader, middle block and scalar preheader. Also 682 /// allocate a loop object for the new vector loop and return it. 683 Loop *createVectorLoopSkeleton(StringRef Prefix); 684 685 /// Create new phi nodes for the induction variables to resume iteration count 686 /// in the scalar epilogue, from where the vectorized loop left off (given by 687 /// \p VectorTripCount). 688 void createInductionResumeValues(Loop *L, Value *VectorTripCount); 689 690 /// Complete the loop skeleton by adding debug MDs, creating appropriate 691 /// conditional branches in the middle block, preparing the builder and 692 /// running the verifier. Take in the vector loop \p L as argument, and return 693 /// the preheader of the completed vector loop. 694 BasicBlock *completeLoopSkeleton(Loop *L, MDNode *OrigLoopID); 695 696 /// Add additional metadata to \p To that was not present on \p Orig. 697 /// 698 /// Currently this is used to add the noalias annotations based on the 699 /// inserted memchecks. Use this for instructions that are *cloned* into the 700 /// vector loop. 701 void addNewMetadata(Instruction *To, const Instruction *Orig); 702 703 /// Add metadata from one instruction to another. 704 /// 705 /// This includes both the original MDs from \p From and additional ones (\see 706 /// addNewMetadata). Use this for *newly created* instructions in the vector 707 /// loop. 708 void addMetadata(Instruction *To, Instruction *From); 709 710 /// Similar to the previous function but it adds the metadata to a 711 /// vector of instructions. 712 void addMetadata(ArrayRef<Value *> To, Instruction *From); 713 714 /// The original loop. 715 Loop *OrigLoop; 716 717 /// A wrapper around ScalarEvolution used to add runtime SCEV checks. Applies 718 /// dynamic knowledge to simplify SCEV expressions and converts them to a 719 /// more usable form. 720 PredicatedScalarEvolution &PSE; 721 722 /// Loop Info. 723 LoopInfo *LI; 724 725 /// Dominator Tree. 726 DominatorTree *DT; 727 728 /// Alias Analysis. 729 AAResults *AA; 730 731 /// Target Library Info. 732 const TargetLibraryInfo *TLI; 733 734 /// Target Transform Info. 735 const TargetTransformInfo *TTI; 736 737 /// Assumption Cache. 738 AssumptionCache *AC; 739 740 /// Interface to emit optimization remarks. 741 OptimizationRemarkEmitter *ORE; 742 743 /// LoopVersioning. It's only set up (non-null) if memchecks were 744 /// used. 745 /// 746 /// This is currently only used to add no-alias metadata based on the 747 /// memchecks. The actually versioning is performed manually. 748 std::unique_ptr<LoopVersioning> LVer; 749 750 /// The vectorization SIMD factor to use. Each vector will have this many 751 /// vector elements. 752 ElementCount VF; 753 754 /// The vectorization unroll factor to use. Each scalar is vectorized to this 755 /// many different vector instructions. 756 unsigned UF; 757 758 /// The builder that we use 759 IRBuilder<> Builder; 760 761 // --- Vectorization state --- 762 763 /// The vector-loop preheader. 764 BasicBlock *LoopVectorPreHeader; 765 766 /// The scalar-loop preheader. 767 BasicBlock *LoopScalarPreHeader; 768 769 /// Middle Block between the vector and the scalar. 770 BasicBlock *LoopMiddleBlock; 771 772 /// The ExitBlock of the scalar loop. 773 BasicBlock *LoopExitBlock; 774 775 /// The vector loop body. 776 BasicBlock *LoopVectorBody; 777 778 /// The scalar loop body. 779 BasicBlock *LoopScalarBody; 780 781 /// A list of all bypass blocks. The first block is the entry of the loop. 782 SmallVector<BasicBlock *, 4> LoopBypassBlocks; 783 784 /// The new Induction variable which was added to the new block. 785 PHINode *Induction = nullptr; 786 787 /// The induction variable of the old basic block. 788 PHINode *OldInduction = nullptr; 789 790 /// Maps values from the original loop to their corresponding values in the 791 /// vectorized loop. A key value can map to either vector values, scalar 792 /// values or both kinds of values, depending on whether the key was 793 /// vectorized and scalarized. 794 VectorizerValueMap VectorLoopValueMap; 795 796 /// Store instructions that were predicated. 797 SmallVector<Instruction *, 4> PredicatedInstructions; 798 799 /// Trip count of the original loop. 800 Value *TripCount = nullptr; 801 802 /// Trip count of the widened loop (TripCount - TripCount % (VF*UF)) 803 Value *VectorTripCount = nullptr; 804 805 /// The legality analysis. 806 LoopVectorizationLegality *Legal; 807 808 /// The profitablity analysis. 809 LoopVectorizationCostModel *Cost; 810 811 // Record whether runtime checks are added. 812 bool AddedSafetyChecks = false; 813 814 // Holds the end values for each induction variable. We save the end values 815 // so we can later fix-up the external users of the induction variables. 816 DenseMap<PHINode *, Value *> IVEndValues; 817 818 // Vector of original scalar PHIs whose corresponding widened PHIs need to be 819 // fixed up at the end of vector code generation. 820 SmallVector<PHINode *, 8> OrigPHIsToFix; 821 822 /// BFI and PSI are used to check for profile guided size optimizations. 823 BlockFrequencyInfo *BFI; 824 ProfileSummaryInfo *PSI; 825 826 // Whether this loop should be optimized for size based on profile guided size 827 // optimizatios. 828 bool OptForSizeBasedOnProfile; 829 }; 830 831 class InnerLoopUnroller : public InnerLoopVectorizer { 832 public: 833 InnerLoopUnroller(Loop *OrigLoop, PredicatedScalarEvolution &PSE, 834 LoopInfo *LI, DominatorTree *DT, 835 const TargetLibraryInfo *TLI, 836 const TargetTransformInfo *TTI, AssumptionCache *AC, 837 OptimizationRemarkEmitter *ORE, unsigned UnrollFactor, 838 LoopVectorizationLegality *LVL, 839 LoopVectorizationCostModel *CM, BlockFrequencyInfo *BFI, 840 ProfileSummaryInfo *PSI) 841 : InnerLoopVectorizer(OrigLoop, PSE, LI, DT, TLI, TTI, AC, ORE, 842 ElementCount::getFixed(1), UnrollFactor, LVL, CM, 843 BFI, PSI) {} 844 845 private: 846 Value *getBroadcastInstrs(Value *V) override; 847 Value *getStepVector(Value *Val, int StartIdx, Value *Step, 848 Instruction::BinaryOps Opcode = 849 Instruction::BinaryOpsEnd) override; 850 Value *reverseVector(Value *Vec) override; 851 }; 852 853 } // end namespace llvm 854 855 /// Look for a meaningful debug location on the instruction or it's 856 /// operands. 857 static Instruction *getDebugLocFromInstOrOperands(Instruction *I) { 858 if (!I) 859 return I; 860 861 DebugLoc Empty; 862 if (I->getDebugLoc() != Empty) 863 return I; 864 865 for (User::op_iterator OI = I->op_begin(), OE = I->op_end(); OI != OE; ++OI) { 866 if (Instruction *OpInst = dyn_cast<Instruction>(*OI)) 867 if (OpInst->getDebugLoc() != Empty) 868 return OpInst; 869 } 870 871 return I; 872 } 873 874 void InnerLoopVectorizer::setDebugLocFromInst(IRBuilder<> &B, const Value *Ptr) { 875 if (const Instruction *Inst = dyn_cast_or_null<Instruction>(Ptr)) { 876 const DILocation *DIL = Inst->getDebugLoc(); 877 if (DIL && Inst->getFunction()->isDebugInfoForProfiling() && 878 !isa<DbgInfoIntrinsic>(Inst)) { 879 assert(!VF.Scalable && "scalable vectors not yet supported."); 880 auto NewDIL = DIL->cloneByMultiplyingDuplicationFactor(UF * VF.Min); 881 if (NewDIL) 882 B.SetCurrentDebugLocation(NewDIL.getValue()); 883 else 884 LLVM_DEBUG(dbgs() 885 << "Failed to create new discriminator: " 886 << DIL->getFilename() << " Line: " << DIL->getLine()); 887 } 888 else 889 B.SetCurrentDebugLocation(DIL); 890 } else 891 B.SetCurrentDebugLocation(DebugLoc()); 892 } 893 894 /// Write a record \p DebugMsg about vectorization failure to the debug 895 /// output stream. If \p I is passed, it is an instruction that prevents 896 /// vectorization. 897 #ifndef NDEBUG 898 static void debugVectorizationFailure(const StringRef DebugMsg, 899 Instruction *I) { 900 dbgs() << "LV: Not vectorizing: " << DebugMsg; 901 if (I != nullptr) 902 dbgs() << " " << *I; 903 else 904 dbgs() << '.'; 905 dbgs() << '\n'; 906 } 907 #endif 908 909 /// Create an analysis remark that explains why vectorization failed 910 /// 911 /// \p PassName is the name of the pass (e.g. can be AlwaysPrint). \p 912 /// RemarkName is the identifier for the remark. If \p I is passed it is an 913 /// instruction that prevents vectorization. Otherwise \p TheLoop is used for 914 /// the location of the remark. \return the remark object that can be 915 /// streamed to. 916 static OptimizationRemarkAnalysis createLVAnalysis(const char *PassName, 917 StringRef RemarkName, Loop *TheLoop, Instruction *I) { 918 Value *CodeRegion = TheLoop->getHeader(); 919 DebugLoc DL = TheLoop->getStartLoc(); 920 921 if (I) { 922 CodeRegion = I->getParent(); 923 // If there is no debug location attached to the instruction, revert back to 924 // using the loop's. 925 if (I->getDebugLoc()) 926 DL = I->getDebugLoc(); 927 } 928 929 OptimizationRemarkAnalysis R(PassName, RemarkName, DL, CodeRegion); 930 R << "loop not vectorized: "; 931 return R; 932 } 933 934 namespace llvm { 935 936 void reportVectorizationFailure(const StringRef DebugMsg, 937 const StringRef OREMsg, const StringRef ORETag, 938 OptimizationRemarkEmitter *ORE, Loop *TheLoop, Instruction *I) { 939 LLVM_DEBUG(debugVectorizationFailure(DebugMsg, I)); 940 LoopVectorizeHints Hints(TheLoop, true /* doesn't matter */, *ORE); 941 ORE->emit(createLVAnalysis(Hints.vectorizeAnalysisPassName(), 942 ORETag, TheLoop, I) << OREMsg); 943 } 944 945 } // end namespace llvm 946 947 #ifndef NDEBUG 948 /// \return string containing a file name and a line # for the given loop. 949 static std::string getDebugLocString(const Loop *L) { 950 std::string Result; 951 if (L) { 952 raw_string_ostream OS(Result); 953 if (const DebugLoc LoopDbgLoc = L->getStartLoc()) 954 LoopDbgLoc.print(OS); 955 else 956 // Just print the module name. 957 OS << L->getHeader()->getParent()->getParent()->getModuleIdentifier(); 958 OS.flush(); 959 } 960 return Result; 961 } 962 #endif 963 964 void InnerLoopVectorizer::addNewMetadata(Instruction *To, 965 const Instruction *Orig) { 966 // If the loop was versioned with memchecks, add the corresponding no-alias 967 // metadata. 968 if (LVer && (isa<LoadInst>(Orig) || isa<StoreInst>(Orig))) 969 LVer->annotateInstWithNoAlias(To, Orig); 970 } 971 972 void InnerLoopVectorizer::addMetadata(Instruction *To, 973 Instruction *From) { 974 propagateMetadata(To, From); 975 addNewMetadata(To, From); 976 } 977 978 void InnerLoopVectorizer::addMetadata(ArrayRef<Value *> To, 979 Instruction *From) { 980 for (Value *V : To) { 981 if (Instruction *I = dyn_cast<Instruction>(V)) 982 addMetadata(I, From); 983 } 984 } 985 986 namespace llvm { 987 988 // Loop vectorization cost-model hints how the scalar epilogue loop should be 989 // lowered. 990 enum ScalarEpilogueLowering { 991 992 // The default: allowing scalar epilogues. 993 CM_ScalarEpilogueAllowed, 994 995 // Vectorization with OptForSize: don't allow epilogues. 996 CM_ScalarEpilogueNotAllowedOptSize, 997 998 // A special case of vectorisation with OptForSize: loops with a very small 999 // trip count are considered for vectorization under OptForSize, thereby 1000 // making sure the cost of their loop body is dominant, free of runtime 1001 // guards and scalar iteration overheads. 1002 CM_ScalarEpilogueNotAllowedLowTripLoop, 1003 1004 // Loop hint predicate indicating an epilogue is undesired. 1005 CM_ScalarEpilogueNotNeededUsePredicate 1006 }; 1007 1008 /// LoopVectorizationCostModel - estimates the expected speedups due to 1009 /// vectorization. 1010 /// In many cases vectorization is not profitable. This can happen because of 1011 /// a number of reasons. In this class we mainly attempt to predict the 1012 /// expected speedup/slowdowns due to the supported instruction set. We use the 1013 /// TargetTransformInfo to query the different backends for the cost of 1014 /// different operations. 1015 class LoopVectorizationCostModel { 1016 public: 1017 LoopVectorizationCostModel(ScalarEpilogueLowering SEL, Loop *L, 1018 PredicatedScalarEvolution &PSE, LoopInfo *LI, 1019 LoopVectorizationLegality *Legal, 1020 const TargetTransformInfo &TTI, 1021 const TargetLibraryInfo *TLI, DemandedBits *DB, 1022 AssumptionCache *AC, 1023 OptimizationRemarkEmitter *ORE, const Function *F, 1024 const LoopVectorizeHints *Hints, 1025 InterleavedAccessInfo &IAI) 1026 : ScalarEpilogueStatus(SEL), TheLoop(L), PSE(PSE), LI(LI), Legal(Legal), 1027 TTI(TTI), TLI(TLI), DB(DB), AC(AC), ORE(ORE), TheFunction(F), 1028 Hints(Hints), InterleaveInfo(IAI) {} 1029 1030 /// \return An upper bound for the vectorization factor, or None if 1031 /// vectorization and interleaving should be avoided up front. 1032 Optional<unsigned> computeMaxVF(unsigned UserVF, unsigned UserIC); 1033 1034 /// \return True if runtime checks are required for vectorization, and false 1035 /// otherwise. 1036 bool runtimeChecksRequired(); 1037 1038 /// \return The most profitable vectorization factor and the cost of that VF. 1039 /// This method checks every power of two up to MaxVF. If UserVF is not ZERO 1040 /// then this vectorization factor will be selected if vectorization is 1041 /// possible. 1042 VectorizationFactor selectVectorizationFactor(unsigned MaxVF); 1043 1044 /// Setup cost-based decisions for user vectorization factor. 1045 void selectUserVectorizationFactor(ElementCount UserVF) { 1046 collectUniformsAndScalars(UserVF); 1047 collectInstsToScalarize(UserVF); 1048 } 1049 1050 /// \return The size (in bits) of the smallest and widest types in the code 1051 /// that needs to be vectorized. We ignore values that remain scalar such as 1052 /// 64 bit loop indices. 1053 std::pair<unsigned, unsigned> getSmallestAndWidestTypes(); 1054 1055 /// \return The desired interleave count. 1056 /// If interleave count has been specified by metadata it will be returned. 1057 /// Otherwise, the interleave count is computed and returned. VF and LoopCost 1058 /// are the selected vectorization factor and the cost of the selected VF. 1059 unsigned selectInterleaveCount(ElementCount VF, unsigned LoopCost); 1060 1061 /// Memory access instruction may be vectorized in more than one way. 1062 /// Form of instruction after vectorization depends on cost. 1063 /// This function takes cost-based decisions for Load/Store instructions 1064 /// and collects them in a map. This decisions map is used for building 1065 /// the lists of loop-uniform and loop-scalar instructions. 1066 /// The calculated cost is saved with widening decision in order to 1067 /// avoid redundant calculations. 1068 void setCostBasedWideningDecision(ElementCount VF); 1069 1070 /// A struct that represents some properties of the register usage 1071 /// of a loop. 1072 struct RegisterUsage { 1073 /// Holds the number of loop invariant values that are used in the loop. 1074 /// The key is ClassID of target-provided register class. 1075 SmallMapVector<unsigned, unsigned, 4> LoopInvariantRegs; 1076 /// Holds the maximum number of concurrent live intervals in the loop. 1077 /// The key is ClassID of target-provided register class. 1078 SmallMapVector<unsigned, unsigned, 4> MaxLocalUsers; 1079 }; 1080 1081 /// \return Returns information about the register usages of the loop for the 1082 /// given vectorization factors. 1083 SmallVector<RegisterUsage, 8> 1084 calculateRegisterUsage(ArrayRef<ElementCount> VFs); 1085 1086 /// Collect values we want to ignore in the cost model. 1087 void collectValuesToIgnore(); 1088 1089 /// Split reductions into those that happen in the loop, and those that happen 1090 /// outside. In loop reductions are collected into InLoopReductionChains. 1091 void collectInLoopReductions(); 1092 1093 /// \returns The smallest bitwidth each instruction can be represented with. 1094 /// The vector equivalents of these instructions should be truncated to this 1095 /// type. 1096 const MapVector<Instruction *, uint64_t> &getMinimalBitwidths() const { 1097 return MinBWs; 1098 } 1099 1100 /// \returns True if it is more profitable to scalarize instruction \p I for 1101 /// vectorization factor \p VF. 1102 bool isProfitableToScalarize(Instruction *I, ElementCount VF) const { 1103 assert(VF.isVector() && 1104 "Profitable to scalarize relevant only for VF > 1."); 1105 1106 // Cost model is not run in the VPlan-native path - return conservative 1107 // result until this changes. 1108 if (EnableVPlanNativePath) 1109 return false; 1110 1111 auto Scalars = InstsToScalarize.find(VF); 1112 assert(Scalars != InstsToScalarize.end() && 1113 "VF not yet analyzed for scalarization profitability"); 1114 return Scalars->second.find(I) != Scalars->second.end(); 1115 } 1116 1117 /// Returns true if \p I is known to be uniform after vectorization. 1118 bool isUniformAfterVectorization(Instruction *I, ElementCount VF) const { 1119 if (VF.isScalar()) 1120 return true; 1121 1122 // Cost model is not run in the VPlan-native path - return conservative 1123 // result until this changes. 1124 if (EnableVPlanNativePath) 1125 return false; 1126 1127 auto UniformsPerVF = Uniforms.find(VF); 1128 assert(UniformsPerVF != Uniforms.end() && 1129 "VF not yet analyzed for uniformity"); 1130 return UniformsPerVF->second.count(I); 1131 } 1132 1133 /// Returns true if \p I is known to be scalar after vectorization. 1134 bool isScalarAfterVectorization(Instruction *I, ElementCount VF) const { 1135 if (VF.isScalar()) 1136 return true; 1137 1138 // Cost model is not run in the VPlan-native path - return conservative 1139 // result until this changes. 1140 if (EnableVPlanNativePath) 1141 return false; 1142 1143 auto ScalarsPerVF = Scalars.find(VF); 1144 assert(ScalarsPerVF != Scalars.end() && 1145 "Scalar values are not calculated for VF"); 1146 return ScalarsPerVF->second.count(I); 1147 } 1148 1149 /// \returns True if instruction \p I can be truncated to a smaller bitwidth 1150 /// for vectorization factor \p VF. 1151 bool canTruncateToMinimalBitwidth(Instruction *I, ElementCount VF) const { 1152 return VF.isVector() && MinBWs.find(I) != MinBWs.end() && 1153 !isProfitableToScalarize(I, VF) && 1154 !isScalarAfterVectorization(I, VF); 1155 } 1156 1157 /// Decision that was taken during cost calculation for memory instruction. 1158 enum InstWidening { 1159 CM_Unknown, 1160 CM_Widen, // For consecutive accesses with stride +1. 1161 CM_Widen_Reverse, // For consecutive accesses with stride -1. 1162 CM_Interleave, 1163 CM_GatherScatter, 1164 CM_Scalarize 1165 }; 1166 1167 /// Save vectorization decision \p W and \p Cost taken by the cost model for 1168 /// instruction \p I and vector width \p VF. 1169 void setWideningDecision(Instruction *I, ElementCount VF, InstWidening W, 1170 unsigned Cost) { 1171 assert(VF.isVector() && "Expected VF >=2"); 1172 WideningDecisions[std::make_pair(I, VF)] = std::make_pair(W, Cost); 1173 } 1174 1175 /// Save vectorization decision \p W and \p Cost taken by the cost model for 1176 /// interleaving group \p Grp and vector width \p VF. 1177 void setWideningDecision(const InterleaveGroup<Instruction> *Grp, 1178 ElementCount VF, InstWidening W, unsigned Cost) { 1179 assert(VF.isVector() && "Expected VF >=2"); 1180 /// Broadcast this decicion to all instructions inside the group. 1181 /// But the cost will be assigned to one instruction only. 1182 for (unsigned i = 0; i < Grp->getFactor(); ++i) { 1183 if (auto *I = Grp->getMember(i)) { 1184 if (Grp->getInsertPos() == I) 1185 WideningDecisions[std::make_pair(I, VF)] = std::make_pair(W, Cost); 1186 else 1187 WideningDecisions[std::make_pair(I, VF)] = std::make_pair(W, 0); 1188 } 1189 } 1190 } 1191 1192 /// Return the cost model decision for the given instruction \p I and vector 1193 /// width \p VF. Return CM_Unknown if this instruction did not pass 1194 /// through the cost modeling. 1195 InstWidening getWideningDecision(Instruction *I, ElementCount VF) { 1196 assert(!VF.Scalable && "scalable vectors not yet supported."); 1197 assert(VF.isVector() && "Expected VF >=2"); 1198 1199 // Cost model is not run in the VPlan-native path - return conservative 1200 // result until this changes. 1201 if (EnableVPlanNativePath) 1202 return CM_GatherScatter; 1203 1204 std::pair<Instruction *, ElementCount> InstOnVF = std::make_pair(I, VF); 1205 auto Itr = WideningDecisions.find(InstOnVF); 1206 if (Itr == WideningDecisions.end()) 1207 return CM_Unknown; 1208 return Itr->second.first; 1209 } 1210 1211 /// Return the vectorization cost for the given instruction \p I and vector 1212 /// width \p VF. 1213 unsigned getWideningCost(Instruction *I, ElementCount VF) { 1214 assert(VF.isVector() && "Expected VF >=2"); 1215 std::pair<Instruction *, ElementCount> InstOnVF = std::make_pair(I, VF); 1216 assert(WideningDecisions.find(InstOnVF) != WideningDecisions.end() && 1217 "The cost is not calculated"); 1218 return WideningDecisions[InstOnVF].second; 1219 } 1220 1221 /// Return True if instruction \p I is an optimizable truncate whose operand 1222 /// is an induction variable. Such a truncate will be removed by adding a new 1223 /// induction variable with the destination type. 1224 bool isOptimizableIVTruncate(Instruction *I, ElementCount VF) { 1225 // If the instruction is not a truncate, return false. 1226 auto *Trunc = dyn_cast<TruncInst>(I); 1227 if (!Trunc) 1228 return false; 1229 1230 // Get the source and destination types of the truncate. 1231 Type *SrcTy = ToVectorTy(cast<CastInst>(I)->getSrcTy(), VF); 1232 Type *DestTy = ToVectorTy(cast<CastInst>(I)->getDestTy(), VF); 1233 1234 // If the truncate is free for the given types, return false. Replacing a 1235 // free truncate with an induction variable would add an induction variable 1236 // update instruction to each iteration of the loop. We exclude from this 1237 // check the primary induction variable since it will need an update 1238 // instruction regardless. 1239 Value *Op = Trunc->getOperand(0); 1240 if (Op != Legal->getPrimaryInduction() && TTI.isTruncateFree(SrcTy, DestTy)) 1241 return false; 1242 1243 // If the truncated value is not an induction variable, return false. 1244 return Legal->isInductionPhi(Op); 1245 } 1246 1247 /// Collects the instructions to scalarize for each predicated instruction in 1248 /// the loop. 1249 void collectInstsToScalarize(ElementCount VF); 1250 1251 /// Collect Uniform and Scalar values for the given \p VF. 1252 /// The sets depend on CM decision for Load/Store instructions 1253 /// that may be vectorized as interleave, gather-scatter or scalarized. 1254 void collectUniformsAndScalars(ElementCount VF) { 1255 // Do the analysis once. 1256 if (VF.isScalar() || Uniforms.find(VF) != Uniforms.end()) 1257 return; 1258 setCostBasedWideningDecision(VF); 1259 collectLoopUniforms(VF); 1260 collectLoopScalars(VF); 1261 } 1262 1263 /// Returns true if the target machine supports masked store operation 1264 /// for the given \p DataType and kind of access to \p Ptr. 1265 bool isLegalMaskedStore(Type *DataType, Value *Ptr, Align Alignment) { 1266 return Legal->isConsecutivePtr(Ptr) && 1267 TTI.isLegalMaskedStore(DataType, Alignment); 1268 } 1269 1270 /// Returns true if the target machine supports masked load operation 1271 /// for the given \p DataType and kind of access to \p Ptr. 1272 bool isLegalMaskedLoad(Type *DataType, Value *Ptr, Align Alignment) { 1273 return Legal->isConsecutivePtr(Ptr) && 1274 TTI.isLegalMaskedLoad(DataType, Alignment); 1275 } 1276 1277 /// Returns true if the target machine supports masked scatter operation 1278 /// for the given \p DataType. 1279 bool isLegalMaskedScatter(Type *DataType, Align Alignment) { 1280 return TTI.isLegalMaskedScatter(DataType, Alignment); 1281 } 1282 1283 /// Returns true if the target machine supports masked gather operation 1284 /// for the given \p DataType. 1285 bool isLegalMaskedGather(Type *DataType, Align Alignment) { 1286 return TTI.isLegalMaskedGather(DataType, Alignment); 1287 } 1288 1289 /// Returns true if the target machine can represent \p V as a masked gather 1290 /// or scatter operation. 1291 bool isLegalGatherOrScatter(Value *V) { 1292 bool LI = isa<LoadInst>(V); 1293 bool SI = isa<StoreInst>(V); 1294 if (!LI && !SI) 1295 return false; 1296 auto *Ty = getMemInstValueType(V); 1297 Align Align = getLoadStoreAlignment(V); 1298 return (LI && isLegalMaskedGather(Ty, Align)) || 1299 (SI && isLegalMaskedScatter(Ty, Align)); 1300 } 1301 1302 /// Returns true if \p I is an instruction that will be scalarized with 1303 /// predication. Such instructions include conditional stores and 1304 /// instructions that may divide by zero. 1305 /// If a non-zero VF has been calculated, we check if I will be scalarized 1306 /// predication for that VF. 1307 bool isScalarWithPredication(Instruction *I, 1308 ElementCount VF = ElementCount::getFixed(1)); 1309 1310 // Returns true if \p I is an instruction that will be predicated either 1311 // through scalar predication or masked load/store or masked gather/scatter. 1312 // Superset of instructions that return true for isScalarWithPredication. 1313 bool isPredicatedInst(Instruction *I) { 1314 if (!blockNeedsPredication(I->getParent())) 1315 return false; 1316 // Loads and stores that need some form of masked operation are predicated 1317 // instructions. 1318 if (isa<LoadInst>(I) || isa<StoreInst>(I)) 1319 return Legal->isMaskRequired(I); 1320 return isScalarWithPredication(I); 1321 } 1322 1323 /// Returns true if \p I is a memory instruction with consecutive memory 1324 /// access that can be widened. 1325 bool 1326 memoryInstructionCanBeWidened(Instruction *I, 1327 ElementCount VF = ElementCount::getFixed(1)); 1328 1329 /// Returns true if \p I is a memory instruction in an interleaved-group 1330 /// of memory accesses that can be vectorized with wide vector loads/stores 1331 /// and shuffles. 1332 bool 1333 interleavedAccessCanBeWidened(Instruction *I, 1334 ElementCount VF = ElementCount::getFixed(1)); 1335 1336 /// Check if \p Instr belongs to any interleaved access group. 1337 bool isAccessInterleaved(Instruction *Instr) { 1338 return InterleaveInfo.isInterleaved(Instr); 1339 } 1340 1341 /// Get the interleaved access group that \p Instr belongs to. 1342 const InterleaveGroup<Instruction> * 1343 getInterleavedAccessGroup(Instruction *Instr) { 1344 return InterleaveInfo.getInterleaveGroup(Instr); 1345 } 1346 1347 /// Returns true if an interleaved group requires a scalar iteration 1348 /// to handle accesses with gaps, and there is nothing preventing us from 1349 /// creating a scalar epilogue. 1350 bool requiresScalarEpilogue() const { 1351 return isScalarEpilogueAllowed() && InterleaveInfo.requiresScalarEpilogue(); 1352 } 1353 1354 /// Returns true if a scalar epilogue is not allowed due to optsize or a 1355 /// loop hint annotation. 1356 bool isScalarEpilogueAllowed() const { 1357 return ScalarEpilogueStatus == CM_ScalarEpilogueAllowed; 1358 } 1359 1360 /// Returns true if all loop blocks should be masked to fold tail loop. 1361 bool foldTailByMasking() const { return FoldTailByMasking; } 1362 1363 bool blockNeedsPredication(BasicBlock *BB) { 1364 return foldTailByMasking() || Legal->blockNeedsPredication(BB); 1365 } 1366 1367 /// A SmallMapVector to store the InLoop reduction op chains, mapping phi 1368 /// nodes to the chain of instructions representing the reductions. Uses a 1369 /// MapVector to ensure deterministic iteration order. 1370 using ReductionChainMap = 1371 SmallMapVector<PHINode *, SmallVector<Instruction *, 4>, 4>; 1372 1373 /// Return the chain of instructions representing an inloop reduction. 1374 const ReductionChainMap &getInLoopReductionChains() const { 1375 return InLoopReductionChains; 1376 } 1377 1378 /// Returns true if the Phi is part of an inloop reduction. 1379 bool isInLoopReduction(PHINode *Phi) const { 1380 return InLoopReductionChains.count(Phi); 1381 } 1382 1383 /// Estimate cost of an intrinsic call instruction CI if it were vectorized 1384 /// with factor VF. Return the cost of the instruction, including 1385 /// scalarization overhead if it's needed. 1386 unsigned getVectorIntrinsicCost(CallInst *CI, ElementCount VF); 1387 1388 /// Estimate cost of a call instruction CI if it were vectorized with factor 1389 /// VF. Return the cost of the instruction, including scalarization overhead 1390 /// if it's needed. The flag NeedToScalarize shows if the call needs to be 1391 /// scalarized - 1392 /// i.e. either vector version isn't available, or is too expensive. 1393 unsigned getVectorCallCost(CallInst *CI, ElementCount VF, 1394 bool &NeedToScalarize); 1395 1396 /// Invalidates decisions already taken by the cost model. 1397 void invalidateCostModelingDecisions() { 1398 WideningDecisions.clear(); 1399 Uniforms.clear(); 1400 Scalars.clear(); 1401 } 1402 1403 private: 1404 unsigned NumPredStores = 0; 1405 1406 /// \return An upper bound for the vectorization factor, a power-of-2 larger 1407 /// than zero. One is returned if vectorization should best be avoided due 1408 /// to cost. 1409 unsigned computeFeasibleMaxVF(unsigned ConstTripCount); 1410 1411 /// The vectorization cost is a combination of the cost itself and a boolean 1412 /// indicating whether any of the contributing operations will actually 1413 /// operate on 1414 /// vector values after type legalization in the backend. If this latter value 1415 /// is 1416 /// false, then all operations will be scalarized (i.e. no vectorization has 1417 /// actually taken place). 1418 using VectorizationCostTy = std::pair<unsigned, bool>; 1419 1420 /// Returns the expected execution cost. The unit of the cost does 1421 /// not matter because we use the 'cost' units to compare different 1422 /// vector widths. The cost that is returned is *not* normalized by 1423 /// the factor width. 1424 VectorizationCostTy expectedCost(ElementCount VF); 1425 1426 /// Returns the execution time cost of an instruction for a given vector 1427 /// width. Vector width of one means scalar. 1428 VectorizationCostTy getInstructionCost(Instruction *I, ElementCount VF); 1429 1430 /// The cost-computation logic from getInstructionCost which provides 1431 /// the vector type as an output parameter. 1432 unsigned getInstructionCost(Instruction *I, ElementCount VF, Type *&VectorTy); 1433 1434 /// Calculate vectorization cost of memory instruction \p I. 1435 unsigned getMemoryInstructionCost(Instruction *I, ElementCount VF); 1436 1437 /// The cost computation for scalarized memory instruction. 1438 unsigned getMemInstScalarizationCost(Instruction *I, ElementCount VF); 1439 1440 /// The cost computation for interleaving group of memory instructions. 1441 unsigned getInterleaveGroupCost(Instruction *I, ElementCount VF); 1442 1443 /// The cost computation for Gather/Scatter instruction. 1444 unsigned getGatherScatterCost(Instruction *I, ElementCount VF); 1445 1446 /// The cost computation for widening instruction \p I with consecutive 1447 /// memory access. 1448 unsigned getConsecutiveMemOpCost(Instruction *I, ElementCount VF); 1449 1450 /// The cost calculation for Load/Store instruction \p I with uniform pointer - 1451 /// Load: scalar load + broadcast. 1452 /// Store: scalar store + (loop invariant value stored? 0 : extract of last 1453 /// element) 1454 unsigned getUniformMemOpCost(Instruction *I, ElementCount VF); 1455 1456 /// Estimate the overhead of scalarizing an instruction. This is a 1457 /// convenience wrapper for the type-based getScalarizationOverhead API. 1458 unsigned getScalarizationOverhead(Instruction *I, ElementCount VF); 1459 1460 /// Returns whether the instruction is a load or store and will be a emitted 1461 /// as a vector operation. 1462 bool isConsecutiveLoadOrStore(Instruction *I); 1463 1464 /// Returns true if an artificially high cost for emulated masked memrefs 1465 /// should be used. 1466 bool useEmulatedMaskMemRefHack(Instruction *I); 1467 1468 /// Map of scalar integer values to the smallest bitwidth they can be legally 1469 /// represented as. The vector equivalents of these values should be truncated 1470 /// to this type. 1471 MapVector<Instruction *, uint64_t> MinBWs; 1472 1473 /// A type representing the costs for instructions if they were to be 1474 /// scalarized rather than vectorized. The entries are Instruction-Cost 1475 /// pairs. 1476 using ScalarCostsTy = DenseMap<Instruction *, unsigned>; 1477 1478 /// A set containing all BasicBlocks that are known to present after 1479 /// vectorization as a predicated block. 1480 SmallPtrSet<BasicBlock *, 4> PredicatedBBsAfterVectorization; 1481 1482 /// Records whether it is allowed to have the original scalar loop execute at 1483 /// least once. This may be needed as a fallback loop in case runtime 1484 /// aliasing/dependence checks fail, or to handle the tail/remainder 1485 /// iterations when the trip count is unknown or doesn't divide by the VF, 1486 /// or as a peel-loop to handle gaps in interleave-groups. 1487 /// Under optsize and when the trip count is very small we don't allow any 1488 /// iterations to execute in the scalar loop. 1489 ScalarEpilogueLowering ScalarEpilogueStatus = CM_ScalarEpilogueAllowed; 1490 1491 /// All blocks of loop are to be masked to fold tail of scalar iterations. 1492 bool FoldTailByMasking = false; 1493 1494 /// A map holding scalar costs for different vectorization factors. The 1495 /// presence of a cost for an instruction in the mapping indicates that the 1496 /// instruction will be scalarized when vectorizing with the associated 1497 /// vectorization factor. The entries are VF-ScalarCostTy pairs. 1498 DenseMap<ElementCount, ScalarCostsTy> InstsToScalarize; 1499 1500 /// Holds the instructions known to be uniform after vectorization. 1501 /// The data is collected per VF. 1502 DenseMap<ElementCount, SmallPtrSet<Instruction *, 4>> Uniforms; 1503 1504 /// Holds the instructions known to be scalar after vectorization. 1505 /// The data is collected per VF. 1506 DenseMap<ElementCount, SmallPtrSet<Instruction *, 4>> Scalars; 1507 1508 /// Holds the instructions (address computations) that are forced to be 1509 /// scalarized. 1510 DenseMap<ElementCount, SmallPtrSet<Instruction *, 4>> ForcedScalars; 1511 1512 /// PHINodes of the reductions that should be expanded in-loop along with 1513 /// their associated chains of reduction operations, in program order from top 1514 /// (PHI) to bottom 1515 ReductionChainMap InLoopReductionChains; 1516 1517 /// Returns the expected difference in cost from scalarizing the expression 1518 /// feeding a predicated instruction \p PredInst. The instructions to 1519 /// scalarize and their scalar costs are collected in \p ScalarCosts. A 1520 /// non-negative return value implies the expression will be scalarized. 1521 /// Currently, only single-use chains are considered for scalarization. 1522 int computePredInstDiscount(Instruction *PredInst, ScalarCostsTy &ScalarCosts, 1523 ElementCount VF); 1524 1525 /// Collect the instructions that are uniform after vectorization. An 1526 /// instruction is uniform if we represent it with a single scalar value in 1527 /// the vectorized loop corresponding to each vector iteration. Examples of 1528 /// uniform instructions include pointer operands of consecutive or 1529 /// interleaved memory accesses. Note that although uniformity implies an 1530 /// instruction will be scalar, the reverse is not true. In general, a 1531 /// scalarized instruction will be represented by VF scalar values in the 1532 /// vectorized loop, each corresponding to an iteration of the original 1533 /// scalar loop. 1534 void collectLoopUniforms(ElementCount VF); 1535 1536 /// Collect the instructions that are scalar after vectorization. An 1537 /// instruction is scalar if it is known to be uniform or will be scalarized 1538 /// during vectorization. Non-uniform scalarized instructions will be 1539 /// represented by VF values in the vectorized loop, each corresponding to an 1540 /// iteration of the original scalar loop. 1541 void collectLoopScalars(ElementCount VF); 1542 1543 /// Keeps cost model vectorization decision and cost for instructions. 1544 /// Right now it is used for memory instructions only. 1545 using DecisionList = DenseMap<std::pair<Instruction *, ElementCount>, 1546 std::pair<InstWidening, unsigned>>; 1547 1548 DecisionList WideningDecisions; 1549 1550 /// Returns true if \p V is expected to be vectorized and it needs to be 1551 /// extracted. 1552 bool needsExtract(Value *V, ElementCount VF) const { 1553 Instruction *I = dyn_cast<Instruction>(V); 1554 if (VF.isScalar() || !I || !TheLoop->contains(I) || 1555 TheLoop->isLoopInvariant(I)) 1556 return false; 1557 1558 // Assume we can vectorize V (and hence we need extraction) if the 1559 // scalars are not computed yet. This can happen, because it is called 1560 // via getScalarizationOverhead from setCostBasedWideningDecision, before 1561 // the scalars are collected. That should be a safe assumption in most 1562 // cases, because we check if the operands have vectorizable types 1563 // beforehand in LoopVectorizationLegality. 1564 return Scalars.find(VF) == Scalars.end() || 1565 !isScalarAfterVectorization(I, VF); 1566 }; 1567 1568 /// Returns a range containing only operands needing to be extracted. 1569 SmallVector<Value *, 4> filterExtractingOperands(Instruction::op_range Ops, 1570 ElementCount VF) { 1571 return SmallVector<Value *, 4>(make_filter_range( 1572 Ops, [this, VF](Value *V) { return this->needsExtract(V, VF); })); 1573 } 1574 1575 public: 1576 /// The loop that we evaluate. 1577 Loop *TheLoop; 1578 1579 /// Predicated scalar evolution analysis. 1580 PredicatedScalarEvolution &PSE; 1581 1582 /// Loop Info analysis. 1583 LoopInfo *LI; 1584 1585 /// Vectorization legality. 1586 LoopVectorizationLegality *Legal; 1587 1588 /// Vector target information. 1589 const TargetTransformInfo &TTI; 1590 1591 /// Target Library Info. 1592 const TargetLibraryInfo *TLI; 1593 1594 /// Demanded bits analysis. 1595 DemandedBits *DB; 1596 1597 /// Assumption cache. 1598 AssumptionCache *AC; 1599 1600 /// Interface to emit optimization remarks. 1601 OptimizationRemarkEmitter *ORE; 1602 1603 const Function *TheFunction; 1604 1605 /// Loop Vectorize Hint. 1606 const LoopVectorizeHints *Hints; 1607 1608 /// The interleave access information contains groups of interleaved accesses 1609 /// with the same stride and close to each other. 1610 InterleavedAccessInfo &InterleaveInfo; 1611 1612 /// Values to ignore in the cost model. 1613 SmallPtrSet<const Value *, 16> ValuesToIgnore; 1614 1615 /// Values to ignore in the cost model when VF > 1. 1616 SmallPtrSet<const Value *, 16> VecValuesToIgnore; 1617 }; 1618 1619 } // end namespace llvm 1620 1621 // Return true if \p OuterLp is an outer loop annotated with hints for explicit 1622 // vectorization. The loop needs to be annotated with #pragma omp simd 1623 // simdlen(#) or #pragma clang vectorize(enable) vectorize_width(#). If the 1624 // vector length information is not provided, vectorization is not considered 1625 // explicit. Interleave hints are not allowed either. These limitations will be 1626 // relaxed in the future. 1627 // Please, note that we are currently forced to abuse the pragma 'clang 1628 // vectorize' semantics. This pragma provides *auto-vectorization hints* 1629 // (i.e., LV must check that vectorization is legal) whereas pragma 'omp simd' 1630 // provides *explicit vectorization hints* (LV can bypass legal checks and 1631 // assume that vectorization is legal). However, both hints are implemented 1632 // using the same metadata (llvm.loop.vectorize, processed by 1633 // LoopVectorizeHints). This will be fixed in the future when the native IR 1634 // representation for pragma 'omp simd' is introduced. 1635 static bool isExplicitVecOuterLoop(Loop *OuterLp, 1636 OptimizationRemarkEmitter *ORE) { 1637 assert(!OuterLp->empty() && "This is not an outer loop"); 1638 LoopVectorizeHints Hints(OuterLp, true /*DisableInterleaving*/, *ORE); 1639 1640 // Only outer loops with an explicit vectorization hint are supported. 1641 // Unannotated outer loops are ignored. 1642 if (Hints.getForce() == LoopVectorizeHints::FK_Undefined) 1643 return false; 1644 1645 Function *Fn = OuterLp->getHeader()->getParent(); 1646 if (!Hints.allowVectorization(Fn, OuterLp, 1647 true /*VectorizeOnlyWhenForced*/)) { 1648 LLVM_DEBUG(dbgs() << "LV: Loop hints prevent outer loop vectorization.\n"); 1649 return false; 1650 } 1651 1652 if (Hints.getInterleave() > 1) { 1653 // TODO: Interleave support is future work. 1654 LLVM_DEBUG(dbgs() << "LV: Not vectorizing: Interleave is not supported for " 1655 "outer loops.\n"); 1656 Hints.emitRemarkWithHints(); 1657 return false; 1658 } 1659 1660 return true; 1661 } 1662 1663 static void collectSupportedLoops(Loop &L, LoopInfo *LI, 1664 OptimizationRemarkEmitter *ORE, 1665 SmallVectorImpl<Loop *> &V) { 1666 // Collect inner loops and outer loops without irreducible control flow. For 1667 // now, only collect outer loops that have explicit vectorization hints. If we 1668 // are stress testing the VPlan H-CFG construction, we collect the outermost 1669 // loop of every loop nest. 1670 if (L.empty() || VPlanBuildStressTest || 1671 (EnableVPlanNativePath && isExplicitVecOuterLoop(&L, ORE))) { 1672 LoopBlocksRPO RPOT(&L); 1673 RPOT.perform(LI); 1674 if (!containsIrreducibleCFG<const BasicBlock *>(RPOT, *LI)) { 1675 V.push_back(&L); 1676 // TODO: Collect inner loops inside marked outer loops in case 1677 // vectorization fails for the outer loop. Do not invoke 1678 // 'containsIrreducibleCFG' again for inner loops when the outer loop is 1679 // already known to be reducible. We can use an inherited attribute for 1680 // that. 1681 return; 1682 } 1683 } 1684 for (Loop *InnerL : L) 1685 collectSupportedLoops(*InnerL, LI, ORE, V); 1686 } 1687 1688 namespace { 1689 1690 /// The LoopVectorize Pass. 1691 struct LoopVectorize : public FunctionPass { 1692 /// Pass identification, replacement for typeid 1693 static char ID; 1694 1695 LoopVectorizePass Impl; 1696 1697 explicit LoopVectorize(bool InterleaveOnlyWhenForced = false, 1698 bool VectorizeOnlyWhenForced = false) 1699 : FunctionPass(ID), 1700 Impl({InterleaveOnlyWhenForced, VectorizeOnlyWhenForced}) { 1701 initializeLoopVectorizePass(*PassRegistry::getPassRegistry()); 1702 } 1703 1704 bool runOnFunction(Function &F) override { 1705 if (skipFunction(F)) 1706 return false; 1707 1708 auto *SE = &getAnalysis<ScalarEvolutionWrapperPass>().getSE(); 1709 auto *LI = &getAnalysis<LoopInfoWrapperPass>().getLoopInfo(); 1710 auto *TTI = &getAnalysis<TargetTransformInfoWrapperPass>().getTTI(F); 1711 auto *DT = &getAnalysis<DominatorTreeWrapperPass>().getDomTree(); 1712 auto *BFI = &getAnalysis<BlockFrequencyInfoWrapperPass>().getBFI(); 1713 auto *TLIP = getAnalysisIfAvailable<TargetLibraryInfoWrapperPass>(); 1714 auto *TLI = TLIP ? &TLIP->getTLI(F) : nullptr; 1715 auto *AA = &getAnalysis<AAResultsWrapperPass>().getAAResults(); 1716 auto *AC = &getAnalysis<AssumptionCacheTracker>().getAssumptionCache(F); 1717 auto *LAA = &getAnalysis<LoopAccessLegacyAnalysis>(); 1718 auto *DB = &getAnalysis<DemandedBitsWrapperPass>().getDemandedBits(); 1719 auto *ORE = &getAnalysis<OptimizationRemarkEmitterWrapperPass>().getORE(); 1720 auto *PSI = &getAnalysis<ProfileSummaryInfoWrapperPass>().getPSI(); 1721 1722 std::function<const LoopAccessInfo &(Loop &)> GetLAA = 1723 [&](Loop &L) -> const LoopAccessInfo & { return LAA->getInfo(&L); }; 1724 1725 return Impl.runImpl(F, *SE, *LI, *TTI, *DT, *BFI, TLI, *DB, *AA, *AC, 1726 GetLAA, *ORE, PSI).MadeAnyChange; 1727 } 1728 1729 void getAnalysisUsage(AnalysisUsage &AU) const override { 1730 AU.addRequired<AssumptionCacheTracker>(); 1731 AU.addRequired<BlockFrequencyInfoWrapperPass>(); 1732 AU.addRequired<DominatorTreeWrapperPass>(); 1733 AU.addRequired<LoopInfoWrapperPass>(); 1734 AU.addRequired<ScalarEvolutionWrapperPass>(); 1735 AU.addRequired<TargetTransformInfoWrapperPass>(); 1736 AU.addRequired<AAResultsWrapperPass>(); 1737 AU.addRequired<LoopAccessLegacyAnalysis>(); 1738 AU.addRequired<DemandedBitsWrapperPass>(); 1739 AU.addRequired<OptimizationRemarkEmitterWrapperPass>(); 1740 AU.addRequired<InjectTLIMappingsLegacy>(); 1741 1742 // We currently do not preserve loopinfo/dominator analyses with outer loop 1743 // vectorization. Until this is addressed, mark these analyses as preserved 1744 // only for non-VPlan-native path. 1745 // TODO: Preserve Loop and Dominator analyses for VPlan-native path. 1746 if (!EnableVPlanNativePath) { 1747 AU.addPreserved<LoopInfoWrapperPass>(); 1748 AU.addPreserved<DominatorTreeWrapperPass>(); 1749 } 1750 1751 AU.addPreserved<BasicAAWrapperPass>(); 1752 AU.addPreserved<GlobalsAAWrapperPass>(); 1753 AU.addRequired<ProfileSummaryInfoWrapperPass>(); 1754 } 1755 }; 1756 1757 } // end anonymous namespace 1758 1759 //===----------------------------------------------------------------------===// 1760 // Implementation of LoopVectorizationLegality, InnerLoopVectorizer and 1761 // LoopVectorizationCostModel and LoopVectorizationPlanner. 1762 //===----------------------------------------------------------------------===// 1763 1764 Value *InnerLoopVectorizer::getBroadcastInstrs(Value *V) { 1765 // We need to place the broadcast of invariant variables outside the loop, 1766 // but only if it's proven safe to do so. Else, broadcast will be inside 1767 // vector loop body. 1768 Instruction *Instr = dyn_cast<Instruction>(V); 1769 bool SafeToHoist = OrigLoop->isLoopInvariant(V) && 1770 (!Instr || 1771 DT->dominates(Instr->getParent(), LoopVectorPreHeader)); 1772 // Place the code for broadcasting invariant variables in the new preheader. 1773 IRBuilder<>::InsertPointGuard Guard(Builder); 1774 if (SafeToHoist) 1775 Builder.SetInsertPoint(LoopVectorPreHeader->getTerminator()); 1776 1777 // Broadcast the scalar into all locations in the vector. 1778 Value *Shuf = Builder.CreateVectorSplat(VF, V, "broadcast"); 1779 1780 return Shuf; 1781 } 1782 1783 void InnerLoopVectorizer::createVectorIntOrFpInductionPHI( 1784 const InductionDescriptor &II, Value *Step, Instruction *EntryVal) { 1785 assert((isa<PHINode>(EntryVal) || isa<TruncInst>(EntryVal)) && 1786 "Expected either an induction phi-node or a truncate of it!"); 1787 Value *Start = II.getStartValue(); 1788 1789 // Construct the initial value of the vector IV in the vector loop preheader 1790 auto CurrIP = Builder.saveIP(); 1791 Builder.SetInsertPoint(LoopVectorPreHeader->getTerminator()); 1792 if (isa<TruncInst>(EntryVal)) { 1793 assert(Start->getType()->isIntegerTy() && 1794 "Truncation requires an integer type"); 1795 auto *TruncType = cast<IntegerType>(EntryVal->getType()); 1796 Step = Builder.CreateTrunc(Step, TruncType); 1797 Start = Builder.CreateCast(Instruction::Trunc, Start, TruncType); 1798 } 1799 Value *SplatStart = Builder.CreateVectorSplat(VF, Start); 1800 Value *SteppedStart = 1801 getStepVector(SplatStart, 0, Step, II.getInductionOpcode()); 1802 1803 // We create vector phi nodes for both integer and floating-point induction 1804 // variables. Here, we determine the kind of arithmetic we will perform. 1805 Instruction::BinaryOps AddOp; 1806 Instruction::BinaryOps MulOp; 1807 if (Step->getType()->isIntegerTy()) { 1808 AddOp = Instruction::Add; 1809 MulOp = Instruction::Mul; 1810 } else { 1811 AddOp = II.getInductionOpcode(); 1812 MulOp = Instruction::FMul; 1813 } 1814 1815 // Multiply the vectorization factor by the step using integer or 1816 // floating-point arithmetic as appropriate. 1817 Value *ConstVF = getSignedIntOrFpConstant(Step->getType(), VF.Min); 1818 Value *Mul = addFastMathFlag(Builder.CreateBinOp(MulOp, Step, ConstVF)); 1819 1820 // Create a vector splat to use in the induction update. 1821 // 1822 // FIXME: If the step is non-constant, we create the vector splat with 1823 // IRBuilder. IRBuilder can constant-fold the multiply, but it doesn't 1824 // handle a constant vector splat. 1825 assert(!VF.Scalable && "scalable vectors not yet supported."); 1826 Value *SplatVF = isa<Constant>(Mul) 1827 ? ConstantVector::getSplat(VF, cast<Constant>(Mul)) 1828 : Builder.CreateVectorSplat(VF, Mul); 1829 Builder.restoreIP(CurrIP); 1830 1831 // We may need to add the step a number of times, depending on the unroll 1832 // factor. The last of those goes into the PHI. 1833 PHINode *VecInd = PHINode::Create(SteppedStart->getType(), 2, "vec.ind", 1834 &*LoopVectorBody->getFirstInsertionPt()); 1835 VecInd->setDebugLoc(EntryVal->getDebugLoc()); 1836 Instruction *LastInduction = VecInd; 1837 for (unsigned Part = 0; Part < UF; ++Part) { 1838 VectorLoopValueMap.setVectorValue(EntryVal, Part, LastInduction); 1839 1840 if (isa<TruncInst>(EntryVal)) 1841 addMetadata(LastInduction, EntryVal); 1842 recordVectorLoopValueForInductionCast(II, EntryVal, LastInduction, Part); 1843 1844 LastInduction = cast<Instruction>(addFastMathFlag( 1845 Builder.CreateBinOp(AddOp, LastInduction, SplatVF, "step.add"))); 1846 LastInduction->setDebugLoc(EntryVal->getDebugLoc()); 1847 } 1848 1849 // Move the last step to the end of the latch block. This ensures consistent 1850 // placement of all induction updates. 1851 auto *LoopVectorLatch = LI->getLoopFor(LoopVectorBody)->getLoopLatch(); 1852 auto *Br = cast<BranchInst>(LoopVectorLatch->getTerminator()); 1853 auto *ICmp = cast<Instruction>(Br->getCondition()); 1854 LastInduction->moveBefore(ICmp); 1855 LastInduction->setName("vec.ind.next"); 1856 1857 VecInd->addIncoming(SteppedStart, LoopVectorPreHeader); 1858 VecInd->addIncoming(LastInduction, LoopVectorLatch); 1859 } 1860 1861 bool InnerLoopVectorizer::shouldScalarizeInstruction(Instruction *I) const { 1862 return Cost->isScalarAfterVectorization(I, VF) || 1863 Cost->isProfitableToScalarize(I, VF); 1864 } 1865 1866 bool InnerLoopVectorizer::needsScalarInduction(Instruction *IV) const { 1867 if (shouldScalarizeInstruction(IV)) 1868 return true; 1869 auto isScalarInst = [&](User *U) -> bool { 1870 auto *I = cast<Instruction>(U); 1871 return (OrigLoop->contains(I) && shouldScalarizeInstruction(I)); 1872 }; 1873 return llvm::any_of(IV->users(), isScalarInst); 1874 } 1875 1876 void InnerLoopVectorizer::recordVectorLoopValueForInductionCast( 1877 const InductionDescriptor &ID, const Instruction *EntryVal, 1878 Value *VectorLoopVal, unsigned Part, unsigned Lane) { 1879 assert((isa<PHINode>(EntryVal) || isa<TruncInst>(EntryVal)) && 1880 "Expected either an induction phi-node or a truncate of it!"); 1881 1882 // This induction variable is not the phi from the original loop but the 1883 // newly-created IV based on the proof that casted Phi is equal to the 1884 // uncasted Phi in the vectorized loop (under a runtime guard possibly). It 1885 // re-uses the same InductionDescriptor that original IV uses but we don't 1886 // have to do any recording in this case - that is done when original IV is 1887 // processed. 1888 if (isa<TruncInst>(EntryVal)) 1889 return; 1890 1891 const SmallVectorImpl<Instruction *> &Casts = ID.getCastInsts(); 1892 if (Casts.empty()) 1893 return; 1894 // Only the first Cast instruction in the Casts vector is of interest. 1895 // The rest of the Casts (if exist) have no uses outside the 1896 // induction update chain itself. 1897 Instruction *CastInst = *Casts.begin(); 1898 if (Lane < UINT_MAX) 1899 VectorLoopValueMap.setScalarValue(CastInst, {Part, Lane}, VectorLoopVal); 1900 else 1901 VectorLoopValueMap.setVectorValue(CastInst, Part, VectorLoopVal); 1902 } 1903 1904 void InnerLoopVectorizer::widenIntOrFpInduction(PHINode *IV, TruncInst *Trunc) { 1905 assert((IV->getType()->isIntegerTy() || IV != OldInduction) && 1906 "Primary induction variable must have an integer type"); 1907 1908 auto II = Legal->getInductionVars().find(IV); 1909 assert(II != Legal->getInductionVars().end() && "IV is not an induction"); 1910 1911 auto ID = II->second; 1912 assert(IV->getType() == ID.getStartValue()->getType() && "Types must match"); 1913 1914 // The value from the original loop to which we are mapping the new induction 1915 // variable. 1916 Instruction *EntryVal = Trunc ? cast<Instruction>(Trunc) : IV; 1917 1918 auto &DL = OrigLoop->getHeader()->getModule()->getDataLayout(); 1919 1920 // Generate code for the induction step. Note that induction steps are 1921 // required to be loop-invariant 1922 auto CreateStepValue = [&](const SCEV *Step) -> Value * { 1923 assert(PSE.getSE()->isLoopInvariant(Step, OrigLoop) && 1924 "Induction step should be loop invariant"); 1925 if (PSE.getSE()->isSCEVable(IV->getType())) { 1926 SCEVExpander Exp(*PSE.getSE(), DL, "induction"); 1927 return Exp.expandCodeFor(Step, Step->getType(), 1928 LoopVectorPreHeader->getTerminator()); 1929 } 1930 return cast<SCEVUnknown>(Step)->getValue(); 1931 }; 1932 1933 // The scalar value to broadcast. This is derived from the canonical 1934 // induction variable. If a truncation type is given, truncate the canonical 1935 // induction variable and step. Otherwise, derive these values from the 1936 // induction descriptor. 1937 auto CreateScalarIV = [&](Value *&Step) -> Value * { 1938 Value *ScalarIV = Induction; 1939 if (IV != OldInduction) { 1940 ScalarIV = IV->getType()->isIntegerTy() 1941 ? Builder.CreateSExtOrTrunc(Induction, IV->getType()) 1942 : Builder.CreateCast(Instruction::SIToFP, Induction, 1943 IV->getType()); 1944 ScalarIV = emitTransformedIndex(Builder, ScalarIV, PSE.getSE(), DL, ID); 1945 ScalarIV->setName("offset.idx"); 1946 } 1947 if (Trunc) { 1948 auto *TruncType = cast<IntegerType>(Trunc->getType()); 1949 assert(Step->getType()->isIntegerTy() && 1950 "Truncation requires an integer step"); 1951 ScalarIV = Builder.CreateTrunc(ScalarIV, TruncType); 1952 Step = Builder.CreateTrunc(Step, TruncType); 1953 } 1954 return ScalarIV; 1955 }; 1956 1957 // Create the vector values from the scalar IV, in the absence of creating a 1958 // vector IV. 1959 auto CreateSplatIV = [&](Value *ScalarIV, Value *Step) { 1960 Value *Broadcasted = getBroadcastInstrs(ScalarIV); 1961 for (unsigned Part = 0; Part < UF; ++Part) { 1962 assert(!VF.Scalable && "scalable vectors not yet supported."); 1963 Value *EntryPart = getStepVector(Broadcasted, VF.Min * Part, Step, 1964 ID.getInductionOpcode()); 1965 VectorLoopValueMap.setVectorValue(EntryVal, Part, EntryPart); 1966 if (Trunc) 1967 addMetadata(EntryPart, Trunc); 1968 recordVectorLoopValueForInductionCast(ID, EntryVal, EntryPart, Part); 1969 } 1970 }; 1971 1972 // Now do the actual transformations, and start with creating the step value. 1973 Value *Step = CreateStepValue(ID.getStep()); 1974 if (VF.isZero() || VF.isScalar()) { 1975 Value *ScalarIV = CreateScalarIV(Step); 1976 CreateSplatIV(ScalarIV, Step); 1977 return; 1978 } 1979 1980 // Determine if we want a scalar version of the induction variable. This is 1981 // true if the induction variable itself is not widened, or if it has at 1982 // least one user in the loop that is not widened. 1983 auto NeedsScalarIV = needsScalarInduction(EntryVal); 1984 if (!NeedsScalarIV) { 1985 createVectorIntOrFpInductionPHI(ID, Step, EntryVal); 1986 return; 1987 } 1988 1989 // Try to create a new independent vector induction variable. If we can't 1990 // create the phi node, we will splat the scalar induction variable in each 1991 // loop iteration. 1992 if (!shouldScalarizeInstruction(EntryVal)) { 1993 createVectorIntOrFpInductionPHI(ID, Step, EntryVal); 1994 Value *ScalarIV = CreateScalarIV(Step); 1995 // Create scalar steps that can be used by instructions we will later 1996 // scalarize. Note that the addition of the scalar steps will not increase 1997 // the number of instructions in the loop in the common case prior to 1998 // InstCombine. We will be trading one vector extract for each scalar step. 1999 buildScalarSteps(ScalarIV, Step, EntryVal, ID); 2000 return; 2001 } 2002 2003 // All IV users are scalar instructions, so only emit a scalar IV, not a 2004 // vectorised IV. Except when we tail-fold, then the splat IV feeds the 2005 // predicate used by the masked loads/stores. 2006 Value *ScalarIV = CreateScalarIV(Step); 2007 if (!Cost->isScalarEpilogueAllowed()) 2008 CreateSplatIV(ScalarIV, Step); 2009 buildScalarSteps(ScalarIV, Step, EntryVal, ID); 2010 } 2011 2012 Value *InnerLoopVectorizer::getStepVector(Value *Val, int StartIdx, Value *Step, 2013 Instruction::BinaryOps BinOp) { 2014 // Create and check the types. 2015 auto *ValVTy = cast<VectorType>(Val->getType()); 2016 int VLen = ValVTy->getNumElements(); 2017 2018 Type *STy = Val->getType()->getScalarType(); 2019 assert((STy->isIntegerTy() || STy->isFloatingPointTy()) && 2020 "Induction Step must be an integer or FP"); 2021 assert(Step->getType() == STy && "Step has wrong type"); 2022 2023 SmallVector<Constant *, 8> Indices; 2024 2025 if (STy->isIntegerTy()) { 2026 // Create a vector of consecutive numbers from zero to VF. 2027 for (int i = 0; i < VLen; ++i) 2028 Indices.push_back(ConstantInt::get(STy, StartIdx + i)); 2029 2030 // Add the consecutive indices to the vector value. 2031 Constant *Cv = ConstantVector::get(Indices); 2032 assert(Cv->getType() == Val->getType() && "Invalid consecutive vec"); 2033 Step = Builder.CreateVectorSplat(VLen, Step); 2034 assert(Step->getType() == Val->getType() && "Invalid step vec"); 2035 // FIXME: The newly created binary instructions should contain nsw/nuw flags, 2036 // which can be found from the original scalar operations. 2037 Step = Builder.CreateMul(Cv, Step); 2038 return Builder.CreateAdd(Val, Step, "induction"); 2039 } 2040 2041 // Floating point induction. 2042 assert((BinOp == Instruction::FAdd || BinOp == Instruction::FSub) && 2043 "Binary Opcode should be specified for FP induction"); 2044 // Create a vector of consecutive numbers from zero to VF. 2045 for (int i = 0; i < VLen; ++i) 2046 Indices.push_back(ConstantFP::get(STy, (double)(StartIdx + i))); 2047 2048 // Add the consecutive indices to the vector value. 2049 Constant *Cv = ConstantVector::get(Indices); 2050 2051 Step = Builder.CreateVectorSplat(VLen, Step); 2052 2053 // Floating point operations had to be 'fast' to enable the induction. 2054 FastMathFlags Flags; 2055 Flags.setFast(); 2056 2057 Value *MulOp = Builder.CreateFMul(Cv, Step); 2058 if (isa<Instruction>(MulOp)) 2059 // Have to check, MulOp may be a constant 2060 cast<Instruction>(MulOp)->setFastMathFlags(Flags); 2061 2062 Value *BOp = Builder.CreateBinOp(BinOp, Val, MulOp, "induction"); 2063 if (isa<Instruction>(BOp)) 2064 cast<Instruction>(BOp)->setFastMathFlags(Flags); 2065 return BOp; 2066 } 2067 2068 void InnerLoopVectorizer::buildScalarSteps(Value *ScalarIV, Value *Step, 2069 Instruction *EntryVal, 2070 const InductionDescriptor &ID) { 2071 // We shouldn't have to build scalar steps if we aren't vectorizing. 2072 assert(VF.isVector() && "VF should be greater than one"); 2073 assert(!VF.Scalable && 2074 "the code below assumes a fixed number of elements at compile time"); 2075 // Get the value type and ensure it and the step have the same integer type. 2076 Type *ScalarIVTy = ScalarIV->getType()->getScalarType(); 2077 assert(ScalarIVTy == Step->getType() && 2078 "Val and Step should have the same type"); 2079 2080 // We build scalar steps for both integer and floating-point induction 2081 // variables. Here, we determine the kind of arithmetic we will perform. 2082 Instruction::BinaryOps AddOp; 2083 Instruction::BinaryOps MulOp; 2084 if (ScalarIVTy->isIntegerTy()) { 2085 AddOp = Instruction::Add; 2086 MulOp = Instruction::Mul; 2087 } else { 2088 AddOp = ID.getInductionOpcode(); 2089 MulOp = Instruction::FMul; 2090 } 2091 2092 // Determine the number of scalars we need to generate for each unroll 2093 // iteration. If EntryVal is uniform, we only need to generate the first 2094 // lane. Otherwise, we generate all VF values. 2095 unsigned Lanes = 2096 Cost->isUniformAfterVectorization(cast<Instruction>(EntryVal), VF) 2097 ? 1 2098 : VF.Min; 2099 // Compute the scalar steps and save the results in VectorLoopValueMap. 2100 for (unsigned Part = 0; Part < UF; ++Part) { 2101 for (unsigned Lane = 0; Lane < Lanes; ++Lane) { 2102 auto *StartIdx = 2103 getSignedIntOrFpConstant(ScalarIVTy, VF.Min * Part + Lane); 2104 auto *Mul = addFastMathFlag(Builder.CreateBinOp(MulOp, StartIdx, Step)); 2105 auto *Add = addFastMathFlag(Builder.CreateBinOp(AddOp, ScalarIV, Mul)); 2106 VectorLoopValueMap.setScalarValue(EntryVal, {Part, Lane}, Add); 2107 recordVectorLoopValueForInductionCast(ID, EntryVal, Add, Part, Lane); 2108 } 2109 } 2110 } 2111 2112 Value *InnerLoopVectorizer::getOrCreateVectorValue(Value *V, unsigned Part) { 2113 assert(V != Induction && "The new induction variable should not be used."); 2114 assert(!V->getType()->isVectorTy() && "Can't widen a vector"); 2115 assert(!V->getType()->isVoidTy() && "Type does not produce a value"); 2116 2117 // If we have a stride that is replaced by one, do it here. Defer this for 2118 // the VPlan-native path until we start running Legal checks in that path. 2119 if (!EnableVPlanNativePath && Legal->hasStride(V)) 2120 V = ConstantInt::get(V->getType(), 1); 2121 2122 // If we have a vector mapped to this value, return it. 2123 if (VectorLoopValueMap.hasVectorValue(V, Part)) 2124 return VectorLoopValueMap.getVectorValue(V, Part); 2125 2126 // If the value has not been vectorized, check if it has been scalarized 2127 // instead. If it has been scalarized, and we actually need the value in 2128 // vector form, we will construct the vector values on demand. 2129 if (VectorLoopValueMap.hasAnyScalarValue(V)) { 2130 Value *ScalarValue = VectorLoopValueMap.getScalarValue(V, {Part, 0}); 2131 2132 // If we've scalarized a value, that value should be an instruction. 2133 auto *I = cast<Instruction>(V); 2134 2135 // If we aren't vectorizing, we can just copy the scalar map values over to 2136 // the vector map. 2137 if (VF == 1) { 2138 VectorLoopValueMap.setVectorValue(V, Part, ScalarValue); 2139 return ScalarValue; 2140 } 2141 2142 // Get the last scalar instruction we generated for V and Part. If the value 2143 // is known to be uniform after vectorization, this corresponds to lane zero 2144 // of the Part unroll iteration. Otherwise, the last instruction is the one 2145 // we created for the last vector lane of the Part unroll iteration. 2146 assert(!VF.Scalable && "scalable vectors not yet supported."); 2147 unsigned LastLane = 2148 Cost->isUniformAfterVectorization(I, VF) ? 0 : VF.Min - 1; 2149 auto *LastInst = cast<Instruction>( 2150 VectorLoopValueMap.getScalarValue(V, {Part, LastLane})); 2151 2152 // Set the insert point after the last scalarized instruction. This ensures 2153 // the insertelement sequence will directly follow the scalar definitions. 2154 auto OldIP = Builder.saveIP(); 2155 auto NewIP = std::next(BasicBlock::iterator(LastInst)); 2156 Builder.SetInsertPoint(&*NewIP); 2157 2158 // However, if we are vectorizing, we need to construct the vector values. 2159 // If the value is known to be uniform after vectorization, we can just 2160 // broadcast the scalar value corresponding to lane zero for each unroll 2161 // iteration. Otherwise, we construct the vector values using insertelement 2162 // instructions. Since the resulting vectors are stored in 2163 // VectorLoopValueMap, we will only generate the insertelements once. 2164 Value *VectorValue = nullptr; 2165 if (Cost->isUniformAfterVectorization(I, VF)) { 2166 VectorValue = getBroadcastInstrs(ScalarValue); 2167 VectorLoopValueMap.setVectorValue(V, Part, VectorValue); 2168 } else { 2169 // Initialize packing with insertelements to start from undef. 2170 assert(!VF.Scalable && "VF is assumed to be non scalable."); 2171 Value *Undef = UndefValue::get(VectorType::get(V->getType(), VF)); 2172 VectorLoopValueMap.setVectorValue(V, Part, Undef); 2173 for (unsigned Lane = 0; Lane < VF.Min; ++Lane) 2174 packScalarIntoVectorValue(V, {Part, Lane}); 2175 VectorValue = VectorLoopValueMap.getVectorValue(V, Part); 2176 } 2177 Builder.restoreIP(OldIP); 2178 return VectorValue; 2179 } 2180 2181 // If this scalar is unknown, assume that it is a constant or that it is 2182 // loop invariant. Broadcast V and save the value for future uses. 2183 Value *B = getBroadcastInstrs(V); 2184 VectorLoopValueMap.setVectorValue(V, Part, B); 2185 return B; 2186 } 2187 2188 Value * 2189 InnerLoopVectorizer::getOrCreateScalarValue(Value *V, 2190 const VPIteration &Instance) { 2191 // If the value is not an instruction contained in the loop, it should 2192 // already be scalar. 2193 if (OrigLoop->isLoopInvariant(V)) 2194 return V; 2195 2196 assert(Instance.Lane > 0 2197 ? !Cost->isUniformAfterVectorization(cast<Instruction>(V), VF) 2198 : true && "Uniform values only have lane zero"); 2199 2200 // If the value from the original loop has not been vectorized, it is 2201 // represented by UF x VF scalar values in the new loop. Return the requested 2202 // scalar value. 2203 if (VectorLoopValueMap.hasScalarValue(V, Instance)) 2204 return VectorLoopValueMap.getScalarValue(V, Instance); 2205 2206 // If the value has not been scalarized, get its entry in VectorLoopValueMap 2207 // for the given unroll part. If this entry is not a vector type (i.e., the 2208 // vectorization factor is one), there is no need to generate an 2209 // extractelement instruction. 2210 auto *U = getOrCreateVectorValue(V, Instance.Part); 2211 if (!U->getType()->isVectorTy()) { 2212 assert(VF == 1 && "Value not scalarized has non-vector type"); 2213 return U; 2214 } 2215 2216 // Otherwise, the value from the original loop has been vectorized and is 2217 // represented by UF vector values. Extract and return the requested scalar 2218 // value from the appropriate vector lane. 2219 return Builder.CreateExtractElement(U, Builder.getInt32(Instance.Lane)); 2220 } 2221 2222 void InnerLoopVectorizer::packScalarIntoVectorValue( 2223 Value *V, const VPIteration &Instance) { 2224 assert(V != Induction && "The new induction variable should not be used."); 2225 assert(!V->getType()->isVectorTy() && "Can't pack a vector"); 2226 assert(!V->getType()->isVoidTy() && "Type does not produce a value"); 2227 2228 Value *ScalarInst = VectorLoopValueMap.getScalarValue(V, Instance); 2229 Value *VectorValue = VectorLoopValueMap.getVectorValue(V, Instance.Part); 2230 VectorValue = Builder.CreateInsertElement(VectorValue, ScalarInst, 2231 Builder.getInt32(Instance.Lane)); 2232 VectorLoopValueMap.resetVectorValue(V, Instance.Part, VectorValue); 2233 } 2234 2235 Value *InnerLoopVectorizer::reverseVector(Value *Vec) { 2236 assert(Vec->getType()->isVectorTy() && "Invalid type"); 2237 assert(!VF.Scalable && "Cannot reverse scalable vectors"); 2238 SmallVector<int, 8> ShuffleMask; 2239 for (unsigned i = 0; i < VF.Min; ++i) 2240 ShuffleMask.push_back(VF.Min - i - 1); 2241 2242 return Builder.CreateShuffleVector(Vec, UndefValue::get(Vec->getType()), 2243 ShuffleMask, "reverse"); 2244 } 2245 2246 // Return whether we allow using masked interleave-groups (for dealing with 2247 // strided loads/stores that reside in predicated blocks, or for dealing 2248 // with gaps). 2249 static bool useMaskedInterleavedAccesses(const TargetTransformInfo &TTI) { 2250 // If an override option has been passed in for interleaved accesses, use it. 2251 if (EnableMaskedInterleavedMemAccesses.getNumOccurrences() > 0) 2252 return EnableMaskedInterleavedMemAccesses; 2253 2254 return TTI.enableMaskedInterleavedAccessVectorization(); 2255 } 2256 2257 // Try to vectorize the interleave group that \p Instr belongs to. 2258 // 2259 // E.g. Translate following interleaved load group (factor = 3): 2260 // for (i = 0; i < N; i+=3) { 2261 // R = Pic[i]; // Member of index 0 2262 // G = Pic[i+1]; // Member of index 1 2263 // B = Pic[i+2]; // Member of index 2 2264 // ... // do something to R, G, B 2265 // } 2266 // To: 2267 // %wide.vec = load <12 x i32> ; Read 4 tuples of R,G,B 2268 // %R.vec = shuffle %wide.vec, undef, <0, 3, 6, 9> ; R elements 2269 // %G.vec = shuffle %wide.vec, undef, <1, 4, 7, 10> ; G elements 2270 // %B.vec = shuffle %wide.vec, undef, <2, 5, 8, 11> ; B elements 2271 // 2272 // Or translate following interleaved store group (factor = 3): 2273 // for (i = 0; i < N; i+=3) { 2274 // ... do something to R, G, B 2275 // Pic[i] = R; // Member of index 0 2276 // Pic[i+1] = G; // Member of index 1 2277 // Pic[i+2] = B; // Member of index 2 2278 // } 2279 // To: 2280 // %R_G.vec = shuffle %R.vec, %G.vec, <0, 1, 2, ..., 7> 2281 // %B_U.vec = shuffle %B.vec, undef, <0, 1, 2, 3, u, u, u, u> 2282 // %interleaved.vec = shuffle %R_G.vec, %B_U.vec, 2283 // <0, 4, 8, 1, 5, 9, 2, 6, 10, 3, 7, 11> ; Interleave R,G,B elements 2284 // store <12 x i32> %interleaved.vec ; Write 4 tuples of R,G,B 2285 void InnerLoopVectorizer::vectorizeInterleaveGroup( 2286 const InterleaveGroup<Instruction> *Group, VPTransformState &State, 2287 VPValue *Addr, VPValue *BlockInMask) { 2288 Instruction *Instr = Group->getInsertPos(); 2289 const DataLayout &DL = Instr->getModule()->getDataLayout(); 2290 2291 // Prepare for the vector type of the interleaved load/store. 2292 Type *ScalarTy = getMemInstValueType(Instr); 2293 unsigned InterleaveFactor = Group->getFactor(); 2294 assert(!VF.Scalable && "scalable vectors not yet supported."); 2295 auto *VecTy = VectorType::get(ScalarTy, VF * InterleaveFactor); 2296 2297 // Prepare for the new pointers. 2298 SmallVector<Value *, 2> AddrParts; 2299 unsigned Index = Group->getIndex(Instr); 2300 2301 // TODO: extend the masked interleaved-group support to reversed access. 2302 assert((!BlockInMask || !Group->isReverse()) && 2303 "Reversed masked interleave-group not supported."); 2304 2305 // If the group is reverse, adjust the index to refer to the last vector lane 2306 // instead of the first. We adjust the index from the first vector lane, 2307 // rather than directly getting the pointer for lane VF - 1, because the 2308 // pointer operand of the interleaved access is supposed to be uniform. For 2309 // uniform instructions, we're only required to generate a value for the 2310 // first vector lane in each unroll iteration. 2311 assert(!VF.Scalable && 2312 "scalable vector reverse operation is not implemented"); 2313 if (Group->isReverse()) 2314 Index += (VF.Min - 1) * Group->getFactor(); 2315 2316 for (unsigned Part = 0; Part < UF; Part++) { 2317 Value *AddrPart = State.get(Addr, {Part, 0}); 2318 setDebugLocFromInst(Builder, AddrPart); 2319 2320 // Notice current instruction could be any index. Need to adjust the address 2321 // to the member of index 0. 2322 // 2323 // E.g. a = A[i+1]; // Member of index 1 (Current instruction) 2324 // b = A[i]; // Member of index 0 2325 // Current pointer is pointed to A[i+1], adjust it to A[i]. 2326 // 2327 // E.g. A[i+1] = a; // Member of index 1 2328 // A[i] = b; // Member of index 0 2329 // A[i+2] = c; // Member of index 2 (Current instruction) 2330 // Current pointer is pointed to A[i+2], adjust it to A[i]. 2331 2332 bool InBounds = false; 2333 if (auto *gep = dyn_cast<GetElementPtrInst>(AddrPart->stripPointerCasts())) 2334 InBounds = gep->isInBounds(); 2335 AddrPart = Builder.CreateGEP(ScalarTy, AddrPart, Builder.getInt32(-Index)); 2336 cast<GetElementPtrInst>(AddrPart)->setIsInBounds(InBounds); 2337 2338 // Cast to the vector pointer type. 2339 unsigned AddressSpace = AddrPart->getType()->getPointerAddressSpace(); 2340 Type *PtrTy = VecTy->getPointerTo(AddressSpace); 2341 AddrParts.push_back(Builder.CreateBitCast(AddrPart, PtrTy)); 2342 } 2343 2344 setDebugLocFromInst(Builder, Instr); 2345 Value *UndefVec = UndefValue::get(VecTy); 2346 2347 Value *MaskForGaps = nullptr; 2348 if (Group->requiresScalarEpilogue() && !Cost->isScalarEpilogueAllowed()) { 2349 assert(!VF.Scalable && "scalable vectors not yet supported."); 2350 MaskForGaps = createBitMaskForGaps(Builder, VF.Min, *Group); 2351 assert(MaskForGaps && "Mask for Gaps is required but it is null"); 2352 } 2353 2354 // Vectorize the interleaved load group. 2355 if (isa<LoadInst>(Instr)) { 2356 // For each unroll part, create a wide load for the group. 2357 SmallVector<Value *, 2> NewLoads; 2358 for (unsigned Part = 0; Part < UF; Part++) { 2359 Instruction *NewLoad; 2360 if (BlockInMask || MaskForGaps) { 2361 assert(useMaskedInterleavedAccesses(*TTI) && 2362 "masked interleaved groups are not allowed."); 2363 Value *GroupMask = MaskForGaps; 2364 if (BlockInMask) { 2365 Value *BlockInMaskPart = State.get(BlockInMask, Part); 2366 auto *Undefs = UndefValue::get(BlockInMaskPart->getType()); 2367 assert(!VF.Scalable && "scalable vectors not yet supported."); 2368 Value *ShuffledMask = Builder.CreateShuffleVector( 2369 BlockInMaskPart, Undefs, 2370 createReplicatedMask(InterleaveFactor, VF.Min), 2371 "interleaved.mask"); 2372 GroupMask = MaskForGaps 2373 ? Builder.CreateBinOp(Instruction::And, ShuffledMask, 2374 MaskForGaps) 2375 : ShuffledMask; 2376 } 2377 NewLoad = 2378 Builder.CreateMaskedLoad(AddrParts[Part], Group->getAlign(), 2379 GroupMask, UndefVec, "wide.masked.vec"); 2380 } 2381 else 2382 NewLoad = Builder.CreateAlignedLoad(VecTy, AddrParts[Part], 2383 Group->getAlign(), "wide.vec"); 2384 Group->addMetadata(NewLoad); 2385 NewLoads.push_back(NewLoad); 2386 } 2387 2388 // For each member in the group, shuffle out the appropriate data from the 2389 // wide loads. 2390 for (unsigned I = 0; I < InterleaveFactor; ++I) { 2391 Instruction *Member = Group->getMember(I); 2392 2393 // Skip the gaps in the group. 2394 if (!Member) 2395 continue; 2396 2397 assert(!VF.Scalable && "scalable vectors not yet supported."); 2398 auto StrideMask = createStrideMask(I, InterleaveFactor, VF.Min); 2399 for (unsigned Part = 0; Part < UF; Part++) { 2400 Value *StridedVec = Builder.CreateShuffleVector( 2401 NewLoads[Part], UndefVec, StrideMask, "strided.vec"); 2402 2403 // If this member has different type, cast the result type. 2404 if (Member->getType() != ScalarTy) { 2405 assert(!VF.Scalable && "VF is assumed to be non scalable."); 2406 VectorType *OtherVTy = VectorType::get(Member->getType(), VF); 2407 StridedVec = createBitOrPointerCast(StridedVec, OtherVTy, DL); 2408 } 2409 2410 if (Group->isReverse()) 2411 StridedVec = reverseVector(StridedVec); 2412 2413 VectorLoopValueMap.setVectorValue(Member, Part, StridedVec); 2414 } 2415 } 2416 return; 2417 } 2418 2419 // The sub vector type for current instruction. 2420 assert(!VF.Scalable && "VF is assumed to be non scalable."); 2421 auto *SubVT = VectorType::get(ScalarTy, VF); 2422 2423 // Vectorize the interleaved store group. 2424 for (unsigned Part = 0; Part < UF; Part++) { 2425 // Collect the stored vector from each member. 2426 SmallVector<Value *, 4> StoredVecs; 2427 for (unsigned i = 0; i < InterleaveFactor; i++) { 2428 // Interleaved store group doesn't allow a gap, so each index has a member 2429 Instruction *Member = Group->getMember(i); 2430 assert(Member && "Fail to get a member from an interleaved store group"); 2431 2432 Value *StoredVec = getOrCreateVectorValue( 2433 cast<StoreInst>(Member)->getValueOperand(), Part); 2434 if (Group->isReverse()) 2435 StoredVec = reverseVector(StoredVec); 2436 2437 // If this member has different type, cast it to a unified type. 2438 2439 if (StoredVec->getType() != SubVT) 2440 StoredVec = createBitOrPointerCast(StoredVec, SubVT, DL); 2441 2442 StoredVecs.push_back(StoredVec); 2443 } 2444 2445 // Concatenate all vectors into a wide vector. 2446 Value *WideVec = concatenateVectors(Builder, StoredVecs); 2447 2448 // Interleave the elements in the wide vector. 2449 assert(!VF.Scalable && "scalable vectors not yet supported."); 2450 Value *IVec = Builder.CreateShuffleVector( 2451 WideVec, UndefVec, createInterleaveMask(VF.Min, InterleaveFactor), 2452 "interleaved.vec"); 2453 2454 Instruction *NewStoreInstr; 2455 if (BlockInMask) { 2456 Value *BlockInMaskPart = State.get(BlockInMask, Part); 2457 auto *Undefs = UndefValue::get(BlockInMaskPart->getType()); 2458 Value *ShuffledMask = Builder.CreateShuffleVector( 2459 BlockInMaskPart, Undefs, 2460 createReplicatedMask(InterleaveFactor, VF.Min), "interleaved.mask"); 2461 NewStoreInstr = Builder.CreateMaskedStore( 2462 IVec, AddrParts[Part], Group->getAlign(), ShuffledMask); 2463 } 2464 else 2465 NewStoreInstr = 2466 Builder.CreateAlignedStore(IVec, AddrParts[Part], Group->getAlign()); 2467 2468 Group->addMetadata(NewStoreInstr); 2469 } 2470 } 2471 2472 void InnerLoopVectorizer::vectorizeMemoryInstruction(Instruction *Instr, 2473 VPTransformState &State, 2474 VPValue *Addr, 2475 VPValue *StoredValue, 2476 VPValue *BlockInMask) { 2477 // Attempt to issue a wide load. 2478 LoadInst *LI = dyn_cast<LoadInst>(Instr); 2479 StoreInst *SI = dyn_cast<StoreInst>(Instr); 2480 2481 assert((LI || SI) && "Invalid Load/Store instruction"); 2482 assert((!SI || StoredValue) && "No stored value provided for widened store"); 2483 assert((!LI || !StoredValue) && "Stored value provided for widened load"); 2484 2485 LoopVectorizationCostModel::InstWidening Decision = 2486 Cost->getWideningDecision(Instr, VF); 2487 assert((Decision == LoopVectorizationCostModel::CM_Widen || 2488 Decision == LoopVectorizationCostModel::CM_Widen_Reverse || 2489 Decision == LoopVectorizationCostModel::CM_GatherScatter) && 2490 "CM decision is not to widen the memory instruction"); 2491 2492 Type *ScalarDataTy = getMemInstValueType(Instr); 2493 2494 assert(!VF.Scalable && "scalable vectors not yet supported."); 2495 auto *DataTy = VectorType::get(ScalarDataTy, VF); 2496 const Align Alignment = getLoadStoreAlignment(Instr); 2497 2498 // Determine if the pointer operand of the access is either consecutive or 2499 // reverse consecutive. 2500 bool Reverse = (Decision == LoopVectorizationCostModel::CM_Widen_Reverse); 2501 bool ConsecutiveStride = 2502 Reverse || (Decision == LoopVectorizationCostModel::CM_Widen); 2503 bool CreateGatherScatter = 2504 (Decision == LoopVectorizationCostModel::CM_GatherScatter); 2505 2506 // Either Ptr feeds a vector load/store, or a vector GEP should feed a vector 2507 // gather/scatter. Otherwise Decision should have been to Scalarize. 2508 assert((ConsecutiveStride || CreateGatherScatter) && 2509 "The instruction should be scalarized"); 2510 (void)ConsecutiveStride; 2511 2512 VectorParts BlockInMaskParts(UF); 2513 bool isMaskRequired = BlockInMask; 2514 if (isMaskRequired) 2515 for (unsigned Part = 0; Part < UF; ++Part) 2516 BlockInMaskParts[Part] = State.get(BlockInMask, Part); 2517 2518 const auto CreateVecPtr = [&](unsigned Part, Value *Ptr) -> Value * { 2519 // Calculate the pointer for the specific unroll-part. 2520 GetElementPtrInst *PartPtr = nullptr; 2521 2522 bool InBounds = false; 2523 if (auto *gep = dyn_cast<GetElementPtrInst>(Ptr->stripPointerCasts())) 2524 InBounds = gep->isInBounds(); 2525 2526 if (Reverse) { 2527 // If the address is consecutive but reversed, then the 2528 // wide store needs to start at the last vector element. 2529 PartPtr = cast<GetElementPtrInst>(Builder.CreateGEP( 2530 ScalarDataTy, Ptr, Builder.getInt32(-Part * VF.Min))); 2531 PartPtr->setIsInBounds(InBounds); 2532 PartPtr = cast<GetElementPtrInst>(Builder.CreateGEP( 2533 ScalarDataTy, PartPtr, Builder.getInt32(1 - VF.Min))); 2534 PartPtr->setIsInBounds(InBounds); 2535 if (isMaskRequired) // Reverse of a null all-one mask is a null mask. 2536 BlockInMaskParts[Part] = reverseVector(BlockInMaskParts[Part]); 2537 } else { 2538 PartPtr = cast<GetElementPtrInst>(Builder.CreateGEP( 2539 ScalarDataTy, Ptr, Builder.getInt32(Part * VF.Min))); 2540 PartPtr->setIsInBounds(InBounds); 2541 } 2542 2543 unsigned AddressSpace = Ptr->getType()->getPointerAddressSpace(); 2544 return Builder.CreateBitCast(PartPtr, DataTy->getPointerTo(AddressSpace)); 2545 }; 2546 2547 // Handle Stores: 2548 if (SI) { 2549 setDebugLocFromInst(Builder, SI); 2550 2551 for (unsigned Part = 0; Part < UF; ++Part) { 2552 Instruction *NewSI = nullptr; 2553 Value *StoredVal = State.get(StoredValue, Part); 2554 if (CreateGatherScatter) { 2555 Value *MaskPart = isMaskRequired ? BlockInMaskParts[Part] : nullptr; 2556 Value *VectorGep = State.get(Addr, Part); 2557 NewSI = Builder.CreateMaskedScatter(StoredVal, VectorGep, Alignment, 2558 MaskPart); 2559 } else { 2560 if (Reverse) { 2561 // If we store to reverse consecutive memory locations, then we need 2562 // to reverse the order of elements in the stored value. 2563 StoredVal = reverseVector(StoredVal); 2564 // We don't want to update the value in the map as it might be used in 2565 // another expression. So don't call resetVectorValue(StoredVal). 2566 } 2567 auto *VecPtr = CreateVecPtr(Part, State.get(Addr, {0, 0})); 2568 if (isMaskRequired) 2569 NewSI = Builder.CreateMaskedStore(StoredVal, VecPtr, Alignment, 2570 BlockInMaskParts[Part]); 2571 else 2572 NewSI = Builder.CreateAlignedStore(StoredVal, VecPtr, Alignment); 2573 } 2574 addMetadata(NewSI, SI); 2575 } 2576 return; 2577 } 2578 2579 // Handle loads. 2580 assert(LI && "Must have a load instruction"); 2581 setDebugLocFromInst(Builder, LI); 2582 for (unsigned Part = 0; Part < UF; ++Part) { 2583 Value *NewLI; 2584 if (CreateGatherScatter) { 2585 Value *MaskPart = isMaskRequired ? BlockInMaskParts[Part] : nullptr; 2586 Value *VectorGep = State.get(Addr, Part); 2587 NewLI = Builder.CreateMaskedGather(VectorGep, Alignment, MaskPart, 2588 nullptr, "wide.masked.gather"); 2589 addMetadata(NewLI, LI); 2590 } else { 2591 auto *VecPtr = CreateVecPtr(Part, State.get(Addr, {0, 0})); 2592 if (isMaskRequired) 2593 NewLI = Builder.CreateMaskedLoad( 2594 VecPtr, Alignment, BlockInMaskParts[Part], UndefValue::get(DataTy), 2595 "wide.masked.load"); 2596 else 2597 NewLI = 2598 Builder.CreateAlignedLoad(DataTy, VecPtr, Alignment, "wide.load"); 2599 2600 // Add metadata to the load, but setVectorValue to the reverse shuffle. 2601 addMetadata(NewLI, LI); 2602 if (Reverse) 2603 NewLI = reverseVector(NewLI); 2604 } 2605 VectorLoopValueMap.setVectorValue(Instr, Part, NewLI); 2606 } 2607 } 2608 2609 void InnerLoopVectorizer::scalarizeInstruction(Instruction *Instr, VPUser &User, 2610 const VPIteration &Instance, 2611 bool IfPredicateInstr, 2612 VPTransformState &State) { 2613 assert(!Instr->getType()->isAggregateType() && "Can't handle vectors"); 2614 2615 setDebugLocFromInst(Builder, Instr); 2616 2617 // Does this instruction return a value ? 2618 bool IsVoidRetTy = Instr->getType()->isVoidTy(); 2619 2620 Instruction *Cloned = Instr->clone(); 2621 if (!IsVoidRetTy) 2622 Cloned->setName(Instr->getName() + ".cloned"); 2623 2624 // Replace the operands of the cloned instructions with their scalar 2625 // equivalents in the new loop. 2626 for (unsigned op = 0, e = User.getNumOperands(); op != e; ++op) { 2627 auto *NewOp = State.get(User.getOperand(op), Instance); 2628 Cloned->setOperand(op, NewOp); 2629 } 2630 addNewMetadata(Cloned, Instr); 2631 2632 // Place the cloned scalar in the new loop. 2633 Builder.Insert(Cloned); 2634 2635 // Add the cloned scalar to the scalar map entry. 2636 VectorLoopValueMap.setScalarValue(Instr, Instance, Cloned); 2637 2638 // If we just cloned a new assumption, add it the assumption cache. 2639 if (auto *II = dyn_cast<IntrinsicInst>(Cloned)) 2640 if (II->getIntrinsicID() == Intrinsic::assume) 2641 AC->registerAssumption(II); 2642 2643 // End if-block. 2644 if (IfPredicateInstr) 2645 PredicatedInstructions.push_back(Cloned); 2646 } 2647 2648 PHINode *InnerLoopVectorizer::createInductionVariable(Loop *L, Value *Start, 2649 Value *End, Value *Step, 2650 Instruction *DL) { 2651 BasicBlock *Header = L->getHeader(); 2652 BasicBlock *Latch = L->getLoopLatch(); 2653 // As we're just creating this loop, it's possible no latch exists 2654 // yet. If so, use the header as this will be a single block loop. 2655 if (!Latch) 2656 Latch = Header; 2657 2658 IRBuilder<> Builder(&*Header->getFirstInsertionPt()); 2659 Instruction *OldInst = getDebugLocFromInstOrOperands(OldInduction); 2660 setDebugLocFromInst(Builder, OldInst); 2661 auto *Induction = Builder.CreatePHI(Start->getType(), 2, "index"); 2662 2663 Builder.SetInsertPoint(Latch->getTerminator()); 2664 setDebugLocFromInst(Builder, OldInst); 2665 2666 // Create i+1 and fill the PHINode. 2667 Value *Next = Builder.CreateAdd(Induction, Step, "index.next"); 2668 Induction->addIncoming(Start, L->getLoopPreheader()); 2669 Induction->addIncoming(Next, Latch); 2670 // Create the compare. 2671 Value *ICmp = Builder.CreateICmpEQ(Next, End); 2672 Builder.CreateCondBr(ICmp, L->getExitBlock(), Header); 2673 2674 // Now we have two terminators. Remove the old one from the block. 2675 Latch->getTerminator()->eraseFromParent(); 2676 2677 return Induction; 2678 } 2679 2680 Value *InnerLoopVectorizer::getOrCreateTripCount(Loop *L) { 2681 if (TripCount) 2682 return TripCount; 2683 2684 assert(L && "Create Trip Count for null loop."); 2685 IRBuilder<> Builder(L->getLoopPreheader()->getTerminator()); 2686 // Find the loop boundaries. 2687 ScalarEvolution *SE = PSE.getSE(); 2688 const SCEV *BackedgeTakenCount = PSE.getBackedgeTakenCount(); 2689 assert(BackedgeTakenCount != SE->getCouldNotCompute() && 2690 "Invalid loop count"); 2691 2692 Type *IdxTy = Legal->getWidestInductionType(); 2693 assert(IdxTy && "No type for induction"); 2694 2695 // The exit count might have the type of i64 while the phi is i32. This can 2696 // happen if we have an induction variable that is sign extended before the 2697 // compare. The only way that we get a backedge taken count is that the 2698 // induction variable was signed and as such will not overflow. In such a case 2699 // truncation is legal. 2700 if (SE->getTypeSizeInBits(BackedgeTakenCount->getType()) > 2701 IdxTy->getPrimitiveSizeInBits()) 2702 BackedgeTakenCount = SE->getTruncateOrNoop(BackedgeTakenCount, IdxTy); 2703 BackedgeTakenCount = SE->getNoopOrZeroExtend(BackedgeTakenCount, IdxTy); 2704 2705 // Get the total trip count from the count by adding 1. 2706 const SCEV *ExitCount = SE->getAddExpr( 2707 BackedgeTakenCount, SE->getOne(BackedgeTakenCount->getType())); 2708 2709 const DataLayout &DL = L->getHeader()->getModule()->getDataLayout(); 2710 2711 // Expand the trip count and place the new instructions in the preheader. 2712 // Notice that the pre-header does not change, only the loop body. 2713 SCEVExpander Exp(*SE, DL, "induction"); 2714 2715 // Count holds the overall loop count (N). 2716 TripCount = Exp.expandCodeFor(ExitCount, ExitCount->getType(), 2717 L->getLoopPreheader()->getTerminator()); 2718 2719 if (TripCount->getType()->isPointerTy()) 2720 TripCount = 2721 CastInst::CreatePointerCast(TripCount, IdxTy, "exitcount.ptrcnt.to.int", 2722 L->getLoopPreheader()->getTerminator()); 2723 2724 return TripCount; 2725 } 2726 2727 Value *InnerLoopVectorizer::getOrCreateVectorTripCount(Loop *L) { 2728 if (VectorTripCount) 2729 return VectorTripCount; 2730 2731 Value *TC = getOrCreateTripCount(L); 2732 IRBuilder<> Builder(L->getLoopPreheader()->getTerminator()); 2733 2734 Type *Ty = TC->getType(); 2735 // This is where we can make the step a runtime constant. 2736 assert(!VF.Scalable && "scalable vectorization is not supported yet"); 2737 Constant *Step = ConstantInt::get(Ty, VF.Min * UF); 2738 2739 // If the tail is to be folded by masking, round the number of iterations N 2740 // up to a multiple of Step instead of rounding down. This is done by first 2741 // adding Step-1 and then rounding down. Note that it's ok if this addition 2742 // overflows: the vector induction variable will eventually wrap to zero given 2743 // that it starts at zero and its Step is a power of two; the loop will then 2744 // exit, with the last early-exit vector comparison also producing all-true. 2745 if (Cost->foldTailByMasking()) { 2746 assert(isPowerOf2_32(VF.Min * UF) && 2747 "VF*UF must be a power of 2 when folding tail by masking"); 2748 TC = Builder.CreateAdd(TC, ConstantInt::get(Ty, VF.Min * UF - 1), 2749 "n.rnd.up"); 2750 } 2751 2752 // Now we need to generate the expression for the part of the loop that the 2753 // vectorized body will execute. This is equal to N - (N % Step) if scalar 2754 // iterations are not required for correctness, or N - Step, otherwise. Step 2755 // is equal to the vectorization factor (number of SIMD elements) times the 2756 // unroll factor (number of SIMD instructions). 2757 Value *R = Builder.CreateURem(TC, Step, "n.mod.vf"); 2758 2759 // If there is a non-reversed interleaved group that may speculatively access 2760 // memory out-of-bounds, we need to ensure that there will be at least one 2761 // iteration of the scalar epilogue loop. Thus, if the step evenly divides 2762 // the trip count, we set the remainder to be equal to the step. If the step 2763 // does not evenly divide the trip count, no adjustment is necessary since 2764 // there will already be scalar iterations. Note that the minimum iterations 2765 // check ensures that N >= Step. 2766 if (VF.isVector() && Cost->requiresScalarEpilogue()) { 2767 auto *IsZero = Builder.CreateICmpEQ(R, ConstantInt::get(R->getType(), 0)); 2768 R = Builder.CreateSelect(IsZero, Step, R); 2769 } 2770 2771 VectorTripCount = Builder.CreateSub(TC, R, "n.vec"); 2772 2773 return VectorTripCount; 2774 } 2775 2776 Value *InnerLoopVectorizer::createBitOrPointerCast(Value *V, VectorType *DstVTy, 2777 const DataLayout &DL) { 2778 // Verify that V is a vector type with same number of elements as DstVTy. 2779 assert(isa<FixedVectorType>(DstVTy) && 2780 "Vector type is assumed to be fixed width."); 2781 unsigned VF = DstVTy->getNumElements(); 2782 VectorType *SrcVecTy = cast<VectorType>(V->getType()); 2783 assert((VF == SrcVecTy->getNumElements()) && "Vector dimensions do not match"); 2784 Type *SrcElemTy = SrcVecTy->getElementType(); 2785 Type *DstElemTy = DstVTy->getElementType(); 2786 assert((DL.getTypeSizeInBits(SrcElemTy) == DL.getTypeSizeInBits(DstElemTy)) && 2787 "Vector elements must have same size"); 2788 2789 // Do a direct cast if element types are castable. 2790 if (CastInst::isBitOrNoopPointerCastable(SrcElemTy, DstElemTy, DL)) { 2791 return Builder.CreateBitOrPointerCast(V, DstVTy); 2792 } 2793 // V cannot be directly casted to desired vector type. 2794 // May happen when V is a floating point vector but DstVTy is a vector of 2795 // pointers or vice-versa. Handle this using a two-step bitcast using an 2796 // intermediate Integer type for the bitcast i.e. Ptr <-> Int <-> Float. 2797 assert((DstElemTy->isPointerTy() != SrcElemTy->isPointerTy()) && 2798 "Only one type should be a pointer type"); 2799 assert((DstElemTy->isFloatingPointTy() != SrcElemTy->isFloatingPointTy()) && 2800 "Only one type should be a floating point type"); 2801 Type *IntTy = 2802 IntegerType::getIntNTy(V->getContext(), DL.getTypeSizeInBits(SrcElemTy)); 2803 auto *VecIntTy = FixedVectorType::get(IntTy, VF); 2804 Value *CastVal = Builder.CreateBitOrPointerCast(V, VecIntTy); 2805 return Builder.CreateBitOrPointerCast(CastVal, DstVTy); 2806 } 2807 2808 void InnerLoopVectorizer::emitMinimumIterationCountCheck(Loop *L, 2809 BasicBlock *Bypass) { 2810 Value *Count = getOrCreateTripCount(L); 2811 // Reuse existing vector loop preheader for TC checks. 2812 // Note that new preheader block is generated for vector loop. 2813 BasicBlock *const TCCheckBlock = LoopVectorPreHeader; 2814 IRBuilder<> Builder(TCCheckBlock->getTerminator()); 2815 2816 // Generate code to check if the loop's trip count is less than VF * UF, or 2817 // equal to it in case a scalar epilogue is required; this implies that the 2818 // vector trip count is zero. This check also covers the case where adding one 2819 // to the backedge-taken count overflowed leading to an incorrect trip count 2820 // of zero. In this case we will also jump to the scalar loop. 2821 auto P = Cost->requiresScalarEpilogue() ? ICmpInst::ICMP_ULE 2822 : ICmpInst::ICMP_ULT; 2823 2824 // If tail is to be folded, vector loop takes care of all iterations. 2825 Value *CheckMinIters = Builder.getFalse(); 2826 if (!Cost->foldTailByMasking()) { 2827 assert(!VF.Scalable && "scalable vectors not yet supported."); 2828 CheckMinIters = Builder.CreateICmp( 2829 P, Count, ConstantInt::get(Count->getType(), VF.Min * UF), 2830 "min.iters.check"); 2831 } 2832 // Create new preheader for vector loop. 2833 LoopVectorPreHeader = 2834 SplitBlock(TCCheckBlock, TCCheckBlock->getTerminator(), DT, LI, nullptr, 2835 "vector.ph"); 2836 2837 assert(DT->properlyDominates(DT->getNode(TCCheckBlock), 2838 DT->getNode(Bypass)->getIDom()) && 2839 "TC check is expected to dominate Bypass"); 2840 2841 // Update dominator for Bypass & LoopExit. 2842 DT->changeImmediateDominator(Bypass, TCCheckBlock); 2843 DT->changeImmediateDominator(LoopExitBlock, TCCheckBlock); 2844 2845 ReplaceInstWithInst( 2846 TCCheckBlock->getTerminator(), 2847 BranchInst::Create(Bypass, LoopVectorPreHeader, CheckMinIters)); 2848 LoopBypassBlocks.push_back(TCCheckBlock); 2849 } 2850 2851 void InnerLoopVectorizer::emitSCEVChecks(Loop *L, BasicBlock *Bypass) { 2852 // Reuse existing vector loop preheader for SCEV checks. 2853 // Note that new preheader block is generated for vector loop. 2854 BasicBlock *const SCEVCheckBlock = LoopVectorPreHeader; 2855 2856 // Generate the code to check that the SCEV assumptions that we made. 2857 // We want the new basic block to start at the first instruction in a 2858 // sequence of instructions that form a check. 2859 SCEVExpander Exp(*PSE.getSE(), Bypass->getModule()->getDataLayout(), 2860 "scev.check"); 2861 Value *SCEVCheck = Exp.expandCodeForPredicate( 2862 &PSE.getUnionPredicate(), SCEVCheckBlock->getTerminator()); 2863 2864 if (auto *C = dyn_cast<ConstantInt>(SCEVCheck)) 2865 if (C->isZero()) 2866 return; 2867 2868 assert(!(SCEVCheckBlock->getParent()->hasOptSize() || 2869 (OptForSizeBasedOnProfile && 2870 Cost->Hints->getForce() != LoopVectorizeHints::FK_Enabled)) && 2871 "Cannot SCEV check stride or overflow when optimizing for size"); 2872 2873 SCEVCheckBlock->setName("vector.scevcheck"); 2874 // Create new preheader for vector loop. 2875 LoopVectorPreHeader = 2876 SplitBlock(SCEVCheckBlock, SCEVCheckBlock->getTerminator(), DT, LI, 2877 nullptr, "vector.ph"); 2878 2879 // Update dominator only if this is first RT check. 2880 if (LoopBypassBlocks.empty()) { 2881 DT->changeImmediateDominator(Bypass, SCEVCheckBlock); 2882 DT->changeImmediateDominator(LoopExitBlock, SCEVCheckBlock); 2883 } 2884 2885 ReplaceInstWithInst( 2886 SCEVCheckBlock->getTerminator(), 2887 BranchInst::Create(Bypass, LoopVectorPreHeader, SCEVCheck)); 2888 LoopBypassBlocks.push_back(SCEVCheckBlock); 2889 AddedSafetyChecks = true; 2890 } 2891 2892 void InnerLoopVectorizer::emitMemRuntimeChecks(Loop *L, BasicBlock *Bypass) { 2893 // VPlan-native path does not do any analysis for runtime checks currently. 2894 if (EnableVPlanNativePath) 2895 return; 2896 2897 // Reuse existing vector loop preheader for runtime memory checks. 2898 // Note that new preheader block is generated for vector loop. 2899 BasicBlock *const MemCheckBlock = L->getLoopPreheader(); 2900 2901 // Generate the code that checks in runtime if arrays overlap. We put the 2902 // checks into a separate block to make the more common case of few elements 2903 // faster. 2904 auto *LAI = Legal->getLAI(); 2905 const auto &RtPtrChecking = *LAI->getRuntimePointerChecking(); 2906 if (!RtPtrChecking.Need) 2907 return; 2908 Instruction *FirstCheckInst; 2909 Instruction *MemRuntimeCheck; 2910 std::tie(FirstCheckInst, MemRuntimeCheck) = 2911 addRuntimeChecks(MemCheckBlock->getTerminator(), OrigLoop, 2912 RtPtrChecking.getChecks(), RtPtrChecking.getSE()); 2913 assert(MemRuntimeCheck && "no RT checks generated although RtPtrChecking " 2914 "claimed checks are required"); 2915 2916 if (MemCheckBlock->getParent()->hasOptSize() || OptForSizeBasedOnProfile) { 2917 assert(Cost->Hints->getForce() == LoopVectorizeHints::FK_Enabled && 2918 "Cannot emit memory checks when optimizing for size, unless forced " 2919 "to vectorize."); 2920 ORE->emit([&]() { 2921 return OptimizationRemarkAnalysis(DEBUG_TYPE, "VectorizationCodeSize", 2922 L->getStartLoc(), L->getHeader()) 2923 << "Code-size may be reduced by not forcing " 2924 "vectorization, or by source-code modifications " 2925 "eliminating the need for runtime checks " 2926 "(e.g., adding 'restrict')."; 2927 }); 2928 } 2929 2930 MemCheckBlock->setName("vector.memcheck"); 2931 // Create new preheader for vector loop. 2932 LoopVectorPreHeader = 2933 SplitBlock(MemCheckBlock, MemCheckBlock->getTerminator(), DT, LI, nullptr, 2934 "vector.ph"); 2935 2936 // Update dominator only if this is first RT check. 2937 if (LoopBypassBlocks.empty()) { 2938 DT->changeImmediateDominator(Bypass, MemCheckBlock); 2939 DT->changeImmediateDominator(LoopExitBlock, MemCheckBlock); 2940 } 2941 2942 ReplaceInstWithInst( 2943 MemCheckBlock->getTerminator(), 2944 BranchInst::Create(Bypass, LoopVectorPreHeader, MemRuntimeCheck)); 2945 LoopBypassBlocks.push_back(MemCheckBlock); 2946 AddedSafetyChecks = true; 2947 2948 // We currently don't use LoopVersioning for the actual loop cloning but we 2949 // still use it to add the noalias metadata. 2950 LVer = std::make_unique<LoopVersioning>(*Legal->getLAI(), OrigLoop, LI, DT, 2951 PSE.getSE()); 2952 LVer->prepareNoAliasMetadata(); 2953 } 2954 2955 Value *InnerLoopVectorizer::emitTransformedIndex( 2956 IRBuilder<> &B, Value *Index, ScalarEvolution *SE, const DataLayout &DL, 2957 const InductionDescriptor &ID) const { 2958 2959 SCEVExpander Exp(*SE, DL, "induction"); 2960 auto Step = ID.getStep(); 2961 auto StartValue = ID.getStartValue(); 2962 assert(Index->getType() == Step->getType() && 2963 "Index type does not match StepValue type"); 2964 2965 // Note: the IR at this point is broken. We cannot use SE to create any new 2966 // SCEV and then expand it, hoping that SCEV's simplification will give us 2967 // a more optimal code. Unfortunately, attempt of doing so on invalid IR may 2968 // lead to various SCEV crashes. So all we can do is to use builder and rely 2969 // on InstCombine for future simplifications. Here we handle some trivial 2970 // cases only. 2971 auto CreateAdd = [&B](Value *X, Value *Y) { 2972 assert(X->getType() == Y->getType() && "Types don't match!"); 2973 if (auto *CX = dyn_cast<ConstantInt>(X)) 2974 if (CX->isZero()) 2975 return Y; 2976 if (auto *CY = dyn_cast<ConstantInt>(Y)) 2977 if (CY->isZero()) 2978 return X; 2979 return B.CreateAdd(X, Y); 2980 }; 2981 2982 auto CreateMul = [&B](Value *X, Value *Y) { 2983 assert(X->getType() == Y->getType() && "Types don't match!"); 2984 if (auto *CX = dyn_cast<ConstantInt>(X)) 2985 if (CX->isOne()) 2986 return Y; 2987 if (auto *CY = dyn_cast<ConstantInt>(Y)) 2988 if (CY->isOne()) 2989 return X; 2990 return B.CreateMul(X, Y); 2991 }; 2992 2993 // Get a suitable insert point for SCEV expansion. For blocks in the vector 2994 // loop, choose the end of the vector loop header (=LoopVectorBody), because 2995 // the DomTree is not kept up-to-date for additional blocks generated in the 2996 // vector loop. By using the header as insertion point, we guarantee that the 2997 // expanded instructions dominate all their uses. 2998 auto GetInsertPoint = [this, &B]() { 2999 BasicBlock *InsertBB = B.GetInsertPoint()->getParent(); 3000 if (InsertBB != LoopVectorBody && 3001 LI->getLoopFor(LoopVectorBody) == LI->getLoopFor(InsertBB)) 3002 return LoopVectorBody->getTerminator(); 3003 return &*B.GetInsertPoint(); 3004 }; 3005 switch (ID.getKind()) { 3006 case InductionDescriptor::IK_IntInduction: { 3007 assert(Index->getType() == StartValue->getType() && 3008 "Index type does not match StartValue type"); 3009 if (ID.getConstIntStepValue() && ID.getConstIntStepValue()->isMinusOne()) 3010 return B.CreateSub(StartValue, Index); 3011 auto *Offset = CreateMul( 3012 Index, Exp.expandCodeFor(Step, Index->getType(), GetInsertPoint())); 3013 return CreateAdd(StartValue, Offset); 3014 } 3015 case InductionDescriptor::IK_PtrInduction: { 3016 assert(isa<SCEVConstant>(Step) && 3017 "Expected constant step for pointer induction"); 3018 return B.CreateGEP( 3019 StartValue->getType()->getPointerElementType(), StartValue, 3020 CreateMul(Index, 3021 Exp.expandCodeFor(Step, Index->getType(), GetInsertPoint()))); 3022 } 3023 case InductionDescriptor::IK_FpInduction: { 3024 assert(Step->getType()->isFloatingPointTy() && "Expected FP Step value"); 3025 auto InductionBinOp = ID.getInductionBinOp(); 3026 assert(InductionBinOp && 3027 (InductionBinOp->getOpcode() == Instruction::FAdd || 3028 InductionBinOp->getOpcode() == Instruction::FSub) && 3029 "Original bin op should be defined for FP induction"); 3030 3031 Value *StepValue = cast<SCEVUnknown>(Step)->getValue(); 3032 3033 // Floating point operations had to be 'fast' to enable the induction. 3034 FastMathFlags Flags; 3035 Flags.setFast(); 3036 3037 Value *MulExp = B.CreateFMul(StepValue, Index); 3038 if (isa<Instruction>(MulExp)) 3039 // We have to check, the MulExp may be a constant. 3040 cast<Instruction>(MulExp)->setFastMathFlags(Flags); 3041 3042 Value *BOp = B.CreateBinOp(InductionBinOp->getOpcode(), StartValue, MulExp, 3043 "induction"); 3044 if (isa<Instruction>(BOp)) 3045 cast<Instruction>(BOp)->setFastMathFlags(Flags); 3046 3047 return BOp; 3048 } 3049 case InductionDescriptor::IK_NoInduction: 3050 return nullptr; 3051 } 3052 llvm_unreachable("invalid enum"); 3053 } 3054 3055 Loop *InnerLoopVectorizer::createVectorLoopSkeleton(StringRef Prefix) { 3056 LoopScalarBody = OrigLoop->getHeader(); 3057 LoopVectorPreHeader = OrigLoop->getLoopPreheader(); 3058 LoopExitBlock = OrigLoop->getExitBlock(); 3059 assert(LoopExitBlock && "Must have an exit block"); 3060 assert(LoopVectorPreHeader && "Invalid loop structure"); 3061 3062 LoopMiddleBlock = 3063 SplitBlock(LoopVectorPreHeader, LoopVectorPreHeader->getTerminator(), DT, 3064 LI, nullptr, Twine(Prefix) + "middle.block"); 3065 LoopScalarPreHeader = 3066 SplitBlock(LoopMiddleBlock, LoopMiddleBlock->getTerminator(), DT, LI, 3067 nullptr, Twine(Prefix) + "scalar.ph"); 3068 // We intentionally don't let SplitBlock to update LoopInfo since 3069 // LoopVectorBody should belong to another loop than LoopVectorPreHeader. 3070 // LoopVectorBody is explicitly added to the correct place few lines later. 3071 LoopVectorBody = 3072 SplitBlock(LoopVectorPreHeader, LoopVectorPreHeader->getTerminator(), DT, 3073 nullptr, nullptr, Twine(Prefix) + "vector.body"); 3074 3075 // Update dominator for loop exit. 3076 DT->changeImmediateDominator(LoopExitBlock, LoopMiddleBlock); 3077 3078 // Create and register the new vector loop. 3079 Loop *Lp = LI->AllocateLoop(); 3080 Loop *ParentLoop = OrigLoop->getParentLoop(); 3081 3082 // Insert the new loop into the loop nest and register the new basic blocks 3083 // before calling any utilities such as SCEV that require valid LoopInfo. 3084 if (ParentLoop) { 3085 ParentLoop->addChildLoop(Lp); 3086 } else { 3087 LI->addTopLevelLoop(Lp); 3088 } 3089 Lp->addBasicBlockToLoop(LoopVectorBody, *LI); 3090 return Lp; 3091 } 3092 3093 void InnerLoopVectorizer::createInductionResumeValues(Loop *L, 3094 Value *VectorTripCount) { 3095 assert(VectorTripCount && L && "Expected valid arguments"); 3096 // We are going to resume the execution of the scalar loop. 3097 // Go over all of the induction variables that we found and fix the 3098 // PHIs that are left in the scalar version of the loop. 3099 // The starting values of PHI nodes depend on the counter of the last 3100 // iteration in the vectorized loop. 3101 // If we come from a bypass edge then we need to start from the original 3102 // start value. 3103 for (auto &InductionEntry : Legal->getInductionVars()) { 3104 PHINode *OrigPhi = InductionEntry.first; 3105 InductionDescriptor II = InductionEntry.second; 3106 3107 // Create phi nodes to merge from the backedge-taken check block. 3108 PHINode *BCResumeVal = 3109 PHINode::Create(OrigPhi->getType(), 3, "bc.resume.val", 3110 LoopScalarPreHeader->getTerminator()); 3111 // Copy original phi DL over to the new one. 3112 BCResumeVal->setDebugLoc(OrigPhi->getDebugLoc()); 3113 Value *&EndValue = IVEndValues[OrigPhi]; 3114 if (OrigPhi == OldInduction) { 3115 // We know what the end value is. 3116 EndValue = VectorTripCount; 3117 } else { 3118 IRBuilder<> B(L->getLoopPreheader()->getTerminator()); 3119 Type *StepType = II.getStep()->getType(); 3120 Instruction::CastOps CastOp = 3121 CastInst::getCastOpcode(VectorTripCount, true, StepType, true); 3122 Value *CRD = B.CreateCast(CastOp, VectorTripCount, StepType, "cast.crd"); 3123 const DataLayout &DL = LoopScalarBody->getModule()->getDataLayout(); 3124 EndValue = emitTransformedIndex(B, CRD, PSE.getSE(), DL, II); 3125 EndValue->setName("ind.end"); 3126 } 3127 3128 // The new PHI merges the original incoming value, in case of a bypass, 3129 // or the value at the end of the vectorized loop. 3130 BCResumeVal->addIncoming(EndValue, LoopMiddleBlock); 3131 3132 // Fix the scalar body counter (PHI node). 3133 // The old induction's phi node in the scalar body needs the truncated 3134 // value. 3135 for (BasicBlock *BB : LoopBypassBlocks) 3136 BCResumeVal->addIncoming(II.getStartValue(), BB); 3137 OrigPhi->setIncomingValueForBlock(LoopScalarPreHeader, BCResumeVal); 3138 } 3139 } 3140 3141 BasicBlock *InnerLoopVectorizer::completeLoopSkeleton(Loop *L, 3142 MDNode *OrigLoopID) { 3143 assert(L && "Expected valid loop."); 3144 3145 // The trip counts should be cached by now. 3146 Value *Count = getOrCreateTripCount(L); 3147 Value *VectorTripCount = getOrCreateVectorTripCount(L); 3148 3149 // We need the OrigLoop (scalar loop part) latch terminator to help 3150 // produce correct debug info for the middle block BB instructions. 3151 // The legality check stage guarantees that the loop will have a single 3152 // latch. 3153 assert(isa<BranchInst>(OrigLoop->getLoopLatch()->getTerminator()) && 3154 "Scalar loop latch terminator isn't a branch"); 3155 BranchInst *ScalarLatchBr = 3156 cast<BranchInst>(OrigLoop->getLoopLatch()->getTerminator()); 3157 3158 // Add a check in the middle block to see if we have completed 3159 // all of the iterations in the first vector loop. 3160 // If (N - N%VF) == N, then we *don't* need to run the remainder. 3161 // If tail is to be folded, we know we don't need to run the remainder. 3162 Value *CmpN = Builder.getTrue(); 3163 if (!Cost->foldTailByMasking()) { 3164 CmpN = CmpInst::Create(Instruction::ICmp, CmpInst::ICMP_EQ, Count, 3165 VectorTripCount, "cmp.n", 3166 LoopMiddleBlock->getTerminator()); 3167 3168 // Here we use the same DebugLoc as the scalar loop latch branch instead 3169 // of the corresponding compare because they may have ended up with 3170 // different line numbers and we want to avoid awkward line stepping while 3171 // debugging. Eg. if the compare has got a line number inside the loop. 3172 cast<Instruction>(CmpN)->setDebugLoc(ScalarLatchBr->getDebugLoc()); 3173 } 3174 3175 BranchInst *BrInst = 3176 BranchInst::Create(LoopExitBlock, LoopScalarPreHeader, CmpN); 3177 BrInst->setDebugLoc(ScalarLatchBr->getDebugLoc()); 3178 ReplaceInstWithInst(LoopMiddleBlock->getTerminator(), BrInst); 3179 3180 // Get ready to start creating new instructions into the vectorized body. 3181 assert(LoopVectorPreHeader == L->getLoopPreheader() && 3182 "Inconsistent vector loop preheader"); 3183 Builder.SetInsertPoint(&*LoopVectorBody->getFirstInsertionPt()); 3184 3185 Optional<MDNode *> VectorizedLoopID = 3186 makeFollowupLoopID(OrigLoopID, {LLVMLoopVectorizeFollowupAll, 3187 LLVMLoopVectorizeFollowupVectorized}); 3188 if (VectorizedLoopID.hasValue()) { 3189 L->setLoopID(VectorizedLoopID.getValue()); 3190 3191 // Do not setAlreadyVectorized if loop attributes have been defined 3192 // explicitly. 3193 return LoopVectorPreHeader; 3194 } 3195 3196 // Keep all loop hints from the original loop on the vector loop (we'll 3197 // replace the vectorizer-specific hints below). 3198 if (MDNode *LID = OrigLoop->getLoopID()) 3199 L->setLoopID(LID); 3200 3201 LoopVectorizeHints Hints(L, true, *ORE); 3202 Hints.setAlreadyVectorized(); 3203 3204 #ifdef EXPENSIVE_CHECKS 3205 assert(DT->verify(DominatorTree::VerificationLevel::Fast)); 3206 LI->verify(*DT); 3207 #endif 3208 3209 return LoopVectorPreHeader; 3210 } 3211 3212 BasicBlock *InnerLoopVectorizer::createVectorizedLoopSkeleton() { 3213 /* 3214 In this function we generate a new loop. The new loop will contain 3215 the vectorized instructions while the old loop will continue to run the 3216 scalar remainder. 3217 3218 [ ] <-- loop iteration number check. 3219 / | 3220 / v 3221 | [ ] <-- vector loop bypass (may consist of multiple blocks). 3222 | / | 3223 | / v 3224 || [ ] <-- vector pre header. 3225 |/ | 3226 | v 3227 | [ ] \ 3228 | [ ]_| <-- vector loop. 3229 | | 3230 | v 3231 | -[ ] <--- middle-block. 3232 | / | 3233 | / v 3234 -|- >[ ] <--- new preheader. 3235 | | 3236 | v 3237 | [ ] \ 3238 | [ ]_| <-- old scalar loop to handle remainder. 3239 \ | 3240 \ v 3241 >[ ] <-- exit block. 3242 ... 3243 */ 3244 3245 // Get the metadata of the original loop before it gets modified. 3246 MDNode *OrigLoopID = OrigLoop->getLoopID(); 3247 3248 // Create an empty vector loop, and prepare basic blocks for the runtime 3249 // checks. 3250 Loop *Lp = createVectorLoopSkeleton(""); 3251 3252 // Now, compare the new count to zero. If it is zero skip the vector loop and 3253 // jump to the scalar loop. This check also covers the case where the 3254 // backedge-taken count is uint##_max: adding one to it will overflow leading 3255 // to an incorrect trip count of zero. In this (rare) case we will also jump 3256 // to the scalar loop. 3257 emitMinimumIterationCountCheck(Lp, LoopScalarPreHeader); 3258 3259 // Generate the code to check any assumptions that we've made for SCEV 3260 // expressions. 3261 emitSCEVChecks(Lp, LoopScalarPreHeader); 3262 3263 // Generate the code that checks in runtime if arrays overlap. We put the 3264 // checks into a separate block to make the more common case of few elements 3265 // faster. 3266 emitMemRuntimeChecks(Lp, LoopScalarPreHeader); 3267 3268 // Some loops have a single integer induction variable, while other loops 3269 // don't. One example is c++ iterators that often have multiple pointer 3270 // induction variables. In the code below we also support a case where we 3271 // don't have a single induction variable. 3272 // 3273 // We try to obtain an induction variable from the original loop as hard 3274 // as possible. However if we don't find one that: 3275 // - is an integer 3276 // - counts from zero, stepping by one 3277 // - is the size of the widest induction variable type 3278 // then we create a new one. 3279 OldInduction = Legal->getPrimaryInduction(); 3280 Type *IdxTy = Legal->getWidestInductionType(); 3281 Value *StartIdx = ConstantInt::get(IdxTy, 0); 3282 // The loop step is equal to the vectorization factor (num of SIMD elements) 3283 // times the unroll factor (num of SIMD instructions). 3284 assert(!VF.Scalable && "scalable vectors not yet supported."); 3285 Constant *Step = ConstantInt::get(IdxTy, VF.Min * UF); 3286 Value *CountRoundDown = getOrCreateVectorTripCount(Lp); 3287 Induction = 3288 createInductionVariable(Lp, StartIdx, CountRoundDown, Step, 3289 getDebugLocFromInstOrOperands(OldInduction)); 3290 3291 // Emit phis for the new starting index of the scalar loop. 3292 createInductionResumeValues(Lp, CountRoundDown); 3293 3294 return completeLoopSkeleton(Lp, OrigLoopID); 3295 } 3296 3297 // Fix up external users of the induction variable. At this point, we are 3298 // in LCSSA form, with all external PHIs that use the IV having one input value, 3299 // coming from the remainder loop. We need those PHIs to also have a correct 3300 // value for the IV when arriving directly from the middle block. 3301 void InnerLoopVectorizer::fixupIVUsers(PHINode *OrigPhi, 3302 const InductionDescriptor &II, 3303 Value *CountRoundDown, Value *EndValue, 3304 BasicBlock *MiddleBlock) { 3305 // There are two kinds of external IV usages - those that use the value 3306 // computed in the last iteration (the PHI) and those that use the penultimate 3307 // value (the value that feeds into the phi from the loop latch). 3308 // We allow both, but they, obviously, have different values. 3309 3310 assert(OrigLoop->getExitBlock() && "Expected a single exit block"); 3311 3312 DenseMap<Value *, Value *> MissingVals; 3313 3314 // An external user of the last iteration's value should see the value that 3315 // the remainder loop uses to initialize its own IV. 3316 Value *PostInc = OrigPhi->getIncomingValueForBlock(OrigLoop->getLoopLatch()); 3317 for (User *U : PostInc->users()) { 3318 Instruction *UI = cast<Instruction>(U); 3319 if (!OrigLoop->contains(UI)) { 3320 assert(isa<PHINode>(UI) && "Expected LCSSA form"); 3321 MissingVals[UI] = EndValue; 3322 } 3323 } 3324 3325 // An external user of the penultimate value need to see EndValue - Step. 3326 // The simplest way to get this is to recompute it from the constituent SCEVs, 3327 // that is Start + (Step * (CRD - 1)). 3328 for (User *U : OrigPhi->users()) { 3329 auto *UI = cast<Instruction>(U); 3330 if (!OrigLoop->contains(UI)) { 3331 const DataLayout &DL = 3332 OrigLoop->getHeader()->getModule()->getDataLayout(); 3333 assert(isa<PHINode>(UI) && "Expected LCSSA form"); 3334 3335 IRBuilder<> B(MiddleBlock->getTerminator()); 3336 Value *CountMinusOne = B.CreateSub( 3337 CountRoundDown, ConstantInt::get(CountRoundDown->getType(), 1)); 3338 Value *CMO = 3339 !II.getStep()->getType()->isIntegerTy() 3340 ? B.CreateCast(Instruction::SIToFP, CountMinusOne, 3341 II.getStep()->getType()) 3342 : B.CreateSExtOrTrunc(CountMinusOne, II.getStep()->getType()); 3343 CMO->setName("cast.cmo"); 3344 Value *Escape = emitTransformedIndex(B, CMO, PSE.getSE(), DL, II); 3345 Escape->setName("ind.escape"); 3346 MissingVals[UI] = Escape; 3347 } 3348 } 3349 3350 for (auto &I : MissingVals) { 3351 PHINode *PHI = cast<PHINode>(I.first); 3352 // One corner case we have to handle is two IVs "chasing" each-other, 3353 // that is %IV2 = phi [...], [ %IV1, %latch ] 3354 // In this case, if IV1 has an external use, we need to avoid adding both 3355 // "last value of IV1" and "penultimate value of IV2". So, verify that we 3356 // don't already have an incoming value for the middle block. 3357 if (PHI->getBasicBlockIndex(MiddleBlock) == -1) 3358 PHI->addIncoming(I.second, MiddleBlock); 3359 } 3360 } 3361 3362 namespace { 3363 3364 struct CSEDenseMapInfo { 3365 static bool canHandle(const Instruction *I) { 3366 return isa<InsertElementInst>(I) || isa<ExtractElementInst>(I) || 3367 isa<ShuffleVectorInst>(I) || isa<GetElementPtrInst>(I); 3368 } 3369 3370 static inline Instruction *getEmptyKey() { 3371 return DenseMapInfo<Instruction *>::getEmptyKey(); 3372 } 3373 3374 static inline Instruction *getTombstoneKey() { 3375 return DenseMapInfo<Instruction *>::getTombstoneKey(); 3376 } 3377 3378 static unsigned getHashValue(const Instruction *I) { 3379 assert(canHandle(I) && "Unknown instruction!"); 3380 return hash_combine(I->getOpcode(), hash_combine_range(I->value_op_begin(), 3381 I->value_op_end())); 3382 } 3383 3384 static bool isEqual(const Instruction *LHS, const Instruction *RHS) { 3385 if (LHS == getEmptyKey() || RHS == getEmptyKey() || 3386 LHS == getTombstoneKey() || RHS == getTombstoneKey()) 3387 return LHS == RHS; 3388 return LHS->isIdenticalTo(RHS); 3389 } 3390 }; 3391 3392 } // end anonymous namespace 3393 3394 ///Perform cse of induction variable instructions. 3395 static void cse(BasicBlock *BB) { 3396 // Perform simple cse. 3397 SmallDenseMap<Instruction *, Instruction *, 4, CSEDenseMapInfo> CSEMap; 3398 for (BasicBlock::iterator I = BB->begin(), E = BB->end(); I != E;) { 3399 Instruction *In = &*I++; 3400 3401 if (!CSEDenseMapInfo::canHandle(In)) 3402 continue; 3403 3404 // Check if we can replace this instruction with any of the 3405 // visited instructions. 3406 if (Instruction *V = CSEMap.lookup(In)) { 3407 In->replaceAllUsesWith(V); 3408 In->eraseFromParent(); 3409 continue; 3410 } 3411 3412 CSEMap[In] = In; 3413 } 3414 } 3415 3416 unsigned LoopVectorizationCostModel::getVectorCallCost(CallInst *CI, 3417 ElementCount VF, 3418 bool &NeedToScalarize) { 3419 assert(!VF.Scalable && "scalable vectors not yet supported."); 3420 Function *F = CI->getCalledFunction(); 3421 Type *ScalarRetTy = CI->getType(); 3422 SmallVector<Type *, 4> Tys, ScalarTys; 3423 for (auto &ArgOp : CI->arg_operands()) 3424 ScalarTys.push_back(ArgOp->getType()); 3425 3426 // Estimate cost of scalarized vector call. The source operands are assumed 3427 // to be vectors, so we need to extract individual elements from there, 3428 // execute VF scalar calls, and then gather the result into the vector return 3429 // value. 3430 unsigned ScalarCallCost = TTI.getCallInstrCost(F, ScalarRetTy, ScalarTys, 3431 TTI::TCK_RecipThroughput); 3432 if (VF.isScalar()) 3433 return ScalarCallCost; 3434 3435 // Compute corresponding vector type for return value and arguments. 3436 Type *RetTy = ToVectorTy(ScalarRetTy, VF); 3437 for (Type *ScalarTy : ScalarTys) 3438 Tys.push_back(ToVectorTy(ScalarTy, VF)); 3439 3440 // Compute costs of unpacking argument values for the scalar calls and 3441 // packing the return values to a vector. 3442 unsigned ScalarizationCost = getScalarizationOverhead(CI, VF); 3443 3444 unsigned Cost = ScalarCallCost * VF.Min + ScalarizationCost; 3445 3446 // If we can't emit a vector call for this function, then the currently found 3447 // cost is the cost we need to return. 3448 NeedToScalarize = true; 3449 VFShape Shape = VFShape::get(*CI, VF, false /*HasGlobalPred*/); 3450 Function *VecFunc = VFDatabase(*CI).getVectorizedFunction(Shape); 3451 3452 if (!TLI || CI->isNoBuiltin() || !VecFunc) 3453 return Cost; 3454 3455 // If the corresponding vector cost is cheaper, return its cost. 3456 unsigned VectorCallCost = TTI.getCallInstrCost(nullptr, RetTy, Tys, 3457 TTI::TCK_RecipThroughput); 3458 if (VectorCallCost < Cost) { 3459 NeedToScalarize = false; 3460 return VectorCallCost; 3461 } 3462 return Cost; 3463 } 3464 3465 unsigned LoopVectorizationCostModel::getVectorIntrinsicCost(CallInst *CI, 3466 ElementCount VF) { 3467 Intrinsic::ID ID = getVectorIntrinsicIDForCall(CI, TLI); 3468 assert(ID && "Expected intrinsic call!"); 3469 3470 IntrinsicCostAttributes CostAttrs(ID, *CI, VF); 3471 return TTI.getIntrinsicInstrCost(CostAttrs, 3472 TargetTransformInfo::TCK_RecipThroughput); 3473 } 3474 3475 static Type *smallestIntegerVectorType(Type *T1, Type *T2) { 3476 auto *I1 = cast<IntegerType>(cast<VectorType>(T1)->getElementType()); 3477 auto *I2 = cast<IntegerType>(cast<VectorType>(T2)->getElementType()); 3478 return I1->getBitWidth() < I2->getBitWidth() ? T1 : T2; 3479 } 3480 3481 static Type *largestIntegerVectorType(Type *T1, Type *T2) { 3482 auto *I1 = cast<IntegerType>(cast<VectorType>(T1)->getElementType()); 3483 auto *I2 = cast<IntegerType>(cast<VectorType>(T2)->getElementType()); 3484 return I1->getBitWidth() > I2->getBitWidth() ? T1 : T2; 3485 } 3486 3487 void InnerLoopVectorizer::truncateToMinimalBitwidths() { 3488 // For every instruction `I` in MinBWs, truncate the operands, create a 3489 // truncated version of `I` and reextend its result. InstCombine runs 3490 // later and will remove any ext/trunc pairs. 3491 SmallPtrSet<Value *, 4> Erased; 3492 for (const auto &KV : Cost->getMinimalBitwidths()) { 3493 // If the value wasn't vectorized, we must maintain the original scalar 3494 // type. The absence of the value from VectorLoopValueMap indicates that it 3495 // wasn't vectorized. 3496 if (!VectorLoopValueMap.hasAnyVectorValue(KV.first)) 3497 continue; 3498 for (unsigned Part = 0; Part < UF; ++Part) { 3499 Value *I = getOrCreateVectorValue(KV.first, Part); 3500 if (Erased.count(I) || I->use_empty() || !isa<Instruction>(I)) 3501 continue; 3502 Type *OriginalTy = I->getType(); 3503 Type *ScalarTruncatedTy = 3504 IntegerType::get(OriginalTy->getContext(), KV.second); 3505 auto *TruncatedTy = FixedVectorType::get( 3506 ScalarTruncatedTy, cast<VectorType>(OriginalTy)->getNumElements()); 3507 if (TruncatedTy == OriginalTy) 3508 continue; 3509 3510 IRBuilder<> B(cast<Instruction>(I)); 3511 auto ShrinkOperand = [&](Value *V) -> Value * { 3512 if (auto *ZI = dyn_cast<ZExtInst>(V)) 3513 if (ZI->getSrcTy() == TruncatedTy) 3514 return ZI->getOperand(0); 3515 return B.CreateZExtOrTrunc(V, TruncatedTy); 3516 }; 3517 3518 // The actual instruction modification depends on the instruction type, 3519 // unfortunately. 3520 Value *NewI = nullptr; 3521 if (auto *BO = dyn_cast<BinaryOperator>(I)) { 3522 NewI = B.CreateBinOp(BO->getOpcode(), ShrinkOperand(BO->getOperand(0)), 3523 ShrinkOperand(BO->getOperand(1))); 3524 3525 // Any wrapping introduced by shrinking this operation shouldn't be 3526 // considered undefined behavior. So, we can't unconditionally copy 3527 // arithmetic wrapping flags to NewI. 3528 cast<BinaryOperator>(NewI)->copyIRFlags(I, /*IncludeWrapFlags=*/false); 3529 } else if (auto *CI = dyn_cast<ICmpInst>(I)) { 3530 NewI = 3531 B.CreateICmp(CI->getPredicate(), ShrinkOperand(CI->getOperand(0)), 3532 ShrinkOperand(CI->getOperand(1))); 3533 } else if (auto *SI = dyn_cast<SelectInst>(I)) { 3534 NewI = B.CreateSelect(SI->getCondition(), 3535 ShrinkOperand(SI->getTrueValue()), 3536 ShrinkOperand(SI->getFalseValue())); 3537 } else if (auto *CI = dyn_cast<CastInst>(I)) { 3538 switch (CI->getOpcode()) { 3539 default: 3540 llvm_unreachable("Unhandled cast!"); 3541 case Instruction::Trunc: 3542 NewI = ShrinkOperand(CI->getOperand(0)); 3543 break; 3544 case Instruction::SExt: 3545 NewI = B.CreateSExtOrTrunc( 3546 CI->getOperand(0), 3547 smallestIntegerVectorType(OriginalTy, TruncatedTy)); 3548 break; 3549 case Instruction::ZExt: 3550 NewI = B.CreateZExtOrTrunc( 3551 CI->getOperand(0), 3552 smallestIntegerVectorType(OriginalTy, TruncatedTy)); 3553 break; 3554 } 3555 } else if (auto *SI = dyn_cast<ShuffleVectorInst>(I)) { 3556 auto Elements0 = 3557 cast<VectorType>(SI->getOperand(0)->getType())->getNumElements(); 3558 auto *O0 = B.CreateZExtOrTrunc( 3559 SI->getOperand(0), 3560 FixedVectorType::get(ScalarTruncatedTy, Elements0)); 3561 auto Elements1 = 3562 cast<VectorType>(SI->getOperand(1)->getType())->getNumElements(); 3563 auto *O1 = B.CreateZExtOrTrunc( 3564 SI->getOperand(1), 3565 FixedVectorType::get(ScalarTruncatedTy, Elements1)); 3566 3567 NewI = B.CreateShuffleVector(O0, O1, SI->getShuffleMask()); 3568 } else if (isa<LoadInst>(I) || isa<PHINode>(I)) { 3569 // Don't do anything with the operands, just extend the result. 3570 continue; 3571 } else if (auto *IE = dyn_cast<InsertElementInst>(I)) { 3572 auto Elements = 3573 cast<VectorType>(IE->getOperand(0)->getType())->getNumElements(); 3574 auto *O0 = B.CreateZExtOrTrunc( 3575 IE->getOperand(0), 3576 FixedVectorType::get(ScalarTruncatedTy, Elements)); 3577 auto *O1 = B.CreateZExtOrTrunc(IE->getOperand(1), ScalarTruncatedTy); 3578 NewI = B.CreateInsertElement(O0, O1, IE->getOperand(2)); 3579 } else if (auto *EE = dyn_cast<ExtractElementInst>(I)) { 3580 auto Elements = 3581 cast<VectorType>(EE->getOperand(0)->getType())->getNumElements(); 3582 auto *O0 = B.CreateZExtOrTrunc( 3583 EE->getOperand(0), 3584 FixedVectorType::get(ScalarTruncatedTy, Elements)); 3585 NewI = B.CreateExtractElement(O0, EE->getOperand(2)); 3586 } else { 3587 // If we don't know what to do, be conservative and don't do anything. 3588 continue; 3589 } 3590 3591 // Lastly, extend the result. 3592 NewI->takeName(cast<Instruction>(I)); 3593 Value *Res = B.CreateZExtOrTrunc(NewI, OriginalTy); 3594 I->replaceAllUsesWith(Res); 3595 cast<Instruction>(I)->eraseFromParent(); 3596 Erased.insert(I); 3597 VectorLoopValueMap.resetVectorValue(KV.first, Part, Res); 3598 } 3599 } 3600 3601 // We'll have created a bunch of ZExts that are now parentless. Clean up. 3602 for (const auto &KV : Cost->getMinimalBitwidths()) { 3603 // If the value wasn't vectorized, we must maintain the original scalar 3604 // type. The absence of the value from VectorLoopValueMap indicates that it 3605 // wasn't vectorized. 3606 if (!VectorLoopValueMap.hasAnyVectorValue(KV.first)) 3607 continue; 3608 for (unsigned Part = 0; Part < UF; ++Part) { 3609 Value *I = getOrCreateVectorValue(KV.first, Part); 3610 ZExtInst *Inst = dyn_cast<ZExtInst>(I); 3611 if (Inst && Inst->use_empty()) { 3612 Value *NewI = Inst->getOperand(0); 3613 Inst->eraseFromParent(); 3614 VectorLoopValueMap.resetVectorValue(KV.first, Part, NewI); 3615 } 3616 } 3617 } 3618 } 3619 3620 void InnerLoopVectorizer::fixVectorizedLoop() { 3621 // Insert truncates and extends for any truncated instructions as hints to 3622 // InstCombine. 3623 if (VF.isVector()) 3624 truncateToMinimalBitwidths(); 3625 3626 // Fix widened non-induction PHIs by setting up the PHI operands. 3627 if (OrigPHIsToFix.size()) { 3628 assert(EnableVPlanNativePath && 3629 "Unexpected non-induction PHIs for fixup in non VPlan-native path"); 3630 fixNonInductionPHIs(); 3631 } 3632 3633 // At this point every instruction in the original loop is widened to a 3634 // vector form. Now we need to fix the recurrences in the loop. These PHI 3635 // nodes are currently empty because we did not want to introduce cycles. 3636 // This is the second stage of vectorizing recurrences. 3637 fixCrossIterationPHIs(); 3638 3639 // Forget the original basic block. 3640 PSE.getSE()->forgetLoop(OrigLoop); 3641 3642 // Fix-up external users of the induction variables. 3643 for (auto &Entry : Legal->getInductionVars()) 3644 fixupIVUsers(Entry.first, Entry.second, 3645 getOrCreateVectorTripCount(LI->getLoopFor(LoopVectorBody)), 3646 IVEndValues[Entry.first], LoopMiddleBlock); 3647 3648 fixLCSSAPHIs(); 3649 for (Instruction *PI : PredicatedInstructions) 3650 sinkScalarOperands(&*PI); 3651 3652 // Remove redundant induction instructions. 3653 cse(LoopVectorBody); 3654 3655 // Set/update profile weights for the vector and remainder loops as original 3656 // loop iterations are now distributed among them. Note that original loop 3657 // represented by LoopScalarBody becomes remainder loop after vectorization. 3658 // 3659 // For cases like foldTailByMasking() and requiresScalarEpiloque() we may 3660 // end up getting slightly roughened result but that should be OK since 3661 // profile is not inherently precise anyway. Note also possible bypass of 3662 // vector code caused by legality checks is ignored, assigning all the weight 3663 // to the vector loop, optimistically. 3664 assert(!VF.Scalable && 3665 "cannot use scalable ElementCount to determine unroll factor"); 3666 setProfileInfoAfterUnrolling(LI->getLoopFor(LoopScalarBody), 3667 LI->getLoopFor(LoopVectorBody), 3668 LI->getLoopFor(LoopScalarBody), VF.Min * UF); 3669 } 3670 3671 void InnerLoopVectorizer::fixCrossIterationPHIs() { 3672 // In order to support recurrences we need to be able to vectorize Phi nodes. 3673 // Phi nodes have cycles, so we need to vectorize them in two stages. This is 3674 // stage #2: We now need to fix the recurrences by adding incoming edges to 3675 // the currently empty PHI nodes. At this point every instruction in the 3676 // original loop is widened to a vector form so we can use them to construct 3677 // the incoming edges. 3678 for (PHINode &Phi : OrigLoop->getHeader()->phis()) { 3679 // Handle first-order recurrences and reductions that need to be fixed. 3680 if (Legal->isFirstOrderRecurrence(&Phi)) 3681 fixFirstOrderRecurrence(&Phi); 3682 else if (Legal->isReductionVariable(&Phi)) 3683 fixReduction(&Phi); 3684 } 3685 } 3686 3687 void InnerLoopVectorizer::fixFirstOrderRecurrence(PHINode *Phi) { 3688 // This is the second phase of vectorizing first-order recurrences. An 3689 // overview of the transformation is described below. Suppose we have the 3690 // following loop. 3691 // 3692 // for (int i = 0; i < n; ++i) 3693 // b[i] = a[i] - a[i - 1]; 3694 // 3695 // There is a first-order recurrence on "a". For this loop, the shorthand 3696 // scalar IR looks like: 3697 // 3698 // scalar.ph: 3699 // s_init = a[-1] 3700 // br scalar.body 3701 // 3702 // scalar.body: 3703 // i = phi [0, scalar.ph], [i+1, scalar.body] 3704 // s1 = phi [s_init, scalar.ph], [s2, scalar.body] 3705 // s2 = a[i] 3706 // b[i] = s2 - s1 3707 // br cond, scalar.body, ... 3708 // 3709 // In this example, s1 is a recurrence because it's value depends on the 3710 // previous iteration. In the first phase of vectorization, we created a 3711 // temporary value for s1. We now complete the vectorization and produce the 3712 // shorthand vector IR shown below (for VF = 4, UF = 1). 3713 // 3714 // vector.ph: 3715 // v_init = vector(..., ..., ..., a[-1]) 3716 // br vector.body 3717 // 3718 // vector.body 3719 // i = phi [0, vector.ph], [i+4, vector.body] 3720 // v1 = phi [v_init, vector.ph], [v2, vector.body] 3721 // v2 = a[i, i+1, i+2, i+3]; 3722 // v3 = vector(v1(3), v2(0, 1, 2)) 3723 // b[i, i+1, i+2, i+3] = v2 - v3 3724 // br cond, vector.body, middle.block 3725 // 3726 // middle.block: 3727 // x = v2(3) 3728 // br scalar.ph 3729 // 3730 // scalar.ph: 3731 // s_init = phi [x, middle.block], [a[-1], otherwise] 3732 // br scalar.body 3733 // 3734 // After execution completes the vector loop, we extract the next value of 3735 // the recurrence (x) to use as the initial value in the scalar loop. 3736 3737 // Get the original loop preheader and single loop latch. 3738 auto *Preheader = OrigLoop->getLoopPreheader(); 3739 auto *Latch = OrigLoop->getLoopLatch(); 3740 3741 // Get the initial and previous values of the scalar recurrence. 3742 auto *ScalarInit = Phi->getIncomingValueForBlock(Preheader); 3743 auto *Previous = Phi->getIncomingValueForBlock(Latch); 3744 3745 // Create a vector from the initial value. 3746 auto *VectorInit = ScalarInit; 3747 if (VF.isVector()) { 3748 Builder.SetInsertPoint(LoopVectorPreHeader->getTerminator()); 3749 assert(!VF.Scalable && "VF is assumed to be non scalable."); 3750 VectorInit = Builder.CreateInsertElement( 3751 UndefValue::get(VectorType::get(VectorInit->getType(), VF)), VectorInit, 3752 Builder.getInt32(VF.Min - 1), "vector.recur.init"); 3753 } 3754 3755 // We constructed a temporary phi node in the first phase of vectorization. 3756 // This phi node will eventually be deleted. 3757 Builder.SetInsertPoint( 3758 cast<Instruction>(VectorLoopValueMap.getVectorValue(Phi, 0))); 3759 3760 // Create a phi node for the new recurrence. The current value will either be 3761 // the initial value inserted into a vector or loop-varying vector value. 3762 auto *VecPhi = Builder.CreatePHI(VectorInit->getType(), 2, "vector.recur"); 3763 VecPhi->addIncoming(VectorInit, LoopVectorPreHeader); 3764 3765 // Get the vectorized previous value of the last part UF - 1. It appears last 3766 // among all unrolled iterations, due to the order of their construction. 3767 Value *PreviousLastPart = getOrCreateVectorValue(Previous, UF - 1); 3768 3769 // Find and set the insertion point after the previous value if it is an 3770 // instruction. 3771 BasicBlock::iterator InsertPt; 3772 // Note that the previous value may have been constant-folded so it is not 3773 // guaranteed to be an instruction in the vector loop. 3774 // FIXME: Loop invariant values do not form recurrences. We should deal with 3775 // them earlier. 3776 if (LI->getLoopFor(LoopVectorBody)->isLoopInvariant(PreviousLastPart)) 3777 InsertPt = LoopVectorBody->getFirstInsertionPt(); 3778 else { 3779 Instruction *PreviousInst = cast<Instruction>(PreviousLastPart); 3780 if (isa<PHINode>(PreviousLastPart)) 3781 // If the previous value is a phi node, we should insert after all the phi 3782 // nodes in the block containing the PHI to avoid breaking basic block 3783 // verification. Note that the basic block may be different to 3784 // LoopVectorBody, in case we predicate the loop. 3785 InsertPt = PreviousInst->getParent()->getFirstInsertionPt(); 3786 else 3787 InsertPt = ++PreviousInst->getIterator(); 3788 } 3789 Builder.SetInsertPoint(&*InsertPt); 3790 3791 // We will construct a vector for the recurrence by combining the values for 3792 // the current and previous iterations. This is the required shuffle mask. 3793 assert(!VF.Scalable); 3794 SmallVector<int, 8> ShuffleMask(VF.Min); 3795 ShuffleMask[0] = VF.Min - 1; 3796 for (unsigned I = 1; I < VF.Min; ++I) 3797 ShuffleMask[I] = I + VF.Min - 1; 3798 3799 // The vector from which to take the initial value for the current iteration 3800 // (actual or unrolled). Initially, this is the vector phi node. 3801 Value *Incoming = VecPhi; 3802 3803 // Shuffle the current and previous vector and update the vector parts. 3804 for (unsigned Part = 0; Part < UF; ++Part) { 3805 Value *PreviousPart = getOrCreateVectorValue(Previous, Part); 3806 Value *PhiPart = VectorLoopValueMap.getVectorValue(Phi, Part); 3807 auto *Shuffle = 3808 VF.isVector() 3809 ? Builder.CreateShuffleVector(Incoming, PreviousPart, ShuffleMask) 3810 : Incoming; 3811 PhiPart->replaceAllUsesWith(Shuffle); 3812 cast<Instruction>(PhiPart)->eraseFromParent(); 3813 VectorLoopValueMap.resetVectorValue(Phi, Part, Shuffle); 3814 Incoming = PreviousPart; 3815 } 3816 3817 // Fix the latch value of the new recurrence in the vector loop. 3818 VecPhi->addIncoming(Incoming, LI->getLoopFor(LoopVectorBody)->getLoopLatch()); 3819 3820 // Extract the last vector element in the middle block. This will be the 3821 // initial value for the recurrence when jumping to the scalar loop. 3822 auto *ExtractForScalar = Incoming; 3823 if (VF.isVector()) { 3824 Builder.SetInsertPoint(LoopMiddleBlock->getTerminator()); 3825 ExtractForScalar = Builder.CreateExtractElement( 3826 ExtractForScalar, Builder.getInt32(VF.Min - 1), "vector.recur.extract"); 3827 } 3828 // Extract the second last element in the middle block if the 3829 // Phi is used outside the loop. We need to extract the phi itself 3830 // and not the last element (the phi update in the current iteration). This 3831 // will be the value when jumping to the exit block from the LoopMiddleBlock, 3832 // when the scalar loop is not run at all. 3833 Value *ExtractForPhiUsedOutsideLoop = nullptr; 3834 if (VF.isVector()) 3835 ExtractForPhiUsedOutsideLoop = Builder.CreateExtractElement( 3836 Incoming, Builder.getInt32(VF.Min - 2), "vector.recur.extract.for.phi"); 3837 // When loop is unrolled without vectorizing, initialize 3838 // ExtractForPhiUsedOutsideLoop with the value just prior to unrolled value of 3839 // `Incoming`. This is analogous to the vectorized case above: extracting the 3840 // second last element when VF > 1. 3841 else if (UF > 1) 3842 ExtractForPhiUsedOutsideLoop = getOrCreateVectorValue(Previous, UF - 2); 3843 3844 // Fix the initial value of the original recurrence in the scalar loop. 3845 Builder.SetInsertPoint(&*LoopScalarPreHeader->begin()); 3846 auto *Start = Builder.CreatePHI(Phi->getType(), 2, "scalar.recur.init"); 3847 for (auto *BB : predecessors(LoopScalarPreHeader)) { 3848 auto *Incoming = BB == LoopMiddleBlock ? ExtractForScalar : ScalarInit; 3849 Start->addIncoming(Incoming, BB); 3850 } 3851 3852 Phi->setIncomingValueForBlock(LoopScalarPreHeader, Start); 3853 Phi->setName("scalar.recur"); 3854 3855 // Finally, fix users of the recurrence outside the loop. The users will need 3856 // either the last value of the scalar recurrence or the last value of the 3857 // vector recurrence we extracted in the middle block. Since the loop is in 3858 // LCSSA form, we just need to find all the phi nodes for the original scalar 3859 // recurrence in the exit block, and then add an edge for the middle block. 3860 for (PHINode &LCSSAPhi : LoopExitBlock->phis()) { 3861 if (LCSSAPhi.getIncomingValue(0) == Phi) { 3862 LCSSAPhi.addIncoming(ExtractForPhiUsedOutsideLoop, LoopMiddleBlock); 3863 } 3864 } 3865 } 3866 3867 void InnerLoopVectorizer::fixReduction(PHINode *Phi) { 3868 Constant *Zero = Builder.getInt32(0); 3869 3870 // Get it's reduction variable descriptor. 3871 assert(Legal->isReductionVariable(Phi) && 3872 "Unable to find the reduction variable"); 3873 RecurrenceDescriptor RdxDesc = Legal->getReductionVars()[Phi]; 3874 3875 RecurrenceDescriptor::RecurrenceKind RK = RdxDesc.getRecurrenceKind(); 3876 TrackingVH<Value> ReductionStartValue = RdxDesc.getRecurrenceStartValue(); 3877 Instruction *LoopExitInst = RdxDesc.getLoopExitInstr(); 3878 RecurrenceDescriptor::MinMaxRecurrenceKind MinMaxKind = 3879 RdxDesc.getMinMaxRecurrenceKind(); 3880 setDebugLocFromInst(Builder, ReductionStartValue); 3881 bool IsInLoopReductionPhi = Cost->isInLoopReduction(Phi); 3882 3883 // We need to generate a reduction vector from the incoming scalar. 3884 // To do so, we need to generate the 'identity' vector and override 3885 // one of the elements with the incoming scalar reduction. We need 3886 // to do it in the vector-loop preheader. 3887 Builder.SetInsertPoint(LoopVectorPreHeader->getTerminator()); 3888 3889 // This is the vector-clone of the value that leaves the loop. 3890 Type *VecTy = getOrCreateVectorValue(LoopExitInst, 0)->getType(); 3891 3892 // Find the reduction identity variable. Zero for addition, or, xor, 3893 // one for multiplication, -1 for And. 3894 Value *Identity; 3895 Value *VectorStart; 3896 if (RK == RecurrenceDescriptor::RK_IntegerMinMax || 3897 RK == RecurrenceDescriptor::RK_FloatMinMax) { 3898 // MinMax reduction have the start value as their identify. 3899 if (VF == 1 || IsInLoopReductionPhi) { 3900 VectorStart = Identity = ReductionStartValue; 3901 } else { 3902 VectorStart = Identity = 3903 Builder.CreateVectorSplat(VF, ReductionStartValue, "minmax.ident"); 3904 } 3905 } else { 3906 // Handle other reduction kinds: 3907 Constant *Iden = RecurrenceDescriptor::getRecurrenceIdentity( 3908 RK, VecTy->getScalarType()); 3909 if (VF == 1 || IsInLoopReductionPhi) { 3910 Identity = Iden; 3911 // This vector is the Identity vector where the first element is the 3912 // incoming scalar reduction. 3913 VectorStart = ReductionStartValue; 3914 } else { 3915 Identity = ConstantVector::getSplat(VF, Iden); 3916 3917 // This vector is the Identity vector where the first element is the 3918 // incoming scalar reduction. 3919 VectorStart = 3920 Builder.CreateInsertElement(Identity, ReductionStartValue, Zero); 3921 } 3922 } 3923 3924 // Wrap flags are in general invalid after vectorization, clear them. 3925 clearReductionWrapFlags(RdxDesc); 3926 3927 // Fix the vector-loop phi. 3928 3929 // Reductions do not have to start at zero. They can start with 3930 // any loop invariant values. 3931 BasicBlock *Latch = OrigLoop->getLoopLatch(); 3932 Value *LoopVal = Phi->getIncomingValueForBlock(Latch); 3933 3934 for (unsigned Part = 0; Part < UF; ++Part) { 3935 Value *VecRdxPhi = getOrCreateVectorValue(Phi, Part); 3936 Value *Val = getOrCreateVectorValue(LoopVal, Part); 3937 // Make sure to add the reduction start value only to the 3938 // first unroll part. 3939 Value *StartVal = (Part == 0) ? VectorStart : Identity; 3940 cast<PHINode>(VecRdxPhi)->addIncoming(StartVal, LoopVectorPreHeader); 3941 cast<PHINode>(VecRdxPhi) 3942 ->addIncoming(Val, LI->getLoopFor(LoopVectorBody)->getLoopLatch()); 3943 } 3944 3945 // Before each round, move the insertion point right between 3946 // the PHIs and the values we are going to write. 3947 // This allows us to write both PHINodes and the extractelement 3948 // instructions. 3949 Builder.SetInsertPoint(&*LoopMiddleBlock->getFirstInsertionPt()); 3950 3951 setDebugLocFromInst(Builder, LoopExitInst); 3952 3953 // If tail is folded by masking, the vector value to leave the loop should be 3954 // a Select choosing between the vectorized LoopExitInst and vectorized Phi, 3955 // instead of the former. 3956 if (Cost->foldTailByMasking()) { 3957 for (unsigned Part = 0; Part < UF; ++Part) { 3958 Value *VecLoopExitInst = 3959 VectorLoopValueMap.getVectorValue(LoopExitInst, Part); 3960 Value *Sel = nullptr; 3961 for (User *U : VecLoopExitInst->users()) { 3962 if (isa<SelectInst>(U)) { 3963 assert(!Sel && "Reduction exit feeding two selects"); 3964 Sel = U; 3965 } else 3966 assert(isa<PHINode>(U) && "Reduction exit must feed Phi's or select"); 3967 } 3968 assert(Sel && "Reduction exit feeds no select"); 3969 VectorLoopValueMap.resetVectorValue(LoopExitInst, Part, Sel); 3970 3971 // If the target can create a predicated operator for the reduction at no 3972 // extra cost in the loop (for example a predicated vadd), it can be 3973 // cheaper for the select to remain in the loop than be sunk out of it, 3974 // and so use the select value for the phi instead of the old 3975 // LoopExitValue. 3976 RecurrenceDescriptor RdxDesc = Legal->getReductionVars()[Phi]; 3977 if (PreferPredicatedReductionSelect || 3978 TTI->preferPredicatedReductionSelect( 3979 RdxDesc.getRecurrenceBinOp(RdxDesc.getRecurrenceKind()), 3980 Phi->getType(), TargetTransformInfo::ReductionFlags())) { 3981 auto *VecRdxPhi = cast<PHINode>(getOrCreateVectorValue(Phi, Part)); 3982 VecRdxPhi->setIncomingValueForBlock( 3983 LI->getLoopFor(LoopVectorBody)->getLoopLatch(), Sel); 3984 } 3985 } 3986 } 3987 3988 // If the vector reduction can be performed in a smaller type, we truncate 3989 // then extend the loop exit value to enable InstCombine to evaluate the 3990 // entire expression in the smaller type. 3991 if (VF.isVector() && Phi->getType() != RdxDesc.getRecurrenceType()) { 3992 assert(!IsInLoopReductionPhi && "Unexpected truncated inloop reduction!"); 3993 assert(!VF.Scalable && "scalable vectors not yet supported."); 3994 Type *RdxVecTy = VectorType::get(RdxDesc.getRecurrenceType(), VF); 3995 Builder.SetInsertPoint( 3996 LI->getLoopFor(LoopVectorBody)->getLoopLatch()->getTerminator()); 3997 VectorParts RdxParts(UF); 3998 for (unsigned Part = 0; Part < UF; ++Part) { 3999 RdxParts[Part] = VectorLoopValueMap.getVectorValue(LoopExitInst, Part); 4000 Value *Trunc = Builder.CreateTrunc(RdxParts[Part], RdxVecTy); 4001 Value *Extnd = RdxDesc.isSigned() ? Builder.CreateSExt(Trunc, VecTy) 4002 : Builder.CreateZExt(Trunc, VecTy); 4003 for (Value::user_iterator UI = RdxParts[Part]->user_begin(); 4004 UI != RdxParts[Part]->user_end();) 4005 if (*UI != Trunc) { 4006 (*UI++)->replaceUsesOfWith(RdxParts[Part], Extnd); 4007 RdxParts[Part] = Extnd; 4008 } else { 4009 ++UI; 4010 } 4011 } 4012 Builder.SetInsertPoint(&*LoopMiddleBlock->getFirstInsertionPt()); 4013 for (unsigned Part = 0; Part < UF; ++Part) { 4014 RdxParts[Part] = Builder.CreateTrunc(RdxParts[Part], RdxVecTy); 4015 VectorLoopValueMap.resetVectorValue(LoopExitInst, Part, RdxParts[Part]); 4016 } 4017 } 4018 4019 // Reduce all of the unrolled parts into a single vector. 4020 Value *ReducedPartRdx = VectorLoopValueMap.getVectorValue(LoopExitInst, 0); 4021 unsigned Op = RecurrenceDescriptor::getRecurrenceBinOp(RK); 4022 4023 // The middle block terminator has already been assigned a DebugLoc here (the 4024 // OrigLoop's single latch terminator). We want the whole middle block to 4025 // appear to execute on this line because: (a) it is all compiler generated, 4026 // (b) these instructions are always executed after evaluating the latch 4027 // conditional branch, and (c) other passes may add new predecessors which 4028 // terminate on this line. This is the easiest way to ensure we don't 4029 // accidentally cause an extra step back into the loop while debugging. 4030 setDebugLocFromInst(Builder, LoopMiddleBlock->getTerminator()); 4031 for (unsigned Part = 1; Part < UF; ++Part) { 4032 Value *RdxPart = VectorLoopValueMap.getVectorValue(LoopExitInst, Part); 4033 if (Op != Instruction::ICmp && Op != Instruction::FCmp) 4034 // Floating point operations had to be 'fast' to enable the reduction. 4035 ReducedPartRdx = addFastMathFlag( 4036 Builder.CreateBinOp((Instruction::BinaryOps)Op, RdxPart, 4037 ReducedPartRdx, "bin.rdx"), 4038 RdxDesc.getFastMathFlags()); 4039 else 4040 ReducedPartRdx = createMinMaxOp(Builder, MinMaxKind, ReducedPartRdx, 4041 RdxPart); 4042 } 4043 4044 // Create the reduction after the loop. Note that inloop reductions create the 4045 // target reduction in the loop using a Reduction recipe. 4046 if (VF.isVector() && !IsInLoopReductionPhi) { 4047 bool NoNaN = Legal->hasFunNoNaNAttr(); 4048 ReducedPartRdx = 4049 createTargetReduction(Builder, TTI, RdxDesc, ReducedPartRdx, NoNaN); 4050 // If the reduction can be performed in a smaller type, we need to extend 4051 // the reduction to the wider type before we branch to the original loop. 4052 if (Phi->getType() != RdxDesc.getRecurrenceType()) 4053 ReducedPartRdx = 4054 RdxDesc.isSigned() 4055 ? Builder.CreateSExt(ReducedPartRdx, Phi->getType()) 4056 : Builder.CreateZExt(ReducedPartRdx, Phi->getType()); 4057 } 4058 4059 // Create a phi node that merges control-flow from the backedge-taken check 4060 // block and the middle block. 4061 PHINode *BCBlockPhi = PHINode::Create(Phi->getType(), 2, "bc.merge.rdx", 4062 LoopScalarPreHeader->getTerminator()); 4063 for (unsigned I = 0, E = LoopBypassBlocks.size(); I != E; ++I) 4064 BCBlockPhi->addIncoming(ReductionStartValue, LoopBypassBlocks[I]); 4065 BCBlockPhi->addIncoming(ReducedPartRdx, LoopMiddleBlock); 4066 4067 // Now, we need to fix the users of the reduction variable 4068 // inside and outside of the scalar remainder loop. 4069 // We know that the loop is in LCSSA form. We need to update the 4070 // PHI nodes in the exit blocks. 4071 for (PHINode &LCSSAPhi : LoopExitBlock->phis()) { 4072 // All PHINodes need to have a single entry edge, or two if 4073 // we already fixed them. 4074 assert(LCSSAPhi.getNumIncomingValues() < 3 && "Invalid LCSSA PHI"); 4075 4076 // We found a reduction value exit-PHI. Update it with the 4077 // incoming bypass edge. 4078 if (LCSSAPhi.getIncomingValue(0) == LoopExitInst) 4079 LCSSAPhi.addIncoming(ReducedPartRdx, LoopMiddleBlock); 4080 } // end of the LCSSA phi scan. 4081 4082 // Fix the scalar loop reduction variable with the incoming reduction sum 4083 // from the vector body and from the backedge value. 4084 int IncomingEdgeBlockIdx = 4085 Phi->getBasicBlockIndex(OrigLoop->getLoopLatch()); 4086 assert(IncomingEdgeBlockIdx >= 0 && "Invalid block index"); 4087 // Pick the other block. 4088 int SelfEdgeBlockIdx = (IncomingEdgeBlockIdx ? 0 : 1); 4089 Phi->setIncomingValue(SelfEdgeBlockIdx, BCBlockPhi); 4090 Phi->setIncomingValue(IncomingEdgeBlockIdx, LoopExitInst); 4091 } 4092 4093 void InnerLoopVectorizer::clearReductionWrapFlags( 4094 RecurrenceDescriptor &RdxDesc) { 4095 RecurrenceDescriptor::RecurrenceKind RK = RdxDesc.getRecurrenceKind(); 4096 if (RK != RecurrenceDescriptor::RK_IntegerAdd && 4097 RK != RecurrenceDescriptor::RK_IntegerMult) 4098 return; 4099 4100 Instruction *LoopExitInstr = RdxDesc.getLoopExitInstr(); 4101 assert(LoopExitInstr && "null loop exit instruction"); 4102 SmallVector<Instruction *, 8> Worklist; 4103 SmallPtrSet<Instruction *, 8> Visited; 4104 Worklist.push_back(LoopExitInstr); 4105 Visited.insert(LoopExitInstr); 4106 4107 while (!Worklist.empty()) { 4108 Instruction *Cur = Worklist.pop_back_val(); 4109 if (isa<OverflowingBinaryOperator>(Cur)) 4110 for (unsigned Part = 0; Part < UF; ++Part) { 4111 Value *V = getOrCreateVectorValue(Cur, Part); 4112 cast<Instruction>(V)->dropPoisonGeneratingFlags(); 4113 } 4114 4115 for (User *U : Cur->users()) { 4116 Instruction *UI = cast<Instruction>(U); 4117 if ((Cur != LoopExitInstr || OrigLoop->contains(UI->getParent())) && 4118 Visited.insert(UI).second) 4119 Worklist.push_back(UI); 4120 } 4121 } 4122 } 4123 4124 void InnerLoopVectorizer::fixLCSSAPHIs() { 4125 assert(!VF.Scalable && "the code below assumes fixed width vectors"); 4126 for (PHINode &LCSSAPhi : LoopExitBlock->phis()) { 4127 if (LCSSAPhi.getNumIncomingValues() == 1) { 4128 auto *IncomingValue = LCSSAPhi.getIncomingValue(0); 4129 // Non-instruction incoming values will have only one value. 4130 unsigned LastLane = 0; 4131 if (isa<Instruction>(IncomingValue)) 4132 LastLane = Cost->isUniformAfterVectorization( 4133 cast<Instruction>(IncomingValue), VF) 4134 ? 0 4135 : VF.Min - 1; 4136 // Can be a loop invariant incoming value or the last scalar value to be 4137 // extracted from the vectorized loop. 4138 Builder.SetInsertPoint(LoopMiddleBlock->getTerminator()); 4139 Value *lastIncomingValue = 4140 getOrCreateScalarValue(IncomingValue, { UF - 1, LastLane }); 4141 LCSSAPhi.addIncoming(lastIncomingValue, LoopMiddleBlock); 4142 } 4143 } 4144 } 4145 4146 void InnerLoopVectorizer::sinkScalarOperands(Instruction *PredInst) { 4147 // The basic block and loop containing the predicated instruction. 4148 auto *PredBB = PredInst->getParent(); 4149 auto *VectorLoop = LI->getLoopFor(PredBB); 4150 4151 // Initialize a worklist with the operands of the predicated instruction. 4152 SetVector<Value *> Worklist(PredInst->op_begin(), PredInst->op_end()); 4153 4154 // Holds instructions that we need to analyze again. An instruction may be 4155 // reanalyzed if we don't yet know if we can sink it or not. 4156 SmallVector<Instruction *, 8> InstsToReanalyze; 4157 4158 // Returns true if a given use occurs in the predicated block. Phi nodes use 4159 // their operands in their corresponding predecessor blocks. 4160 auto isBlockOfUsePredicated = [&](Use &U) -> bool { 4161 auto *I = cast<Instruction>(U.getUser()); 4162 BasicBlock *BB = I->getParent(); 4163 if (auto *Phi = dyn_cast<PHINode>(I)) 4164 BB = Phi->getIncomingBlock( 4165 PHINode::getIncomingValueNumForOperand(U.getOperandNo())); 4166 return BB == PredBB; 4167 }; 4168 4169 // Iteratively sink the scalarized operands of the predicated instruction 4170 // into the block we created for it. When an instruction is sunk, it's 4171 // operands are then added to the worklist. The algorithm ends after one pass 4172 // through the worklist doesn't sink a single instruction. 4173 bool Changed; 4174 do { 4175 // Add the instructions that need to be reanalyzed to the worklist, and 4176 // reset the changed indicator. 4177 Worklist.insert(InstsToReanalyze.begin(), InstsToReanalyze.end()); 4178 InstsToReanalyze.clear(); 4179 Changed = false; 4180 4181 while (!Worklist.empty()) { 4182 auto *I = dyn_cast<Instruction>(Worklist.pop_back_val()); 4183 4184 // We can't sink an instruction if it is a phi node, is already in the 4185 // predicated block, is not in the loop, or may have side effects. 4186 if (!I || isa<PHINode>(I) || I->getParent() == PredBB || 4187 !VectorLoop->contains(I) || I->mayHaveSideEffects()) 4188 continue; 4189 4190 // It's legal to sink the instruction if all its uses occur in the 4191 // predicated block. Otherwise, there's nothing to do yet, and we may 4192 // need to reanalyze the instruction. 4193 if (!llvm::all_of(I->uses(), isBlockOfUsePredicated)) { 4194 InstsToReanalyze.push_back(I); 4195 continue; 4196 } 4197 4198 // Move the instruction to the beginning of the predicated block, and add 4199 // it's operands to the worklist. 4200 I->moveBefore(&*PredBB->getFirstInsertionPt()); 4201 Worklist.insert(I->op_begin(), I->op_end()); 4202 4203 // The sinking may have enabled other instructions to be sunk, so we will 4204 // need to iterate. 4205 Changed = true; 4206 } 4207 } while (Changed); 4208 } 4209 4210 void InnerLoopVectorizer::fixNonInductionPHIs() { 4211 for (PHINode *OrigPhi : OrigPHIsToFix) { 4212 PHINode *NewPhi = 4213 cast<PHINode>(VectorLoopValueMap.getVectorValue(OrigPhi, 0)); 4214 unsigned NumIncomingValues = OrigPhi->getNumIncomingValues(); 4215 4216 SmallVector<BasicBlock *, 2> ScalarBBPredecessors( 4217 predecessors(OrigPhi->getParent())); 4218 SmallVector<BasicBlock *, 2> VectorBBPredecessors( 4219 predecessors(NewPhi->getParent())); 4220 assert(ScalarBBPredecessors.size() == VectorBBPredecessors.size() && 4221 "Scalar and Vector BB should have the same number of predecessors"); 4222 4223 // The insertion point in Builder may be invalidated by the time we get 4224 // here. Force the Builder insertion point to something valid so that we do 4225 // not run into issues during insertion point restore in 4226 // getOrCreateVectorValue calls below. 4227 Builder.SetInsertPoint(NewPhi); 4228 4229 // The predecessor order is preserved and we can rely on mapping between 4230 // scalar and vector block predecessors. 4231 for (unsigned i = 0; i < NumIncomingValues; ++i) { 4232 BasicBlock *NewPredBB = VectorBBPredecessors[i]; 4233 4234 // When looking up the new scalar/vector values to fix up, use incoming 4235 // values from original phi. 4236 Value *ScIncV = 4237 OrigPhi->getIncomingValueForBlock(ScalarBBPredecessors[i]); 4238 4239 // Scalar incoming value may need a broadcast 4240 Value *NewIncV = getOrCreateVectorValue(ScIncV, 0); 4241 NewPhi->addIncoming(NewIncV, NewPredBB); 4242 } 4243 } 4244 } 4245 4246 void InnerLoopVectorizer::widenGEP(GetElementPtrInst *GEP, VPUser &Operands, 4247 unsigned UF, ElementCount VF, 4248 bool IsPtrLoopInvariant, 4249 SmallBitVector &IsIndexLoopInvariant, 4250 VPTransformState &State) { 4251 // Construct a vector GEP by widening the operands of the scalar GEP as 4252 // necessary. We mark the vector GEP 'inbounds' if appropriate. A GEP 4253 // results in a vector of pointers when at least one operand of the GEP 4254 // is vector-typed. Thus, to keep the representation compact, we only use 4255 // vector-typed operands for loop-varying values. 4256 4257 if (VF.isVector() && IsPtrLoopInvariant && IsIndexLoopInvariant.all()) { 4258 // If we are vectorizing, but the GEP has only loop-invariant operands, 4259 // the GEP we build (by only using vector-typed operands for 4260 // loop-varying values) would be a scalar pointer. Thus, to ensure we 4261 // produce a vector of pointers, we need to either arbitrarily pick an 4262 // operand to broadcast, or broadcast a clone of the original GEP. 4263 // Here, we broadcast a clone of the original. 4264 // 4265 // TODO: If at some point we decide to scalarize instructions having 4266 // loop-invariant operands, this special case will no longer be 4267 // required. We would add the scalarization decision to 4268 // collectLoopScalars() and teach getVectorValue() to broadcast 4269 // the lane-zero scalar value. 4270 auto *Clone = Builder.Insert(GEP->clone()); 4271 for (unsigned Part = 0; Part < UF; ++Part) { 4272 Value *EntryPart = Builder.CreateVectorSplat(VF, Clone); 4273 VectorLoopValueMap.setVectorValue(GEP, Part, EntryPart); 4274 addMetadata(EntryPart, GEP); 4275 } 4276 } else { 4277 // If the GEP has at least one loop-varying operand, we are sure to 4278 // produce a vector of pointers. But if we are only unrolling, we want 4279 // to produce a scalar GEP for each unroll part. Thus, the GEP we 4280 // produce with the code below will be scalar (if VF == 1) or vector 4281 // (otherwise). Note that for the unroll-only case, we still maintain 4282 // values in the vector mapping with initVector, as we do for other 4283 // instructions. 4284 for (unsigned Part = 0; Part < UF; ++Part) { 4285 // The pointer operand of the new GEP. If it's loop-invariant, we 4286 // won't broadcast it. 4287 auto *Ptr = IsPtrLoopInvariant ? State.get(Operands.getOperand(0), {0, 0}) 4288 : State.get(Operands.getOperand(0), Part); 4289 4290 // Collect all the indices for the new GEP. If any index is 4291 // loop-invariant, we won't broadcast it. 4292 SmallVector<Value *, 4> Indices; 4293 for (unsigned I = 1, E = Operands.getNumOperands(); I < E; I++) { 4294 VPValue *Operand = Operands.getOperand(I); 4295 if (IsIndexLoopInvariant[I - 1]) 4296 Indices.push_back(State.get(Operand, {0, 0})); 4297 else 4298 Indices.push_back(State.get(Operand, Part)); 4299 } 4300 4301 // Create the new GEP. Note that this GEP may be a scalar if VF == 1, 4302 // but it should be a vector, otherwise. 4303 auto *NewGEP = 4304 GEP->isInBounds() 4305 ? Builder.CreateInBoundsGEP(GEP->getSourceElementType(), Ptr, 4306 Indices) 4307 : Builder.CreateGEP(GEP->getSourceElementType(), Ptr, Indices); 4308 assert((VF == 1 || NewGEP->getType()->isVectorTy()) && 4309 "NewGEP is not a pointer vector"); 4310 VectorLoopValueMap.setVectorValue(GEP, Part, NewGEP); 4311 addMetadata(NewGEP, GEP); 4312 } 4313 } 4314 } 4315 4316 void InnerLoopVectorizer::widenPHIInstruction(Instruction *PN, unsigned UF, 4317 ElementCount VF) { 4318 assert(!VF.Scalable && "scalable vectors not yet supported."); 4319 PHINode *P = cast<PHINode>(PN); 4320 if (EnableVPlanNativePath) { 4321 // Currently we enter here in the VPlan-native path for non-induction 4322 // PHIs where all control flow is uniform. We simply widen these PHIs. 4323 // Create a vector phi with no operands - the vector phi operands will be 4324 // set at the end of vector code generation. 4325 Type *VecTy = 4326 (VF.isScalar()) ? PN->getType() : VectorType::get(PN->getType(), VF); 4327 Value *VecPhi = Builder.CreatePHI(VecTy, PN->getNumOperands(), "vec.phi"); 4328 VectorLoopValueMap.setVectorValue(P, 0, VecPhi); 4329 OrigPHIsToFix.push_back(P); 4330 4331 return; 4332 } 4333 4334 assert(PN->getParent() == OrigLoop->getHeader() && 4335 "Non-header phis should have been handled elsewhere"); 4336 4337 // In order to support recurrences we need to be able to vectorize Phi nodes. 4338 // Phi nodes have cycles, so we need to vectorize them in two stages. This is 4339 // stage #1: We create a new vector PHI node with no incoming edges. We'll use 4340 // this value when we vectorize all of the instructions that use the PHI. 4341 if (Legal->isReductionVariable(P) || Legal->isFirstOrderRecurrence(P)) { 4342 for (unsigned Part = 0; Part < UF; ++Part) { 4343 // This is phase one of vectorizing PHIs. 4344 bool ScalarPHI = 4345 (VF.isScalar()) || Cost->isInLoopReduction(cast<PHINode>(PN)); 4346 Type *VecTy = 4347 ScalarPHI ? PN->getType() : VectorType::get(PN->getType(), VF); 4348 Value *EntryPart = PHINode::Create( 4349 VecTy, 2, "vec.phi", &*LoopVectorBody->getFirstInsertionPt()); 4350 VectorLoopValueMap.setVectorValue(P, Part, EntryPart); 4351 } 4352 return; 4353 } 4354 4355 setDebugLocFromInst(Builder, P); 4356 4357 // This PHINode must be an induction variable. 4358 // Make sure that we know about it. 4359 assert(Legal->getInductionVars().count(P) && "Not an induction variable"); 4360 4361 InductionDescriptor II = Legal->getInductionVars().lookup(P); 4362 const DataLayout &DL = OrigLoop->getHeader()->getModule()->getDataLayout(); 4363 4364 // FIXME: The newly created binary instructions should contain nsw/nuw flags, 4365 // which can be found from the original scalar operations. 4366 switch (II.getKind()) { 4367 case InductionDescriptor::IK_NoInduction: 4368 llvm_unreachable("Unknown induction"); 4369 case InductionDescriptor::IK_IntInduction: 4370 case InductionDescriptor::IK_FpInduction: 4371 llvm_unreachable("Integer/fp induction is handled elsewhere."); 4372 case InductionDescriptor::IK_PtrInduction: { 4373 // Handle the pointer induction variable case. 4374 assert(P->getType()->isPointerTy() && "Unexpected type."); 4375 4376 if (Cost->isScalarAfterVectorization(P, VF)) { 4377 // This is the normalized GEP that starts counting at zero. 4378 Value *PtrInd = 4379 Builder.CreateSExtOrTrunc(Induction, II.getStep()->getType()); 4380 // Determine the number of scalars we need to generate for each unroll 4381 // iteration. If the instruction is uniform, we only need to generate the 4382 // first lane. Otherwise, we generate all VF values. 4383 unsigned Lanes = Cost->isUniformAfterVectorization(P, VF) ? 1 : VF.Min; 4384 for (unsigned Part = 0; Part < UF; ++Part) { 4385 for (unsigned Lane = 0; Lane < Lanes; ++Lane) { 4386 Constant *Idx = 4387 ConstantInt::get(PtrInd->getType(), Lane + Part * VF.Min); 4388 Value *GlobalIdx = Builder.CreateAdd(PtrInd, Idx); 4389 Value *SclrGep = 4390 emitTransformedIndex(Builder, GlobalIdx, PSE.getSE(), DL, II); 4391 SclrGep->setName("next.gep"); 4392 VectorLoopValueMap.setScalarValue(P, {Part, Lane}, SclrGep); 4393 } 4394 } 4395 return; 4396 } 4397 assert(isa<SCEVConstant>(II.getStep()) && 4398 "Induction step not a SCEV constant!"); 4399 Type *PhiType = II.getStep()->getType(); 4400 4401 // Build a pointer phi 4402 Value *ScalarStartValue = II.getStartValue(); 4403 Type *ScStValueType = ScalarStartValue->getType(); 4404 PHINode *NewPointerPhi = 4405 PHINode::Create(ScStValueType, 2, "pointer.phi", Induction); 4406 NewPointerPhi->addIncoming(ScalarStartValue, LoopVectorPreHeader); 4407 4408 // A pointer induction, performed by using a gep 4409 BasicBlock *LoopLatch = LI->getLoopFor(LoopVectorBody)->getLoopLatch(); 4410 Instruction *InductionLoc = LoopLatch->getTerminator(); 4411 const SCEV *ScalarStep = II.getStep(); 4412 SCEVExpander Exp(*PSE.getSE(), DL, "induction"); 4413 Value *ScalarStepValue = 4414 Exp.expandCodeFor(ScalarStep, PhiType, InductionLoc); 4415 Value *InductionGEP = GetElementPtrInst::Create( 4416 ScStValueType->getPointerElementType(), NewPointerPhi, 4417 Builder.CreateMul(ScalarStepValue, 4418 ConstantInt::get(PhiType, VF.Min * UF)), 4419 "ptr.ind", InductionLoc); 4420 NewPointerPhi->addIncoming(InductionGEP, LoopLatch); 4421 4422 // Create UF many actual address geps that use the pointer 4423 // phi as base and a vectorized version of the step value 4424 // (<step*0, ..., step*N>) as offset. 4425 for (unsigned Part = 0; Part < UF; ++Part) { 4426 SmallVector<Constant *, 8> Indices; 4427 // Create a vector of consecutive numbers from zero to VF. 4428 for (unsigned i = 0; i < VF.Min; ++i) 4429 Indices.push_back(ConstantInt::get(PhiType, i + Part * VF.Min)); 4430 Constant *StartOffset = ConstantVector::get(Indices); 4431 4432 Value *GEP = Builder.CreateGEP( 4433 ScStValueType->getPointerElementType(), NewPointerPhi, 4434 Builder.CreateMul(StartOffset, 4435 Builder.CreateVectorSplat(VF.Min, ScalarStepValue), 4436 "vector.gep")); 4437 VectorLoopValueMap.setVectorValue(P, Part, GEP); 4438 } 4439 } 4440 } 4441 } 4442 4443 /// A helper function for checking whether an integer division-related 4444 /// instruction may divide by zero (in which case it must be predicated if 4445 /// executed conditionally in the scalar code). 4446 /// TODO: It may be worthwhile to generalize and check isKnownNonZero(). 4447 /// Non-zero divisors that are non compile-time constants will not be 4448 /// converted into multiplication, so we will still end up scalarizing 4449 /// the division, but can do so w/o predication. 4450 static bool mayDivideByZero(Instruction &I) { 4451 assert((I.getOpcode() == Instruction::UDiv || 4452 I.getOpcode() == Instruction::SDiv || 4453 I.getOpcode() == Instruction::URem || 4454 I.getOpcode() == Instruction::SRem) && 4455 "Unexpected instruction"); 4456 Value *Divisor = I.getOperand(1); 4457 auto *CInt = dyn_cast<ConstantInt>(Divisor); 4458 return !CInt || CInt->isZero(); 4459 } 4460 4461 void InnerLoopVectorizer::widenInstruction(Instruction &I, VPUser &User, 4462 VPTransformState &State) { 4463 assert(!VF.Scalable && "scalable vectors not yet supported."); 4464 switch (I.getOpcode()) { 4465 case Instruction::Call: 4466 case Instruction::Br: 4467 case Instruction::PHI: 4468 case Instruction::GetElementPtr: 4469 case Instruction::Select: 4470 llvm_unreachable("This instruction is handled by a different recipe."); 4471 case Instruction::UDiv: 4472 case Instruction::SDiv: 4473 case Instruction::SRem: 4474 case Instruction::URem: 4475 case Instruction::Add: 4476 case Instruction::FAdd: 4477 case Instruction::Sub: 4478 case Instruction::FSub: 4479 case Instruction::FNeg: 4480 case Instruction::Mul: 4481 case Instruction::FMul: 4482 case Instruction::FDiv: 4483 case Instruction::FRem: 4484 case Instruction::Shl: 4485 case Instruction::LShr: 4486 case Instruction::AShr: 4487 case Instruction::And: 4488 case Instruction::Or: 4489 case Instruction::Xor: { 4490 // Just widen unops and binops. 4491 setDebugLocFromInst(Builder, &I); 4492 4493 for (unsigned Part = 0; Part < UF; ++Part) { 4494 SmallVector<Value *, 2> Ops; 4495 for (VPValue *VPOp : User.operands()) 4496 Ops.push_back(State.get(VPOp, Part)); 4497 4498 Value *V = Builder.CreateNAryOp(I.getOpcode(), Ops); 4499 4500 if (auto *VecOp = dyn_cast<Instruction>(V)) 4501 VecOp->copyIRFlags(&I); 4502 4503 // Use this vector value for all users of the original instruction. 4504 VectorLoopValueMap.setVectorValue(&I, Part, V); 4505 addMetadata(V, &I); 4506 } 4507 4508 break; 4509 } 4510 case Instruction::ICmp: 4511 case Instruction::FCmp: { 4512 // Widen compares. Generate vector compares. 4513 bool FCmp = (I.getOpcode() == Instruction::FCmp); 4514 auto *Cmp = cast<CmpInst>(&I); 4515 setDebugLocFromInst(Builder, Cmp); 4516 for (unsigned Part = 0; Part < UF; ++Part) { 4517 Value *A = State.get(User.getOperand(0), Part); 4518 Value *B = State.get(User.getOperand(1), Part); 4519 Value *C = nullptr; 4520 if (FCmp) { 4521 // Propagate fast math flags. 4522 IRBuilder<>::FastMathFlagGuard FMFG(Builder); 4523 Builder.setFastMathFlags(Cmp->getFastMathFlags()); 4524 C = Builder.CreateFCmp(Cmp->getPredicate(), A, B); 4525 } else { 4526 C = Builder.CreateICmp(Cmp->getPredicate(), A, B); 4527 } 4528 VectorLoopValueMap.setVectorValue(&I, Part, C); 4529 addMetadata(C, &I); 4530 } 4531 4532 break; 4533 } 4534 4535 case Instruction::ZExt: 4536 case Instruction::SExt: 4537 case Instruction::FPToUI: 4538 case Instruction::FPToSI: 4539 case Instruction::FPExt: 4540 case Instruction::PtrToInt: 4541 case Instruction::IntToPtr: 4542 case Instruction::SIToFP: 4543 case Instruction::UIToFP: 4544 case Instruction::Trunc: 4545 case Instruction::FPTrunc: 4546 case Instruction::BitCast: { 4547 auto *CI = cast<CastInst>(&I); 4548 setDebugLocFromInst(Builder, CI); 4549 4550 /// Vectorize casts. 4551 assert(!VF.Scalable && "VF is assumed to be non scalable."); 4552 Type *DestTy = 4553 (VF.isScalar()) ? CI->getType() : VectorType::get(CI->getType(), VF); 4554 4555 for (unsigned Part = 0; Part < UF; ++Part) { 4556 Value *A = State.get(User.getOperand(0), Part); 4557 Value *Cast = Builder.CreateCast(CI->getOpcode(), A, DestTy); 4558 VectorLoopValueMap.setVectorValue(&I, Part, Cast); 4559 addMetadata(Cast, &I); 4560 } 4561 break; 4562 } 4563 default: 4564 // This instruction is not vectorized by simple widening. 4565 LLVM_DEBUG(dbgs() << "LV: Found an unhandled instruction: " << I); 4566 llvm_unreachable("Unhandled instruction!"); 4567 } // end of switch. 4568 } 4569 4570 void InnerLoopVectorizer::widenCallInstruction(CallInst &I, VPUser &ArgOperands, 4571 VPTransformState &State) { 4572 assert(!isa<DbgInfoIntrinsic>(I) && 4573 "DbgInfoIntrinsic should have been dropped during VPlan construction"); 4574 setDebugLocFromInst(Builder, &I); 4575 4576 Module *M = I.getParent()->getParent()->getParent(); 4577 auto *CI = cast<CallInst>(&I); 4578 4579 SmallVector<Type *, 4> Tys; 4580 for (Value *ArgOperand : CI->arg_operands()) 4581 Tys.push_back(ToVectorTy(ArgOperand->getType(), VF.Min)); 4582 4583 Intrinsic::ID ID = getVectorIntrinsicIDForCall(CI, TLI); 4584 4585 // The flag shows whether we use Intrinsic or a usual Call for vectorized 4586 // version of the instruction. 4587 // Is it beneficial to perform intrinsic call compared to lib call? 4588 bool NeedToScalarize = false; 4589 unsigned CallCost = Cost->getVectorCallCost(CI, VF, NeedToScalarize); 4590 bool UseVectorIntrinsic = 4591 ID && Cost->getVectorIntrinsicCost(CI, VF) <= CallCost; 4592 assert((UseVectorIntrinsic || !NeedToScalarize) && 4593 "Instruction should be scalarized elsewhere."); 4594 4595 for (unsigned Part = 0; Part < UF; ++Part) { 4596 SmallVector<Value *, 4> Args; 4597 for (auto &I : enumerate(ArgOperands.operands())) { 4598 // Some intrinsics have a scalar argument - don't replace it with a 4599 // vector. 4600 Value *Arg; 4601 if (!UseVectorIntrinsic || !hasVectorInstrinsicScalarOpd(ID, I.index())) 4602 Arg = State.get(I.value(), Part); 4603 else 4604 Arg = State.get(I.value(), {0, 0}); 4605 Args.push_back(Arg); 4606 } 4607 4608 Function *VectorF; 4609 if (UseVectorIntrinsic) { 4610 // Use vector version of the intrinsic. 4611 Type *TysForDecl[] = {CI->getType()}; 4612 if (VF.isVector()) { 4613 assert(!VF.Scalable && "VF is assumed to be non scalable."); 4614 TysForDecl[0] = VectorType::get(CI->getType()->getScalarType(), VF); 4615 } 4616 VectorF = Intrinsic::getDeclaration(M, ID, TysForDecl); 4617 assert(VectorF && "Can't retrieve vector intrinsic."); 4618 } else { 4619 // Use vector version of the function call. 4620 const VFShape Shape = VFShape::get(*CI, VF, false /*HasGlobalPred*/); 4621 #ifndef NDEBUG 4622 assert(VFDatabase(*CI).getVectorizedFunction(Shape) != nullptr && 4623 "Can't create vector function."); 4624 #endif 4625 VectorF = VFDatabase(*CI).getVectorizedFunction(Shape); 4626 } 4627 SmallVector<OperandBundleDef, 1> OpBundles; 4628 CI->getOperandBundlesAsDefs(OpBundles); 4629 CallInst *V = Builder.CreateCall(VectorF, Args, OpBundles); 4630 4631 if (isa<FPMathOperator>(V)) 4632 V->copyFastMathFlags(CI); 4633 4634 VectorLoopValueMap.setVectorValue(&I, Part, V); 4635 addMetadata(V, &I); 4636 } 4637 } 4638 4639 void InnerLoopVectorizer::widenSelectInstruction(SelectInst &I, 4640 VPUser &Operands, 4641 bool InvariantCond, 4642 VPTransformState &State) { 4643 setDebugLocFromInst(Builder, &I); 4644 4645 // The condition can be loop invariant but still defined inside the 4646 // loop. This means that we can't just use the original 'cond' value. 4647 // We have to take the 'vectorized' value and pick the first lane. 4648 // Instcombine will make this a no-op. 4649 auto *InvarCond = 4650 InvariantCond ? State.get(Operands.getOperand(0), {0, 0}) : nullptr; 4651 4652 for (unsigned Part = 0; Part < UF; ++Part) { 4653 Value *Cond = 4654 InvarCond ? InvarCond : State.get(Operands.getOperand(0), Part); 4655 Value *Op0 = State.get(Operands.getOperand(1), Part); 4656 Value *Op1 = State.get(Operands.getOperand(2), Part); 4657 Value *Sel = Builder.CreateSelect(Cond, Op0, Op1); 4658 VectorLoopValueMap.setVectorValue(&I, Part, Sel); 4659 addMetadata(Sel, &I); 4660 } 4661 } 4662 4663 void LoopVectorizationCostModel::collectLoopScalars(ElementCount VF) { 4664 // We should not collect Scalars more than once per VF. Right now, this 4665 // function is called from collectUniformsAndScalars(), which already does 4666 // this check. Collecting Scalars for VF=1 does not make any sense. 4667 assert(VF.isVector() && Scalars.find(VF) == Scalars.end() && 4668 "This function should not be visited twice for the same VF"); 4669 4670 SmallSetVector<Instruction *, 8> Worklist; 4671 4672 // These sets are used to seed the analysis with pointers used by memory 4673 // accesses that will remain scalar. 4674 SmallSetVector<Instruction *, 8> ScalarPtrs; 4675 SmallPtrSet<Instruction *, 8> PossibleNonScalarPtrs; 4676 auto *Latch = TheLoop->getLoopLatch(); 4677 4678 // A helper that returns true if the use of Ptr by MemAccess will be scalar. 4679 // The pointer operands of loads and stores will be scalar as long as the 4680 // memory access is not a gather or scatter operation. The value operand of a 4681 // store will remain scalar if the store is scalarized. 4682 auto isScalarUse = [&](Instruction *MemAccess, Value *Ptr) { 4683 InstWidening WideningDecision = getWideningDecision(MemAccess, VF); 4684 assert(WideningDecision != CM_Unknown && 4685 "Widening decision should be ready at this moment"); 4686 if (auto *Store = dyn_cast<StoreInst>(MemAccess)) 4687 if (Ptr == Store->getValueOperand()) 4688 return WideningDecision == CM_Scalarize; 4689 assert(Ptr == getLoadStorePointerOperand(MemAccess) && 4690 "Ptr is neither a value or pointer operand"); 4691 return WideningDecision != CM_GatherScatter; 4692 }; 4693 4694 // A helper that returns true if the given value is a bitcast or 4695 // getelementptr instruction contained in the loop. 4696 auto isLoopVaryingBitCastOrGEP = [&](Value *V) { 4697 return ((isa<BitCastInst>(V) && V->getType()->isPointerTy()) || 4698 isa<GetElementPtrInst>(V)) && 4699 !TheLoop->isLoopInvariant(V); 4700 }; 4701 4702 auto isScalarPtrInduction = [&](Instruction *MemAccess, Value *Ptr) { 4703 if (!isa<PHINode>(Ptr) || 4704 !Legal->getInductionVars().count(cast<PHINode>(Ptr))) 4705 return false; 4706 auto &Induction = Legal->getInductionVars()[cast<PHINode>(Ptr)]; 4707 if (Induction.getKind() != InductionDescriptor::IK_PtrInduction) 4708 return false; 4709 return isScalarUse(MemAccess, Ptr); 4710 }; 4711 4712 // A helper that evaluates a memory access's use of a pointer. If the 4713 // pointer is actually the pointer induction of a loop, it is being 4714 // inserted into Worklist. If the use will be a scalar use, and the 4715 // pointer is only used by memory accesses, we place the pointer in 4716 // ScalarPtrs. Otherwise, the pointer is placed in PossibleNonScalarPtrs. 4717 auto evaluatePtrUse = [&](Instruction *MemAccess, Value *Ptr) { 4718 if (isScalarPtrInduction(MemAccess, Ptr)) { 4719 Worklist.insert(cast<Instruction>(Ptr)); 4720 Instruction *Update = cast<Instruction>( 4721 cast<PHINode>(Ptr)->getIncomingValueForBlock(Latch)); 4722 Worklist.insert(Update); 4723 LLVM_DEBUG(dbgs() << "LV: Found new scalar instruction: " << *Ptr 4724 << "\n"); 4725 LLVM_DEBUG(dbgs() << "LV: Found new scalar instruction: " << *Update 4726 << "\n"); 4727 return; 4728 } 4729 // We only care about bitcast and getelementptr instructions contained in 4730 // the loop. 4731 if (!isLoopVaryingBitCastOrGEP(Ptr)) 4732 return; 4733 4734 // If the pointer has already been identified as scalar (e.g., if it was 4735 // also identified as uniform), there's nothing to do. 4736 auto *I = cast<Instruction>(Ptr); 4737 if (Worklist.count(I)) 4738 return; 4739 4740 // If the use of the pointer will be a scalar use, and all users of the 4741 // pointer are memory accesses, place the pointer in ScalarPtrs. Otherwise, 4742 // place the pointer in PossibleNonScalarPtrs. 4743 if (isScalarUse(MemAccess, Ptr) && llvm::all_of(I->users(), [&](User *U) { 4744 return isa<LoadInst>(U) || isa<StoreInst>(U); 4745 })) 4746 ScalarPtrs.insert(I); 4747 else 4748 PossibleNonScalarPtrs.insert(I); 4749 }; 4750 4751 // We seed the scalars analysis with three classes of instructions: (1) 4752 // instructions marked uniform-after-vectorization and (2) bitcast, 4753 // getelementptr and (pointer) phi instructions used by memory accesses 4754 // requiring a scalar use. 4755 // 4756 // (1) Add to the worklist all instructions that have been identified as 4757 // uniform-after-vectorization. 4758 Worklist.insert(Uniforms[VF].begin(), Uniforms[VF].end()); 4759 4760 // (2) Add to the worklist all bitcast and getelementptr instructions used by 4761 // memory accesses requiring a scalar use. The pointer operands of loads and 4762 // stores will be scalar as long as the memory accesses is not a gather or 4763 // scatter operation. The value operand of a store will remain scalar if the 4764 // store is scalarized. 4765 for (auto *BB : TheLoop->blocks()) 4766 for (auto &I : *BB) { 4767 if (auto *Load = dyn_cast<LoadInst>(&I)) { 4768 evaluatePtrUse(Load, Load->getPointerOperand()); 4769 } else if (auto *Store = dyn_cast<StoreInst>(&I)) { 4770 evaluatePtrUse(Store, Store->getPointerOperand()); 4771 evaluatePtrUse(Store, Store->getValueOperand()); 4772 } 4773 } 4774 for (auto *I : ScalarPtrs) 4775 if (!PossibleNonScalarPtrs.count(I)) { 4776 LLVM_DEBUG(dbgs() << "LV: Found scalar instruction: " << *I << "\n"); 4777 Worklist.insert(I); 4778 } 4779 4780 // Insert the forced scalars. 4781 // FIXME: Currently widenPHIInstruction() often creates a dead vector 4782 // induction variable when the PHI user is scalarized. 4783 auto ForcedScalar = ForcedScalars.find(VF); 4784 if (ForcedScalar != ForcedScalars.end()) 4785 for (auto *I : ForcedScalar->second) 4786 Worklist.insert(I); 4787 4788 // Expand the worklist by looking through any bitcasts and getelementptr 4789 // instructions we've already identified as scalar. This is similar to the 4790 // expansion step in collectLoopUniforms(); however, here we're only 4791 // expanding to include additional bitcasts and getelementptr instructions. 4792 unsigned Idx = 0; 4793 while (Idx != Worklist.size()) { 4794 Instruction *Dst = Worklist[Idx++]; 4795 if (!isLoopVaryingBitCastOrGEP(Dst->getOperand(0))) 4796 continue; 4797 auto *Src = cast<Instruction>(Dst->getOperand(0)); 4798 if (llvm::all_of(Src->users(), [&](User *U) -> bool { 4799 auto *J = cast<Instruction>(U); 4800 return !TheLoop->contains(J) || Worklist.count(J) || 4801 ((isa<LoadInst>(J) || isa<StoreInst>(J)) && 4802 isScalarUse(J, Src)); 4803 })) { 4804 Worklist.insert(Src); 4805 LLVM_DEBUG(dbgs() << "LV: Found scalar instruction: " << *Src << "\n"); 4806 } 4807 } 4808 4809 // An induction variable will remain scalar if all users of the induction 4810 // variable and induction variable update remain scalar. 4811 for (auto &Induction : Legal->getInductionVars()) { 4812 auto *Ind = Induction.first; 4813 auto *IndUpdate = cast<Instruction>(Ind->getIncomingValueForBlock(Latch)); 4814 4815 // If tail-folding is applied, the primary induction variable will be used 4816 // to feed a vector compare. 4817 if (Ind == Legal->getPrimaryInduction() && foldTailByMasking()) 4818 continue; 4819 4820 // Determine if all users of the induction variable are scalar after 4821 // vectorization. 4822 auto ScalarInd = llvm::all_of(Ind->users(), [&](User *U) -> bool { 4823 auto *I = cast<Instruction>(U); 4824 return I == IndUpdate || !TheLoop->contains(I) || Worklist.count(I); 4825 }); 4826 if (!ScalarInd) 4827 continue; 4828 4829 // Determine if all users of the induction variable update instruction are 4830 // scalar after vectorization. 4831 auto ScalarIndUpdate = 4832 llvm::all_of(IndUpdate->users(), [&](User *U) -> bool { 4833 auto *I = cast<Instruction>(U); 4834 return I == Ind || !TheLoop->contains(I) || Worklist.count(I); 4835 }); 4836 if (!ScalarIndUpdate) 4837 continue; 4838 4839 // The induction variable and its update instruction will remain scalar. 4840 Worklist.insert(Ind); 4841 Worklist.insert(IndUpdate); 4842 LLVM_DEBUG(dbgs() << "LV: Found scalar instruction: " << *Ind << "\n"); 4843 LLVM_DEBUG(dbgs() << "LV: Found scalar instruction: " << *IndUpdate 4844 << "\n"); 4845 } 4846 4847 Scalars[VF].insert(Worklist.begin(), Worklist.end()); 4848 } 4849 4850 bool LoopVectorizationCostModel::isScalarWithPredication(Instruction *I, 4851 ElementCount VF) { 4852 assert(!VF.Scalable && "scalable vectors not yet supported."); 4853 if (!blockNeedsPredication(I->getParent())) 4854 return false; 4855 switch(I->getOpcode()) { 4856 default: 4857 break; 4858 case Instruction::Load: 4859 case Instruction::Store: { 4860 if (!Legal->isMaskRequired(I)) 4861 return false; 4862 auto *Ptr = getLoadStorePointerOperand(I); 4863 auto *Ty = getMemInstValueType(I); 4864 // We have already decided how to vectorize this instruction, get that 4865 // result. 4866 if (VF.isVector()) { 4867 InstWidening WideningDecision = getWideningDecision(I, VF); 4868 assert(WideningDecision != CM_Unknown && 4869 "Widening decision should be ready at this moment"); 4870 return WideningDecision == CM_Scalarize; 4871 } 4872 const Align Alignment = getLoadStoreAlignment(I); 4873 return isa<LoadInst>(I) ? !(isLegalMaskedLoad(Ty, Ptr, Alignment) || 4874 isLegalMaskedGather(Ty, Alignment)) 4875 : !(isLegalMaskedStore(Ty, Ptr, Alignment) || 4876 isLegalMaskedScatter(Ty, Alignment)); 4877 } 4878 case Instruction::UDiv: 4879 case Instruction::SDiv: 4880 case Instruction::SRem: 4881 case Instruction::URem: 4882 return mayDivideByZero(*I); 4883 } 4884 return false; 4885 } 4886 4887 bool LoopVectorizationCostModel::interleavedAccessCanBeWidened( 4888 Instruction *I, ElementCount VF) { 4889 assert(isAccessInterleaved(I) && "Expecting interleaved access."); 4890 assert(getWideningDecision(I, VF) == CM_Unknown && 4891 "Decision should not be set yet."); 4892 auto *Group = getInterleavedAccessGroup(I); 4893 assert(Group && "Must have a group."); 4894 4895 // If the instruction's allocated size doesn't equal it's type size, it 4896 // requires padding and will be scalarized. 4897 auto &DL = I->getModule()->getDataLayout(); 4898 auto *ScalarTy = getMemInstValueType(I); 4899 if (hasIrregularType(ScalarTy, DL, VF)) 4900 return false; 4901 4902 // Check if masking is required. 4903 // A Group may need masking for one of two reasons: it resides in a block that 4904 // needs predication, or it was decided to use masking to deal with gaps. 4905 bool PredicatedAccessRequiresMasking = 4906 Legal->blockNeedsPredication(I->getParent()) && Legal->isMaskRequired(I); 4907 bool AccessWithGapsRequiresMasking = 4908 Group->requiresScalarEpilogue() && !isScalarEpilogueAllowed(); 4909 if (!PredicatedAccessRequiresMasking && !AccessWithGapsRequiresMasking) 4910 return true; 4911 4912 // If masked interleaving is required, we expect that the user/target had 4913 // enabled it, because otherwise it either wouldn't have been created or 4914 // it should have been invalidated by the CostModel. 4915 assert(useMaskedInterleavedAccesses(TTI) && 4916 "Masked interleave-groups for predicated accesses are not enabled."); 4917 4918 auto *Ty = getMemInstValueType(I); 4919 const Align Alignment = getLoadStoreAlignment(I); 4920 return isa<LoadInst>(I) ? TTI.isLegalMaskedLoad(Ty, Alignment) 4921 : TTI.isLegalMaskedStore(Ty, Alignment); 4922 } 4923 4924 bool LoopVectorizationCostModel::memoryInstructionCanBeWidened( 4925 Instruction *I, ElementCount VF) { 4926 // Get and ensure we have a valid memory instruction. 4927 LoadInst *LI = dyn_cast<LoadInst>(I); 4928 StoreInst *SI = dyn_cast<StoreInst>(I); 4929 assert((LI || SI) && "Invalid memory instruction"); 4930 4931 auto *Ptr = getLoadStorePointerOperand(I); 4932 4933 // In order to be widened, the pointer should be consecutive, first of all. 4934 if (!Legal->isConsecutivePtr(Ptr)) 4935 return false; 4936 4937 // If the instruction is a store located in a predicated block, it will be 4938 // scalarized. 4939 if (isScalarWithPredication(I)) 4940 return false; 4941 4942 // If the instruction's allocated size doesn't equal it's type size, it 4943 // requires padding and will be scalarized. 4944 auto &DL = I->getModule()->getDataLayout(); 4945 auto *ScalarTy = LI ? LI->getType() : SI->getValueOperand()->getType(); 4946 if (hasIrregularType(ScalarTy, DL, VF)) 4947 return false; 4948 4949 return true; 4950 } 4951 4952 void LoopVectorizationCostModel::collectLoopUniforms(ElementCount VF) { 4953 // We should not collect Uniforms more than once per VF. Right now, 4954 // this function is called from collectUniformsAndScalars(), which 4955 // already does this check. Collecting Uniforms for VF=1 does not make any 4956 // sense. 4957 4958 assert(VF.isVector() && Uniforms.find(VF) == Uniforms.end() && 4959 "This function should not be visited twice for the same VF"); 4960 4961 // Visit the list of Uniforms. If we'll not find any uniform value, we'll 4962 // not analyze again. Uniforms.count(VF) will return 1. 4963 Uniforms[VF].clear(); 4964 4965 // We now know that the loop is vectorizable! 4966 // Collect instructions inside the loop that will remain uniform after 4967 // vectorization. 4968 4969 // Global values, params and instructions outside of current loop are out of 4970 // scope. 4971 auto isOutOfScope = [&](Value *V) -> bool { 4972 Instruction *I = dyn_cast<Instruction>(V); 4973 return (!I || !TheLoop->contains(I)); 4974 }; 4975 4976 SetVector<Instruction *> Worklist; 4977 BasicBlock *Latch = TheLoop->getLoopLatch(); 4978 4979 // Instructions that are scalar with predication must not be considered 4980 // uniform after vectorization, because that would create an erroneous 4981 // replicating region where only a single instance out of VF should be formed. 4982 // TODO: optimize such seldom cases if found important, see PR40816. 4983 auto addToWorklistIfAllowed = [&](Instruction *I) -> void { 4984 if (isScalarWithPredication(I, VF)) { 4985 LLVM_DEBUG(dbgs() << "LV: Found not uniform being ScalarWithPredication: " 4986 << *I << "\n"); 4987 return; 4988 } 4989 LLVM_DEBUG(dbgs() << "LV: Found uniform instruction: " << *I << "\n"); 4990 Worklist.insert(I); 4991 }; 4992 4993 // Start with the conditional branch. If the branch condition is an 4994 // instruction contained in the loop that is only used by the branch, it is 4995 // uniform. 4996 auto *Cmp = dyn_cast<Instruction>(Latch->getTerminator()->getOperand(0)); 4997 if (Cmp && TheLoop->contains(Cmp) && Cmp->hasOneUse()) 4998 addToWorklistIfAllowed(Cmp); 4999 5000 // Holds consecutive and consecutive-like pointers. Consecutive-like pointers 5001 // are pointers that are treated like consecutive pointers during 5002 // vectorization. The pointer operands of interleaved accesses are an 5003 // example. 5004 SmallSetVector<Instruction *, 8> ConsecutiveLikePtrs; 5005 5006 // Holds pointer operands of instructions that are possibly non-uniform. 5007 SmallPtrSet<Instruction *, 8> PossibleNonUniformPtrs; 5008 5009 auto isUniformDecision = [&](Instruction *I, ElementCount VF) { 5010 InstWidening WideningDecision = getWideningDecision(I, VF); 5011 assert(WideningDecision != CM_Unknown && 5012 "Widening decision should be ready at this moment"); 5013 5014 return (WideningDecision == CM_Widen || 5015 WideningDecision == CM_Widen_Reverse || 5016 WideningDecision == CM_Interleave); 5017 }; 5018 // Iterate over the instructions in the loop, and collect all 5019 // consecutive-like pointer operands in ConsecutiveLikePtrs. If it's possible 5020 // that a consecutive-like pointer operand will be scalarized, we collect it 5021 // in PossibleNonUniformPtrs instead. We use two sets here because a single 5022 // getelementptr instruction can be used by both vectorized and scalarized 5023 // memory instructions. For example, if a loop loads and stores from the same 5024 // location, but the store is conditional, the store will be scalarized, and 5025 // the getelementptr won't remain uniform. 5026 for (auto *BB : TheLoop->blocks()) 5027 for (auto &I : *BB) { 5028 // If there's no pointer operand, there's nothing to do. 5029 auto *Ptr = dyn_cast_or_null<Instruction>(getLoadStorePointerOperand(&I)); 5030 if (!Ptr) 5031 continue; 5032 5033 // True if all users of Ptr are memory accesses that have Ptr as their 5034 // pointer operand. 5035 auto UsersAreMemAccesses = 5036 llvm::all_of(Ptr->users(), [&](User *U) -> bool { 5037 return getLoadStorePointerOperand(U) == Ptr; 5038 }); 5039 5040 // Ensure the memory instruction will not be scalarized or used by 5041 // gather/scatter, making its pointer operand non-uniform. If the pointer 5042 // operand is used by any instruction other than a memory access, we 5043 // conservatively assume the pointer operand may be non-uniform. 5044 if (!UsersAreMemAccesses || !isUniformDecision(&I, VF)) 5045 PossibleNonUniformPtrs.insert(Ptr); 5046 5047 // If the memory instruction will be vectorized and its pointer operand 5048 // is consecutive-like, or interleaving - the pointer operand should 5049 // remain uniform. 5050 else 5051 ConsecutiveLikePtrs.insert(Ptr); 5052 } 5053 5054 // Add to the Worklist all consecutive and consecutive-like pointers that 5055 // aren't also identified as possibly non-uniform. 5056 for (auto *V : ConsecutiveLikePtrs) 5057 if (!PossibleNonUniformPtrs.count(V)) 5058 addToWorklistIfAllowed(V); 5059 5060 // Expand Worklist in topological order: whenever a new instruction 5061 // is added , its users should be already inside Worklist. It ensures 5062 // a uniform instruction will only be used by uniform instructions. 5063 unsigned idx = 0; 5064 while (idx != Worklist.size()) { 5065 Instruction *I = Worklist[idx++]; 5066 5067 for (auto OV : I->operand_values()) { 5068 // isOutOfScope operands cannot be uniform instructions. 5069 if (isOutOfScope(OV)) 5070 continue; 5071 // First order recurrence Phi's should typically be considered 5072 // non-uniform. 5073 auto *OP = dyn_cast<PHINode>(OV); 5074 if (OP && Legal->isFirstOrderRecurrence(OP)) 5075 continue; 5076 // If all the users of the operand are uniform, then add the 5077 // operand into the uniform worklist. 5078 auto *OI = cast<Instruction>(OV); 5079 if (llvm::all_of(OI->users(), [&](User *U) -> bool { 5080 auto *J = cast<Instruction>(U); 5081 return Worklist.count(J) || 5082 (OI == getLoadStorePointerOperand(J) && 5083 isUniformDecision(J, VF)); 5084 })) 5085 addToWorklistIfAllowed(OI); 5086 } 5087 } 5088 5089 // Returns true if Ptr is the pointer operand of a memory access instruction 5090 // I, and I is known to not require scalarization. 5091 auto isVectorizedMemAccessUse = [&](Instruction *I, Value *Ptr) -> bool { 5092 return getLoadStorePointerOperand(I) == Ptr && isUniformDecision(I, VF); 5093 }; 5094 5095 // For an instruction to be added into Worklist above, all its users inside 5096 // the loop should also be in Worklist. However, this condition cannot be 5097 // true for phi nodes that form a cyclic dependence. We must process phi 5098 // nodes separately. An induction variable will remain uniform if all users 5099 // of the induction variable and induction variable update remain uniform. 5100 // The code below handles both pointer and non-pointer induction variables. 5101 for (auto &Induction : Legal->getInductionVars()) { 5102 auto *Ind = Induction.first; 5103 auto *IndUpdate = cast<Instruction>(Ind->getIncomingValueForBlock(Latch)); 5104 5105 // Determine if all users of the induction variable are uniform after 5106 // vectorization. 5107 auto UniformInd = llvm::all_of(Ind->users(), [&](User *U) -> bool { 5108 auto *I = cast<Instruction>(U); 5109 return I == IndUpdate || !TheLoop->contains(I) || Worklist.count(I) || 5110 isVectorizedMemAccessUse(I, Ind); 5111 }); 5112 if (!UniformInd) 5113 continue; 5114 5115 // Determine if all users of the induction variable update instruction are 5116 // uniform after vectorization. 5117 auto UniformIndUpdate = 5118 llvm::all_of(IndUpdate->users(), [&](User *U) -> bool { 5119 auto *I = cast<Instruction>(U); 5120 return I == Ind || !TheLoop->contains(I) || Worklist.count(I) || 5121 isVectorizedMemAccessUse(I, IndUpdate); 5122 }); 5123 if (!UniformIndUpdate) 5124 continue; 5125 5126 // The induction variable and its update instruction will remain uniform. 5127 addToWorklistIfAllowed(Ind); 5128 addToWorklistIfAllowed(IndUpdate); 5129 } 5130 5131 Uniforms[VF].insert(Worklist.begin(), Worklist.end()); 5132 } 5133 5134 bool LoopVectorizationCostModel::runtimeChecksRequired() { 5135 LLVM_DEBUG(dbgs() << "LV: Performing code size checks.\n"); 5136 5137 if (Legal->getRuntimePointerChecking()->Need) { 5138 reportVectorizationFailure("Runtime ptr check is required with -Os/-Oz", 5139 "runtime pointer checks needed. Enable vectorization of this " 5140 "loop with '#pragma clang loop vectorize(enable)' when " 5141 "compiling with -Os/-Oz", 5142 "CantVersionLoopWithOptForSize", ORE, TheLoop); 5143 return true; 5144 } 5145 5146 if (!PSE.getUnionPredicate().getPredicates().empty()) { 5147 reportVectorizationFailure("Runtime SCEV check is required with -Os/-Oz", 5148 "runtime SCEV checks needed. Enable vectorization of this " 5149 "loop with '#pragma clang loop vectorize(enable)' when " 5150 "compiling with -Os/-Oz", 5151 "CantVersionLoopWithOptForSize", ORE, TheLoop); 5152 return true; 5153 } 5154 5155 // FIXME: Avoid specializing for stride==1 instead of bailing out. 5156 if (!Legal->getLAI()->getSymbolicStrides().empty()) { 5157 reportVectorizationFailure("Runtime stride check for small trip count", 5158 "runtime stride == 1 checks needed. Enable vectorization of " 5159 "this loop without such check by compiling with -Os/-Oz", 5160 "CantVersionLoopWithOptForSize", ORE, TheLoop); 5161 return true; 5162 } 5163 5164 return false; 5165 } 5166 5167 Optional<unsigned> LoopVectorizationCostModel::computeMaxVF(unsigned UserVF, 5168 unsigned UserIC) { 5169 if (Legal->getRuntimePointerChecking()->Need && TTI.hasBranchDivergence()) { 5170 // TODO: It may by useful to do since it's still likely to be dynamically 5171 // uniform if the target can skip. 5172 reportVectorizationFailure( 5173 "Not inserting runtime ptr check for divergent target", 5174 "runtime pointer checks needed. Not enabled for divergent target", 5175 "CantVersionLoopWithDivergentTarget", ORE, TheLoop); 5176 return None; 5177 } 5178 5179 unsigned TC = PSE.getSE()->getSmallConstantTripCount(TheLoop); 5180 LLVM_DEBUG(dbgs() << "LV: Found trip count: " << TC << '\n'); 5181 if (TC == 1) { 5182 reportVectorizationFailure("Single iteration (non) loop", 5183 "loop trip count is one, irrelevant for vectorization", 5184 "SingleIterationLoop", ORE, TheLoop); 5185 return None; 5186 } 5187 5188 switch (ScalarEpilogueStatus) { 5189 case CM_ScalarEpilogueAllowed: 5190 return UserVF ? UserVF : computeFeasibleMaxVF(TC); 5191 case CM_ScalarEpilogueNotNeededUsePredicate: 5192 LLVM_DEBUG( 5193 dbgs() << "LV: vector predicate hint/switch found.\n" 5194 << "LV: Not allowing scalar epilogue, creating predicated " 5195 << "vector loop.\n"); 5196 break; 5197 case CM_ScalarEpilogueNotAllowedLowTripLoop: 5198 // fallthrough as a special case of OptForSize 5199 case CM_ScalarEpilogueNotAllowedOptSize: 5200 if (ScalarEpilogueStatus == CM_ScalarEpilogueNotAllowedOptSize) 5201 LLVM_DEBUG( 5202 dbgs() << "LV: Not allowing scalar epilogue due to -Os/-Oz.\n"); 5203 else 5204 LLVM_DEBUG(dbgs() << "LV: Not allowing scalar epilogue due to low trip " 5205 << "count.\n"); 5206 5207 // Bail if runtime checks are required, which are not good when optimising 5208 // for size. 5209 if (runtimeChecksRequired()) 5210 return None; 5211 break; 5212 } 5213 5214 // Now try the tail folding 5215 5216 // Invalidate interleave groups that require an epilogue if we can't mask 5217 // the interleave-group. 5218 if (!useMaskedInterleavedAccesses(TTI)) { 5219 assert(WideningDecisions.empty() && Uniforms.empty() && Scalars.empty() && 5220 "No decisions should have been taken at this point"); 5221 // Note: There is no need to invalidate any cost modeling decisions here, as 5222 // non where taken so far. 5223 InterleaveInfo.invalidateGroupsRequiringScalarEpilogue(); 5224 } 5225 5226 unsigned MaxVF = UserVF ? UserVF : computeFeasibleMaxVF(TC); 5227 assert((UserVF || isPowerOf2_32(MaxVF)) && "MaxVF must be a power of 2"); 5228 unsigned MaxVFtimesIC = UserIC ? MaxVF * UserIC : MaxVF; 5229 if (TC > 0 && TC % MaxVFtimesIC == 0) { 5230 // Accept MaxVF if we do not have a tail. 5231 LLVM_DEBUG(dbgs() << "LV: No tail will remain for any chosen VF.\n"); 5232 return MaxVF; 5233 } 5234 5235 // If we don't know the precise trip count, or if the trip count that we 5236 // found modulo the vectorization factor is not zero, try to fold the tail 5237 // by masking. 5238 // FIXME: look for a smaller MaxVF that does divide TC rather than masking. 5239 if (Legal->prepareToFoldTailByMasking()) { 5240 FoldTailByMasking = true; 5241 return MaxVF; 5242 } 5243 5244 if (TC == 0) { 5245 reportVectorizationFailure( 5246 "Unable to calculate the loop count due to complex control flow", 5247 "unable to calculate the loop count due to complex control flow", 5248 "UnknownLoopCountComplexCFG", ORE, TheLoop); 5249 return None; 5250 } 5251 5252 reportVectorizationFailure( 5253 "Cannot optimize for size and vectorize at the same time.", 5254 "cannot optimize for size and vectorize at the same time. " 5255 "Enable vectorization of this loop with '#pragma clang loop " 5256 "vectorize(enable)' when compiling with -Os/-Oz", 5257 "NoTailLoopWithOptForSize", ORE, TheLoop); 5258 return None; 5259 } 5260 5261 unsigned 5262 LoopVectorizationCostModel::computeFeasibleMaxVF(unsigned ConstTripCount) { 5263 MinBWs = computeMinimumValueSizes(TheLoop->getBlocks(), *DB, &TTI); 5264 unsigned SmallestType, WidestType; 5265 std::tie(SmallestType, WidestType) = getSmallestAndWidestTypes(); 5266 unsigned WidestRegister = TTI.getRegisterBitWidth(true); 5267 5268 // Get the maximum safe dependence distance in bits computed by LAA. 5269 // It is computed by MaxVF * sizeOf(type) * 8, where type is taken from 5270 // the memory accesses that is most restrictive (involved in the smallest 5271 // dependence distance). 5272 unsigned MaxSafeRegisterWidth = Legal->getMaxSafeRegisterWidth(); 5273 5274 WidestRegister = std::min(WidestRegister, MaxSafeRegisterWidth); 5275 5276 // Ensure MaxVF is a power of 2; the dependence distance bound may not be. 5277 // Note that both WidestRegister and WidestType may not be a powers of 2. 5278 unsigned MaxVectorSize = PowerOf2Floor(WidestRegister / WidestType); 5279 5280 LLVM_DEBUG(dbgs() << "LV: The Smallest and Widest types: " << SmallestType 5281 << " / " << WidestType << " bits.\n"); 5282 LLVM_DEBUG(dbgs() << "LV: The Widest register safe to use is: " 5283 << WidestRegister << " bits.\n"); 5284 5285 assert(MaxVectorSize <= 256 && "Did not expect to pack so many elements" 5286 " into one vector!"); 5287 if (MaxVectorSize == 0) { 5288 LLVM_DEBUG(dbgs() << "LV: The target has no vector registers.\n"); 5289 MaxVectorSize = 1; 5290 return MaxVectorSize; 5291 } else if (ConstTripCount && ConstTripCount < MaxVectorSize && 5292 isPowerOf2_32(ConstTripCount)) { 5293 // We need to clamp the VF to be the ConstTripCount. There is no point in 5294 // choosing a higher viable VF as done in the loop below. 5295 LLVM_DEBUG(dbgs() << "LV: Clamping the MaxVF to the constant trip count: " 5296 << ConstTripCount << "\n"); 5297 MaxVectorSize = ConstTripCount; 5298 return MaxVectorSize; 5299 } 5300 5301 unsigned MaxVF = MaxVectorSize; 5302 if (TTI.shouldMaximizeVectorBandwidth(!isScalarEpilogueAllowed()) || 5303 (MaximizeBandwidth && isScalarEpilogueAllowed())) { 5304 // Collect all viable vectorization factors larger than the default MaxVF 5305 // (i.e. MaxVectorSize). 5306 SmallVector<ElementCount, 8> VFs; 5307 unsigned NewMaxVectorSize = WidestRegister / SmallestType; 5308 for (unsigned VS = MaxVectorSize * 2; VS <= NewMaxVectorSize; VS *= 2) 5309 VFs.push_back(ElementCount::getFixed(VS)); 5310 5311 // For each VF calculate its register usage. 5312 auto RUs = calculateRegisterUsage(VFs); 5313 5314 // Select the largest VF which doesn't require more registers than existing 5315 // ones. 5316 for (int i = RUs.size() - 1; i >= 0; --i) { 5317 bool Selected = true; 5318 for (auto& pair : RUs[i].MaxLocalUsers) { 5319 unsigned TargetNumRegisters = TTI.getNumberOfRegisters(pair.first); 5320 if (pair.second > TargetNumRegisters) 5321 Selected = false; 5322 } 5323 if (Selected) { 5324 MaxVF = VFs[i].Min; 5325 break; 5326 } 5327 } 5328 if (unsigned MinVF = TTI.getMinimumVF(SmallestType)) { 5329 if (MaxVF < MinVF) { 5330 LLVM_DEBUG(dbgs() << "LV: Overriding calculated MaxVF(" << MaxVF 5331 << ") with target's minimum: " << MinVF << '\n'); 5332 MaxVF = MinVF; 5333 } 5334 } 5335 } 5336 return MaxVF; 5337 } 5338 5339 VectorizationFactor 5340 LoopVectorizationCostModel::selectVectorizationFactor(unsigned MaxVF) { 5341 float Cost = expectedCost(ElementCount::getFixed(1)).first; 5342 const float ScalarCost = Cost; 5343 unsigned Width = 1; 5344 LLVM_DEBUG(dbgs() << "LV: Scalar loop costs: " << (int)ScalarCost << ".\n"); 5345 5346 bool ForceVectorization = Hints->getForce() == LoopVectorizeHints::FK_Enabled; 5347 if (ForceVectorization && MaxVF > 1) { 5348 // Ignore scalar width, because the user explicitly wants vectorization. 5349 // Initialize cost to max so that VF = 2 is, at least, chosen during cost 5350 // evaluation. 5351 Cost = std::numeric_limits<float>::max(); 5352 } 5353 5354 for (unsigned i = 2; i <= MaxVF; i *= 2) { 5355 // Notice that the vector loop needs to be executed less times, so 5356 // we need to divide the cost of the vector loops by the width of 5357 // the vector elements. 5358 VectorizationCostTy C = expectedCost(ElementCount::getFixed(i)); 5359 float VectorCost = C.first / (float)i; 5360 LLVM_DEBUG(dbgs() << "LV: Vector loop of width " << i 5361 << " costs: " << (int)VectorCost << ".\n"); 5362 if (!C.second && !ForceVectorization) { 5363 LLVM_DEBUG( 5364 dbgs() << "LV: Not considering vector loop of width " << i 5365 << " because it will not generate any vector instructions.\n"); 5366 continue; 5367 } 5368 if (VectorCost < Cost) { 5369 Cost = VectorCost; 5370 Width = i; 5371 } 5372 } 5373 5374 if (!EnableCondStoresVectorization && NumPredStores) { 5375 reportVectorizationFailure("There are conditional stores.", 5376 "store that is conditionally executed prevents vectorization", 5377 "ConditionalStore", ORE, TheLoop); 5378 Width = 1; 5379 Cost = ScalarCost; 5380 } 5381 5382 LLVM_DEBUG(if (ForceVectorization && Width > 1 && Cost >= ScalarCost) dbgs() 5383 << "LV: Vectorization seems to be not beneficial, " 5384 << "but was forced by a user.\n"); 5385 LLVM_DEBUG(dbgs() << "LV: Selecting VF: " << Width << ".\n"); 5386 VectorizationFactor Factor = {ElementCount::getFixed(Width), 5387 (unsigned)(Width * Cost)}; 5388 return Factor; 5389 } 5390 5391 std::pair<unsigned, unsigned> 5392 LoopVectorizationCostModel::getSmallestAndWidestTypes() { 5393 unsigned MinWidth = -1U; 5394 unsigned MaxWidth = 8; 5395 const DataLayout &DL = TheFunction->getParent()->getDataLayout(); 5396 5397 // For each block. 5398 for (BasicBlock *BB : TheLoop->blocks()) { 5399 // For each instruction in the loop. 5400 for (Instruction &I : BB->instructionsWithoutDebug()) { 5401 Type *T = I.getType(); 5402 5403 // Skip ignored values. 5404 if (ValuesToIgnore.count(&I)) 5405 continue; 5406 5407 // Only examine Loads, Stores and PHINodes. 5408 if (!isa<LoadInst>(I) && !isa<StoreInst>(I) && !isa<PHINode>(I)) 5409 continue; 5410 5411 // Examine PHI nodes that are reduction variables. Update the type to 5412 // account for the recurrence type. 5413 if (auto *PN = dyn_cast<PHINode>(&I)) { 5414 if (!Legal->isReductionVariable(PN)) 5415 continue; 5416 RecurrenceDescriptor RdxDesc = Legal->getReductionVars()[PN]; 5417 T = RdxDesc.getRecurrenceType(); 5418 } 5419 5420 // Examine the stored values. 5421 if (auto *ST = dyn_cast<StoreInst>(&I)) 5422 T = ST->getValueOperand()->getType(); 5423 5424 // Ignore loaded pointer types and stored pointer types that are not 5425 // vectorizable. 5426 // 5427 // FIXME: The check here attempts to predict whether a load or store will 5428 // be vectorized. We only know this for certain after a VF has 5429 // been selected. Here, we assume that if an access can be 5430 // vectorized, it will be. We should also look at extending this 5431 // optimization to non-pointer types. 5432 // 5433 if (T->isPointerTy() && !isConsecutiveLoadOrStore(&I) && 5434 !isAccessInterleaved(&I) && !isLegalGatherOrScatter(&I)) 5435 continue; 5436 5437 MinWidth = std::min(MinWidth, 5438 (unsigned)DL.getTypeSizeInBits(T->getScalarType())); 5439 MaxWidth = std::max(MaxWidth, 5440 (unsigned)DL.getTypeSizeInBits(T->getScalarType())); 5441 } 5442 } 5443 5444 return {MinWidth, MaxWidth}; 5445 } 5446 5447 unsigned LoopVectorizationCostModel::selectInterleaveCount(ElementCount VF, 5448 unsigned LoopCost) { 5449 // -- The interleave heuristics -- 5450 // We interleave the loop in order to expose ILP and reduce the loop overhead. 5451 // There are many micro-architectural considerations that we can't predict 5452 // at this level. For example, frontend pressure (on decode or fetch) due to 5453 // code size, or the number and capabilities of the execution ports. 5454 // 5455 // We use the following heuristics to select the interleave count: 5456 // 1. If the code has reductions, then we interleave to break the cross 5457 // iteration dependency. 5458 // 2. If the loop is really small, then we interleave to reduce the loop 5459 // overhead. 5460 // 3. We don't interleave if we think that we will spill registers to memory 5461 // due to the increased register pressure. 5462 5463 if (!isScalarEpilogueAllowed()) 5464 return 1; 5465 5466 // We used the distance for the interleave count. 5467 if (Legal->getMaxSafeDepDistBytes() != -1U) 5468 return 1; 5469 5470 // Do not interleave loops with a relatively small known or estimated trip 5471 // count. 5472 auto BestKnownTC = getSmallBestKnownTC(*PSE.getSE(), TheLoop); 5473 if (BestKnownTC && *BestKnownTC < TinyTripCountInterleaveThreshold) 5474 return 1; 5475 5476 RegisterUsage R = calculateRegisterUsage({VF})[0]; 5477 // We divide by these constants so assume that we have at least one 5478 // instruction that uses at least one register. 5479 for (auto& pair : R.MaxLocalUsers) { 5480 pair.second = std::max(pair.second, 1U); 5481 } 5482 5483 // We calculate the interleave count using the following formula. 5484 // Subtract the number of loop invariants from the number of available 5485 // registers. These registers are used by all of the interleaved instances. 5486 // Next, divide the remaining registers by the number of registers that is 5487 // required by the loop, in order to estimate how many parallel instances 5488 // fit without causing spills. All of this is rounded down if necessary to be 5489 // a power of two. We want power of two interleave count to simplify any 5490 // addressing operations or alignment considerations. 5491 // We also want power of two interleave counts to ensure that the induction 5492 // variable of the vector loop wraps to zero, when tail is folded by masking; 5493 // this currently happens when OptForSize, in which case IC is set to 1 above. 5494 unsigned IC = UINT_MAX; 5495 5496 for (auto& pair : R.MaxLocalUsers) { 5497 unsigned TargetNumRegisters = TTI.getNumberOfRegisters(pair.first); 5498 LLVM_DEBUG(dbgs() << "LV: The target has " << TargetNumRegisters 5499 << " registers of " 5500 << TTI.getRegisterClassName(pair.first) << " register class\n"); 5501 if (VF == 1) { 5502 if (ForceTargetNumScalarRegs.getNumOccurrences() > 0) 5503 TargetNumRegisters = ForceTargetNumScalarRegs; 5504 } else { 5505 if (ForceTargetNumVectorRegs.getNumOccurrences() > 0) 5506 TargetNumRegisters = ForceTargetNumVectorRegs; 5507 } 5508 unsigned MaxLocalUsers = pair.second; 5509 unsigned LoopInvariantRegs = 0; 5510 if (R.LoopInvariantRegs.find(pair.first) != R.LoopInvariantRegs.end()) 5511 LoopInvariantRegs = R.LoopInvariantRegs[pair.first]; 5512 5513 unsigned TmpIC = PowerOf2Floor((TargetNumRegisters - LoopInvariantRegs) / MaxLocalUsers); 5514 // Don't count the induction variable as interleaved. 5515 if (EnableIndVarRegisterHeur) { 5516 TmpIC = 5517 PowerOf2Floor((TargetNumRegisters - LoopInvariantRegs - 1) / 5518 std::max(1U, (MaxLocalUsers - 1))); 5519 } 5520 5521 IC = std::min(IC, TmpIC); 5522 } 5523 5524 // Clamp the interleave ranges to reasonable counts. 5525 assert(!VF.Scalable && "scalable vectors not yet supported."); 5526 unsigned MaxInterleaveCount = TTI.getMaxInterleaveFactor(VF.Min); 5527 5528 // Check if the user has overridden the max. 5529 if (VF == 1) { 5530 if (ForceTargetMaxScalarInterleaveFactor.getNumOccurrences() > 0) 5531 MaxInterleaveCount = ForceTargetMaxScalarInterleaveFactor; 5532 } else { 5533 if (ForceTargetMaxVectorInterleaveFactor.getNumOccurrences() > 0) 5534 MaxInterleaveCount = ForceTargetMaxVectorInterleaveFactor; 5535 } 5536 5537 // If trip count is known or estimated compile time constant, limit the 5538 // interleave count to be less than the trip count divided by VF. 5539 if (BestKnownTC) { 5540 MaxInterleaveCount = std::min(*BestKnownTC / VF.Min, MaxInterleaveCount); 5541 } 5542 5543 // If we did not calculate the cost for VF (because the user selected the VF) 5544 // then we calculate the cost of VF here. 5545 if (LoopCost == 0) 5546 LoopCost = expectedCost(VF).first; 5547 5548 assert(LoopCost && "Non-zero loop cost expected"); 5549 5550 // Clamp the calculated IC to be between the 1 and the max interleave count 5551 // that the target and trip count allows. 5552 if (IC > MaxInterleaveCount) 5553 IC = MaxInterleaveCount; 5554 else if (IC < 1) 5555 IC = 1; 5556 5557 // Interleave if we vectorized this loop and there is a reduction that could 5558 // benefit from interleaving. 5559 if (VF.isVector() && !Legal->getReductionVars().empty()) { 5560 LLVM_DEBUG(dbgs() << "LV: Interleaving because of reductions.\n"); 5561 return IC; 5562 } 5563 5564 // Note that if we've already vectorized the loop we will have done the 5565 // runtime check and so interleaving won't require further checks. 5566 bool InterleavingRequiresRuntimePointerCheck = 5567 (VF.isScalar() && Legal->getRuntimePointerChecking()->Need); 5568 5569 // We want to interleave small loops in order to reduce the loop overhead and 5570 // potentially expose ILP opportunities. 5571 LLVM_DEBUG(dbgs() << "LV: Loop cost is " << LoopCost << '\n'); 5572 if (!InterleavingRequiresRuntimePointerCheck && LoopCost < SmallLoopCost) { 5573 // We assume that the cost overhead is 1 and we use the cost model 5574 // to estimate the cost of the loop and interleave until the cost of the 5575 // loop overhead is about 5% of the cost of the loop. 5576 unsigned SmallIC = 5577 std::min(IC, (unsigned)PowerOf2Floor(SmallLoopCost / LoopCost)); 5578 5579 // Interleave until store/load ports (estimated by max interleave count) are 5580 // saturated. 5581 unsigned NumStores = Legal->getNumStores(); 5582 unsigned NumLoads = Legal->getNumLoads(); 5583 unsigned StoresIC = IC / (NumStores ? NumStores : 1); 5584 unsigned LoadsIC = IC / (NumLoads ? NumLoads : 1); 5585 5586 // If we have a scalar reduction (vector reductions are already dealt with 5587 // by this point), we can increase the critical path length if the loop 5588 // we're interleaving is inside another loop. Limit, by default to 2, so the 5589 // critical path only gets increased by one reduction operation. 5590 if (!Legal->getReductionVars().empty() && TheLoop->getLoopDepth() > 1) { 5591 unsigned F = static_cast<unsigned>(MaxNestedScalarReductionIC); 5592 SmallIC = std::min(SmallIC, F); 5593 StoresIC = std::min(StoresIC, F); 5594 LoadsIC = std::min(LoadsIC, F); 5595 } 5596 5597 if (EnableLoadStoreRuntimeInterleave && 5598 std::max(StoresIC, LoadsIC) > SmallIC) { 5599 LLVM_DEBUG( 5600 dbgs() << "LV: Interleaving to saturate store or load ports.\n"); 5601 return std::max(StoresIC, LoadsIC); 5602 } 5603 5604 LLVM_DEBUG(dbgs() << "LV: Interleaving to reduce branch cost.\n"); 5605 return SmallIC; 5606 } 5607 5608 // Interleave if this is a large loop (small loops are already dealt with by 5609 // this point) that could benefit from interleaving. 5610 bool HasReductions = !Legal->getReductionVars().empty(); 5611 if (TTI.enableAggressiveInterleaving(HasReductions)) { 5612 LLVM_DEBUG(dbgs() << "LV: Interleaving to expose ILP.\n"); 5613 return IC; 5614 } 5615 5616 LLVM_DEBUG(dbgs() << "LV: Not Interleaving.\n"); 5617 return 1; 5618 } 5619 5620 SmallVector<LoopVectorizationCostModel::RegisterUsage, 8> 5621 LoopVectorizationCostModel::calculateRegisterUsage(ArrayRef<ElementCount> VFs) { 5622 // This function calculates the register usage by measuring the highest number 5623 // of values that are alive at a single location. Obviously, this is a very 5624 // rough estimation. We scan the loop in a topological order in order and 5625 // assign a number to each instruction. We use RPO to ensure that defs are 5626 // met before their users. We assume that each instruction that has in-loop 5627 // users starts an interval. We record every time that an in-loop value is 5628 // used, so we have a list of the first and last occurrences of each 5629 // instruction. Next, we transpose this data structure into a multi map that 5630 // holds the list of intervals that *end* at a specific location. This multi 5631 // map allows us to perform a linear search. We scan the instructions linearly 5632 // and record each time that a new interval starts, by placing it in a set. 5633 // If we find this value in the multi-map then we remove it from the set. 5634 // The max register usage is the maximum size of the set. 5635 // We also search for instructions that are defined outside the loop, but are 5636 // used inside the loop. We need this number separately from the max-interval 5637 // usage number because when we unroll, loop-invariant values do not take 5638 // more register. 5639 LoopBlocksDFS DFS(TheLoop); 5640 DFS.perform(LI); 5641 5642 RegisterUsage RU; 5643 5644 // Each 'key' in the map opens a new interval. The values 5645 // of the map are the index of the 'last seen' usage of the 5646 // instruction that is the key. 5647 using IntervalMap = DenseMap<Instruction *, unsigned>; 5648 5649 // Maps instruction to its index. 5650 SmallVector<Instruction *, 64> IdxToInstr; 5651 // Marks the end of each interval. 5652 IntervalMap EndPoint; 5653 // Saves the list of instruction indices that are used in the loop. 5654 SmallPtrSet<Instruction *, 8> Ends; 5655 // Saves the list of values that are used in the loop but are 5656 // defined outside the loop, such as arguments and constants. 5657 SmallPtrSet<Value *, 8> LoopInvariants; 5658 5659 for (BasicBlock *BB : make_range(DFS.beginRPO(), DFS.endRPO())) { 5660 for (Instruction &I : BB->instructionsWithoutDebug()) { 5661 IdxToInstr.push_back(&I); 5662 5663 // Save the end location of each USE. 5664 for (Value *U : I.operands()) { 5665 auto *Instr = dyn_cast<Instruction>(U); 5666 5667 // Ignore non-instruction values such as arguments, constants, etc. 5668 if (!Instr) 5669 continue; 5670 5671 // If this instruction is outside the loop then record it and continue. 5672 if (!TheLoop->contains(Instr)) { 5673 LoopInvariants.insert(Instr); 5674 continue; 5675 } 5676 5677 // Overwrite previous end points. 5678 EndPoint[Instr] = IdxToInstr.size(); 5679 Ends.insert(Instr); 5680 } 5681 } 5682 } 5683 5684 // Saves the list of intervals that end with the index in 'key'. 5685 using InstrList = SmallVector<Instruction *, 2>; 5686 DenseMap<unsigned, InstrList> TransposeEnds; 5687 5688 // Transpose the EndPoints to a list of values that end at each index. 5689 for (auto &Interval : EndPoint) 5690 TransposeEnds[Interval.second].push_back(Interval.first); 5691 5692 SmallPtrSet<Instruction *, 8> OpenIntervals; 5693 5694 // Get the size of the widest register. 5695 unsigned MaxSafeDepDist = -1U; 5696 if (Legal->getMaxSafeDepDistBytes() != -1U) 5697 MaxSafeDepDist = Legal->getMaxSafeDepDistBytes() * 8; 5698 unsigned WidestRegister = 5699 std::min(TTI.getRegisterBitWidth(true), MaxSafeDepDist); 5700 const DataLayout &DL = TheFunction->getParent()->getDataLayout(); 5701 5702 SmallVector<RegisterUsage, 8> RUs(VFs.size()); 5703 SmallVector<SmallMapVector<unsigned, unsigned, 4>, 8> MaxUsages(VFs.size()); 5704 5705 LLVM_DEBUG(dbgs() << "LV(REG): Calculating max register usage:\n"); 5706 5707 // A lambda that gets the register usage for the given type and VF. 5708 auto GetRegUsage = [&DL, WidestRegister](Type *Ty, ElementCount VF) { 5709 if (Ty->isTokenTy()) 5710 return 0U; 5711 unsigned TypeSize = DL.getTypeSizeInBits(Ty->getScalarType()); 5712 assert(!VF.Scalable && "scalable vectors not yet supported."); 5713 return std::max<unsigned>(1, VF.Min * TypeSize / WidestRegister); 5714 }; 5715 5716 for (unsigned int i = 0, s = IdxToInstr.size(); i < s; ++i) { 5717 Instruction *I = IdxToInstr[i]; 5718 5719 // Remove all of the instructions that end at this location. 5720 InstrList &List = TransposeEnds[i]; 5721 for (Instruction *ToRemove : List) 5722 OpenIntervals.erase(ToRemove); 5723 5724 // Ignore instructions that are never used within the loop. 5725 if (!Ends.count(I)) 5726 continue; 5727 5728 // Skip ignored values. 5729 if (ValuesToIgnore.count(I)) 5730 continue; 5731 5732 // For each VF find the maximum usage of registers. 5733 for (unsigned j = 0, e = VFs.size(); j < e; ++j) { 5734 // Count the number of live intervals. 5735 SmallMapVector<unsigned, unsigned, 4> RegUsage; 5736 5737 if (VFs[j].isScalar()) { 5738 for (auto Inst : OpenIntervals) { 5739 unsigned ClassID = TTI.getRegisterClassForType(false, Inst->getType()); 5740 if (RegUsage.find(ClassID) == RegUsage.end()) 5741 RegUsage[ClassID] = 1; 5742 else 5743 RegUsage[ClassID] += 1; 5744 } 5745 } else { 5746 collectUniformsAndScalars(VFs[j]); 5747 for (auto Inst : OpenIntervals) { 5748 // Skip ignored values for VF > 1. 5749 if (VecValuesToIgnore.count(Inst)) 5750 continue; 5751 if (isScalarAfterVectorization(Inst, VFs[j])) { 5752 unsigned ClassID = TTI.getRegisterClassForType(false, Inst->getType()); 5753 if (RegUsage.find(ClassID) == RegUsage.end()) 5754 RegUsage[ClassID] = 1; 5755 else 5756 RegUsage[ClassID] += 1; 5757 } else { 5758 unsigned ClassID = TTI.getRegisterClassForType(true, Inst->getType()); 5759 if (RegUsage.find(ClassID) == RegUsage.end()) 5760 RegUsage[ClassID] = GetRegUsage(Inst->getType(), VFs[j]); 5761 else 5762 RegUsage[ClassID] += GetRegUsage(Inst->getType(), VFs[j]); 5763 } 5764 } 5765 } 5766 5767 for (auto& pair : RegUsage) { 5768 if (MaxUsages[j].find(pair.first) != MaxUsages[j].end()) 5769 MaxUsages[j][pair.first] = std::max(MaxUsages[j][pair.first], pair.second); 5770 else 5771 MaxUsages[j][pair.first] = pair.second; 5772 } 5773 } 5774 5775 LLVM_DEBUG(dbgs() << "LV(REG): At #" << i << " Interval # " 5776 << OpenIntervals.size() << '\n'); 5777 5778 // Add the current instruction to the list of open intervals. 5779 OpenIntervals.insert(I); 5780 } 5781 5782 for (unsigned i = 0, e = VFs.size(); i < e; ++i) { 5783 SmallMapVector<unsigned, unsigned, 4> Invariant; 5784 5785 for (auto Inst : LoopInvariants) { 5786 unsigned Usage = 5787 VFs[i].isScalar() ? 1 : GetRegUsage(Inst->getType(), VFs[i]); 5788 unsigned ClassID = 5789 TTI.getRegisterClassForType(VFs[i].isVector(), Inst->getType()); 5790 if (Invariant.find(ClassID) == Invariant.end()) 5791 Invariant[ClassID] = Usage; 5792 else 5793 Invariant[ClassID] += Usage; 5794 } 5795 5796 LLVM_DEBUG({ 5797 dbgs() << "LV(REG): VF = " << VFs[i] << '\n'; 5798 dbgs() << "LV(REG): Found max usage: " << MaxUsages[i].size() 5799 << " item\n"; 5800 for (const auto &pair : MaxUsages[i]) { 5801 dbgs() << "LV(REG): RegisterClass: " 5802 << TTI.getRegisterClassName(pair.first) << ", " << pair.second 5803 << " registers\n"; 5804 } 5805 dbgs() << "LV(REG): Found invariant usage: " << Invariant.size() 5806 << " item\n"; 5807 for (const auto &pair : Invariant) { 5808 dbgs() << "LV(REG): RegisterClass: " 5809 << TTI.getRegisterClassName(pair.first) << ", " << pair.second 5810 << " registers\n"; 5811 } 5812 }); 5813 5814 RU.LoopInvariantRegs = Invariant; 5815 RU.MaxLocalUsers = MaxUsages[i]; 5816 RUs[i] = RU; 5817 } 5818 5819 return RUs; 5820 } 5821 5822 bool LoopVectorizationCostModel::useEmulatedMaskMemRefHack(Instruction *I){ 5823 // TODO: Cost model for emulated masked load/store is completely 5824 // broken. This hack guides the cost model to use an artificially 5825 // high enough value to practically disable vectorization with such 5826 // operations, except where previously deployed legality hack allowed 5827 // using very low cost values. This is to avoid regressions coming simply 5828 // from moving "masked load/store" check from legality to cost model. 5829 // Masked Load/Gather emulation was previously never allowed. 5830 // Limited number of Masked Store/Scatter emulation was allowed. 5831 assert(isPredicatedInst(I) && "Expecting a scalar emulated instruction"); 5832 return isa<LoadInst>(I) || 5833 (isa<StoreInst>(I) && 5834 NumPredStores > NumberOfStoresToPredicate); 5835 } 5836 5837 void LoopVectorizationCostModel::collectInstsToScalarize(ElementCount VF) { 5838 // If we aren't vectorizing the loop, or if we've already collected the 5839 // instructions to scalarize, there's nothing to do. Collection may already 5840 // have occurred if we have a user-selected VF and are now computing the 5841 // expected cost for interleaving. 5842 if (VF.isScalar() || VF.isZero() || 5843 InstsToScalarize.find(VF) != InstsToScalarize.end()) 5844 return; 5845 5846 // Initialize a mapping for VF in InstsToScalalarize. If we find that it's 5847 // not profitable to scalarize any instructions, the presence of VF in the 5848 // map will indicate that we've analyzed it already. 5849 ScalarCostsTy &ScalarCostsVF = InstsToScalarize[VF]; 5850 5851 // Find all the instructions that are scalar with predication in the loop and 5852 // determine if it would be better to not if-convert the blocks they are in. 5853 // If so, we also record the instructions to scalarize. 5854 for (BasicBlock *BB : TheLoop->blocks()) { 5855 if (!blockNeedsPredication(BB)) 5856 continue; 5857 for (Instruction &I : *BB) 5858 if (isScalarWithPredication(&I)) { 5859 ScalarCostsTy ScalarCosts; 5860 // Do not apply discount logic if hacked cost is needed 5861 // for emulated masked memrefs. 5862 if (!useEmulatedMaskMemRefHack(&I) && 5863 computePredInstDiscount(&I, ScalarCosts, VF) >= 0) 5864 ScalarCostsVF.insert(ScalarCosts.begin(), ScalarCosts.end()); 5865 // Remember that BB will remain after vectorization. 5866 PredicatedBBsAfterVectorization.insert(BB); 5867 } 5868 } 5869 } 5870 5871 int LoopVectorizationCostModel::computePredInstDiscount( 5872 Instruction *PredInst, DenseMap<Instruction *, unsigned> &ScalarCosts, 5873 ElementCount VF) { 5874 assert(!isUniformAfterVectorization(PredInst, VF) && 5875 "Instruction marked uniform-after-vectorization will be predicated"); 5876 5877 // Initialize the discount to zero, meaning that the scalar version and the 5878 // vector version cost the same. 5879 int Discount = 0; 5880 5881 // Holds instructions to analyze. The instructions we visit are mapped in 5882 // ScalarCosts. Those instructions are the ones that would be scalarized if 5883 // we find that the scalar version costs less. 5884 SmallVector<Instruction *, 8> Worklist; 5885 5886 // Returns true if the given instruction can be scalarized. 5887 auto canBeScalarized = [&](Instruction *I) -> bool { 5888 // We only attempt to scalarize instructions forming a single-use chain 5889 // from the original predicated block that would otherwise be vectorized. 5890 // Although not strictly necessary, we give up on instructions we know will 5891 // already be scalar to avoid traversing chains that are unlikely to be 5892 // beneficial. 5893 if (!I->hasOneUse() || PredInst->getParent() != I->getParent() || 5894 isScalarAfterVectorization(I, VF)) 5895 return false; 5896 5897 // If the instruction is scalar with predication, it will be analyzed 5898 // separately. We ignore it within the context of PredInst. 5899 if (isScalarWithPredication(I)) 5900 return false; 5901 5902 // If any of the instruction's operands are uniform after vectorization, 5903 // the instruction cannot be scalarized. This prevents, for example, a 5904 // masked load from being scalarized. 5905 // 5906 // We assume we will only emit a value for lane zero of an instruction 5907 // marked uniform after vectorization, rather than VF identical values. 5908 // Thus, if we scalarize an instruction that uses a uniform, we would 5909 // create uses of values corresponding to the lanes we aren't emitting code 5910 // for. This behavior can be changed by allowing getScalarValue to clone 5911 // the lane zero values for uniforms rather than asserting. 5912 for (Use &U : I->operands()) 5913 if (auto *J = dyn_cast<Instruction>(U.get())) 5914 if (isUniformAfterVectorization(J, VF)) 5915 return false; 5916 5917 // Otherwise, we can scalarize the instruction. 5918 return true; 5919 }; 5920 5921 // Compute the expected cost discount from scalarizing the entire expression 5922 // feeding the predicated instruction. We currently only consider expressions 5923 // that are single-use instruction chains. 5924 Worklist.push_back(PredInst); 5925 while (!Worklist.empty()) { 5926 Instruction *I = Worklist.pop_back_val(); 5927 5928 // If we've already analyzed the instruction, there's nothing to do. 5929 if (ScalarCosts.find(I) != ScalarCosts.end()) 5930 continue; 5931 5932 // Compute the cost of the vector instruction. Note that this cost already 5933 // includes the scalarization overhead of the predicated instruction. 5934 unsigned VectorCost = getInstructionCost(I, VF).first; 5935 5936 // Compute the cost of the scalarized instruction. This cost is the cost of 5937 // the instruction as if it wasn't if-converted and instead remained in the 5938 // predicated block. We will scale this cost by block probability after 5939 // computing the scalarization overhead. 5940 assert(!VF.Scalable && "scalable vectors not yet supported."); 5941 unsigned ScalarCost = 5942 VF.Min * getInstructionCost(I, ElementCount::getFixed(1)).first; 5943 5944 // Compute the scalarization overhead of needed insertelement instructions 5945 // and phi nodes. 5946 if (isScalarWithPredication(I) && !I->getType()->isVoidTy()) { 5947 ScalarCost += TTI.getScalarizationOverhead( 5948 cast<VectorType>(ToVectorTy(I->getType(), VF)), 5949 APInt::getAllOnesValue(VF.Min), true, false); 5950 assert(!VF.Scalable && "scalable vectors not yet supported."); 5951 ScalarCost += 5952 VF.Min * 5953 TTI.getCFInstrCost(Instruction::PHI, TTI::TCK_RecipThroughput); 5954 } 5955 5956 // Compute the scalarization overhead of needed extractelement 5957 // instructions. For each of the instruction's operands, if the operand can 5958 // be scalarized, add it to the worklist; otherwise, account for the 5959 // overhead. 5960 for (Use &U : I->operands()) 5961 if (auto *J = dyn_cast<Instruction>(U.get())) { 5962 assert(VectorType::isValidElementType(J->getType()) && 5963 "Instruction has non-scalar type"); 5964 if (canBeScalarized(J)) 5965 Worklist.push_back(J); 5966 else if (needsExtract(J, VF)) { 5967 assert(!VF.Scalable && "scalable vectors not yet supported."); 5968 ScalarCost += TTI.getScalarizationOverhead( 5969 cast<VectorType>(ToVectorTy(J->getType(), VF)), 5970 APInt::getAllOnesValue(VF.Min), false, true); 5971 } 5972 } 5973 5974 // Scale the total scalar cost by block probability. 5975 ScalarCost /= getReciprocalPredBlockProb(); 5976 5977 // Compute the discount. A non-negative discount means the vector version 5978 // of the instruction costs more, and scalarizing would be beneficial. 5979 Discount += VectorCost - ScalarCost; 5980 ScalarCosts[I] = ScalarCost; 5981 } 5982 5983 return Discount; 5984 } 5985 5986 LoopVectorizationCostModel::VectorizationCostTy 5987 LoopVectorizationCostModel::expectedCost(ElementCount VF) { 5988 assert(!VF.Scalable && "scalable vectors not yet supported."); 5989 VectorizationCostTy Cost; 5990 5991 // For each block. 5992 for (BasicBlock *BB : TheLoop->blocks()) { 5993 VectorizationCostTy BlockCost; 5994 5995 // For each instruction in the old loop. 5996 for (Instruction &I : BB->instructionsWithoutDebug()) { 5997 // Skip ignored values. 5998 if (ValuesToIgnore.count(&I) || 5999 (VF.isVector() && VecValuesToIgnore.count(&I))) 6000 continue; 6001 6002 VectorizationCostTy C = getInstructionCost(&I, VF); 6003 6004 // Check if we should override the cost. 6005 if (ForceTargetInstructionCost.getNumOccurrences() > 0) 6006 C.first = ForceTargetInstructionCost; 6007 6008 BlockCost.first += C.first; 6009 BlockCost.second |= C.second; 6010 LLVM_DEBUG(dbgs() << "LV: Found an estimated cost of " << C.first 6011 << " for VF " << VF << " For instruction: " << I 6012 << '\n'); 6013 } 6014 6015 // If we are vectorizing a predicated block, it will have been 6016 // if-converted. This means that the block's instructions (aside from 6017 // stores and instructions that may divide by zero) will now be 6018 // unconditionally executed. For the scalar case, we may not always execute 6019 // the predicated block. Thus, scale the block's cost by the probability of 6020 // executing it. 6021 if (VF.isScalar() && blockNeedsPredication(BB)) 6022 BlockCost.first /= getReciprocalPredBlockProb(); 6023 6024 Cost.first += BlockCost.first; 6025 Cost.second |= BlockCost.second; 6026 } 6027 6028 return Cost; 6029 } 6030 6031 /// Gets Address Access SCEV after verifying that the access pattern 6032 /// is loop invariant except the induction variable dependence. 6033 /// 6034 /// This SCEV can be sent to the Target in order to estimate the address 6035 /// calculation cost. 6036 static const SCEV *getAddressAccessSCEV( 6037 Value *Ptr, 6038 LoopVectorizationLegality *Legal, 6039 PredicatedScalarEvolution &PSE, 6040 const Loop *TheLoop) { 6041 6042 auto *Gep = dyn_cast<GetElementPtrInst>(Ptr); 6043 if (!Gep) 6044 return nullptr; 6045 6046 // We are looking for a gep with all loop invariant indices except for one 6047 // which should be an induction variable. 6048 auto SE = PSE.getSE(); 6049 unsigned NumOperands = Gep->getNumOperands(); 6050 for (unsigned i = 1; i < NumOperands; ++i) { 6051 Value *Opd = Gep->getOperand(i); 6052 if (!SE->isLoopInvariant(SE->getSCEV(Opd), TheLoop) && 6053 !Legal->isInductionVariable(Opd)) 6054 return nullptr; 6055 } 6056 6057 // Now we know we have a GEP ptr, %inv, %ind, %inv. return the Ptr SCEV. 6058 return PSE.getSCEV(Ptr); 6059 } 6060 6061 static bool isStrideMul(Instruction *I, LoopVectorizationLegality *Legal) { 6062 return Legal->hasStride(I->getOperand(0)) || 6063 Legal->hasStride(I->getOperand(1)); 6064 } 6065 6066 unsigned 6067 LoopVectorizationCostModel::getMemInstScalarizationCost(Instruction *I, 6068 ElementCount VF) { 6069 assert(VF.isVector() && 6070 "Scalarization cost of instruction implies vectorization."); 6071 assert(!VF.Scalable && "scalable vectors not yet supported."); 6072 Type *ValTy = getMemInstValueType(I); 6073 auto SE = PSE.getSE(); 6074 6075 unsigned AS = getLoadStoreAddressSpace(I); 6076 Value *Ptr = getLoadStorePointerOperand(I); 6077 Type *PtrTy = ToVectorTy(Ptr->getType(), VF); 6078 6079 // Figure out whether the access is strided and get the stride value 6080 // if it's known in compile time 6081 const SCEV *PtrSCEV = getAddressAccessSCEV(Ptr, Legal, PSE, TheLoop); 6082 6083 // Get the cost of the scalar memory instruction and address computation. 6084 unsigned Cost = VF.Min * TTI.getAddressComputationCost(PtrTy, SE, PtrSCEV); 6085 6086 // Don't pass *I here, since it is scalar but will actually be part of a 6087 // vectorized loop where the user of it is a vectorized instruction. 6088 const Align Alignment = getLoadStoreAlignment(I); 6089 Cost += VF.Min * 6090 TTI.getMemoryOpCost(I->getOpcode(), ValTy->getScalarType(), Alignment, 6091 AS, TTI::TCK_RecipThroughput); 6092 6093 // Get the overhead of the extractelement and insertelement instructions 6094 // we might create due to scalarization. 6095 Cost += getScalarizationOverhead(I, VF); 6096 6097 // If we have a predicated store, it may not be executed for each vector 6098 // lane. Scale the cost by the probability of executing the predicated 6099 // block. 6100 if (isPredicatedInst(I)) { 6101 Cost /= getReciprocalPredBlockProb(); 6102 6103 if (useEmulatedMaskMemRefHack(I)) 6104 // Artificially setting to a high enough value to practically disable 6105 // vectorization with such operations. 6106 Cost = 3000000; 6107 } 6108 6109 return Cost; 6110 } 6111 6112 unsigned LoopVectorizationCostModel::getConsecutiveMemOpCost(Instruction *I, 6113 ElementCount VF) { 6114 Type *ValTy = getMemInstValueType(I); 6115 auto *VectorTy = cast<VectorType>(ToVectorTy(ValTy, VF)); 6116 Value *Ptr = getLoadStorePointerOperand(I); 6117 unsigned AS = getLoadStoreAddressSpace(I); 6118 int ConsecutiveStride = Legal->isConsecutivePtr(Ptr); 6119 enum TTI::TargetCostKind CostKind = TTI::TCK_RecipThroughput; 6120 6121 assert((ConsecutiveStride == 1 || ConsecutiveStride == -1) && 6122 "Stride should be 1 or -1 for consecutive memory access"); 6123 const Align Alignment = getLoadStoreAlignment(I); 6124 unsigned Cost = 0; 6125 if (Legal->isMaskRequired(I)) 6126 Cost += TTI.getMaskedMemoryOpCost(I->getOpcode(), VectorTy, Alignment, AS, 6127 CostKind); 6128 else 6129 Cost += TTI.getMemoryOpCost(I->getOpcode(), VectorTy, Alignment, AS, 6130 CostKind, I); 6131 6132 bool Reverse = ConsecutiveStride < 0; 6133 if (Reverse) 6134 Cost += TTI.getShuffleCost(TargetTransformInfo::SK_Reverse, VectorTy, 0); 6135 return Cost; 6136 } 6137 6138 unsigned LoopVectorizationCostModel::getUniformMemOpCost(Instruction *I, 6139 ElementCount VF) { 6140 Type *ValTy = getMemInstValueType(I); 6141 auto *VectorTy = cast<VectorType>(ToVectorTy(ValTy, VF)); 6142 const Align Alignment = getLoadStoreAlignment(I); 6143 unsigned AS = getLoadStoreAddressSpace(I); 6144 enum TTI::TargetCostKind CostKind = TTI::TCK_RecipThroughput; 6145 if (isa<LoadInst>(I)) { 6146 return TTI.getAddressComputationCost(ValTy) + 6147 TTI.getMemoryOpCost(Instruction::Load, ValTy, Alignment, AS, 6148 CostKind) + 6149 TTI.getShuffleCost(TargetTransformInfo::SK_Broadcast, VectorTy); 6150 } 6151 StoreInst *SI = cast<StoreInst>(I); 6152 6153 bool isLoopInvariantStoreValue = Legal->isUniform(SI->getValueOperand()); 6154 return TTI.getAddressComputationCost(ValTy) + 6155 TTI.getMemoryOpCost(Instruction::Store, ValTy, Alignment, AS, 6156 CostKind) + 6157 (isLoopInvariantStoreValue ? 0 : TTI.getVectorInstrCost( 6158 Instruction::ExtractElement, 6159 VectorTy, VF.Min - 1)); 6160 } 6161 6162 unsigned LoopVectorizationCostModel::getGatherScatterCost(Instruction *I, 6163 ElementCount VF) { 6164 Type *ValTy = getMemInstValueType(I); 6165 auto *VectorTy = cast<VectorType>(ToVectorTy(ValTy, VF)); 6166 const Align Alignment = getLoadStoreAlignment(I); 6167 const Value *Ptr = getLoadStorePointerOperand(I); 6168 6169 return TTI.getAddressComputationCost(VectorTy) + 6170 TTI.getGatherScatterOpCost( 6171 I->getOpcode(), VectorTy, Ptr, Legal->isMaskRequired(I), Alignment, 6172 TargetTransformInfo::TCK_RecipThroughput, I); 6173 } 6174 6175 unsigned LoopVectorizationCostModel::getInterleaveGroupCost(Instruction *I, 6176 ElementCount VF) { 6177 Type *ValTy = getMemInstValueType(I); 6178 auto *VectorTy = cast<VectorType>(ToVectorTy(ValTy, VF)); 6179 unsigned AS = getLoadStoreAddressSpace(I); 6180 6181 auto Group = getInterleavedAccessGroup(I); 6182 assert(Group && "Fail to get an interleaved access group."); 6183 6184 unsigned InterleaveFactor = Group->getFactor(); 6185 assert(!VF.Scalable && "scalable vectors not yet supported."); 6186 auto *WideVecTy = VectorType::get(ValTy, VF * InterleaveFactor); 6187 6188 // Holds the indices of existing members in an interleaved load group. 6189 // An interleaved store group doesn't need this as it doesn't allow gaps. 6190 SmallVector<unsigned, 4> Indices; 6191 if (isa<LoadInst>(I)) { 6192 for (unsigned i = 0; i < InterleaveFactor; i++) 6193 if (Group->getMember(i)) 6194 Indices.push_back(i); 6195 } 6196 6197 // Calculate the cost of the whole interleaved group. 6198 bool UseMaskForGaps = 6199 Group->requiresScalarEpilogue() && !isScalarEpilogueAllowed(); 6200 unsigned Cost = TTI.getInterleavedMemoryOpCost( 6201 I->getOpcode(), WideVecTy, Group->getFactor(), Indices, Group->getAlign(), 6202 AS, TTI::TCK_RecipThroughput, Legal->isMaskRequired(I), UseMaskForGaps); 6203 6204 if (Group->isReverse()) { 6205 // TODO: Add support for reversed masked interleaved access. 6206 assert(!Legal->isMaskRequired(I) && 6207 "Reverse masked interleaved access not supported."); 6208 Cost += Group->getNumMembers() * 6209 TTI.getShuffleCost(TargetTransformInfo::SK_Reverse, VectorTy, 0); 6210 } 6211 return Cost; 6212 } 6213 6214 unsigned LoopVectorizationCostModel::getMemoryInstructionCost(Instruction *I, 6215 ElementCount VF) { 6216 // Calculate scalar cost only. Vectorization cost should be ready at this 6217 // moment. 6218 if (VF.isScalar()) { 6219 Type *ValTy = getMemInstValueType(I); 6220 const Align Alignment = getLoadStoreAlignment(I); 6221 unsigned AS = getLoadStoreAddressSpace(I); 6222 6223 return TTI.getAddressComputationCost(ValTy) + 6224 TTI.getMemoryOpCost(I->getOpcode(), ValTy, Alignment, AS, 6225 TTI::TCK_RecipThroughput, I); 6226 } 6227 return getWideningCost(I, VF); 6228 } 6229 6230 LoopVectorizationCostModel::VectorizationCostTy 6231 LoopVectorizationCostModel::getInstructionCost(Instruction *I, 6232 ElementCount VF) { 6233 assert(!VF.Scalable && 6234 "the cost model is not yet implemented for scalable vectorization"); 6235 // If we know that this instruction will remain uniform, check the cost of 6236 // the scalar version. 6237 if (isUniformAfterVectorization(I, VF)) 6238 VF = ElementCount::getFixed(1); 6239 6240 if (VF.isVector() && isProfitableToScalarize(I, VF)) 6241 return VectorizationCostTy(InstsToScalarize[VF][I], false); 6242 6243 // Forced scalars do not have any scalarization overhead. 6244 auto ForcedScalar = ForcedScalars.find(VF); 6245 if (VF.isVector() && ForcedScalar != ForcedScalars.end()) { 6246 auto InstSet = ForcedScalar->second; 6247 if (InstSet.count(I)) 6248 return VectorizationCostTy( 6249 (getInstructionCost(I, ElementCount::getFixed(1)).first * VF.Min), 6250 false); 6251 } 6252 6253 Type *VectorTy; 6254 unsigned C = getInstructionCost(I, VF, VectorTy); 6255 6256 bool TypeNotScalarized = VF.isVector() && VectorTy->isVectorTy() && 6257 TTI.getNumberOfParts(VectorTy) < VF.Min; 6258 return VectorizationCostTy(C, TypeNotScalarized); 6259 } 6260 6261 unsigned LoopVectorizationCostModel::getScalarizationOverhead(Instruction *I, 6262 ElementCount VF) { 6263 6264 assert(!VF.Scalable && 6265 "cannot compute scalarization overhead for scalable vectorization"); 6266 if (VF.isScalar()) 6267 return 0; 6268 6269 unsigned Cost = 0; 6270 Type *RetTy = ToVectorTy(I->getType(), VF); 6271 if (!RetTy->isVoidTy() && 6272 (!isa<LoadInst>(I) || !TTI.supportsEfficientVectorElementLoadStore())) 6273 Cost += TTI.getScalarizationOverhead( 6274 cast<VectorType>(RetTy), APInt::getAllOnesValue(VF.Min), true, false); 6275 6276 // Some targets keep addresses scalar. 6277 if (isa<LoadInst>(I) && !TTI.prefersVectorizedAddressing()) 6278 return Cost; 6279 6280 // Some targets support efficient element stores. 6281 if (isa<StoreInst>(I) && TTI.supportsEfficientVectorElementLoadStore()) 6282 return Cost; 6283 6284 // Collect operands to consider. 6285 CallInst *CI = dyn_cast<CallInst>(I); 6286 Instruction::op_range Ops = CI ? CI->arg_operands() : I->operands(); 6287 6288 // Skip operands that do not require extraction/scalarization and do not incur 6289 // any overhead. 6290 return Cost + 6291 TTI.getOperandsScalarizationOverhead(filterExtractingOperands(Ops, VF), 6292 VF.Min); 6293 } 6294 6295 void LoopVectorizationCostModel::setCostBasedWideningDecision(ElementCount VF) { 6296 assert(!VF.Scalable && "scalable vectors not yet supported."); 6297 if (VF.isScalar()) 6298 return; 6299 NumPredStores = 0; 6300 for (BasicBlock *BB : TheLoop->blocks()) { 6301 // For each instruction in the old loop. 6302 for (Instruction &I : *BB) { 6303 Value *Ptr = getLoadStorePointerOperand(&I); 6304 if (!Ptr) 6305 continue; 6306 6307 // TODO: We should generate better code and update the cost model for 6308 // predicated uniform stores. Today they are treated as any other 6309 // predicated store (see added test cases in 6310 // invariant-store-vectorization.ll). 6311 if (isa<StoreInst>(&I) && isScalarWithPredication(&I)) 6312 NumPredStores++; 6313 6314 if (Legal->isUniform(Ptr) && 6315 // Conditional loads and stores should be scalarized and predicated. 6316 // isScalarWithPredication cannot be used here since masked 6317 // gather/scatters are not considered scalar with predication. 6318 !Legal->blockNeedsPredication(I.getParent())) { 6319 // TODO: Avoid replicating loads and stores instead of 6320 // relying on instcombine to remove them. 6321 // Load: Scalar load + broadcast 6322 // Store: Scalar store + isLoopInvariantStoreValue ? 0 : extract 6323 unsigned Cost = getUniformMemOpCost(&I, VF); 6324 setWideningDecision(&I, VF, CM_Scalarize, Cost); 6325 continue; 6326 } 6327 6328 // We assume that widening is the best solution when possible. 6329 if (memoryInstructionCanBeWidened(&I, VF)) { 6330 unsigned Cost = getConsecutiveMemOpCost(&I, VF); 6331 int ConsecutiveStride = 6332 Legal->isConsecutivePtr(getLoadStorePointerOperand(&I)); 6333 assert((ConsecutiveStride == 1 || ConsecutiveStride == -1) && 6334 "Expected consecutive stride."); 6335 InstWidening Decision = 6336 ConsecutiveStride == 1 ? CM_Widen : CM_Widen_Reverse; 6337 setWideningDecision(&I, VF, Decision, Cost); 6338 continue; 6339 } 6340 6341 // Choose between Interleaving, Gather/Scatter or Scalarization. 6342 unsigned InterleaveCost = std::numeric_limits<unsigned>::max(); 6343 unsigned NumAccesses = 1; 6344 if (isAccessInterleaved(&I)) { 6345 auto Group = getInterleavedAccessGroup(&I); 6346 assert(Group && "Fail to get an interleaved access group."); 6347 6348 // Make one decision for the whole group. 6349 if (getWideningDecision(&I, VF) != CM_Unknown) 6350 continue; 6351 6352 NumAccesses = Group->getNumMembers(); 6353 if (interleavedAccessCanBeWidened(&I, VF)) 6354 InterleaveCost = getInterleaveGroupCost(&I, VF); 6355 } 6356 6357 unsigned GatherScatterCost = 6358 isLegalGatherOrScatter(&I) 6359 ? getGatherScatterCost(&I, VF) * NumAccesses 6360 : std::numeric_limits<unsigned>::max(); 6361 6362 unsigned ScalarizationCost = 6363 getMemInstScalarizationCost(&I, VF) * NumAccesses; 6364 6365 // Choose better solution for the current VF, 6366 // write down this decision and use it during vectorization. 6367 unsigned Cost; 6368 InstWidening Decision; 6369 if (InterleaveCost <= GatherScatterCost && 6370 InterleaveCost < ScalarizationCost) { 6371 Decision = CM_Interleave; 6372 Cost = InterleaveCost; 6373 } else if (GatherScatterCost < ScalarizationCost) { 6374 Decision = CM_GatherScatter; 6375 Cost = GatherScatterCost; 6376 } else { 6377 Decision = CM_Scalarize; 6378 Cost = ScalarizationCost; 6379 } 6380 // If the instructions belongs to an interleave group, the whole group 6381 // receives the same decision. The whole group receives the cost, but 6382 // the cost will actually be assigned to one instruction. 6383 if (auto Group = getInterleavedAccessGroup(&I)) 6384 setWideningDecision(Group, VF, Decision, Cost); 6385 else 6386 setWideningDecision(&I, VF, Decision, Cost); 6387 } 6388 } 6389 6390 // Make sure that any load of address and any other address computation 6391 // remains scalar unless there is gather/scatter support. This avoids 6392 // inevitable extracts into address registers, and also has the benefit of 6393 // activating LSR more, since that pass can't optimize vectorized 6394 // addresses. 6395 if (TTI.prefersVectorizedAddressing()) 6396 return; 6397 6398 // Start with all scalar pointer uses. 6399 SmallPtrSet<Instruction *, 8> AddrDefs; 6400 for (BasicBlock *BB : TheLoop->blocks()) 6401 for (Instruction &I : *BB) { 6402 Instruction *PtrDef = 6403 dyn_cast_or_null<Instruction>(getLoadStorePointerOperand(&I)); 6404 if (PtrDef && TheLoop->contains(PtrDef) && 6405 getWideningDecision(&I, VF) != CM_GatherScatter) 6406 AddrDefs.insert(PtrDef); 6407 } 6408 6409 // Add all instructions used to generate the addresses. 6410 SmallVector<Instruction *, 4> Worklist; 6411 for (auto *I : AddrDefs) 6412 Worklist.push_back(I); 6413 while (!Worklist.empty()) { 6414 Instruction *I = Worklist.pop_back_val(); 6415 for (auto &Op : I->operands()) 6416 if (auto *InstOp = dyn_cast<Instruction>(Op)) 6417 if ((InstOp->getParent() == I->getParent()) && !isa<PHINode>(InstOp) && 6418 AddrDefs.insert(InstOp).second) 6419 Worklist.push_back(InstOp); 6420 } 6421 6422 for (auto *I : AddrDefs) { 6423 if (isa<LoadInst>(I)) { 6424 // Setting the desired widening decision should ideally be handled in 6425 // by cost functions, but since this involves the task of finding out 6426 // if the loaded register is involved in an address computation, it is 6427 // instead changed here when we know this is the case. 6428 InstWidening Decision = getWideningDecision(I, VF); 6429 if (Decision == CM_Widen || Decision == CM_Widen_Reverse) 6430 // Scalarize a widened load of address. 6431 setWideningDecision( 6432 I, VF, CM_Scalarize, 6433 (VF.Min * getMemoryInstructionCost(I, ElementCount::getFixed(1)))); 6434 else if (auto Group = getInterleavedAccessGroup(I)) { 6435 // Scalarize an interleave group of address loads. 6436 for (unsigned I = 0; I < Group->getFactor(); ++I) { 6437 if (Instruction *Member = Group->getMember(I)) 6438 setWideningDecision( 6439 Member, VF, CM_Scalarize, 6440 (VF.Min * 6441 getMemoryInstructionCost(Member, ElementCount::getFixed(1)))); 6442 } 6443 } 6444 } else 6445 // Make sure I gets scalarized and a cost estimate without 6446 // scalarization overhead. 6447 ForcedScalars[VF].insert(I); 6448 } 6449 } 6450 6451 unsigned LoopVectorizationCostModel::getInstructionCost(Instruction *I, 6452 ElementCount VF, 6453 Type *&VectorTy) { 6454 Type *RetTy = I->getType(); 6455 if (canTruncateToMinimalBitwidth(I, VF)) 6456 RetTy = IntegerType::get(RetTy->getContext(), MinBWs[I]); 6457 VectorTy = isScalarAfterVectorization(I, VF) ? RetTy : ToVectorTy(RetTy, VF); 6458 auto SE = PSE.getSE(); 6459 TTI::TargetCostKind CostKind = TTI::TCK_RecipThroughput; 6460 6461 // TODO: We need to estimate the cost of intrinsic calls. 6462 switch (I->getOpcode()) { 6463 case Instruction::GetElementPtr: 6464 // We mark this instruction as zero-cost because the cost of GEPs in 6465 // vectorized code depends on whether the corresponding memory instruction 6466 // is scalarized or not. Therefore, we handle GEPs with the memory 6467 // instruction cost. 6468 return 0; 6469 case Instruction::Br: { 6470 // In cases of scalarized and predicated instructions, there will be VF 6471 // predicated blocks in the vectorized loop. Each branch around these 6472 // blocks requires also an extract of its vector compare i1 element. 6473 bool ScalarPredicatedBB = false; 6474 BranchInst *BI = cast<BranchInst>(I); 6475 if (VF.isVector() && BI->isConditional() && 6476 (PredicatedBBsAfterVectorization.count(BI->getSuccessor(0)) || 6477 PredicatedBBsAfterVectorization.count(BI->getSuccessor(1)))) 6478 ScalarPredicatedBB = true; 6479 6480 if (ScalarPredicatedBB) { 6481 // Return cost for branches around scalarized and predicated blocks. 6482 assert(!VF.Scalable && "scalable vectors not yet supported."); 6483 auto *Vec_i1Ty = 6484 VectorType::get(IntegerType::getInt1Ty(RetTy->getContext()), VF); 6485 return (TTI.getScalarizationOverhead( 6486 Vec_i1Ty, APInt::getAllOnesValue(VF.Min), false, true) + 6487 (TTI.getCFInstrCost(Instruction::Br, CostKind) * VF.Min)); 6488 } else if (I->getParent() == TheLoop->getLoopLatch() || VF.isScalar()) 6489 // The back-edge branch will remain, as will all scalar branches. 6490 return TTI.getCFInstrCost(Instruction::Br, CostKind); 6491 else 6492 // This branch will be eliminated by if-conversion. 6493 return 0; 6494 // Note: We currently assume zero cost for an unconditional branch inside 6495 // a predicated block since it will become a fall-through, although we 6496 // may decide in the future to call TTI for all branches. 6497 } 6498 case Instruction::PHI: { 6499 auto *Phi = cast<PHINode>(I); 6500 6501 // First-order recurrences are replaced by vector shuffles inside the loop. 6502 // NOTE: Don't use ToVectorTy as SK_ExtractSubvector expects a vector type. 6503 if (VF.isVector() && Legal->isFirstOrderRecurrence(Phi)) 6504 return TTI.getShuffleCost(TargetTransformInfo::SK_ExtractSubvector, 6505 cast<VectorType>(VectorTy), VF.Min - 1, 6506 FixedVectorType::get(RetTy, 1)); 6507 6508 // Phi nodes in non-header blocks (not inductions, reductions, etc.) are 6509 // converted into select instructions. We require N - 1 selects per phi 6510 // node, where N is the number of incoming values. 6511 if (VF.isVector() && Phi->getParent() != TheLoop->getHeader()) 6512 return (Phi->getNumIncomingValues() - 1) * 6513 TTI.getCmpSelInstrCost( 6514 Instruction::Select, ToVectorTy(Phi->getType(), VF), 6515 ToVectorTy(Type::getInt1Ty(Phi->getContext()), VF), 6516 CostKind); 6517 6518 return TTI.getCFInstrCost(Instruction::PHI, CostKind); 6519 } 6520 case Instruction::UDiv: 6521 case Instruction::SDiv: 6522 case Instruction::URem: 6523 case Instruction::SRem: 6524 // If we have a predicated instruction, it may not be executed for each 6525 // vector lane. Get the scalarization cost and scale this amount by the 6526 // probability of executing the predicated block. If the instruction is not 6527 // predicated, we fall through to the next case. 6528 if (VF.isVector() && isScalarWithPredication(I)) { 6529 unsigned Cost = 0; 6530 6531 // These instructions have a non-void type, so account for the phi nodes 6532 // that we will create. This cost is likely to be zero. The phi node 6533 // cost, if any, should be scaled by the block probability because it 6534 // models a copy at the end of each predicated block. 6535 Cost += VF.Min * TTI.getCFInstrCost(Instruction::PHI, CostKind); 6536 6537 // The cost of the non-predicated instruction. 6538 Cost += 6539 VF.Min * TTI.getArithmeticInstrCost(I->getOpcode(), RetTy, CostKind); 6540 6541 // The cost of insertelement and extractelement instructions needed for 6542 // scalarization. 6543 Cost += getScalarizationOverhead(I, VF); 6544 6545 // Scale the cost by the probability of executing the predicated blocks. 6546 // This assumes the predicated block for each vector lane is equally 6547 // likely. 6548 return Cost / getReciprocalPredBlockProb(); 6549 } 6550 LLVM_FALLTHROUGH; 6551 case Instruction::Add: 6552 case Instruction::FAdd: 6553 case Instruction::Sub: 6554 case Instruction::FSub: 6555 case Instruction::Mul: 6556 case Instruction::FMul: 6557 case Instruction::FDiv: 6558 case Instruction::FRem: 6559 case Instruction::Shl: 6560 case Instruction::LShr: 6561 case Instruction::AShr: 6562 case Instruction::And: 6563 case Instruction::Or: 6564 case Instruction::Xor: { 6565 // Since we will replace the stride by 1 the multiplication should go away. 6566 if (I->getOpcode() == Instruction::Mul && isStrideMul(I, Legal)) 6567 return 0; 6568 // Certain instructions can be cheaper to vectorize if they have a constant 6569 // second vector operand. One example of this are shifts on x86. 6570 Value *Op2 = I->getOperand(1); 6571 TargetTransformInfo::OperandValueProperties Op2VP; 6572 TargetTransformInfo::OperandValueKind Op2VK = 6573 TTI.getOperandInfo(Op2, Op2VP); 6574 if (Op2VK == TargetTransformInfo::OK_AnyValue && Legal->isUniform(Op2)) 6575 Op2VK = TargetTransformInfo::OK_UniformValue; 6576 6577 SmallVector<const Value *, 4> Operands(I->operand_values()); 6578 unsigned N = isScalarAfterVectorization(I, VF) ? VF.Min : 1; 6579 return N * TTI.getArithmeticInstrCost( 6580 I->getOpcode(), VectorTy, CostKind, 6581 TargetTransformInfo::OK_AnyValue, 6582 Op2VK, TargetTransformInfo::OP_None, Op2VP, Operands, I); 6583 } 6584 case Instruction::FNeg: { 6585 assert(!VF.Scalable && "VF is assumed to be non scalable."); 6586 unsigned N = isScalarAfterVectorization(I, VF) ? VF.Min : 1; 6587 return N * TTI.getArithmeticInstrCost( 6588 I->getOpcode(), VectorTy, CostKind, 6589 TargetTransformInfo::OK_AnyValue, 6590 TargetTransformInfo::OK_AnyValue, 6591 TargetTransformInfo::OP_None, TargetTransformInfo::OP_None, 6592 I->getOperand(0), I); 6593 } 6594 case Instruction::Select: { 6595 SelectInst *SI = cast<SelectInst>(I); 6596 const SCEV *CondSCEV = SE->getSCEV(SI->getCondition()); 6597 bool ScalarCond = (SE->isLoopInvariant(CondSCEV, TheLoop)); 6598 Type *CondTy = SI->getCondition()->getType(); 6599 if (!ScalarCond) { 6600 assert(!VF.Scalable && "VF is assumed to be non scalable."); 6601 CondTy = VectorType::get(CondTy, VF); 6602 } 6603 return TTI.getCmpSelInstrCost(I->getOpcode(), VectorTy, CondTy, 6604 CostKind, I); 6605 } 6606 case Instruction::ICmp: 6607 case Instruction::FCmp: { 6608 Type *ValTy = I->getOperand(0)->getType(); 6609 Instruction *Op0AsInstruction = dyn_cast<Instruction>(I->getOperand(0)); 6610 if (canTruncateToMinimalBitwidth(Op0AsInstruction, VF)) 6611 ValTy = IntegerType::get(ValTy->getContext(), MinBWs[Op0AsInstruction]); 6612 VectorTy = ToVectorTy(ValTy, VF); 6613 return TTI.getCmpSelInstrCost(I->getOpcode(), VectorTy, nullptr, CostKind, 6614 I); 6615 } 6616 case Instruction::Store: 6617 case Instruction::Load: { 6618 ElementCount Width = VF; 6619 if (Width.isVector()) { 6620 InstWidening Decision = getWideningDecision(I, Width); 6621 assert(Decision != CM_Unknown && 6622 "CM decision should be taken at this point"); 6623 if (Decision == CM_Scalarize) 6624 Width = ElementCount::getFixed(1); 6625 } 6626 VectorTy = ToVectorTy(getMemInstValueType(I), Width); 6627 return getMemoryInstructionCost(I, VF); 6628 } 6629 case Instruction::ZExt: 6630 case Instruction::SExt: 6631 case Instruction::FPToUI: 6632 case Instruction::FPToSI: 6633 case Instruction::FPExt: 6634 case Instruction::PtrToInt: 6635 case Instruction::IntToPtr: 6636 case Instruction::SIToFP: 6637 case Instruction::UIToFP: 6638 case Instruction::Trunc: 6639 case Instruction::FPTrunc: 6640 case Instruction::BitCast: { 6641 // Computes the CastContextHint from a Load/Store instruction. 6642 auto ComputeCCH = [&](Instruction *I) -> TTI::CastContextHint { 6643 assert((isa<LoadInst>(I) || isa<StoreInst>(I)) && 6644 "Expected a load or a store!"); 6645 6646 if (VF.isScalar() || !TheLoop->contains(I)) 6647 return TTI::CastContextHint::Normal; 6648 6649 switch (getWideningDecision(I, VF)) { 6650 case LoopVectorizationCostModel::CM_GatherScatter: 6651 return TTI::CastContextHint::GatherScatter; 6652 case LoopVectorizationCostModel::CM_Interleave: 6653 return TTI::CastContextHint::Interleave; 6654 case LoopVectorizationCostModel::CM_Scalarize: 6655 case LoopVectorizationCostModel::CM_Widen: 6656 return Legal->isMaskRequired(I) ? TTI::CastContextHint::Masked 6657 : TTI::CastContextHint::Normal; 6658 case LoopVectorizationCostModel::CM_Widen_Reverse: 6659 return TTI::CastContextHint::Reversed; 6660 case LoopVectorizationCostModel::CM_Unknown: 6661 llvm_unreachable("Instr did not go through cost modelling?"); 6662 } 6663 6664 llvm_unreachable("Unhandled case!"); 6665 }; 6666 6667 unsigned Opcode = I->getOpcode(); 6668 TTI::CastContextHint CCH = TTI::CastContextHint::None; 6669 // For Trunc, the context is the only user, which must be a StoreInst. 6670 if (Opcode == Instruction::Trunc || Opcode == Instruction::FPTrunc) { 6671 if (I->hasOneUse()) 6672 if (StoreInst *Store = dyn_cast<StoreInst>(*I->user_begin())) 6673 CCH = ComputeCCH(Store); 6674 } 6675 // For Z/Sext, the context is the operand, which must be a LoadInst. 6676 else if (Opcode == Instruction::ZExt || Opcode == Instruction::SExt || 6677 Opcode == Instruction::FPExt) { 6678 if (LoadInst *Load = dyn_cast<LoadInst>(I->getOperand(0))) 6679 CCH = ComputeCCH(Load); 6680 } 6681 6682 // We optimize the truncation of induction variables having constant 6683 // integer steps. The cost of these truncations is the same as the scalar 6684 // operation. 6685 if (isOptimizableIVTruncate(I, VF)) { 6686 auto *Trunc = cast<TruncInst>(I); 6687 return TTI.getCastInstrCost(Instruction::Trunc, Trunc->getDestTy(), 6688 Trunc->getSrcTy(), CCH, CostKind, Trunc); 6689 } 6690 6691 Type *SrcScalarTy = I->getOperand(0)->getType(); 6692 Type *SrcVecTy = 6693 VectorTy->isVectorTy() ? ToVectorTy(SrcScalarTy, VF) : SrcScalarTy; 6694 if (canTruncateToMinimalBitwidth(I, VF)) { 6695 // This cast is going to be shrunk. This may remove the cast or it might 6696 // turn it into slightly different cast. For example, if MinBW == 16, 6697 // "zext i8 %1 to i32" becomes "zext i8 %1 to i16". 6698 // 6699 // Calculate the modified src and dest types. 6700 Type *MinVecTy = VectorTy; 6701 if (Opcode == Instruction::Trunc) { 6702 SrcVecTy = smallestIntegerVectorType(SrcVecTy, MinVecTy); 6703 VectorTy = 6704 largestIntegerVectorType(ToVectorTy(I->getType(), VF), MinVecTy); 6705 } else if (Opcode == Instruction::ZExt || Opcode == Instruction::SExt) { 6706 SrcVecTy = largestIntegerVectorType(SrcVecTy, MinVecTy); 6707 VectorTy = 6708 smallestIntegerVectorType(ToVectorTy(I->getType(), VF), MinVecTy); 6709 } 6710 } 6711 6712 assert(!VF.Scalable && "VF is assumed to be non scalable"); 6713 unsigned N = isScalarAfterVectorization(I, VF) ? VF.Min : 1; 6714 return N * 6715 TTI.getCastInstrCost(Opcode, VectorTy, SrcVecTy, CCH, CostKind, I); 6716 } 6717 case Instruction::Call: { 6718 bool NeedToScalarize; 6719 CallInst *CI = cast<CallInst>(I); 6720 unsigned CallCost = getVectorCallCost(CI, VF, NeedToScalarize); 6721 if (getVectorIntrinsicIDForCall(CI, TLI)) 6722 return std::min(CallCost, getVectorIntrinsicCost(CI, VF)); 6723 return CallCost; 6724 } 6725 default: 6726 // The cost of executing VF copies of the scalar instruction. This opcode 6727 // is unknown. Assume that it is the same as 'mul'. 6728 return VF.Min * 6729 TTI.getArithmeticInstrCost(Instruction::Mul, VectorTy, 6730 CostKind) + 6731 getScalarizationOverhead(I, VF); 6732 } // end of switch. 6733 } 6734 6735 char LoopVectorize::ID = 0; 6736 6737 static const char lv_name[] = "Loop Vectorization"; 6738 6739 INITIALIZE_PASS_BEGIN(LoopVectorize, LV_NAME, lv_name, false, false) 6740 INITIALIZE_PASS_DEPENDENCY(TargetTransformInfoWrapperPass) 6741 INITIALIZE_PASS_DEPENDENCY(BasicAAWrapperPass) 6742 INITIALIZE_PASS_DEPENDENCY(AAResultsWrapperPass) 6743 INITIALIZE_PASS_DEPENDENCY(GlobalsAAWrapperPass) 6744 INITIALIZE_PASS_DEPENDENCY(AssumptionCacheTracker) 6745 INITIALIZE_PASS_DEPENDENCY(BlockFrequencyInfoWrapperPass) 6746 INITIALIZE_PASS_DEPENDENCY(DominatorTreeWrapperPass) 6747 INITIALIZE_PASS_DEPENDENCY(ScalarEvolutionWrapperPass) 6748 INITIALIZE_PASS_DEPENDENCY(LoopInfoWrapperPass) 6749 INITIALIZE_PASS_DEPENDENCY(LoopAccessLegacyAnalysis) 6750 INITIALIZE_PASS_DEPENDENCY(DemandedBitsWrapperPass) 6751 INITIALIZE_PASS_DEPENDENCY(OptimizationRemarkEmitterWrapperPass) 6752 INITIALIZE_PASS_DEPENDENCY(ProfileSummaryInfoWrapperPass) 6753 INITIALIZE_PASS_DEPENDENCY(InjectTLIMappingsLegacy) 6754 INITIALIZE_PASS_END(LoopVectorize, LV_NAME, lv_name, false, false) 6755 6756 namespace llvm { 6757 6758 Pass *createLoopVectorizePass() { return new LoopVectorize(); } 6759 6760 Pass *createLoopVectorizePass(bool InterleaveOnlyWhenForced, 6761 bool VectorizeOnlyWhenForced) { 6762 return new LoopVectorize(InterleaveOnlyWhenForced, VectorizeOnlyWhenForced); 6763 } 6764 6765 } // end namespace llvm 6766 6767 bool LoopVectorizationCostModel::isConsecutiveLoadOrStore(Instruction *Inst) { 6768 // Check if the pointer operand of a load or store instruction is 6769 // consecutive. 6770 if (auto *Ptr = getLoadStorePointerOperand(Inst)) 6771 return Legal->isConsecutivePtr(Ptr); 6772 return false; 6773 } 6774 6775 void LoopVectorizationCostModel::collectValuesToIgnore() { 6776 // Ignore ephemeral values. 6777 CodeMetrics::collectEphemeralValues(TheLoop, AC, ValuesToIgnore); 6778 6779 // Ignore type-promoting instructions we identified during reduction 6780 // detection. 6781 for (auto &Reduction : Legal->getReductionVars()) { 6782 RecurrenceDescriptor &RedDes = Reduction.second; 6783 SmallPtrSetImpl<Instruction *> &Casts = RedDes.getCastInsts(); 6784 VecValuesToIgnore.insert(Casts.begin(), Casts.end()); 6785 } 6786 // Ignore type-casting instructions we identified during induction 6787 // detection. 6788 for (auto &Induction : Legal->getInductionVars()) { 6789 InductionDescriptor &IndDes = Induction.second; 6790 const SmallVectorImpl<Instruction *> &Casts = IndDes.getCastInsts(); 6791 VecValuesToIgnore.insert(Casts.begin(), Casts.end()); 6792 } 6793 } 6794 6795 void LoopVectorizationCostModel::collectInLoopReductions() { 6796 // For the moment, without predicated reduction instructions, we do not 6797 // support inloop reductions whilst folding the tail, and hence in those cases 6798 // all reductions are currently out of the loop. 6799 if (!PreferInLoopReductions || foldTailByMasking()) 6800 return; 6801 6802 for (auto &Reduction : Legal->getReductionVars()) { 6803 PHINode *Phi = Reduction.first; 6804 RecurrenceDescriptor &RdxDesc = Reduction.second; 6805 6806 // We don't collect reductions that are type promoted (yet). 6807 if (RdxDesc.getRecurrenceType() != Phi->getType()) 6808 continue; 6809 6810 // Check that we can correctly put the reductions into the loop, by 6811 // finding the chain of operations that leads from the phi to the loop 6812 // exit value. 6813 SmallVector<Instruction *, 4> ReductionOperations = 6814 RdxDesc.getReductionOpChain(Phi, TheLoop); 6815 bool InLoop = !ReductionOperations.empty(); 6816 if (InLoop) 6817 InLoopReductionChains[Phi] = ReductionOperations; 6818 LLVM_DEBUG(dbgs() << "LV: Using " << (InLoop ? "inloop" : "out of loop") 6819 << " reduction for phi: " << *Phi << "\n"); 6820 } 6821 } 6822 6823 // TODO: we could return a pair of values that specify the max VF and 6824 // min VF, to be used in `buildVPlans(MinVF, MaxVF)` instead of 6825 // `buildVPlans(VF, VF)`. We cannot do it because VPLAN at the moment 6826 // doesn't have a cost model that can choose which plan to execute if 6827 // more than one is generated. 6828 static unsigned determineVPlanVF(const unsigned WidestVectorRegBits, 6829 LoopVectorizationCostModel &CM) { 6830 unsigned WidestType; 6831 std::tie(std::ignore, WidestType) = CM.getSmallestAndWidestTypes(); 6832 return WidestVectorRegBits / WidestType; 6833 } 6834 6835 VectorizationFactor 6836 LoopVectorizationPlanner::planInVPlanNativePath(ElementCount UserVF) { 6837 assert(!UserVF.Scalable && "scalable vectors not yet supported"); 6838 ElementCount VF = UserVF; 6839 // Outer loop handling: They may require CFG and instruction level 6840 // transformations before even evaluating whether vectorization is profitable. 6841 // Since we cannot modify the incoming IR, we need to build VPlan upfront in 6842 // the vectorization pipeline. 6843 if (!OrigLoop->empty()) { 6844 // If the user doesn't provide a vectorization factor, determine a 6845 // reasonable one. 6846 if (UserVF.isZero()) { 6847 VF = ElementCount::getFixed( 6848 determineVPlanVF(TTI->getRegisterBitWidth(true /* Vector*/), CM)); 6849 LLVM_DEBUG(dbgs() << "LV: VPlan computed VF " << VF << ".\n"); 6850 6851 // Make sure we have a VF > 1 for stress testing. 6852 if (VPlanBuildStressTest && (VF.isScalar() || VF.isZero())) { 6853 LLVM_DEBUG(dbgs() << "LV: VPlan stress testing: " 6854 << "overriding computed VF.\n"); 6855 VF = ElementCount::getFixed(4); 6856 } 6857 } 6858 assert(EnableVPlanNativePath && "VPlan-native path is not enabled."); 6859 assert(isPowerOf2_32(VF.Min) && "VF needs to be a power of two"); 6860 LLVM_DEBUG(dbgs() << "LV: Using " << (!UserVF.isZero() ? "user " : "") 6861 << "VF " << VF << " to build VPlans.\n"); 6862 buildVPlans(VF.Min, VF.Min); 6863 6864 // For VPlan build stress testing, we bail out after VPlan construction. 6865 if (VPlanBuildStressTest) 6866 return VectorizationFactor::Disabled(); 6867 6868 return {VF, 0 /*Cost*/}; 6869 } 6870 6871 LLVM_DEBUG( 6872 dbgs() << "LV: Not vectorizing. Inner loops aren't supported in the " 6873 "VPlan-native path.\n"); 6874 return VectorizationFactor::Disabled(); 6875 } 6876 6877 Optional<VectorizationFactor> 6878 LoopVectorizationPlanner::plan(ElementCount UserVF, unsigned UserIC) { 6879 assert(!UserVF.Scalable && "scalable vectorization not yet handled"); 6880 assert(OrigLoop->empty() && "Inner loop expected."); 6881 Optional<unsigned> MaybeMaxVF = CM.computeMaxVF(UserVF.Min, UserIC); 6882 if (!MaybeMaxVF) // Cases that should not to be vectorized nor interleaved. 6883 return None; 6884 6885 // Invalidate interleave groups if all blocks of loop will be predicated. 6886 if (CM.blockNeedsPredication(OrigLoop->getHeader()) && 6887 !useMaskedInterleavedAccesses(*TTI)) { 6888 LLVM_DEBUG( 6889 dbgs() 6890 << "LV: Invalidate all interleaved groups due to fold-tail by masking " 6891 "which requires masked-interleaved support.\n"); 6892 if (CM.InterleaveInfo.invalidateGroups()) 6893 // Invalidating interleave groups also requires invalidating all decisions 6894 // based on them, which includes widening decisions and uniform and scalar 6895 // values. 6896 CM.invalidateCostModelingDecisions(); 6897 } 6898 6899 if (!UserVF.isZero()) { 6900 LLVM_DEBUG(dbgs() << "LV: Using user VF " << UserVF << ".\n"); 6901 assert(isPowerOf2_32(UserVF.Min) && "VF needs to be a power of two"); 6902 // Collect the instructions (and their associated costs) that will be more 6903 // profitable to scalarize. 6904 CM.selectUserVectorizationFactor(UserVF); 6905 CM.collectInLoopReductions(); 6906 buildVPlansWithVPRecipes(UserVF.Min, UserVF.Min); 6907 LLVM_DEBUG(printPlans(dbgs())); 6908 return {{UserVF, 0}}; 6909 } 6910 6911 unsigned MaxVF = MaybeMaxVF.getValue(); 6912 assert(MaxVF != 0 && "MaxVF is zero."); 6913 6914 for (unsigned VF = 1; VF <= MaxVF; VF *= 2) { 6915 // Collect Uniform and Scalar instructions after vectorization with VF. 6916 CM.collectUniformsAndScalars(ElementCount::getFixed(VF)); 6917 6918 // Collect the instructions (and their associated costs) that will be more 6919 // profitable to scalarize. 6920 if (VF > 1) 6921 CM.collectInstsToScalarize(ElementCount::getFixed(VF)); 6922 } 6923 6924 CM.collectInLoopReductions(); 6925 6926 buildVPlansWithVPRecipes(1, MaxVF); 6927 LLVM_DEBUG(printPlans(dbgs())); 6928 if (MaxVF == 1) 6929 return VectorizationFactor::Disabled(); 6930 6931 // Select the optimal vectorization factor. 6932 return CM.selectVectorizationFactor(MaxVF); 6933 } 6934 6935 void LoopVectorizationPlanner::setBestPlan(ElementCount VF, unsigned UF) { 6936 LLVM_DEBUG(dbgs() << "Setting best plan to VF=" << VF << ", UF=" << UF 6937 << '\n'); 6938 BestVF = VF; 6939 BestUF = UF; 6940 6941 erase_if(VPlans, [VF](const VPlanPtr &Plan) { 6942 return !Plan->hasVF(VF); 6943 }); 6944 assert(VPlans.size() == 1 && "Best VF has not a single VPlan."); 6945 } 6946 6947 void LoopVectorizationPlanner::executePlan(InnerLoopVectorizer &ILV, 6948 DominatorTree *DT) { 6949 // Perform the actual loop transformation. 6950 6951 // 1. Create a new empty loop. Unlink the old loop and connect the new one. 6952 VPCallbackILV CallbackILV(ILV); 6953 6954 assert(BestVF.hasValue() && "Vectorization Factor is missing"); 6955 6956 VPTransformState State{*BestVF, BestUF, LI, 6957 DT, ILV.Builder, ILV.VectorLoopValueMap, 6958 &ILV, CallbackILV}; 6959 State.CFG.PrevBB = ILV.createVectorizedLoopSkeleton(); 6960 State.TripCount = ILV.getOrCreateTripCount(nullptr); 6961 State.CanonicalIV = ILV.Induction; 6962 6963 //===------------------------------------------------===// 6964 // 6965 // Notice: any optimization or new instruction that go 6966 // into the code below should also be implemented in 6967 // the cost-model. 6968 // 6969 //===------------------------------------------------===// 6970 6971 // 2. Copy and widen instructions from the old loop into the new loop. 6972 assert(VPlans.size() == 1 && "Not a single VPlan to execute."); 6973 VPlans.front()->execute(&State); 6974 6975 // 3. Fix the vectorized code: take care of header phi's, live-outs, 6976 // predication, updating analyses. 6977 ILV.fixVectorizedLoop(); 6978 } 6979 6980 void LoopVectorizationPlanner::collectTriviallyDeadInstructions( 6981 SmallPtrSetImpl<Instruction *> &DeadInstructions) { 6982 BasicBlock *Latch = OrigLoop->getLoopLatch(); 6983 6984 // We create new control-flow for the vectorized loop, so the original 6985 // condition will be dead after vectorization if it's only used by the 6986 // branch. 6987 auto *Cmp = dyn_cast<Instruction>(Latch->getTerminator()->getOperand(0)); 6988 if (Cmp && Cmp->hasOneUse()) 6989 DeadInstructions.insert(Cmp); 6990 6991 // We create new "steps" for induction variable updates to which the original 6992 // induction variables map. An original update instruction will be dead if 6993 // all its users except the induction variable are dead. 6994 for (auto &Induction : Legal->getInductionVars()) { 6995 PHINode *Ind = Induction.first; 6996 auto *IndUpdate = cast<Instruction>(Ind->getIncomingValueForBlock(Latch)); 6997 if (llvm::all_of(IndUpdate->users(), [&](User *U) -> bool { 6998 return U == Ind || DeadInstructions.count(cast<Instruction>(U)); 6999 })) 7000 DeadInstructions.insert(IndUpdate); 7001 7002 // We record as "Dead" also the type-casting instructions we had identified 7003 // during induction analysis. We don't need any handling for them in the 7004 // vectorized loop because we have proven that, under a proper runtime 7005 // test guarding the vectorized loop, the value of the phi, and the casted 7006 // value of the phi, are the same. The last instruction in this casting chain 7007 // will get its scalar/vector/widened def from the scalar/vector/widened def 7008 // of the respective phi node. Any other casts in the induction def-use chain 7009 // have no other uses outside the phi update chain, and will be ignored. 7010 InductionDescriptor &IndDes = Induction.second; 7011 const SmallVectorImpl<Instruction *> &Casts = IndDes.getCastInsts(); 7012 DeadInstructions.insert(Casts.begin(), Casts.end()); 7013 } 7014 } 7015 7016 Value *InnerLoopUnroller::reverseVector(Value *Vec) { return Vec; } 7017 7018 Value *InnerLoopUnroller::getBroadcastInstrs(Value *V) { return V; } 7019 7020 Value *InnerLoopUnroller::getStepVector(Value *Val, int StartIdx, Value *Step, 7021 Instruction::BinaryOps BinOp) { 7022 // When unrolling and the VF is 1, we only need to add a simple scalar. 7023 Type *Ty = Val->getType(); 7024 assert(!Ty->isVectorTy() && "Val must be a scalar"); 7025 7026 if (Ty->isFloatingPointTy()) { 7027 Constant *C = ConstantFP::get(Ty, (double)StartIdx); 7028 7029 // Floating point operations had to be 'fast' to enable the unrolling. 7030 Value *MulOp = addFastMathFlag(Builder.CreateFMul(C, Step)); 7031 return addFastMathFlag(Builder.CreateBinOp(BinOp, Val, MulOp)); 7032 } 7033 Constant *C = ConstantInt::get(Ty, StartIdx); 7034 return Builder.CreateAdd(Val, Builder.CreateMul(C, Step), "induction"); 7035 } 7036 7037 static void AddRuntimeUnrollDisableMetaData(Loop *L) { 7038 SmallVector<Metadata *, 4> MDs; 7039 // Reserve first location for self reference to the LoopID metadata node. 7040 MDs.push_back(nullptr); 7041 bool IsUnrollMetadata = false; 7042 MDNode *LoopID = L->getLoopID(); 7043 if (LoopID) { 7044 // First find existing loop unrolling disable metadata. 7045 for (unsigned i = 1, ie = LoopID->getNumOperands(); i < ie; ++i) { 7046 auto *MD = dyn_cast<MDNode>(LoopID->getOperand(i)); 7047 if (MD) { 7048 const auto *S = dyn_cast<MDString>(MD->getOperand(0)); 7049 IsUnrollMetadata = 7050 S && S->getString().startswith("llvm.loop.unroll.disable"); 7051 } 7052 MDs.push_back(LoopID->getOperand(i)); 7053 } 7054 } 7055 7056 if (!IsUnrollMetadata) { 7057 // Add runtime unroll disable metadata. 7058 LLVMContext &Context = L->getHeader()->getContext(); 7059 SmallVector<Metadata *, 1> DisableOperands; 7060 DisableOperands.push_back( 7061 MDString::get(Context, "llvm.loop.unroll.runtime.disable")); 7062 MDNode *DisableNode = MDNode::get(Context, DisableOperands); 7063 MDs.push_back(DisableNode); 7064 MDNode *NewLoopID = MDNode::get(Context, MDs); 7065 // Set operand 0 to refer to the loop id itself. 7066 NewLoopID->replaceOperandWith(0, NewLoopID); 7067 L->setLoopID(NewLoopID); 7068 } 7069 } 7070 7071 bool LoopVectorizationPlanner::getDecisionAndClampRange( 7072 const std::function<bool(ElementCount)> &Predicate, VFRange &Range) { 7073 assert(Range.End > Range.Start && "Trying to test an empty VF range."); 7074 bool PredicateAtRangeStart = Predicate(ElementCount::getFixed(Range.Start)); 7075 7076 for (unsigned TmpVF = Range.Start * 2; TmpVF < Range.End; TmpVF *= 2) 7077 if (Predicate(ElementCount::getFixed(TmpVF)) != PredicateAtRangeStart) { 7078 Range.End = TmpVF; 7079 break; 7080 } 7081 7082 return PredicateAtRangeStart; 7083 } 7084 7085 /// Build VPlans for the full range of feasible VF's = {\p MinVF, 2 * \p MinVF, 7086 /// 4 * \p MinVF, ..., \p MaxVF} by repeatedly building a VPlan for a sub-range 7087 /// of VF's starting at a given VF and extending it as much as possible. Each 7088 /// vectorization decision can potentially shorten this sub-range during 7089 /// buildVPlan(). 7090 void LoopVectorizationPlanner::buildVPlans(unsigned MinVF, unsigned MaxVF) { 7091 for (unsigned VF = MinVF; VF < MaxVF + 1;) { 7092 VFRange SubRange = {VF, MaxVF + 1}; 7093 VPlans.push_back(buildVPlan(SubRange)); 7094 VF = SubRange.End; 7095 } 7096 } 7097 7098 VPValue *VPRecipeBuilder::createEdgeMask(BasicBlock *Src, BasicBlock *Dst, 7099 VPlanPtr &Plan) { 7100 assert(is_contained(predecessors(Dst), Src) && "Invalid edge"); 7101 7102 // Look for cached value. 7103 std::pair<BasicBlock *, BasicBlock *> Edge(Src, Dst); 7104 EdgeMaskCacheTy::iterator ECEntryIt = EdgeMaskCache.find(Edge); 7105 if (ECEntryIt != EdgeMaskCache.end()) 7106 return ECEntryIt->second; 7107 7108 VPValue *SrcMask = createBlockInMask(Src, Plan); 7109 7110 // The terminator has to be a branch inst! 7111 BranchInst *BI = dyn_cast<BranchInst>(Src->getTerminator()); 7112 assert(BI && "Unexpected terminator found"); 7113 7114 if (!BI->isConditional() || BI->getSuccessor(0) == BI->getSuccessor(1)) 7115 return EdgeMaskCache[Edge] = SrcMask; 7116 7117 VPValue *EdgeMask = Plan->getVPValue(BI->getCondition()); 7118 assert(EdgeMask && "No Edge Mask found for condition"); 7119 7120 if (BI->getSuccessor(0) != Dst) 7121 EdgeMask = Builder.createNot(EdgeMask); 7122 7123 if (SrcMask) // Otherwise block in-mask is all-one, no need to AND. 7124 EdgeMask = Builder.createAnd(EdgeMask, SrcMask); 7125 7126 return EdgeMaskCache[Edge] = EdgeMask; 7127 } 7128 7129 VPValue *VPRecipeBuilder::createBlockInMask(BasicBlock *BB, VPlanPtr &Plan) { 7130 assert(OrigLoop->contains(BB) && "Block is not a part of a loop"); 7131 7132 // Look for cached value. 7133 BlockMaskCacheTy::iterator BCEntryIt = BlockMaskCache.find(BB); 7134 if (BCEntryIt != BlockMaskCache.end()) 7135 return BCEntryIt->second; 7136 7137 // All-one mask is modelled as no-mask following the convention for masked 7138 // load/store/gather/scatter. Initialize BlockMask to no-mask. 7139 VPValue *BlockMask = nullptr; 7140 7141 if (OrigLoop->getHeader() == BB) { 7142 if (!CM.blockNeedsPredication(BB)) 7143 return BlockMaskCache[BB] = BlockMask; // Loop incoming mask is all-one. 7144 7145 // Introduce the early-exit compare IV <= BTC to form header block mask. 7146 // This is used instead of IV < TC because TC may wrap, unlike BTC. 7147 // Start by constructing the desired canonical IV. 7148 VPValue *IV = nullptr; 7149 if (Legal->getPrimaryInduction()) 7150 IV = Plan->getVPValue(Legal->getPrimaryInduction()); 7151 else { 7152 auto IVRecipe = new VPWidenCanonicalIVRecipe(); 7153 Builder.getInsertBlock()->appendRecipe(IVRecipe); 7154 IV = IVRecipe->getVPValue(); 7155 } 7156 VPValue *BTC = Plan->getOrCreateBackedgeTakenCount(); 7157 bool TailFolded = !CM.isScalarEpilogueAllowed(); 7158 7159 if (TailFolded && CM.TTI.emitGetActiveLaneMask()) { 7160 // While ActiveLaneMask is a binary op that consumes the loop tripcount 7161 // as a second argument, we only pass the IV here and extract the 7162 // tripcount from the transform state where codegen of the VP instructions 7163 // happen. 7164 BlockMask = Builder.createNaryOp(VPInstruction::ActiveLaneMask, {IV}); 7165 } else { 7166 BlockMask = Builder.createNaryOp(VPInstruction::ICmpULE, {IV, BTC}); 7167 } 7168 return BlockMaskCache[BB] = BlockMask; 7169 } 7170 7171 // This is the block mask. We OR all incoming edges. 7172 for (auto *Predecessor : predecessors(BB)) { 7173 VPValue *EdgeMask = createEdgeMask(Predecessor, BB, Plan); 7174 if (!EdgeMask) // Mask of predecessor is all-one so mask of block is too. 7175 return BlockMaskCache[BB] = EdgeMask; 7176 7177 if (!BlockMask) { // BlockMask has its initialized nullptr value. 7178 BlockMask = EdgeMask; 7179 continue; 7180 } 7181 7182 BlockMask = Builder.createOr(BlockMask, EdgeMask); 7183 } 7184 7185 return BlockMaskCache[BB] = BlockMask; 7186 } 7187 7188 VPWidenMemoryInstructionRecipe * 7189 VPRecipeBuilder::tryToWidenMemory(Instruction *I, VFRange &Range, 7190 VPlanPtr &Plan) { 7191 assert((isa<LoadInst>(I) || isa<StoreInst>(I)) && 7192 "Must be called with either a load or store"); 7193 7194 auto willWiden = [&](ElementCount VF) -> bool { 7195 assert(!VF.Scalable && "unexpected scalable ElementCount"); 7196 if (VF.isScalar()) 7197 return false; 7198 LoopVectorizationCostModel::InstWidening Decision = 7199 CM.getWideningDecision(I, VF); 7200 assert(Decision != LoopVectorizationCostModel::CM_Unknown && 7201 "CM decision should be taken at this point."); 7202 if (Decision == LoopVectorizationCostModel::CM_Interleave) 7203 return true; 7204 if (CM.isScalarAfterVectorization(I, VF) || 7205 CM.isProfitableToScalarize(I, VF)) 7206 return false; 7207 return Decision != LoopVectorizationCostModel::CM_Scalarize; 7208 }; 7209 7210 if (!LoopVectorizationPlanner::getDecisionAndClampRange(willWiden, Range)) 7211 return nullptr; 7212 7213 VPValue *Mask = nullptr; 7214 if (Legal->isMaskRequired(I)) 7215 Mask = createBlockInMask(I->getParent(), Plan); 7216 7217 VPValue *Addr = Plan->getOrAddVPValue(getLoadStorePointerOperand(I)); 7218 if (LoadInst *Load = dyn_cast<LoadInst>(I)) 7219 return new VPWidenMemoryInstructionRecipe(*Load, Addr, Mask); 7220 7221 StoreInst *Store = cast<StoreInst>(I); 7222 VPValue *StoredValue = Plan->getOrAddVPValue(Store->getValueOperand()); 7223 return new VPWidenMemoryInstructionRecipe(*Store, Addr, StoredValue, Mask); 7224 } 7225 7226 VPWidenIntOrFpInductionRecipe * 7227 VPRecipeBuilder::tryToOptimizeInductionPHI(PHINode *Phi) const { 7228 // Check if this is an integer or fp induction. If so, build the recipe that 7229 // produces its scalar and vector values. 7230 InductionDescriptor II = Legal->getInductionVars().lookup(Phi); 7231 if (II.getKind() == InductionDescriptor::IK_IntInduction || 7232 II.getKind() == InductionDescriptor::IK_FpInduction) 7233 return new VPWidenIntOrFpInductionRecipe(Phi); 7234 7235 return nullptr; 7236 } 7237 7238 VPWidenIntOrFpInductionRecipe * 7239 VPRecipeBuilder::tryToOptimizeInductionTruncate(TruncInst *I, 7240 VFRange &Range) const { 7241 // Optimize the special case where the source is a constant integer 7242 // induction variable. Notice that we can only optimize the 'trunc' case 7243 // because (a) FP conversions lose precision, (b) sext/zext may wrap, and 7244 // (c) other casts depend on pointer size. 7245 7246 // Determine whether \p K is a truncation based on an induction variable that 7247 // can be optimized. 7248 auto isOptimizableIVTruncate = 7249 [&](Instruction *K) -> std::function<bool(ElementCount)> { 7250 return [=](ElementCount VF) -> bool { 7251 return CM.isOptimizableIVTruncate(K, VF); 7252 }; 7253 }; 7254 7255 if (LoopVectorizationPlanner::getDecisionAndClampRange( 7256 isOptimizableIVTruncate(I), Range)) 7257 return new VPWidenIntOrFpInductionRecipe(cast<PHINode>(I->getOperand(0)), 7258 I); 7259 return nullptr; 7260 } 7261 7262 VPBlendRecipe *VPRecipeBuilder::tryToBlend(PHINode *Phi, VPlanPtr &Plan) { 7263 // We know that all PHIs in non-header blocks are converted into selects, so 7264 // we don't have to worry about the insertion order and we can just use the 7265 // builder. At this point we generate the predication tree. There may be 7266 // duplications since this is a simple recursive scan, but future 7267 // optimizations will clean it up. 7268 7269 SmallVector<VPValue *, 2> Operands; 7270 unsigned NumIncoming = Phi->getNumIncomingValues(); 7271 for (unsigned In = 0; In < NumIncoming; In++) { 7272 VPValue *EdgeMask = 7273 createEdgeMask(Phi->getIncomingBlock(In), Phi->getParent(), Plan); 7274 assert((EdgeMask || NumIncoming == 1) && 7275 "Multiple predecessors with one having a full mask"); 7276 Operands.push_back(Plan->getOrAddVPValue(Phi->getIncomingValue(In))); 7277 if (EdgeMask) 7278 Operands.push_back(EdgeMask); 7279 } 7280 return new VPBlendRecipe(Phi, Operands); 7281 } 7282 7283 VPWidenCallRecipe *VPRecipeBuilder::tryToWidenCall(CallInst *CI, VFRange &Range, 7284 VPlan &Plan) const { 7285 7286 bool IsPredicated = LoopVectorizationPlanner::getDecisionAndClampRange( 7287 [this, CI](ElementCount VF) { 7288 return CM.isScalarWithPredication(CI, VF); 7289 }, 7290 Range); 7291 7292 if (IsPredicated) 7293 return nullptr; 7294 7295 Intrinsic::ID ID = getVectorIntrinsicIDForCall(CI, TLI); 7296 if (ID && (ID == Intrinsic::assume || ID == Intrinsic::lifetime_end || 7297 ID == Intrinsic::lifetime_start || ID == Intrinsic::sideeffect)) 7298 return nullptr; 7299 7300 auto willWiden = [&](ElementCount VF) -> bool { 7301 Intrinsic::ID ID = getVectorIntrinsicIDForCall(CI, TLI); 7302 // The following case may be scalarized depending on the VF. 7303 // The flag shows whether we use Intrinsic or a usual Call for vectorized 7304 // version of the instruction. 7305 // Is it beneficial to perform intrinsic call compared to lib call? 7306 bool NeedToScalarize = false; 7307 unsigned CallCost = CM.getVectorCallCost(CI, VF, NeedToScalarize); 7308 bool UseVectorIntrinsic = 7309 ID && CM.getVectorIntrinsicCost(CI, VF) <= CallCost; 7310 return UseVectorIntrinsic || !NeedToScalarize; 7311 }; 7312 7313 if (!LoopVectorizationPlanner::getDecisionAndClampRange(willWiden, Range)) 7314 return nullptr; 7315 7316 return new VPWidenCallRecipe(*CI, Plan.mapToVPValues(CI->arg_operands())); 7317 } 7318 7319 bool VPRecipeBuilder::shouldWiden(Instruction *I, VFRange &Range) const { 7320 assert(!isa<BranchInst>(I) && !isa<PHINode>(I) && !isa<LoadInst>(I) && 7321 !isa<StoreInst>(I) && "Instruction should have been handled earlier"); 7322 // Instruction should be widened, unless it is scalar after vectorization, 7323 // scalarization is profitable or it is predicated. 7324 auto WillScalarize = [this, I](ElementCount VF) -> bool { 7325 return CM.isScalarAfterVectorization(I, VF) || 7326 CM.isProfitableToScalarize(I, VF) || 7327 CM.isScalarWithPredication(I, VF); 7328 }; 7329 return !LoopVectorizationPlanner::getDecisionAndClampRange(WillScalarize, 7330 Range); 7331 } 7332 7333 VPWidenRecipe *VPRecipeBuilder::tryToWiden(Instruction *I, VPlan &Plan) const { 7334 auto IsVectorizableOpcode = [](unsigned Opcode) { 7335 switch (Opcode) { 7336 case Instruction::Add: 7337 case Instruction::And: 7338 case Instruction::AShr: 7339 case Instruction::BitCast: 7340 case Instruction::FAdd: 7341 case Instruction::FCmp: 7342 case Instruction::FDiv: 7343 case Instruction::FMul: 7344 case Instruction::FNeg: 7345 case Instruction::FPExt: 7346 case Instruction::FPToSI: 7347 case Instruction::FPToUI: 7348 case Instruction::FPTrunc: 7349 case Instruction::FRem: 7350 case Instruction::FSub: 7351 case Instruction::ICmp: 7352 case Instruction::IntToPtr: 7353 case Instruction::LShr: 7354 case Instruction::Mul: 7355 case Instruction::Or: 7356 case Instruction::PtrToInt: 7357 case Instruction::SDiv: 7358 case Instruction::Select: 7359 case Instruction::SExt: 7360 case Instruction::Shl: 7361 case Instruction::SIToFP: 7362 case Instruction::SRem: 7363 case Instruction::Sub: 7364 case Instruction::Trunc: 7365 case Instruction::UDiv: 7366 case Instruction::UIToFP: 7367 case Instruction::URem: 7368 case Instruction::Xor: 7369 case Instruction::ZExt: 7370 return true; 7371 } 7372 return false; 7373 }; 7374 7375 if (!IsVectorizableOpcode(I->getOpcode())) 7376 return nullptr; 7377 7378 // Success: widen this instruction. 7379 return new VPWidenRecipe(*I, Plan.mapToVPValues(I->operands())); 7380 } 7381 7382 VPBasicBlock *VPRecipeBuilder::handleReplication( 7383 Instruction *I, VFRange &Range, VPBasicBlock *VPBB, 7384 DenseMap<Instruction *, VPReplicateRecipe *> &PredInst2Recipe, 7385 VPlanPtr &Plan) { 7386 bool IsUniform = LoopVectorizationPlanner::getDecisionAndClampRange( 7387 [&](ElementCount VF) { return CM.isUniformAfterVectorization(I, VF); }, 7388 Range); 7389 7390 bool IsPredicated = LoopVectorizationPlanner::getDecisionAndClampRange( 7391 [&](ElementCount VF) { return CM.isScalarWithPredication(I, VF); }, 7392 Range); 7393 7394 auto *Recipe = new VPReplicateRecipe(I, Plan->mapToVPValues(I->operands()), 7395 IsUniform, IsPredicated); 7396 setRecipe(I, Recipe); 7397 7398 // Find if I uses a predicated instruction. If so, it will use its scalar 7399 // value. Avoid hoisting the insert-element which packs the scalar value into 7400 // a vector value, as that happens iff all users use the vector value. 7401 for (auto &Op : I->operands()) 7402 if (auto *PredInst = dyn_cast<Instruction>(Op)) 7403 if (PredInst2Recipe.find(PredInst) != PredInst2Recipe.end()) 7404 PredInst2Recipe[PredInst]->setAlsoPack(false); 7405 7406 // Finalize the recipe for Instr, first if it is not predicated. 7407 if (!IsPredicated) { 7408 LLVM_DEBUG(dbgs() << "LV: Scalarizing:" << *I << "\n"); 7409 VPBB->appendRecipe(Recipe); 7410 return VPBB; 7411 } 7412 LLVM_DEBUG(dbgs() << "LV: Scalarizing and predicating:" << *I << "\n"); 7413 assert(VPBB->getSuccessors().empty() && 7414 "VPBB has successors when handling predicated replication."); 7415 // Record predicated instructions for above packing optimizations. 7416 PredInst2Recipe[I] = Recipe; 7417 VPBlockBase *Region = createReplicateRegion(I, Recipe, Plan); 7418 VPBlockUtils::insertBlockAfter(Region, VPBB); 7419 auto *RegSucc = new VPBasicBlock(); 7420 VPBlockUtils::insertBlockAfter(RegSucc, Region); 7421 return RegSucc; 7422 } 7423 7424 VPRegionBlock *VPRecipeBuilder::createReplicateRegion(Instruction *Instr, 7425 VPRecipeBase *PredRecipe, 7426 VPlanPtr &Plan) { 7427 // Instructions marked for predication are replicated and placed under an 7428 // if-then construct to prevent side-effects. 7429 7430 // Generate recipes to compute the block mask for this region. 7431 VPValue *BlockInMask = createBlockInMask(Instr->getParent(), Plan); 7432 7433 // Build the triangular if-then region. 7434 std::string RegionName = (Twine("pred.") + Instr->getOpcodeName()).str(); 7435 assert(Instr->getParent() && "Predicated instruction not in any basic block"); 7436 auto *BOMRecipe = new VPBranchOnMaskRecipe(BlockInMask); 7437 auto *Entry = new VPBasicBlock(Twine(RegionName) + ".entry", BOMRecipe); 7438 auto *PHIRecipe = 7439 Instr->getType()->isVoidTy() ? nullptr : new VPPredInstPHIRecipe(Instr); 7440 auto *Exit = new VPBasicBlock(Twine(RegionName) + ".continue", PHIRecipe); 7441 auto *Pred = new VPBasicBlock(Twine(RegionName) + ".if", PredRecipe); 7442 VPRegionBlock *Region = new VPRegionBlock(Entry, Exit, RegionName, true); 7443 7444 // Note: first set Entry as region entry and then connect successors starting 7445 // from it in order, to propagate the "parent" of each VPBasicBlock. 7446 VPBlockUtils::insertTwoBlocksAfter(Pred, Exit, BlockInMask, Entry); 7447 VPBlockUtils::connectBlocks(Pred, Exit); 7448 7449 return Region; 7450 } 7451 7452 VPRecipeBase *VPRecipeBuilder::tryToCreateWidenRecipe(Instruction *Instr, 7453 VFRange &Range, 7454 VPlanPtr &Plan) { 7455 // First, check for specific widening recipes that deal with calls, memory 7456 // operations, inductions and Phi nodes. 7457 if (auto *CI = dyn_cast<CallInst>(Instr)) 7458 return tryToWidenCall(CI, Range, *Plan); 7459 7460 if (isa<LoadInst>(Instr) || isa<StoreInst>(Instr)) 7461 return tryToWidenMemory(Instr, Range, Plan); 7462 7463 VPRecipeBase *Recipe; 7464 if (auto Phi = dyn_cast<PHINode>(Instr)) { 7465 if (Phi->getParent() != OrigLoop->getHeader()) 7466 return tryToBlend(Phi, Plan); 7467 if ((Recipe = tryToOptimizeInductionPHI(Phi))) 7468 return Recipe; 7469 return new VPWidenPHIRecipe(Phi); 7470 } 7471 7472 if (isa<TruncInst>(Instr) && 7473 (Recipe = tryToOptimizeInductionTruncate(cast<TruncInst>(Instr), Range))) 7474 return Recipe; 7475 7476 if (!shouldWiden(Instr, Range)) 7477 return nullptr; 7478 7479 if (auto GEP = dyn_cast<GetElementPtrInst>(Instr)) 7480 return new VPWidenGEPRecipe(GEP, Plan->mapToVPValues(GEP->operands()), 7481 OrigLoop); 7482 7483 if (auto *SI = dyn_cast<SelectInst>(Instr)) { 7484 bool InvariantCond = 7485 PSE.getSE()->isLoopInvariant(PSE.getSCEV(SI->getOperand(0)), OrigLoop); 7486 return new VPWidenSelectRecipe(*SI, Plan->mapToVPValues(SI->operands()), 7487 InvariantCond); 7488 } 7489 7490 return tryToWiden(Instr, *Plan); 7491 } 7492 7493 void LoopVectorizationPlanner::buildVPlansWithVPRecipes(unsigned MinVF, 7494 unsigned MaxVF) { 7495 assert(OrigLoop->empty() && "Inner loop expected."); 7496 7497 // Collect conditions feeding internal conditional branches; they need to be 7498 // represented in VPlan for it to model masking. 7499 SmallPtrSet<Value *, 1> NeedDef; 7500 7501 auto *Latch = OrigLoop->getLoopLatch(); 7502 for (BasicBlock *BB : OrigLoop->blocks()) { 7503 if (BB == Latch) 7504 continue; 7505 BranchInst *Branch = dyn_cast<BranchInst>(BB->getTerminator()); 7506 if (Branch && Branch->isConditional()) 7507 NeedDef.insert(Branch->getCondition()); 7508 } 7509 7510 // If the tail is to be folded by masking, the primary induction variable, if 7511 // exists needs to be represented in VPlan for it to model early-exit masking. 7512 // Also, both the Phi and the live-out instruction of each reduction are 7513 // required in order to introduce a select between them in VPlan. 7514 if (CM.foldTailByMasking()) { 7515 if (Legal->getPrimaryInduction()) 7516 NeedDef.insert(Legal->getPrimaryInduction()); 7517 for (auto &Reduction : Legal->getReductionVars()) { 7518 NeedDef.insert(Reduction.first); 7519 NeedDef.insert(Reduction.second.getLoopExitInstr()); 7520 } 7521 } 7522 7523 // Collect instructions from the original loop that will become trivially dead 7524 // in the vectorized loop. We don't need to vectorize these instructions. For 7525 // example, original induction update instructions can become dead because we 7526 // separately emit induction "steps" when generating code for the new loop. 7527 // Similarly, we create a new latch condition when setting up the structure 7528 // of the new loop, so the old one can become dead. 7529 SmallPtrSet<Instruction *, 4> DeadInstructions; 7530 collectTriviallyDeadInstructions(DeadInstructions); 7531 7532 // Add assume instructions we need to drop to DeadInstructions, to prevent 7533 // them from being added to the VPlan. 7534 // TODO: We only need to drop assumes in blocks that get flattend. If the 7535 // control flow is preserved, we should keep them. 7536 auto &ConditionalAssumes = Legal->getConditionalAssumes(); 7537 DeadInstructions.insert(ConditionalAssumes.begin(), ConditionalAssumes.end()); 7538 7539 DenseMap<Instruction *, Instruction *> &SinkAfter = Legal->getSinkAfter(); 7540 // Dead instructions do not need sinking. Remove them from SinkAfter. 7541 for (Instruction *I : DeadInstructions) 7542 SinkAfter.erase(I); 7543 7544 for (unsigned VF = MinVF; VF < MaxVF + 1;) { 7545 VFRange SubRange = {VF, MaxVF + 1}; 7546 VPlans.push_back(buildVPlanWithVPRecipes(SubRange, NeedDef, 7547 DeadInstructions, SinkAfter)); 7548 VF = SubRange.End; 7549 } 7550 } 7551 7552 VPlanPtr LoopVectorizationPlanner::buildVPlanWithVPRecipes( 7553 VFRange &Range, SmallPtrSetImpl<Value *> &NeedDef, 7554 SmallPtrSetImpl<Instruction *> &DeadInstructions, 7555 const DenseMap<Instruction *, Instruction *> &SinkAfter) { 7556 7557 // Hold a mapping from predicated instructions to their recipes, in order to 7558 // fix their AlsoPack behavior if a user is determined to replicate and use a 7559 // scalar instead of vector value. 7560 DenseMap<Instruction *, VPReplicateRecipe *> PredInst2Recipe; 7561 7562 SmallPtrSet<const InterleaveGroup<Instruction> *, 1> InterleaveGroups; 7563 7564 VPRecipeBuilder RecipeBuilder(OrigLoop, TLI, Legal, CM, PSE, Builder); 7565 7566 // --------------------------------------------------------------------------- 7567 // Pre-construction: record ingredients whose recipes we'll need to further 7568 // process after constructing the initial VPlan. 7569 // --------------------------------------------------------------------------- 7570 7571 // Mark instructions we'll need to sink later and their targets as 7572 // ingredients whose recipe we'll need to record. 7573 for (auto &Entry : SinkAfter) { 7574 RecipeBuilder.recordRecipeOf(Entry.first); 7575 RecipeBuilder.recordRecipeOf(Entry.second); 7576 } 7577 for (auto &Reduction : CM.getInLoopReductionChains()) { 7578 PHINode *Phi = Reduction.first; 7579 RecurrenceDescriptor::RecurrenceKind Kind = 7580 Legal->getReductionVars()[Phi].getRecurrenceKind(); 7581 const SmallVector<Instruction *, 4> &ReductionOperations = Reduction.second; 7582 7583 RecipeBuilder.recordRecipeOf(Phi); 7584 for (auto &R : ReductionOperations) { 7585 RecipeBuilder.recordRecipeOf(R); 7586 // For min/max reducitons, where we have a pair of icmp/select, we also 7587 // need to record the ICmp recipe, so it can be removed later. 7588 if (Kind == RecurrenceDescriptor::RK_IntegerMinMax || 7589 Kind == RecurrenceDescriptor::RK_FloatMinMax) { 7590 RecipeBuilder.recordRecipeOf(cast<Instruction>(R->getOperand(0))); 7591 } 7592 } 7593 } 7594 7595 // For each interleave group which is relevant for this (possibly trimmed) 7596 // Range, add it to the set of groups to be later applied to the VPlan and add 7597 // placeholders for its members' Recipes which we'll be replacing with a 7598 // single VPInterleaveRecipe. 7599 for (InterleaveGroup<Instruction> *IG : IAI.getInterleaveGroups()) { 7600 auto applyIG = [IG, this](ElementCount VF) -> bool { 7601 return (VF.isVector() && // Query is illegal for VF == 1 7602 CM.getWideningDecision(IG->getInsertPos(), VF) == 7603 LoopVectorizationCostModel::CM_Interleave); 7604 }; 7605 if (!getDecisionAndClampRange(applyIG, Range)) 7606 continue; 7607 InterleaveGroups.insert(IG); 7608 for (unsigned i = 0; i < IG->getFactor(); i++) 7609 if (Instruction *Member = IG->getMember(i)) 7610 RecipeBuilder.recordRecipeOf(Member); 7611 }; 7612 7613 // --------------------------------------------------------------------------- 7614 // Build initial VPlan: Scan the body of the loop in a topological order to 7615 // visit each basic block after having visited its predecessor basic blocks. 7616 // --------------------------------------------------------------------------- 7617 7618 // Create a dummy pre-entry VPBasicBlock to start building the VPlan. 7619 auto Plan = std::make_unique<VPlan>(); 7620 VPBasicBlock *VPBB = new VPBasicBlock("Pre-Entry"); 7621 Plan->setEntry(VPBB); 7622 7623 // Represent values that will have defs inside VPlan. 7624 for (Value *V : NeedDef) 7625 Plan->addVPValue(V); 7626 7627 // Scan the body of the loop in a topological order to visit each basic block 7628 // after having visited its predecessor basic blocks. 7629 LoopBlocksDFS DFS(OrigLoop); 7630 DFS.perform(LI); 7631 7632 for (BasicBlock *BB : make_range(DFS.beginRPO(), DFS.endRPO())) { 7633 // Relevant instructions from basic block BB will be grouped into VPRecipe 7634 // ingredients and fill a new VPBasicBlock. 7635 unsigned VPBBsForBB = 0; 7636 auto *FirstVPBBForBB = new VPBasicBlock(BB->getName()); 7637 VPBlockUtils::insertBlockAfter(FirstVPBBForBB, VPBB); 7638 VPBB = FirstVPBBForBB; 7639 Builder.setInsertPoint(VPBB); 7640 7641 // Introduce each ingredient into VPlan. 7642 // TODO: Model and preserve debug instrinsics in VPlan. 7643 for (Instruction &I : BB->instructionsWithoutDebug()) { 7644 Instruction *Instr = &I; 7645 7646 // First filter out irrelevant instructions, to ensure no recipes are 7647 // built for them. 7648 if (isa<BranchInst>(Instr) || DeadInstructions.count(Instr)) 7649 continue; 7650 7651 if (auto Recipe = 7652 RecipeBuilder.tryToCreateWidenRecipe(Instr, Range, Plan)) { 7653 RecipeBuilder.setRecipe(Instr, Recipe); 7654 VPBB->appendRecipe(Recipe); 7655 continue; 7656 } 7657 7658 // Otherwise, if all widening options failed, Instruction is to be 7659 // replicated. This may create a successor for VPBB. 7660 VPBasicBlock *NextVPBB = RecipeBuilder.handleReplication( 7661 Instr, Range, VPBB, PredInst2Recipe, Plan); 7662 if (NextVPBB != VPBB) { 7663 VPBB = NextVPBB; 7664 VPBB->setName(BB->hasName() ? BB->getName() + "." + Twine(VPBBsForBB++) 7665 : ""); 7666 } 7667 } 7668 } 7669 7670 // Discard empty dummy pre-entry VPBasicBlock. Note that other VPBasicBlocks 7671 // may also be empty, such as the last one VPBB, reflecting original 7672 // basic-blocks with no recipes. 7673 VPBasicBlock *PreEntry = cast<VPBasicBlock>(Plan->getEntry()); 7674 assert(PreEntry->empty() && "Expecting empty pre-entry block."); 7675 VPBlockBase *Entry = Plan->setEntry(PreEntry->getSingleSuccessor()); 7676 VPBlockUtils::disconnectBlocks(PreEntry, Entry); 7677 delete PreEntry; 7678 7679 // --------------------------------------------------------------------------- 7680 // Transform initial VPlan: Apply previously taken decisions, in order, to 7681 // bring the VPlan to its final state. 7682 // --------------------------------------------------------------------------- 7683 7684 // Apply Sink-After legal constraints. 7685 for (auto &Entry : SinkAfter) { 7686 VPRecipeBase *Sink = RecipeBuilder.getRecipe(Entry.first); 7687 VPRecipeBase *Target = RecipeBuilder.getRecipe(Entry.second); 7688 Sink->moveAfter(Target); 7689 } 7690 7691 // Interleave memory: for each Interleave Group we marked earlier as relevant 7692 // for this VPlan, replace the Recipes widening its memory instructions with a 7693 // single VPInterleaveRecipe at its insertion point. 7694 for (auto IG : InterleaveGroups) { 7695 auto *Recipe = cast<VPWidenMemoryInstructionRecipe>( 7696 RecipeBuilder.getRecipe(IG->getInsertPos())); 7697 (new VPInterleaveRecipe(IG, Recipe->getAddr(), Recipe->getMask())) 7698 ->insertBefore(Recipe); 7699 7700 for (unsigned i = 0; i < IG->getFactor(); ++i) 7701 if (Instruction *Member = IG->getMember(i)) { 7702 RecipeBuilder.getRecipe(Member)->eraseFromParent(); 7703 } 7704 } 7705 7706 // Adjust the recipes for any inloop reductions. 7707 if (Range.Start > 1) 7708 adjustRecipesForInLoopReductions(Plan, RecipeBuilder); 7709 7710 // Finally, if tail is folded by masking, introduce selects between the phi 7711 // and the live-out instruction of each reduction, at the end of the latch. 7712 if (CM.foldTailByMasking() && !Legal->getReductionVars().empty()) { 7713 Builder.setInsertPoint(VPBB); 7714 auto *Cond = RecipeBuilder.createBlockInMask(OrigLoop->getHeader(), Plan); 7715 for (auto &Reduction : Legal->getReductionVars()) { 7716 assert(!CM.isInLoopReduction(Reduction.first) && 7717 "Didn't expect inloop tail folded reduction yet!"); 7718 VPValue *Phi = Plan->getVPValue(Reduction.first); 7719 VPValue *Red = Plan->getVPValue(Reduction.second.getLoopExitInstr()); 7720 Builder.createNaryOp(Instruction::Select, {Cond, Red, Phi}); 7721 } 7722 } 7723 7724 std::string PlanName; 7725 raw_string_ostream RSO(PlanName); 7726 ElementCount VF = ElementCount::getFixed(Range.Start); 7727 Plan->addVF(VF); 7728 RSO << "Initial VPlan for VF={" << VF; 7729 for (VF.Min *= 2; VF.Min < Range.End; VF.Min *= 2) { 7730 Plan->addVF(VF); 7731 RSO << "," << VF; 7732 } 7733 RSO << "},UF>=1"; 7734 RSO.flush(); 7735 Plan->setName(PlanName); 7736 7737 return Plan; 7738 } 7739 7740 VPlanPtr LoopVectorizationPlanner::buildVPlan(VFRange &Range) { 7741 // Outer loop handling: They may require CFG and instruction level 7742 // transformations before even evaluating whether vectorization is profitable. 7743 // Since we cannot modify the incoming IR, we need to build VPlan upfront in 7744 // the vectorization pipeline. 7745 assert(!OrigLoop->empty()); 7746 assert(EnableVPlanNativePath && "VPlan-native path is not enabled."); 7747 7748 // Create new empty VPlan 7749 auto Plan = std::make_unique<VPlan>(); 7750 7751 // Build hierarchical CFG 7752 VPlanHCFGBuilder HCFGBuilder(OrigLoop, LI, *Plan); 7753 HCFGBuilder.buildHierarchicalCFG(); 7754 7755 for (unsigned VF = Range.Start; VF < Range.End; VF *= 2) 7756 Plan->addVF(ElementCount::getFixed(VF)); 7757 7758 if (EnableVPlanPredication) { 7759 VPlanPredicator VPP(*Plan); 7760 VPP.predicate(); 7761 7762 // Avoid running transformation to recipes until masked code generation in 7763 // VPlan-native path is in place. 7764 return Plan; 7765 } 7766 7767 SmallPtrSet<Instruction *, 1> DeadInstructions; 7768 VPlanTransforms::VPInstructionsToVPRecipes( 7769 OrigLoop, Plan, Legal->getInductionVars(), DeadInstructions); 7770 return Plan; 7771 } 7772 7773 // Adjust the recipes for any inloop reductions. The chain of instructions 7774 // leading from the loop exit instr to the phi need to be converted to 7775 // reductions, with one operand being vector and the other being the scalar 7776 // reduction chain. 7777 void LoopVectorizationPlanner::adjustRecipesForInLoopReductions( 7778 VPlanPtr &Plan, VPRecipeBuilder &RecipeBuilder) { 7779 for (auto &Reduction : CM.getInLoopReductionChains()) { 7780 PHINode *Phi = Reduction.first; 7781 RecurrenceDescriptor &RdxDesc = Legal->getReductionVars()[Phi]; 7782 const SmallVector<Instruction *, 4> &ReductionOperations = Reduction.second; 7783 7784 // ReductionOperations are orders top-down from the phi's use to the 7785 // LoopExitValue. We keep a track of the previous item (the Chain) to tell 7786 // which of the two operands will remain scalar and which will be reduced. 7787 // For minmax the chain will be the select instructions. 7788 Instruction *Chain = Phi; 7789 for (Instruction *R : ReductionOperations) { 7790 VPRecipeBase *WidenRecipe = RecipeBuilder.getRecipe(R); 7791 RecurrenceDescriptor::RecurrenceKind Kind = RdxDesc.getRecurrenceKind(); 7792 7793 VPValue *ChainOp = Plan->getVPValue(Chain); 7794 unsigned FirstOpId; 7795 if (Kind == RecurrenceDescriptor::RK_IntegerMinMax || 7796 Kind == RecurrenceDescriptor::RK_FloatMinMax) { 7797 assert(WidenRecipe->getVPRecipeID() == VPRecipeBase::VPWidenSelectSC && 7798 "Expected to replace a VPWidenSelectSC"); 7799 FirstOpId = 1; 7800 } else { 7801 assert(WidenRecipe->getVPRecipeID() == VPRecipeBase::VPWidenSC && 7802 "Expected to replace a VPWidenSC"); 7803 FirstOpId = 0; 7804 } 7805 unsigned VecOpId = 7806 R->getOperand(FirstOpId) == Chain ? FirstOpId + 1 : FirstOpId; 7807 VPValue *VecOp = Plan->getVPValue(R->getOperand(VecOpId)); 7808 7809 VPReductionRecipe *RedRecipe = new VPReductionRecipe( 7810 &RdxDesc, R, ChainOp, VecOp, Legal->hasFunNoNaNAttr(), TTI); 7811 WidenRecipe->getParent()->insert(RedRecipe, WidenRecipe->getIterator()); 7812 WidenRecipe->eraseFromParent(); 7813 7814 if (Kind == RecurrenceDescriptor::RK_IntegerMinMax || 7815 Kind == RecurrenceDescriptor::RK_FloatMinMax) { 7816 VPRecipeBase *CompareRecipe = 7817 RecipeBuilder.getRecipe(cast<Instruction>(R->getOperand(0))); 7818 assert(CompareRecipe->getVPRecipeID() == VPRecipeBase::VPWidenSC && 7819 "Expected to replace a VPWidenSC"); 7820 CompareRecipe->eraseFromParent(); 7821 } 7822 Chain = R; 7823 } 7824 } 7825 } 7826 7827 Value* LoopVectorizationPlanner::VPCallbackILV:: 7828 getOrCreateVectorValues(Value *V, unsigned Part) { 7829 return ILV.getOrCreateVectorValue(V, Part); 7830 } 7831 7832 Value *LoopVectorizationPlanner::VPCallbackILV::getOrCreateScalarValue( 7833 Value *V, const VPIteration &Instance) { 7834 return ILV.getOrCreateScalarValue(V, Instance); 7835 } 7836 7837 void VPInterleaveRecipe::print(raw_ostream &O, const Twine &Indent, 7838 VPSlotTracker &SlotTracker) const { 7839 O << "\"INTERLEAVE-GROUP with factor " << IG->getFactor() << " at "; 7840 IG->getInsertPos()->printAsOperand(O, false); 7841 O << ", "; 7842 getAddr()->printAsOperand(O, SlotTracker); 7843 VPValue *Mask = getMask(); 7844 if (Mask) { 7845 O << ", "; 7846 Mask->printAsOperand(O, SlotTracker); 7847 } 7848 for (unsigned i = 0; i < IG->getFactor(); ++i) 7849 if (Instruction *I = IG->getMember(i)) 7850 O << "\\l\" +\n" << Indent << "\" " << VPlanIngredient(I) << " " << i; 7851 } 7852 7853 void VPWidenCallRecipe::execute(VPTransformState &State) { 7854 State.ILV->widenCallInstruction(Ingredient, User, State); 7855 } 7856 7857 void VPWidenSelectRecipe::execute(VPTransformState &State) { 7858 State.ILV->widenSelectInstruction(Ingredient, User, InvariantCond, State); 7859 } 7860 7861 void VPWidenRecipe::execute(VPTransformState &State) { 7862 State.ILV->widenInstruction(Ingredient, User, State); 7863 } 7864 7865 void VPWidenGEPRecipe::execute(VPTransformState &State) { 7866 State.ILV->widenGEP(GEP, User, State.UF, State.VF, IsPtrLoopInvariant, 7867 IsIndexLoopInvariant, State); 7868 } 7869 7870 void VPWidenIntOrFpInductionRecipe::execute(VPTransformState &State) { 7871 assert(!State.Instance && "Int or FP induction being replicated."); 7872 State.ILV->widenIntOrFpInduction(IV, Trunc); 7873 } 7874 7875 void VPWidenPHIRecipe::execute(VPTransformState &State) { 7876 State.ILV->widenPHIInstruction(Phi, State.UF, State.VF); 7877 } 7878 7879 void VPBlendRecipe::execute(VPTransformState &State) { 7880 State.ILV->setDebugLocFromInst(State.Builder, Phi); 7881 // We know that all PHIs in non-header blocks are converted into 7882 // selects, so we don't have to worry about the insertion order and we 7883 // can just use the builder. 7884 // At this point we generate the predication tree. There may be 7885 // duplications since this is a simple recursive scan, but future 7886 // optimizations will clean it up. 7887 7888 unsigned NumIncoming = getNumIncomingValues(); 7889 7890 // Generate a sequence of selects of the form: 7891 // SELECT(Mask3, In3, 7892 // SELECT(Mask2, In2, 7893 // SELECT(Mask1, In1, 7894 // In0))) 7895 // Note that Mask0 is never used: lanes for which no path reaches this phi and 7896 // are essentially undef are taken from In0. 7897 InnerLoopVectorizer::VectorParts Entry(State.UF); 7898 for (unsigned In = 0; In < NumIncoming; ++In) { 7899 for (unsigned Part = 0; Part < State.UF; ++Part) { 7900 // We might have single edge PHIs (blocks) - use an identity 7901 // 'select' for the first PHI operand. 7902 Value *In0 = State.get(getIncomingValue(In), Part); 7903 if (In == 0) 7904 Entry[Part] = In0; // Initialize with the first incoming value. 7905 else { 7906 // Select between the current value and the previous incoming edge 7907 // based on the incoming mask. 7908 Value *Cond = State.get(getMask(In), Part); 7909 Entry[Part] = 7910 State.Builder.CreateSelect(Cond, In0, Entry[Part], "predphi"); 7911 } 7912 } 7913 } 7914 for (unsigned Part = 0; Part < State.UF; ++Part) 7915 State.ValueMap.setVectorValue(Phi, Part, Entry[Part]); 7916 } 7917 7918 void VPInterleaveRecipe::execute(VPTransformState &State) { 7919 assert(!State.Instance && "Interleave group being replicated."); 7920 State.ILV->vectorizeInterleaveGroup(IG, State, getAddr(), getMask()); 7921 } 7922 7923 void VPReductionRecipe::execute(VPTransformState &State) { 7924 assert(!State.Instance && "Reduction being replicated."); 7925 for (unsigned Part = 0; Part < State.UF; ++Part) { 7926 unsigned Kind = RdxDesc->getRecurrenceKind(); 7927 Value *NewVecOp = State.get(VecOp, Part); 7928 Value *NewRed = 7929 createTargetReduction(State.Builder, TTI, *RdxDesc, NewVecOp, NoNaN); 7930 Value *PrevInChain = State.get(ChainOp, Part); 7931 Value *NextInChain; 7932 if (Kind == RecurrenceDescriptor::RK_IntegerMinMax || 7933 Kind == RecurrenceDescriptor::RK_FloatMinMax) { 7934 NextInChain = 7935 createMinMaxOp(State.Builder, RdxDesc->getMinMaxRecurrenceKind(), 7936 NewRed, PrevInChain); 7937 } else { 7938 NextInChain = State.Builder.CreateBinOp( 7939 (Instruction::BinaryOps)I->getOpcode(), NewRed, PrevInChain); 7940 } 7941 State.ValueMap.setVectorValue(I, Part, NextInChain); 7942 } 7943 } 7944 7945 void VPReplicateRecipe::execute(VPTransformState &State) { 7946 if (State.Instance) { // Generate a single instance. 7947 State.ILV->scalarizeInstruction(Ingredient, User, *State.Instance, 7948 IsPredicated, State); 7949 // Insert scalar instance packing it into a vector. 7950 if (AlsoPack && State.VF.isVector()) { 7951 // If we're constructing lane 0, initialize to start from undef. 7952 if (State.Instance->Lane == 0) { 7953 assert(!State.VF.Scalable && "VF is assumed to be non scalable."); 7954 Value *Undef = 7955 UndefValue::get(VectorType::get(Ingredient->getType(), State.VF)); 7956 State.ValueMap.setVectorValue(Ingredient, State.Instance->Part, Undef); 7957 } 7958 State.ILV->packScalarIntoVectorValue(Ingredient, *State.Instance); 7959 } 7960 return; 7961 } 7962 7963 // Generate scalar instances for all VF lanes of all UF parts, unless the 7964 // instruction is uniform inwhich case generate only the first lane for each 7965 // of the UF parts. 7966 unsigned EndLane = IsUniform ? 1 : State.VF.Min; 7967 for (unsigned Part = 0; Part < State.UF; ++Part) 7968 for (unsigned Lane = 0; Lane < EndLane; ++Lane) 7969 State.ILV->scalarizeInstruction(Ingredient, User, {Part, Lane}, 7970 IsPredicated, State); 7971 } 7972 7973 void VPBranchOnMaskRecipe::execute(VPTransformState &State) { 7974 assert(State.Instance && "Branch on Mask works only on single instance."); 7975 7976 unsigned Part = State.Instance->Part; 7977 unsigned Lane = State.Instance->Lane; 7978 7979 Value *ConditionBit = nullptr; 7980 VPValue *BlockInMask = getMask(); 7981 if (BlockInMask) { 7982 ConditionBit = State.get(BlockInMask, Part); 7983 if (ConditionBit->getType()->isVectorTy()) 7984 ConditionBit = State.Builder.CreateExtractElement( 7985 ConditionBit, State.Builder.getInt32(Lane)); 7986 } else // Block in mask is all-one. 7987 ConditionBit = State.Builder.getTrue(); 7988 7989 // Replace the temporary unreachable terminator with a new conditional branch, 7990 // whose two destinations will be set later when they are created. 7991 auto *CurrentTerminator = State.CFG.PrevBB->getTerminator(); 7992 assert(isa<UnreachableInst>(CurrentTerminator) && 7993 "Expected to replace unreachable terminator with conditional branch."); 7994 auto *CondBr = BranchInst::Create(State.CFG.PrevBB, nullptr, ConditionBit); 7995 CondBr->setSuccessor(0, nullptr); 7996 ReplaceInstWithInst(CurrentTerminator, CondBr); 7997 } 7998 7999 void VPPredInstPHIRecipe::execute(VPTransformState &State) { 8000 assert(State.Instance && "Predicated instruction PHI works per instance."); 8001 Instruction *ScalarPredInst = cast<Instruction>( 8002 State.ValueMap.getScalarValue(PredInst, *State.Instance)); 8003 BasicBlock *PredicatedBB = ScalarPredInst->getParent(); 8004 BasicBlock *PredicatingBB = PredicatedBB->getSinglePredecessor(); 8005 assert(PredicatingBB && "Predicated block has no single predecessor."); 8006 8007 // By current pack/unpack logic we need to generate only a single phi node: if 8008 // a vector value for the predicated instruction exists at this point it means 8009 // the instruction has vector users only, and a phi for the vector value is 8010 // needed. In this case the recipe of the predicated instruction is marked to 8011 // also do that packing, thereby "hoisting" the insert-element sequence. 8012 // Otherwise, a phi node for the scalar value is needed. 8013 unsigned Part = State.Instance->Part; 8014 if (State.ValueMap.hasVectorValue(PredInst, Part)) { 8015 Value *VectorValue = State.ValueMap.getVectorValue(PredInst, Part); 8016 InsertElementInst *IEI = cast<InsertElementInst>(VectorValue); 8017 PHINode *VPhi = State.Builder.CreatePHI(IEI->getType(), 2); 8018 VPhi->addIncoming(IEI->getOperand(0), PredicatingBB); // Unmodified vector. 8019 VPhi->addIncoming(IEI, PredicatedBB); // New vector with inserted element. 8020 State.ValueMap.resetVectorValue(PredInst, Part, VPhi); // Update cache. 8021 } else { 8022 Type *PredInstType = PredInst->getType(); 8023 PHINode *Phi = State.Builder.CreatePHI(PredInstType, 2); 8024 Phi->addIncoming(UndefValue::get(ScalarPredInst->getType()), PredicatingBB); 8025 Phi->addIncoming(ScalarPredInst, PredicatedBB); 8026 State.ValueMap.resetScalarValue(PredInst, *State.Instance, Phi); 8027 } 8028 } 8029 8030 void VPWidenMemoryInstructionRecipe::execute(VPTransformState &State) { 8031 VPValue *StoredValue = isa<StoreInst>(Instr) ? getStoredValue() : nullptr; 8032 State.ILV->vectorizeMemoryInstruction(&Instr, State, getAddr(), StoredValue, 8033 getMask()); 8034 } 8035 8036 // Determine how to lower the scalar epilogue, which depends on 1) optimising 8037 // for minimum code-size, 2) predicate compiler options, 3) loop hints forcing 8038 // predication, and 4) a TTI hook that analyses whether the loop is suitable 8039 // for predication. 8040 static ScalarEpilogueLowering getScalarEpilogueLowering( 8041 Function *F, Loop *L, LoopVectorizeHints &Hints, ProfileSummaryInfo *PSI, 8042 BlockFrequencyInfo *BFI, TargetTransformInfo *TTI, TargetLibraryInfo *TLI, 8043 AssumptionCache *AC, LoopInfo *LI, ScalarEvolution *SE, DominatorTree *DT, 8044 LoopVectorizationLegality &LVL) { 8045 // 1) OptSize takes precedence over all other options, i.e. if this is set, 8046 // don't look at hints or options, and don't request a scalar epilogue. 8047 // (For PGSO, as shouldOptimizeForSize isn't currently accessible from 8048 // LoopAccessInfo (due to code dependency and not being able to reliably get 8049 // PSI/BFI from a loop analysis under NPM), we cannot suppress the collection 8050 // of strides in LoopAccessInfo::analyzeLoop() and vectorize without 8051 // versioning when the vectorization is forced, unlike hasOptSize. So revert 8052 // back to the old way and vectorize with versioning when forced. See D81345.) 8053 if (F->hasOptSize() || (llvm::shouldOptimizeForSize(L->getHeader(), PSI, BFI, 8054 PGSOQueryType::IRPass) && 8055 Hints.getForce() != LoopVectorizeHints::FK_Enabled)) 8056 return CM_ScalarEpilogueNotAllowedOptSize; 8057 8058 bool PredicateOptDisabled = PreferPredicateOverEpilog.getNumOccurrences() && 8059 !PreferPredicateOverEpilog; 8060 8061 // 2) Next, if disabling predication is requested on the command line, honour 8062 // this and request a scalar epilogue. 8063 if (PredicateOptDisabled) 8064 return CM_ScalarEpilogueAllowed; 8065 8066 // 3) and 4) look if enabling predication is requested on the command line, 8067 // with a loop hint, or if the TTI hook indicates this is profitable, request 8068 // predication . 8069 if (PreferPredicateOverEpilog || 8070 Hints.getPredicate() == LoopVectorizeHints::FK_Enabled || 8071 (TTI->preferPredicateOverEpilogue(L, LI, *SE, *AC, TLI, DT, 8072 LVL.getLAI()) && 8073 Hints.getPredicate() != LoopVectorizeHints::FK_Disabled)) 8074 return CM_ScalarEpilogueNotNeededUsePredicate; 8075 8076 return CM_ScalarEpilogueAllowed; 8077 } 8078 8079 // Process the loop in the VPlan-native vectorization path. This path builds 8080 // VPlan upfront in the vectorization pipeline, which allows to apply 8081 // VPlan-to-VPlan transformations from the very beginning without modifying the 8082 // input LLVM IR. 8083 static bool processLoopInVPlanNativePath( 8084 Loop *L, PredicatedScalarEvolution &PSE, LoopInfo *LI, DominatorTree *DT, 8085 LoopVectorizationLegality *LVL, TargetTransformInfo *TTI, 8086 TargetLibraryInfo *TLI, DemandedBits *DB, AssumptionCache *AC, 8087 OptimizationRemarkEmitter *ORE, BlockFrequencyInfo *BFI, 8088 ProfileSummaryInfo *PSI, LoopVectorizeHints &Hints) { 8089 8090 if (PSE.getBackedgeTakenCount() == PSE.getSE()->getCouldNotCompute()) { 8091 LLVM_DEBUG(dbgs() << "LV: cannot compute the outer-loop trip count\n"); 8092 return false; 8093 } 8094 assert(EnableVPlanNativePath && "VPlan-native path is disabled."); 8095 Function *F = L->getHeader()->getParent(); 8096 InterleavedAccessInfo IAI(PSE, L, DT, LI, LVL->getLAI()); 8097 8098 ScalarEpilogueLowering SEL = getScalarEpilogueLowering( 8099 F, L, Hints, PSI, BFI, TTI, TLI, AC, LI, PSE.getSE(), DT, *LVL); 8100 8101 LoopVectorizationCostModel CM(SEL, L, PSE, LI, LVL, *TTI, TLI, DB, AC, ORE, F, 8102 &Hints, IAI); 8103 // Use the planner for outer loop vectorization. 8104 // TODO: CM is not used at this point inside the planner. Turn CM into an 8105 // optional argument if we don't need it in the future. 8106 LoopVectorizationPlanner LVP(L, LI, TLI, TTI, LVL, CM, IAI, PSE); 8107 8108 // Get user vectorization factor. 8109 const unsigned UserVF = Hints.getWidth(); 8110 8111 // Plan how to best vectorize, return the best VF and its cost. 8112 const VectorizationFactor VF = 8113 LVP.planInVPlanNativePath(ElementCount::getFixed(UserVF)); 8114 8115 // If we are stress testing VPlan builds, do not attempt to generate vector 8116 // code. Masked vector code generation support will follow soon. 8117 // Also, do not attempt to vectorize if no vector code will be produced. 8118 if (VPlanBuildStressTest || EnableVPlanPredication || 8119 VectorizationFactor::Disabled() == VF) 8120 return false; 8121 8122 LVP.setBestPlan(VF.Width, 1); 8123 8124 InnerLoopVectorizer LB(L, PSE, LI, DT, TLI, TTI, AC, ORE, VF.Width, 1, LVL, 8125 &CM, BFI, PSI); 8126 LLVM_DEBUG(dbgs() << "Vectorizing outer loop in \"" 8127 << L->getHeader()->getParent()->getName() << "\"\n"); 8128 LVP.executePlan(LB, DT); 8129 8130 // Mark the loop as already vectorized to avoid vectorizing again. 8131 Hints.setAlreadyVectorized(); 8132 8133 assert(!verifyFunction(*L->getHeader()->getParent(), &dbgs())); 8134 return true; 8135 } 8136 8137 LoopVectorizePass::LoopVectorizePass(LoopVectorizeOptions Opts) 8138 : InterleaveOnlyWhenForced(Opts.InterleaveOnlyWhenForced || 8139 !EnableLoopInterleaving), 8140 VectorizeOnlyWhenForced(Opts.VectorizeOnlyWhenForced || 8141 !EnableLoopVectorization) {} 8142 8143 bool LoopVectorizePass::processLoop(Loop *L) { 8144 assert((EnableVPlanNativePath || L->empty()) && 8145 "VPlan-native path is not enabled. Only process inner loops."); 8146 8147 #ifndef NDEBUG 8148 const std::string DebugLocStr = getDebugLocString(L); 8149 #endif /* NDEBUG */ 8150 8151 LLVM_DEBUG(dbgs() << "\nLV: Checking a loop in \"" 8152 << L->getHeader()->getParent()->getName() << "\" from " 8153 << DebugLocStr << "\n"); 8154 8155 LoopVectorizeHints Hints(L, InterleaveOnlyWhenForced, *ORE); 8156 8157 LLVM_DEBUG( 8158 dbgs() << "LV: Loop hints:" 8159 << " force=" 8160 << (Hints.getForce() == LoopVectorizeHints::FK_Disabled 8161 ? "disabled" 8162 : (Hints.getForce() == LoopVectorizeHints::FK_Enabled 8163 ? "enabled" 8164 : "?")) 8165 << " width=" << Hints.getWidth() 8166 << " unroll=" << Hints.getInterleave() << "\n"); 8167 8168 // Function containing loop 8169 Function *F = L->getHeader()->getParent(); 8170 8171 // Looking at the diagnostic output is the only way to determine if a loop 8172 // was vectorized (other than looking at the IR or machine code), so it 8173 // is important to generate an optimization remark for each loop. Most of 8174 // these messages are generated as OptimizationRemarkAnalysis. Remarks 8175 // generated as OptimizationRemark and OptimizationRemarkMissed are 8176 // less verbose reporting vectorized loops and unvectorized loops that may 8177 // benefit from vectorization, respectively. 8178 8179 if (!Hints.allowVectorization(F, L, VectorizeOnlyWhenForced)) { 8180 LLVM_DEBUG(dbgs() << "LV: Loop hints prevent vectorization.\n"); 8181 return false; 8182 } 8183 8184 PredicatedScalarEvolution PSE(*SE, *L); 8185 8186 // Check if it is legal to vectorize the loop. 8187 LoopVectorizationRequirements Requirements(*ORE); 8188 LoopVectorizationLegality LVL(L, PSE, DT, TTI, TLI, AA, F, GetLAA, LI, ORE, 8189 &Requirements, &Hints, DB, AC, BFI, PSI); 8190 if (!LVL.canVectorize(EnableVPlanNativePath)) { 8191 LLVM_DEBUG(dbgs() << "LV: Not vectorizing: Cannot prove legality.\n"); 8192 Hints.emitRemarkWithHints(); 8193 return false; 8194 } 8195 8196 // Check the function attributes and profiles to find out if this function 8197 // should be optimized for size. 8198 ScalarEpilogueLowering SEL = getScalarEpilogueLowering( 8199 F, L, Hints, PSI, BFI, TTI, TLI, AC, LI, PSE.getSE(), DT, LVL); 8200 8201 // Entrance to the VPlan-native vectorization path. Outer loops are processed 8202 // here. They may require CFG and instruction level transformations before 8203 // even evaluating whether vectorization is profitable. Since we cannot modify 8204 // the incoming IR, we need to build VPlan upfront in the vectorization 8205 // pipeline. 8206 if (!L->empty()) 8207 return processLoopInVPlanNativePath(L, PSE, LI, DT, &LVL, TTI, TLI, DB, AC, 8208 ORE, BFI, PSI, Hints); 8209 8210 assert(L->empty() && "Inner loop expected."); 8211 8212 // Check the loop for a trip count threshold: vectorize loops with a tiny trip 8213 // count by optimizing for size, to minimize overheads. 8214 auto ExpectedTC = getSmallBestKnownTC(*SE, L); 8215 if (ExpectedTC && *ExpectedTC < TinyTripCountVectorThreshold) { 8216 LLVM_DEBUG(dbgs() << "LV: Found a loop with a very small trip count. " 8217 << "This loop is worth vectorizing only if no scalar " 8218 << "iteration overheads are incurred."); 8219 if (Hints.getForce() == LoopVectorizeHints::FK_Enabled) 8220 LLVM_DEBUG(dbgs() << " But vectorizing was explicitly forced.\n"); 8221 else { 8222 LLVM_DEBUG(dbgs() << "\n"); 8223 SEL = CM_ScalarEpilogueNotAllowedLowTripLoop; 8224 } 8225 } 8226 8227 // Check the function attributes to see if implicit floats are allowed. 8228 // FIXME: This check doesn't seem possibly correct -- what if the loop is 8229 // an integer loop and the vector instructions selected are purely integer 8230 // vector instructions? 8231 if (F->hasFnAttribute(Attribute::NoImplicitFloat)) { 8232 reportVectorizationFailure( 8233 "Can't vectorize when the NoImplicitFloat attribute is used", 8234 "loop not vectorized due to NoImplicitFloat attribute", 8235 "NoImplicitFloat", ORE, L); 8236 Hints.emitRemarkWithHints(); 8237 return false; 8238 } 8239 8240 // Check if the target supports potentially unsafe FP vectorization. 8241 // FIXME: Add a check for the type of safety issue (denormal, signaling) 8242 // for the target we're vectorizing for, to make sure none of the 8243 // additional fp-math flags can help. 8244 if (Hints.isPotentiallyUnsafe() && 8245 TTI->isFPVectorizationPotentiallyUnsafe()) { 8246 reportVectorizationFailure( 8247 "Potentially unsafe FP op prevents vectorization", 8248 "loop not vectorized due to unsafe FP support.", 8249 "UnsafeFP", ORE, L); 8250 Hints.emitRemarkWithHints(); 8251 return false; 8252 } 8253 8254 bool UseInterleaved = TTI->enableInterleavedAccessVectorization(); 8255 InterleavedAccessInfo IAI(PSE, L, DT, LI, LVL.getLAI()); 8256 8257 // If an override option has been passed in for interleaved accesses, use it. 8258 if (EnableInterleavedMemAccesses.getNumOccurrences() > 0) 8259 UseInterleaved = EnableInterleavedMemAccesses; 8260 8261 // Analyze interleaved memory accesses. 8262 if (UseInterleaved) { 8263 IAI.analyzeInterleaving(useMaskedInterleavedAccesses(*TTI)); 8264 } 8265 8266 // Use the cost model. 8267 LoopVectorizationCostModel CM(SEL, L, PSE, LI, &LVL, *TTI, TLI, DB, AC, ORE, 8268 F, &Hints, IAI); 8269 CM.collectValuesToIgnore(); 8270 8271 // Use the planner for vectorization. 8272 LoopVectorizationPlanner LVP(L, LI, TLI, TTI, &LVL, CM, IAI, PSE); 8273 8274 // Get user vectorization factor and interleave count. 8275 unsigned UserVF = Hints.getWidth(); 8276 unsigned UserIC = Hints.getInterleave(); 8277 8278 // Plan how to best vectorize, return the best VF and its cost. 8279 Optional<VectorizationFactor> MaybeVF = 8280 LVP.plan(ElementCount::getFixed(UserVF), UserIC); 8281 8282 VectorizationFactor VF = VectorizationFactor::Disabled(); 8283 unsigned IC = 1; 8284 8285 if (MaybeVF) { 8286 VF = *MaybeVF; 8287 // Select the interleave count. 8288 IC = CM.selectInterleaveCount(VF.Width, VF.Cost); 8289 } 8290 8291 // Identify the diagnostic messages that should be produced. 8292 std::pair<StringRef, std::string> VecDiagMsg, IntDiagMsg; 8293 bool VectorizeLoop = true, InterleaveLoop = true; 8294 if (Requirements.doesNotMeet(F, L, Hints)) { 8295 LLVM_DEBUG(dbgs() << "LV: Not vectorizing: loop did not meet vectorization " 8296 "requirements.\n"); 8297 Hints.emitRemarkWithHints(); 8298 return false; 8299 } 8300 8301 if (VF.Width == 1) { 8302 LLVM_DEBUG(dbgs() << "LV: Vectorization is possible but not beneficial.\n"); 8303 VecDiagMsg = std::make_pair( 8304 "VectorizationNotBeneficial", 8305 "the cost-model indicates that vectorization is not beneficial"); 8306 VectorizeLoop = false; 8307 } 8308 8309 if (!MaybeVF && UserIC > 1) { 8310 // Tell the user interleaving was avoided up-front, despite being explicitly 8311 // requested. 8312 LLVM_DEBUG(dbgs() << "LV: Ignoring UserIC, because vectorization and " 8313 "interleaving should be avoided up front\n"); 8314 IntDiagMsg = std::make_pair( 8315 "InterleavingAvoided", 8316 "Ignoring UserIC, because interleaving was avoided up front"); 8317 InterleaveLoop = false; 8318 } else if (IC == 1 && UserIC <= 1) { 8319 // Tell the user interleaving is not beneficial. 8320 LLVM_DEBUG(dbgs() << "LV: Interleaving is not beneficial.\n"); 8321 IntDiagMsg = std::make_pair( 8322 "InterleavingNotBeneficial", 8323 "the cost-model indicates that interleaving is not beneficial"); 8324 InterleaveLoop = false; 8325 if (UserIC == 1) { 8326 IntDiagMsg.first = "InterleavingNotBeneficialAndDisabled"; 8327 IntDiagMsg.second += 8328 " and is explicitly disabled or interleave count is set to 1"; 8329 } 8330 } else if (IC > 1 && UserIC == 1) { 8331 // Tell the user interleaving is beneficial, but it explicitly disabled. 8332 LLVM_DEBUG( 8333 dbgs() << "LV: Interleaving is beneficial but is explicitly disabled."); 8334 IntDiagMsg = std::make_pair( 8335 "InterleavingBeneficialButDisabled", 8336 "the cost-model indicates that interleaving is beneficial " 8337 "but is explicitly disabled or interleave count is set to 1"); 8338 InterleaveLoop = false; 8339 } 8340 8341 // Override IC if user provided an interleave count. 8342 IC = UserIC > 0 ? UserIC : IC; 8343 8344 // Emit diagnostic messages, if any. 8345 const char *VAPassName = Hints.vectorizeAnalysisPassName(); 8346 if (!VectorizeLoop && !InterleaveLoop) { 8347 // Do not vectorize or interleaving the loop. 8348 ORE->emit([&]() { 8349 return OptimizationRemarkMissed(VAPassName, VecDiagMsg.first, 8350 L->getStartLoc(), L->getHeader()) 8351 << VecDiagMsg.second; 8352 }); 8353 ORE->emit([&]() { 8354 return OptimizationRemarkMissed(LV_NAME, IntDiagMsg.first, 8355 L->getStartLoc(), L->getHeader()) 8356 << IntDiagMsg.second; 8357 }); 8358 return false; 8359 } else if (!VectorizeLoop && InterleaveLoop) { 8360 LLVM_DEBUG(dbgs() << "LV: Interleave Count is " << IC << '\n'); 8361 ORE->emit([&]() { 8362 return OptimizationRemarkAnalysis(VAPassName, VecDiagMsg.first, 8363 L->getStartLoc(), L->getHeader()) 8364 << VecDiagMsg.second; 8365 }); 8366 } else if (VectorizeLoop && !InterleaveLoop) { 8367 LLVM_DEBUG(dbgs() << "LV: Found a vectorizable loop (" << VF.Width 8368 << ") in " << DebugLocStr << '\n'); 8369 ORE->emit([&]() { 8370 return OptimizationRemarkAnalysis(LV_NAME, IntDiagMsg.first, 8371 L->getStartLoc(), L->getHeader()) 8372 << IntDiagMsg.second; 8373 }); 8374 } else if (VectorizeLoop && InterleaveLoop) { 8375 LLVM_DEBUG(dbgs() << "LV: Found a vectorizable loop (" << VF.Width 8376 << ") in " << DebugLocStr << '\n'); 8377 LLVM_DEBUG(dbgs() << "LV: Interleave Count is " << IC << '\n'); 8378 } 8379 8380 LVP.setBestPlan(VF.Width, IC); 8381 8382 using namespace ore; 8383 bool DisableRuntimeUnroll = false; 8384 MDNode *OrigLoopID = L->getLoopID(); 8385 8386 if (!VectorizeLoop) { 8387 assert(IC > 1 && "interleave count should not be 1 or 0"); 8388 // If we decided that it is not legal to vectorize the loop, then 8389 // interleave it. 8390 InnerLoopUnroller Unroller(L, PSE, LI, DT, TLI, TTI, AC, ORE, IC, &LVL, &CM, 8391 BFI, PSI); 8392 LVP.executePlan(Unroller, DT); 8393 8394 ORE->emit([&]() { 8395 return OptimizationRemark(LV_NAME, "Interleaved", L->getStartLoc(), 8396 L->getHeader()) 8397 << "interleaved loop (interleaved count: " 8398 << NV("InterleaveCount", IC) << ")"; 8399 }); 8400 } else { 8401 // If we decided that it is *legal* to vectorize the loop, then do it. 8402 InnerLoopVectorizer LB(L, PSE, LI, DT, TLI, TTI, AC, ORE, VF.Width, IC, 8403 &LVL, &CM, BFI, PSI); 8404 LVP.executePlan(LB, DT); 8405 ++LoopsVectorized; 8406 8407 // Add metadata to disable runtime unrolling a scalar loop when there are 8408 // no runtime checks about strides and memory. A scalar loop that is 8409 // rarely used is not worth unrolling. 8410 if (!LB.areSafetyChecksAdded()) 8411 DisableRuntimeUnroll = true; 8412 8413 // Report the vectorization decision. 8414 ORE->emit([&]() { 8415 return OptimizationRemark(LV_NAME, "Vectorized", L->getStartLoc(), 8416 L->getHeader()) 8417 << "vectorized loop (vectorization width: " 8418 << NV("VectorizationFactor", VF.Width) 8419 << ", interleaved count: " << NV("InterleaveCount", IC) << ")"; 8420 }); 8421 } 8422 8423 Optional<MDNode *> RemainderLoopID = 8424 makeFollowupLoopID(OrigLoopID, {LLVMLoopVectorizeFollowupAll, 8425 LLVMLoopVectorizeFollowupEpilogue}); 8426 if (RemainderLoopID.hasValue()) { 8427 L->setLoopID(RemainderLoopID.getValue()); 8428 } else { 8429 if (DisableRuntimeUnroll) 8430 AddRuntimeUnrollDisableMetaData(L); 8431 8432 // Mark the loop as already vectorized to avoid vectorizing again. 8433 Hints.setAlreadyVectorized(); 8434 } 8435 8436 assert(!verifyFunction(*L->getHeader()->getParent(), &dbgs())); 8437 return true; 8438 } 8439 8440 LoopVectorizeResult LoopVectorizePass::runImpl( 8441 Function &F, ScalarEvolution &SE_, LoopInfo &LI_, TargetTransformInfo &TTI_, 8442 DominatorTree &DT_, BlockFrequencyInfo &BFI_, TargetLibraryInfo *TLI_, 8443 DemandedBits &DB_, AAResults &AA_, AssumptionCache &AC_, 8444 std::function<const LoopAccessInfo &(Loop &)> &GetLAA_, 8445 OptimizationRemarkEmitter &ORE_, ProfileSummaryInfo *PSI_) { 8446 SE = &SE_; 8447 LI = &LI_; 8448 TTI = &TTI_; 8449 DT = &DT_; 8450 BFI = &BFI_; 8451 TLI = TLI_; 8452 AA = &AA_; 8453 AC = &AC_; 8454 GetLAA = &GetLAA_; 8455 DB = &DB_; 8456 ORE = &ORE_; 8457 PSI = PSI_; 8458 8459 // Don't attempt if 8460 // 1. the target claims to have no vector registers, and 8461 // 2. interleaving won't help ILP. 8462 // 8463 // The second condition is necessary because, even if the target has no 8464 // vector registers, loop vectorization may still enable scalar 8465 // interleaving. 8466 if (!TTI->getNumberOfRegisters(TTI->getRegisterClassForType(true)) && 8467 TTI->getMaxInterleaveFactor(1) < 2) 8468 return LoopVectorizeResult(false, false); 8469 8470 bool Changed = false, CFGChanged = false; 8471 8472 // The vectorizer requires loops to be in simplified form. 8473 // Since simplification may add new inner loops, it has to run before the 8474 // legality and profitability checks. This means running the loop vectorizer 8475 // will simplify all loops, regardless of whether anything end up being 8476 // vectorized. 8477 for (auto &L : *LI) 8478 Changed |= CFGChanged |= 8479 simplifyLoop(L, DT, LI, SE, AC, nullptr, false /* PreserveLCSSA */); 8480 8481 // Build up a worklist of inner-loops to vectorize. This is necessary as 8482 // the act of vectorizing or partially unrolling a loop creates new loops 8483 // and can invalidate iterators across the loops. 8484 SmallVector<Loop *, 8> Worklist; 8485 8486 for (Loop *L : *LI) 8487 collectSupportedLoops(*L, LI, ORE, Worklist); 8488 8489 LoopsAnalyzed += Worklist.size(); 8490 8491 // Now walk the identified inner loops. 8492 while (!Worklist.empty()) { 8493 Loop *L = Worklist.pop_back_val(); 8494 8495 // For the inner loops we actually process, form LCSSA to simplify the 8496 // transform. 8497 Changed |= formLCSSARecursively(*L, *DT, LI, SE); 8498 8499 Changed |= CFGChanged |= processLoop(L); 8500 } 8501 8502 // Process each loop nest in the function. 8503 return LoopVectorizeResult(Changed, CFGChanged); 8504 } 8505 8506 PreservedAnalyses LoopVectorizePass::run(Function &F, 8507 FunctionAnalysisManager &AM) { 8508 auto &SE = AM.getResult<ScalarEvolutionAnalysis>(F); 8509 auto &LI = AM.getResult<LoopAnalysis>(F); 8510 auto &TTI = AM.getResult<TargetIRAnalysis>(F); 8511 auto &DT = AM.getResult<DominatorTreeAnalysis>(F); 8512 auto &BFI = AM.getResult<BlockFrequencyAnalysis>(F); 8513 auto &TLI = AM.getResult<TargetLibraryAnalysis>(F); 8514 auto &AA = AM.getResult<AAManager>(F); 8515 auto &AC = AM.getResult<AssumptionAnalysis>(F); 8516 auto &DB = AM.getResult<DemandedBitsAnalysis>(F); 8517 auto &ORE = AM.getResult<OptimizationRemarkEmitterAnalysis>(F); 8518 MemorySSA *MSSA = EnableMSSALoopDependency 8519 ? &AM.getResult<MemorySSAAnalysis>(F).getMSSA() 8520 : nullptr; 8521 8522 auto &LAM = AM.getResult<LoopAnalysisManagerFunctionProxy>(F).getManager(); 8523 std::function<const LoopAccessInfo &(Loop &)> GetLAA = 8524 [&](Loop &L) -> const LoopAccessInfo & { 8525 LoopStandardAnalysisResults AR = {AA, AC, DT, LI, SE, TLI, TTI, MSSA}; 8526 return LAM.getResult<LoopAccessAnalysis>(L, AR); 8527 }; 8528 auto &MAMProxy = AM.getResult<ModuleAnalysisManagerFunctionProxy>(F); 8529 ProfileSummaryInfo *PSI = 8530 MAMProxy.getCachedResult<ProfileSummaryAnalysis>(*F.getParent()); 8531 LoopVectorizeResult Result = 8532 runImpl(F, SE, LI, TTI, DT, BFI, &TLI, DB, AA, AC, GetLAA, ORE, PSI); 8533 if (!Result.MadeAnyChange) 8534 return PreservedAnalyses::all(); 8535 PreservedAnalyses PA; 8536 8537 // We currently do not preserve loopinfo/dominator analyses with outer loop 8538 // vectorization. Until this is addressed, mark these analyses as preserved 8539 // only for non-VPlan-native path. 8540 // TODO: Preserve Loop and Dominator analyses for VPlan-native path. 8541 if (!EnableVPlanNativePath) { 8542 PA.preserve<LoopAnalysis>(); 8543 PA.preserve<DominatorTreeAnalysis>(); 8544 } 8545 PA.preserve<BasicAA>(); 8546 PA.preserve<GlobalsAA>(); 8547 if (!Result.MadeCFGChange) 8548 PA.preserveSet<CFGAnalyses>(); 8549 return PA; 8550 } 8551