1 //===- LoopVectorize.cpp - A Loop Vectorizer ------------------------------===// 2 // 3 // Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. 4 // See https://llvm.org/LICENSE.txt for license information. 5 // SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception 6 // 7 //===----------------------------------------------------------------------===// 8 // 9 // This is the LLVM loop vectorizer. This pass modifies 'vectorizable' loops 10 // and generates target-independent LLVM-IR. 11 // The vectorizer uses the TargetTransformInfo analysis to estimate the costs 12 // of instructions in order to estimate the profitability of vectorization. 13 // 14 // The loop vectorizer combines consecutive loop iterations into a single 15 // 'wide' iteration. After this transformation the index is incremented 16 // by the SIMD vector width, and not by one. 17 // 18 // This pass has three parts: 19 // 1. The main loop pass that drives the different parts. 20 // 2. LoopVectorizationLegality - A unit that checks for the legality 21 // of the vectorization. 22 // 3. InnerLoopVectorizer - A unit that performs the actual 23 // widening of instructions. 24 // 4. LoopVectorizationCostModel - A unit that checks for the profitability 25 // of vectorization. It decides on the optimal vector width, which 26 // can be one, if vectorization is not profitable. 27 // 28 // There is a development effort going on to migrate loop vectorizer to the 29 // VPlan infrastructure and to introduce outer loop vectorization support (see 30 // docs/Proposal/VectorizationPlan.rst and 31 // http://lists.llvm.org/pipermail/llvm-dev/2017-December/119523.html). For this 32 // purpose, we temporarily introduced the VPlan-native vectorization path: an 33 // alternative vectorization path that is natively implemented on top of the 34 // VPlan infrastructure. See EnableVPlanNativePath for enabling. 35 // 36 //===----------------------------------------------------------------------===// 37 // 38 // The reduction-variable vectorization is based on the paper: 39 // D. Nuzman and R. Henderson. Multi-platform Auto-vectorization. 40 // 41 // Variable uniformity checks are inspired by: 42 // Karrenberg, R. and Hack, S. Whole Function Vectorization. 43 // 44 // The interleaved access vectorization is based on the paper: 45 // Dorit Nuzman, Ira Rosen and Ayal Zaks. Auto-Vectorization of Interleaved 46 // Data for SIMD 47 // 48 // Other ideas/concepts are from: 49 // A. Zaks and D. Nuzman. Autovectorization in GCC-two years later. 50 // 51 // S. Maleki, Y. Gao, M. Garzaran, T. Wong and D. Padua. An Evaluation of 52 // Vectorizing Compilers. 53 // 54 //===----------------------------------------------------------------------===// 55 56 #include "llvm/Transforms/Vectorize/LoopVectorize.h" 57 #include "LoopVectorizationPlanner.h" 58 #include "VPRecipeBuilder.h" 59 #include "VPlan.h" 60 #include "VPlanHCFGBuilder.h" 61 #include "VPlanPredicator.h" 62 #include "VPlanTransforms.h" 63 #include "llvm/ADT/APInt.h" 64 #include "llvm/ADT/ArrayRef.h" 65 #include "llvm/ADT/DenseMap.h" 66 #include "llvm/ADT/DenseMapInfo.h" 67 #include "llvm/ADT/Hashing.h" 68 #include "llvm/ADT/MapVector.h" 69 #include "llvm/ADT/None.h" 70 #include "llvm/ADT/Optional.h" 71 #include "llvm/ADT/STLExtras.h" 72 #include "llvm/ADT/SetVector.h" 73 #include "llvm/ADT/SmallPtrSet.h" 74 #include "llvm/ADT/SmallVector.h" 75 #include "llvm/ADT/Statistic.h" 76 #include "llvm/ADT/StringRef.h" 77 #include "llvm/ADT/Twine.h" 78 #include "llvm/ADT/iterator_range.h" 79 #include "llvm/Analysis/AssumptionCache.h" 80 #include "llvm/Analysis/BasicAliasAnalysis.h" 81 #include "llvm/Analysis/BlockFrequencyInfo.h" 82 #include "llvm/Analysis/CFG.h" 83 #include "llvm/Analysis/CodeMetrics.h" 84 #include "llvm/Analysis/DemandedBits.h" 85 #include "llvm/Analysis/GlobalsModRef.h" 86 #include "llvm/Analysis/LoopAccessAnalysis.h" 87 #include "llvm/Analysis/LoopAnalysisManager.h" 88 #include "llvm/Analysis/LoopInfo.h" 89 #include "llvm/Analysis/LoopIterator.h" 90 #include "llvm/Analysis/MemorySSA.h" 91 #include "llvm/Analysis/OptimizationRemarkEmitter.h" 92 #include "llvm/Analysis/ProfileSummaryInfo.h" 93 #include "llvm/Analysis/ScalarEvolution.h" 94 #include "llvm/Analysis/ScalarEvolutionExpressions.h" 95 #include "llvm/Analysis/TargetLibraryInfo.h" 96 #include "llvm/Analysis/TargetTransformInfo.h" 97 #include "llvm/Analysis/VectorUtils.h" 98 #include "llvm/IR/Attributes.h" 99 #include "llvm/IR/BasicBlock.h" 100 #include "llvm/IR/CFG.h" 101 #include "llvm/IR/Constant.h" 102 #include "llvm/IR/Constants.h" 103 #include "llvm/IR/DataLayout.h" 104 #include "llvm/IR/DebugInfoMetadata.h" 105 #include "llvm/IR/DebugLoc.h" 106 #include "llvm/IR/DerivedTypes.h" 107 #include "llvm/IR/DiagnosticInfo.h" 108 #include "llvm/IR/Dominators.h" 109 #include "llvm/IR/Function.h" 110 #include "llvm/IR/IRBuilder.h" 111 #include "llvm/IR/InstrTypes.h" 112 #include "llvm/IR/Instruction.h" 113 #include "llvm/IR/Instructions.h" 114 #include "llvm/IR/IntrinsicInst.h" 115 #include "llvm/IR/Intrinsics.h" 116 #include "llvm/IR/LLVMContext.h" 117 #include "llvm/IR/Metadata.h" 118 #include "llvm/IR/Module.h" 119 #include "llvm/IR/Operator.h" 120 #include "llvm/IR/Type.h" 121 #include "llvm/IR/Use.h" 122 #include "llvm/IR/User.h" 123 #include "llvm/IR/Value.h" 124 #include "llvm/IR/ValueHandle.h" 125 #include "llvm/IR/Verifier.h" 126 #include "llvm/InitializePasses.h" 127 #include "llvm/Pass.h" 128 #include "llvm/Support/Casting.h" 129 #include "llvm/Support/CommandLine.h" 130 #include "llvm/Support/Compiler.h" 131 #include "llvm/Support/Debug.h" 132 #include "llvm/Support/ErrorHandling.h" 133 #include "llvm/Support/MathExtras.h" 134 #include "llvm/Support/raw_ostream.h" 135 #include "llvm/Transforms/Utils/BasicBlockUtils.h" 136 #include "llvm/Transforms/Utils/InjectTLIMappings.h" 137 #include "llvm/Transforms/Utils/LoopSimplify.h" 138 #include "llvm/Transforms/Utils/LoopUtils.h" 139 #include "llvm/Transforms/Utils/LoopVersioning.h" 140 #include "llvm/Transforms/Utils/ScalarEvolutionExpander.h" 141 #include "llvm/Transforms/Utils/SizeOpts.h" 142 #include "llvm/Transforms/Vectorize/LoopVectorizationLegality.h" 143 #include <algorithm> 144 #include <cassert> 145 #include <cstdint> 146 #include <cstdlib> 147 #include <functional> 148 #include <iterator> 149 #include <limits> 150 #include <memory> 151 #include <string> 152 #include <tuple> 153 #include <utility> 154 155 using namespace llvm; 156 157 #define LV_NAME "loop-vectorize" 158 #define DEBUG_TYPE LV_NAME 159 160 /// @{ 161 /// Metadata attribute names 162 static const char *const LLVMLoopVectorizeFollowupAll = 163 "llvm.loop.vectorize.followup_all"; 164 static const char *const LLVMLoopVectorizeFollowupVectorized = 165 "llvm.loop.vectorize.followup_vectorized"; 166 static const char *const LLVMLoopVectorizeFollowupEpilogue = 167 "llvm.loop.vectorize.followup_epilogue"; 168 /// @} 169 170 STATISTIC(LoopsVectorized, "Number of loops vectorized"); 171 STATISTIC(LoopsAnalyzed, "Number of loops analyzed for vectorization"); 172 173 /// Loops with a known constant trip count below this number are vectorized only 174 /// if no scalar iteration overheads are incurred. 175 static cl::opt<unsigned> TinyTripCountVectorThreshold( 176 "vectorizer-min-trip-count", cl::init(16), cl::Hidden, 177 cl::desc("Loops with a constant trip count that is smaller than this " 178 "value are vectorized only if no scalar iteration overheads " 179 "are incurred.")); 180 181 // Indicates that an epilogue is undesired, predication is preferred. 182 // This means that the vectorizer will try to fold the loop-tail (epilogue) 183 // into the loop and predicate the loop body accordingly. 184 static cl::opt<bool> PreferPredicateOverEpilog( 185 "prefer-predicate-over-epilog", cl::init(false), cl::Hidden, 186 cl::desc("Indicate that an epilogue is undesired, predication should be " 187 "used instead.")); 188 189 static cl::opt<bool> MaximizeBandwidth( 190 "vectorizer-maximize-bandwidth", cl::init(false), cl::Hidden, 191 cl::desc("Maximize bandwidth when selecting vectorization factor which " 192 "will be determined by the smallest type in loop.")); 193 194 static cl::opt<bool> EnableInterleavedMemAccesses( 195 "enable-interleaved-mem-accesses", cl::init(false), cl::Hidden, 196 cl::desc("Enable vectorization on interleaved memory accesses in a loop")); 197 198 /// An interleave-group may need masking if it resides in a block that needs 199 /// predication, or in order to mask away gaps. 200 static cl::opt<bool> EnableMaskedInterleavedMemAccesses( 201 "enable-masked-interleaved-mem-accesses", cl::init(false), cl::Hidden, 202 cl::desc("Enable vectorization on masked interleaved memory accesses in a loop")); 203 204 static cl::opt<unsigned> TinyTripCountInterleaveThreshold( 205 "tiny-trip-count-interleave-threshold", cl::init(128), cl::Hidden, 206 cl::desc("We don't interleave loops with a estimated constant trip count " 207 "below this number")); 208 209 static cl::opt<unsigned> ForceTargetNumScalarRegs( 210 "force-target-num-scalar-regs", cl::init(0), cl::Hidden, 211 cl::desc("A flag that overrides the target's number of scalar registers.")); 212 213 static cl::opt<unsigned> ForceTargetNumVectorRegs( 214 "force-target-num-vector-regs", cl::init(0), cl::Hidden, 215 cl::desc("A flag that overrides the target's number of vector registers.")); 216 217 static cl::opt<unsigned> ForceTargetMaxScalarInterleaveFactor( 218 "force-target-max-scalar-interleave", cl::init(0), cl::Hidden, 219 cl::desc("A flag that overrides the target's max interleave factor for " 220 "scalar loops.")); 221 222 static cl::opt<unsigned> ForceTargetMaxVectorInterleaveFactor( 223 "force-target-max-vector-interleave", cl::init(0), cl::Hidden, 224 cl::desc("A flag that overrides the target's max interleave factor for " 225 "vectorized loops.")); 226 227 static cl::opt<unsigned> ForceTargetInstructionCost( 228 "force-target-instruction-cost", cl::init(0), cl::Hidden, 229 cl::desc("A flag that overrides the target's expected cost for " 230 "an instruction to a single constant value. Mostly " 231 "useful for getting consistent testing.")); 232 233 static cl::opt<unsigned> SmallLoopCost( 234 "small-loop-cost", cl::init(20), cl::Hidden, 235 cl::desc( 236 "The cost of a loop that is considered 'small' by the interleaver.")); 237 238 static cl::opt<bool> LoopVectorizeWithBlockFrequency( 239 "loop-vectorize-with-block-frequency", cl::init(true), cl::Hidden, 240 cl::desc("Enable the use of the block frequency analysis to access PGO " 241 "heuristics minimizing code growth in cold regions and being more " 242 "aggressive in hot regions.")); 243 244 // Runtime interleave loops for load/store throughput. 245 static cl::opt<bool> EnableLoadStoreRuntimeInterleave( 246 "enable-loadstore-runtime-interleave", cl::init(true), cl::Hidden, 247 cl::desc( 248 "Enable runtime interleaving until load/store ports are saturated")); 249 250 /// The number of stores in a loop that are allowed to need predication. 251 static cl::opt<unsigned> NumberOfStoresToPredicate( 252 "vectorize-num-stores-pred", cl::init(1), cl::Hidden, 253 cl::desc("Max number of stores to be predicated behind an if.")); 254 255 static cl::opt<bool> EnableIndVarRegisterHeur( 256 "enable-ind-var-reg-heur", cl::init(true), cl::Hidden, 257 cl::desc("Count the induction variable only once when interleaving")); 258 259 static cl::opt<bool> EnableCondStoresVectorization( 260 "enable-cond-stores-vec", cl::init(true), cl::Hidden, 261 cl::desc("Enable if predication of stores during vectorization.")); 262 263 static cl::opt<unsigned> MaxNestedScalarReductionIC( 264 "max-nested-scalar-reduction-interleave", cl::init(2), cl::Hidden, 265 cl::desc("The maximum interleave count to use when interleaving a scalar " 266 "reduction in a nested loop.")); 267 268 static cl::opt<bool> 269 PreferInLoopReductions("prefer-inloop-reductions", cl::init(false), 270 cl::Hidden, 271 cl::desc("Prefer in-loop vector reductions, " 272 "overriding the targets preference.")); 273 274 static cl::opt<bool> PreferPredicatedReductionSelect( 275 "prefer-predicated-reduction-select", cl::init(false), cl::Hidden, 276 cl::desc( 277 "Prefer predicating a reduction operation over an after loop select.")); 278 279 cl::opt<bool> EnableVPlanNativePath( 280 "enable-vplan-native-path", cl::init(false), cl::Hidden, 281 cl::desc("Enable VPlan-native vectorization path with " 282 "support for outer loop vectorization.")); 283 284 // FIXME: Remove this switch once we have divergence analysis. Currently we 285 // assume divergent non-backedge branches when this switch is true. 286 cl::opt<bool> EnableVPlanPredication( 287 "enable-vplan-predication", cl::init(false), cl::Hidden, 288 cl::desc("Enable VPlan-native vectorization path predicator with " 289 "support for outer loop vectorization.")); 290 291 // This flag enables the stress testing of the VPlan H-CFG construction in the 292 // VPlan-native vectorization path. It must be used in conjuction with 293 // -enable-vplan-native-path. -vplan-verify-hcfg can also be used to enable the 294 // verification of the H-CFGs built. 295 static cl::opt<bool> VPlanBuildStressTest( 296 "vplan-build-stress-test", cl::init(false), cl::Hidden, 297 cl::desc( 298 "Build VPlan for every supported loop nest in the function and bail " 299 "out right after the build (stress test the VPlan H-CFG construction " 300 "in the VPlan-native vectorization path).")); 301 302 cl::opt<bool> llvm::EnableLoopInterleaving( 303 "interleave-loops", cl::init(true), cl::Hidden, 304 cl::desc("Enable loop interleaving in Loop vectorization passes")); 305 cl::opt<bool> llvm::EnableLoopVectorization( 306 "vectorize-loops", cl::init(true), cl::Hidden, 307 cl::desc("Run the Loop vectorization passes")); 308 309 /// A helper function that returns the type of loaded or stored value. 310 static Type *getMemInstValueType(Value *I) { 311 assert((isa<LoadInst>(I) || isa<StoreInst>(I)) && 312 "Expected Load or Store instruction"); 313 if (auto *LI = dyn_cast<LoadInst>(I)) 314 return LI->getType(); 315 return cast<StoreInst>(I)->getValueOperand()->getType(); 316 } 317 318 /// A helper function that returns true if the given type is irregular. The 319 /// type is irregular if its allocated size doesn't equal the store size of an 320 /// element of the corresponding vector type at the given vectorization factor. 321 static bool hasIrregularType(Type *Ty, const DataLayout &DL, unsigned VF) { 322 // Determine if an array of VF elements of type Ty is "bitcast compatible" 323 // with a <VF x Ty> vector. 324 if (VF > 1) { 325 auto *VectorTy = FixedVectorType::get(Ty, VF); 326 return VF * DL.getTypeAllocSize(Ty) != DL.getTypeStoreSize(VectorTy); 327 } 328 329 // If the vectorization factor is one, we just check if an array of type Ty 330 // requires padding between elements. 331 return DL.getTypeAllocSizeInBits(Ty) != DL.getTypeSizeInBits(Ty); 332 } 333 334 /// A helper function that returns the reciprocal of the block probability of 335 /// predicated blocks. If we return X, we are assuming the predicated block 336 /// will execute once for every X iterations of the loop header. 337 /// 338 /// TODO: We should use actual block probability here, if available. Currently, 339 /// we always assume predicated blocks have a 50% chance of executing. 340 static unsigned getReciprocalPredBlockProb() { return 2; } 341 342 /// A helper function that adds a 'fast' flag to floating-point operations. 343 static Value *addFastMathFlag(Value *V) { 344 if (isa<FPMathOperator>(V)) 345 cast<Instruction>(V)->setFastMathFlags(FastMathFlags::getFast()); 346 return V; 347 } 348 349 static Value *addFastMathFlag(Value *V, FastMathFlags FMF) { 350 if (isa<FPMathOperator>(V)) 351 cast<Instruction>(V)->setFastMathFlags(FMF); 352 return V; 353 } 354 355 /// A helper function that returns an integer or floating-point constant with 356 /// value C. 357 static Constant *getSignedIntOrFpConstant(Type *Ty, int64_t C) { 358 return Ty->isIntegerTy() ? ConstantInt::getSigned(Ty, C) 359 : ConstantFP::get(Ty, C); 360 } 361 362 /// Returns "best known" trip count for the specified loop \p L as defined by 363 /// the following procedure: 364 /// 1) Returns exact trip count if it is known. 365 /// 2) Returns expected trip count according to profile data if any. 366 /// 3) Returns upper bound estimate if it is known. 367 /// 4) Returns None if all of the above failed. 368 static Optional<unsigned> getSmallBestKnownTC(ScalarEvolution &SE, Loop *L) { 369 // Check if exact trip count is known. 370 if (unsigned ExpectedTC = SE.getSmallConstantTripCount(L)) 371 return ExpectedTC; 372 373 // Check if there is an expected trip count available from profile data. 374 if (LoopVectorizeWithBlockFrequency) 375 if (auto EstimatedTC = getLoopEstimatedTripCount(L)) 376 return EstimatedTC; 377 378 // Check if upper bound estimate is known. 379 if (unsigned ExpectedTC = SE.getSmallConstantMaxTripCount(L)) 380 return ExpectedTC; 381 382 return None; 383 } 384 385 namespace llvm { 386 387 /// InnerLoopVectorizer vectorizes loops which contain only one basic 388 /// block to a specified vectorization factor (VF). 389 /// This class performs the widening of scalars into vectors, or multiple 390 /// scalars. This class also implements the following features: 391 /// * It inserts an epilogue loop for handling loops that don't have iteration 392 /// counts that are known to be a multiple of the vectorization factor. 393 /// * It handles the code generation for reduction variables. 394 /// * Scalarization (implementation using scalars) of un-vectorizable 395 /// instructions. 396 /// InnerLoopVectorizer does not perform any vectorization-legality 397 /// checks, and relies on the caller to check for the different legality 398 /// aspects. The InnerLoopVectorizer relies on the 399 /// LoopVectorizationLegality class to provide information about the induction 400 /// and reduction variables that were found to a given vectorization factor. 401 class InnerLoopVectorizer { 402 public: 403 InnerLoopVectorizer(Loop *OrigLoop, PredicatedScalarEvolution &PSE, 404 LoopInfo *LI, DominatorTree *DT, 405 const TargetLibraryInfo *TLI, 406 const TargetTransformInfo *TTI, AssumptionCache *AC, 407 OptimizationRemarkEmitter *ORE, unsigned VecWidth, 408 unsigned UnrollFactor, LoopVectorizationLegality *LVL, 409 LoopVectorizationCostModel *CM, BlockFrequencyInfo *BFI, 410 ProfileSummaryInfo *PSI) 411 : OrigLoop(OrigLoop), PSE(PSE), LI(LI), DT(DT), TLI(TLI), TTI(TTI), 412 AC(AC), ORE(ORE), VF(VecWidth), UF(UnrollFactor), 413 Builder(PSE.getSE()->getContext()), 414 VectorLoopValueMap(UnrollFactor, VecWidth), Legal(LVL), Cost(CM), 415 BFI(BFI), PSI(PSI) { 416 // Query this against the original loop and save it here because the profile 417 // of the original loop header may change as the transformation happens. 418 OptForSizeBasedOnProfile = llvm::shouldOptimizeForSize( 419 OrigLoop->getHeader(), PSI, BFI, PGSOQueryType::IRPass); 420 } 421 422 virtual ~InnerLoopVectorizer() = default; 423 424 /// Create a new empty loop that will contain vectorized instructions later 425 /// on, while the old loop will be used as the scalar remainder. Control flow 426 /// is generated around the vectorized (and scalar epilogue) loops consisting 427 /// of various checks and bypasses. Return the pre-header block of the new 428 /// loop. 429 BasicBlock *createVectorizedLoopSkeleton(); 430 431 /// Widen a single instruction within the innermost loop. 432 void widenInstruction(Instruction &I, VPUser &Operands, 433 VPTransformState &State); 434 435 /// Widen a single call instruction within the innermost loop. 436 void widenCallInstruction(CallInst &I, VPUser &ArgOperands, 437 VPTransformState &State); 438 439 /// Widen a single select instruction within the innermost loop. 440 void widenSelectInstruction(SelectInst &I, VPUser &Operands, 441 bool InvariantCond, VPTransformState &State); 442 443 /// Fix the vectorized code, taking care of header phi's, live-outs, and more. 444 void fixVectorizedLoop(); 445 446 // Return true if any runtime check is added. 447 bool areSafetyChecksAdded() { return AddedSafetyChecks; } 448 449 /// A type for vectorized values in the new loop. Each value from the 450 /// original loop, when vectorized, is represented by UF vector values in the 451 /// new unrolled loop, where UF is the unroll factor. 452 using VectorParts = SmallVector<Value *, 2>; 453 454 /// Vectorize a single GetElementPtrInst based on information gathered and 455 /// decisions taken during planning. 456 void widenGEP(GetElementPtrInst *GEP, VPUser &Indices, unsigned UF, 457 unsigned VF, bool IsPtrLoopInvariant, 458 SmallBitVector &IsIndexLoopInvariant, VPTransformState &State); 459 460 /// Vectorize a single PHINode in a block. This method handles the induction 461 /// variable canonicalization. It supports both VF = 1 for unrolled loops and 462 /// arbitrary length vectors. 463 void widenPHIInstruction(Instruction *PN, unsigned UF, unsigned VF); 464 465 /// A helper function to scalarize a single Instruction in the innermost loop. 466 /// Generates a sequence of scalar instances for each lane between \p MinLane 467 /// and \p MaxLane, times each part between \p MinPart and \p MaxPart, 468 /// inclusive. Uses the VPValue operands from \p Operands instead of \p 469 /// Instr's operands. 470 void scalarizeInstruction(Instruction *Instr, VPUser &Operands, 471 const VPIteration &Instance, bool IfPredicateInstr, 472 VPTransformState &State); 473 474 /// Widen an integer or floating-point induction variable \p IV. If \p Trunc 475 /// is provided, the integer induction variable will first be truncated to 476 /// the corresponding type. 477 void widenIntOrFpInduction(PHINode *IV, TruncInst *Trunc = nullptr); 478 479 /// getOrCreateVectorValue and getOrCreateScalarValue coordinate to generate a 480 /// vector or scalar value on-demand if one is not yet available. When 481 /// vectorizing a loop, we visit the definition of an instruction before its 482 /// uses. When visiting the definition, we either vectorize or scalarize the 483 /// instruction, creating an entry for it in the corresponding map. (In some 484 /// cases, such as induction variables, we will create both vector and scalar 485 /// entries.) Then, as we encounter uses of the definition, we derive values 486 /// for each scalar or vector use unless such a value is already available. 487 /// For example, if we scalarize a definition and one of its uses is vector, 488 /// we build the required vector on-demand with an insertelement sequence 489 /// when visiting the use. Otherwise, if the use is scalar, we can use the 490 /// existing scalar definition. 491 /// 492 /// Return a value in the new loop corresponding to \p V from the original 493 /// loop at unroll index \p Part. If the value has already been vectorized, 494 /// the corresponding vector entry in VectorLoopValueMap is returned. If, 495 /// however, the value has a scalar entry in VectorLoopValueMap, we construct 496 /// a new vector value on-demand by inserting the scalar values into a vector 497 /// with an insertelement sequence. If the value has been neither vectorized 498 /// nor scalarized, it must be loop invariant, so we simply broadcast the 499 /// value into a vector. 500 Value *getOrCreateVectorValue(Value *V, unsigned Part); 501 502 /// Return a value in the new loop corresponding to \p V from the original 503 /// loop at unroll and vector indices \p Instance. If the value has been 504 /// vectorized but not scalarized, the necessary extractelement instruction 505 /// will be generated. 506 Value *getOrCreateScalarValue(Value *V, const VPIteration &Instance); 507 508 /// Construct the vector value of a scalarized value \p V one lane at a time. 509 void packScalarIntoVectorValue(Value *V, const VPIteration &Instance); 510 511 /// Try to vectorize interleaved access group \p Group with the base address 512 /// given in \p Addr, optionally masking the vector operations if \p 513 /// BlockInMask is non-null. Use \p State to translate given VPValues to IR 514 /// values in the vectorized loop. 515 void vectorizeInterleaveGroup(const InterleaveGroup<Instruction> *Group, 516 VPTransformState &State, VPValue *Addr, 517 VPValue *BlockInMask = nullptr); 518 519 /// Vectorize Load and Store instructions with the base address given in \p 520 /// Addr, optionally masking the vector operations if \p BlockInMask is 521 /// non-null. Use \p State to translate given VPValues to IR values in the 522 /// vectorized loop. 523 void vectorizeMemoryInstruction(Instruction *Instr, VPTransformState &State, 524 VPValue *Addr, VPValue *StoredValue, 525 VPValue *BlockInMask); 526 527 /// Set the debug location in the builder using the debug location in 528 /// the instruction. 529 void setDebugLocFromInst(IRBuilder<> &B, const Value *Ptr); 530 531 /// Fix the non-induction PHIs in the OrigPHIsToFix vector. 532 void fixNonInductionPHIs(void); 533 534 protected: 535 friend class LoopVectorizationPlanner; 536 537 /// A small list of PHINodes. 538 using PhiVector = SmallVector<PHINode *, 4>; 539 540 /// A type for scalarized values in the new loop. Each value from the 541 /// original loop, when scalarized, is represented by UF x VF scalar values 542 /// in the new unrolled loop, where UF is the unroll factor and VF is the 543 /// vectorization factor. 544 using ScalarParts = SmallVector<SmallVector<Value *, 4>, 2>; 545 546 /// Set up the values of the IVs correctly when exiting the vector loop. 547 void fixupIVUsers(PHINode *OrigPhi, const InductionDescriptor &II, 548 Value *CountRoundDown, Value *EndValue, 549 BasicBlock *MiddleBlock); 550 551 /// Create a new induction variable inside L. 552 PHINode *createInductionVariable(Loop *L, Value *Start, Value *End, 553 Value *Step, Instruction *DL); 554 555 /// Handle all cross-iteration phis in the header. 556 void fixCrossIterationPHIs(); 557 558 /// Fix a first-order recurrence. This is the second phase of vectorizing 559 /// this phi node. 560 void fixFirstOrderRecurrence(PHINode *Phi); 561 562 /// Fix a reduction cross-iteration phi. This is the second phase of 563 /// vectorizing this phi node. 564 void fixReduction(PHINode *Phi); 565 566 /// Clear NSW/NUW flags from reduction instructions if necessary. 567 void clearReductionWrapFlags(RecurrenceDescriptor &RdxDesc); 568 569 /// The Loop exit block may have single value PHI nodes with some 570 /// incoming value. While vectorizing we only handled real values 571 /// that were defined inside the loop and we should have one value for 572 /// each predecessor of its parent basic block. See PR14725. 573 void fixLCSSAPHIs(); 574 575 /// Iteratively sink the scalarized operands of a predicated instruction into 576 /// the block that was created for it. 577 void sinkScalarOperands(Instruction *PredInst); 578 579 /// Shrinks vector element sizes to the smallest bitwidth they can be legally 580 /// represented as. 581 void truncateToMinimalBitwidths(); 582 583 /// Create a broadcast instruction. This method generates a broadcast 584 /// instruction (shuffle) for loop invariant values and for the induction 585 /// value. If this is the induction variable then we extend it to N, N+1, ... 586 /// this is needed because each iteration in the loop corresponds to a SIMD 587 /// element. 588 virtual Value *getBroadcastInstrs(Value *V); 589 590 /// This function adds (StartIdx, StartIdx + Step, StartIdx + 2*Step, ...) 591 /// to each vector element of Val. The sequence starts at StartIndex. 592 /// \p Opcode is relevant for FP induction variable. 593 virtual Value *getStepVector(Value *Val, int StartIdx, Value *Step, 594 Instruction::BinaryOps Opcode = 595 Instruction::BinaryOpsEnd); 596 597 /// Compute scalar induction steps. \p ScalarIV is the scalar induction 598 /// variable on which to base the steps, \p Step is the size of the step, and 599 /// \p EntryVal is the value from the original loop that maps to the steps. 600 /// Note that \p EntryVal doesn't have to be an induction variable - it 601 /// can also be a truncate instruction. 602 void buildScalarSteps(Value *ScalarIV, Value *Step, Instruction *EntryVal, 603 const InductionDescriptor &ID); 604 605 /// Create a vector induction phi node based on an existing scalar one. \p 606 /// EntryVal is the value from the original loop that maps to the vector phi 607 /// node, and \p Step is the loop-invariant step. If \p EntryVal is a 608 /// truncate instruction, instead of widening the original IV, we widen a 609 /// version of the IV truncated to \p EntryVal's type. 610 void createVectorIntOrFpInductionPHI(const InductionDescriptor &II, 611 Value *Step, Instruction *EntryVal); 612 613 /// Returns true if an instruction \p I should be scalarized instead of 614 /// vectorized for the chosen vectorization factor. 615 bool shouldScalarizeInstruction(Instruction *I) const; 616 617 /// Returns true if we should generate a scalar version of \p IV. 618 bool needsScalarInduction(Instruction *IV) const; 619 620 /// If there is a cast involved in the induction variable \p ID, which should 621 /// be ignored in the vectorized loop body, this function records the 622 /// VectorLoopValue of the respective Phi also as the VectorLoopValue of the 623 /// cast. We had already proved that the casted Phi is equal to the uncasted 624 /// Phi in the vectorized loop (under a runtime guard), and therefore 625 /// there is no need to vectorize the cast - the same value can be used in the 626 /// vector loop for both the Phi and the cast. 627 /// If \p VectorLoopValue is a scalarized value, \p Lane is also specified, 628 /// Otherwise, \p VectorLoopValue is a widened/vectorized value. 629 /// 630 /// \p EntryVal is the value from the original loop that maps to the vector 631 /// phi node and is used to distinguish what is the IV currently being 632 /// processed - original one (if \p EntryVal is a phi corresponding to the 633 /// original IV) or the "newly-created" one based on the proof mentioned above 634 /// (see also buildScalarSteps() and createVectorIntOrFPInductionPHI()). In the 635 /// latter case \p EntryVal is a TruncInst and we must not record anything for 636 /// that IV, but it's error-prone to expect callers of this routine to care 637 /// about that, hence this explicit parameter. 638 void recordVectorLoopValueForInductionCast(const InductionDescriptor &ID, 639 const Instruction *EntryVal, 640 Value *VectorLoopValue, 641 unsigned Part, 642 unsigned Lane = UINT_MAX); 643 644 /// Generate a shuffle sequence that will reverse the vector Vec. 645 virtual Value *reverseVector(Value *Vec); 646 647 /// Returns (and creates if needed) the original loop trip count. 648 Value *getOrCreateTripCount(Loop *NewLoop); 649 650 /// Returns (and creates if needed) the trip count of the widened loop. 651 Value *getOrCreateVectorTripCount(Loop *NewLoop); 652 653 /// Returns a bitcasted value to the requested vector type. 654 /// Also handles bitcasts of vector<float> <-> vector<pointer> types. 655 Value *createBitOrPointerCast(Value *V, VectorType *DstVTy, 656 const DataLayout &DL); 657 658 /// Emit a bypass check to see if the vector trip count is zero, including if 659 /// it overflows. 660 void emitMinimumIterationCountCheck(Loop *L, BasicBlock *Bypass); 661 662 /// Emit a bypass check to see if all of the SCEV assumptions we've 663 /// had to make are correct. 664 void emitSCEVChecks(Loop *L, BasicBlock *Bypass); 665 666 /// Emit bypass checks to check any memory assumptions we may have made. 667 void emitMemRuntimeChecks(Loop *L, BasicBlock *Bypass); 668 669 /// Compute the transformed value of Index at offset StartValue using step 670 /// StepValue. 671 /// For integer induction, returns StartValue + Index * StepValue. 672 /// For pointer induction, returns StartValue[Index * StepValue]. 673 /// FIXME: The newly created binary instructions should contain nsw/nuw 674 /// flags, which can be found from the original scalar operations. 675 Value *emitTransformedIndex(IRBuilder<> &B, Value *Index, ScalarEvolution *SE, 676 const DataLayout &DL, 677 const InductionDescriptor &ID) const; 678 679 /// Emit basic blocks (prefixed with \p Prefix) for the iteration check, 680 /// vector loop preheader, middle block and scalar preheader. Also 681 /// allocate a loop object for the new vector loop and return it. 682 Loop *createVectorLoopSkeleton(StringRef Prefix); 683 684 /// Create new phi nodes for the induction variables to resume iteration count 685 /// in the scalar epilogue, from where the vectorized loop left off (given by 686 /// \p VectorTripCount). 687 void createInductionResumeValues(Loop *L, Value *VectorTripCount); 688 689 /// Complete the loop skeleton by adding debug MDs, creating appropriate 690 /// conditional branches in the middle block, preparing the builder and 691 /// running the verifier. Take in the vector loop \p L as argument, and return 692 /// the preheader of the completed vector loop. 693 BasicBlock *completeLoopSkeleton(Loop *L, MDNode *OrigLoopID); 694 695 /// Add additional metadata to \p To that was not present on \p Orig. 696 /// 697 /// Currently this is used to add the noalias annotations based on the 698 /// inserted memchecks. Use this for instructions that are *cloned* into the 699 /// vector loop. 700 void addNewMetadata(Instruction *To, const Instruction *Orig); 701 702 /// Add metadata from one instruction to another. 703 /// 704 /// This includes both the original MDs from \p From and additional ones (\see 705 /// addNewMetadata). Use this for *newly created* instructions in the vector 706 /// loop. 707 void addMetadata(Instruction *To, Instruction *From); 708 709 /// Similar to the previous function but it adds the metadata to a 710 /// vector of instructions. 711 void addMetadata(ArrayRef<Value *> To, Instruction *From); 712 713 /// The original loop. 714 Loop *OrigLoop; 715 716 /// A wrapper around ScalarEvolution used to add runtime SCEV checks. Applies 717 /// dynamic knowledge to simplify SCEV expressions and converts them to a 718 /// more usable form. 719 PredicatedScalarEvolution &PSE; 720 721 /// Loop Info. 722 LoopInfo *LI; 723 724 /// Dominator Tree. 725 DominatorTree *DT; 726 727 /// Alias Analysis. 728 AAResults *AA; 729 730 /// Target Library Info. 731 const TargetLibraryInfo *TLI; 732 733 /// Target Transform Info. 734 const TargetTransformInfo *TTI; 735 736 /// Assumption Cache. 737 AssumptionCache *AC; 738 739 /// Interface to emit optimization remarks. 740 OptimizationRemarkEmitter *ORE; 741 742 /// LoopVersioning. It's only set up (non-null) if memchecks were 743 /// used. 744 /// 745 /// This is currently only used to add no-alias metadata based on the 746 /// memchecks. The actually versioning is performed manually. 747 std::unique_ptr<LoopVersioning> LVer; 748 749 /// The vectorization SIMD factor to use. Each vector will have this many 750 /// vector elements. 751 unsigned VF; 752 753 /// The vectorization unroll factor to use. Each scalar is vectorized to this 754 /// many different vector instructions. 755 unsigned UF; 756 757 /// The builder that we use 758 IRBuilder<> Builder; 759 760 // --- Vectorization state --- 761 762 /// The vector-loop preheader. 763 BasicBlock *LoopVectorPreHeader; 764 765 /// The scalar-loop preheader. 766 BasicBlock *LoopScalarPreHeader; 767 768 /// Middle Block between the vector and the scalar. 769 BasicBlock *LoopMiddleBlock; 770 771 /// The ExitBlock of the scalar loop. 772 BasicBlock *LoopExitBlock; 773 774 /// The vector loop body. 775 BasicBlock *LoopVectorBody; 776 777 /// The scalar loop body. 778 BasicBlock *LoopScalarBody; 779 780 /// A list of all bypass blocks. The first block is the entry of the loop. 781 SmallVector<BasicBlock *, 4> LoopBypassBlocks; 782 783 /// The new Induction variable which was added to the new block. 784 PHINode *Induction = nullptr; 785 786 /// The induction variable of the old basic block. 787 PHINode *OldInduction = nullptr; 788 789 /// Maps values from the original loop to their corresponding values in the 790 /// vectorized loop. A key value can map to either vector values, scalar 791 /// values or both kinds of values, depending on whether the key was 792 /// vectorized and scalarized. 793 VectorizerValueMap VectorLoopValueMap; 794 795 /// Store instructions that were predicated. 796 SmallVector<Instruction *, 4> PredicatedInstructions; 797 798 /// Trip count of the original loop. 799 Value *TripCount = nullptr; 800 801 /// Trip count of the widened loop (TripCount - TripCount % (VF*UF)) 802 Value *VectorTripCount = nullptr; 803 804 /// The legality analysis. 805 LoopVectorizationLegality *Legal; 806 807 /// The profitablity analysis. 808 LoopVectorizationCostModel *Cost; 809 810 // Record whether runtime checks are added. 811 bool AddedSafetyChecks = false; 812 813 // Holds the end values for each induction variable. We save the end values 814 // so we can later fix-up the external users of the induction variables. 815 DenseMap<PHINode *, Value *> IVEndValues; 816 817 // Vector of original scalar PHIs whose corresponding widened PHIs need to be 818 // fixed up at the end of vector code generation. 819 SmallVector<PHINode *, 8> OrigPHIsToFix; 820 821 /// BFI and PSI are used to check for profile guided size optimizations. 822 BlockFrequencyInfo *BFI; 823 ProfileSummaryInfo *PSI; 824 825 // Whether this loop should be optimized for size based on profile guided size 826 // optimizatios. 827 bool OptForSizeBasedOnProfile; 828 }; 829 830 class InnerLoopUnroller : public InnerLoopVectorizer { 831 public: 832 InnerLoopUnroller(Loop *OrigLoop, PredicatedScalarEvolution &PSE, 833 LoopInfo *LI, DominatorTree *DT, 834 const TargetLibraryInfo *TLI, 835 const TargetTransformInfo *TTI, AssumptionCache *AC, 836 OptimizationRemarkEmitter *ORE, unsigned UnrollFactor, 837 LoopVectorizationLegality *LVL, 838 LoopVectorizationCostModel *CM, BlockFrequencyInfo *BFI, 839 ProfileSummaryInfo *PSI) 840 : InnerLoopVectorizer(OrigLoop, PSE, LI, DT, TLI, TTI, AC, ORE, 1, 841 UnrollFactor, LVL, CM, BFI, PSI) {} 842 843 private: 844 Value *getBroadcastInstrs(Value *V) override; 845 Value *getStepVector(Value *Val, int StartIdx, Value *Step, 846 Instruction::BinaryOps Opcode = 847 Instruction::BinaryOpsEnd) override; 848 Value *reverseVector(Value *Vec) override; 849 }; 850 851 } // end namespace llvm 852 853 /// Look for a meaningful debug location on the instruction or it's 854 /// operands. 855 static Instruction *getDebugLocFromInstOrOperands(Instruction *I) { 856 if (!I) 857 return I; 858 859 DebugLoc Empty; 860 if (I->getDebugLoc() != Empty) 861 return I; 862 863 for (User::op_iterator OI = I->op_begin(), OE = I->op_end(); OI != OE; ++OI) { 864 if (Instruction *OpInst = dyn_cast<Instruction>(*OI)) 865 if (OpInst->getDebugLoc() != Empty) 866 return OpInst; 867 } 868 869 return I; 870 } 871 872 void InnerLoopVectorizer::setDebugLocFromInst(IRBuilder<> &B, const Value *Ptr) { 873 if (const Instruction *Inst = dyn_cast_or_null<Instruction>(Ptr)) { 874 const DILocation *DIL = Inst->getDebugLoc(); 875 if (DIL && Inst->getFunction()->isDebugInfoForProfiling() && 876 !isa<DbgInfoIntrinsic>(Inst)) { 877 auto NewDIL = DIL->cloneByMultiplyingDuplicationFactor(UF * VF); 878 if (NewDIL) 879 B.SetCurrentDebugLocation(NewDIL.getValue()); 880 else 881 LLVM_DEBUG(dbgs() 882 << "Failed to create new discriminator: " 883 << DIL->getFilename() << " Line: " << DIL->getLine()); 884 } 885 else 886 B.SetCurrentDebugLocation(DIL); 887 } else 888 B.SetCurrentDebugLocation(DebugLoc()); 889 } 890 891 /// Write a record \p DebugMsg about vectorization failure to the debug 892 /// output stream. If \p I is passed, it is an instruction that prevents 893 /// vectorization. 894 #ifndef NDEBUG 895 static void debugVectorizationFailure(const StringRef DebugMsg, 896 Instruction *I) { 897 dbgs() << "LV: Not vectorizing: " << DebugMsg; 898 if (I != nullptr) 899 dbgs() << " " << *I; 900 else 901 dbgs() << '.'; 902 dbgs() << '\n'; 903 } 904 #endif 905 906 /// Create an analysis remark that explains why vectorization failed 907 /// 908 /// \p PassName is the name of the pass (e.g. can be AlwaysPrint). \p 909 /// RemarkName is the identifier for the remark. If \p I is passed it is an 910 /// instruction that prevents vectorization. Otherwise \p TheLoop is used for 911 /// the location of the remark. \return the remark object that can be 912 /// streamed to. 913 static OptimizationRemarkAnalysis createLVAnalysis(const char *PassName, 914 StringRef RemarkName, Loop *TheLoop, Instruction *I) { 915 Value *CodeRegion = TheLoop->getHeader(); 916 DebugLoc DL = TheLoop->getStartLoc(); 917 918 if (I) { 919 CodeRegion = I->getParent(); 920 // If there is no debug location attached to the instruction, revert back to 921 // using the loop's. 922 if (I->getDebugLoc()) 923 DL = I->getDebugLoc(); 924 } 925 926 OptimizationRemarkAnalysis R(PassName, RemarkName, DL, CodeRegion); 927 R << "loop not vectorized: "; 928 return R; 929 } 930 931 namespace llvm { 932 933 void reportVectorizationFailure(const StringRef DebugMsg, 934 const StringRef OREMsg, const StringRef ORETag, 935 OptimizationRemarkEmitter *ORE, Loop *TheLoop, Instruction *I) { 936 LLVM_DEBUG(debugVectorizationFailure(DebugMsg, I)); 937 LoopVectorizeHints Hints(TheLoop, true /* doesn't matter */, *ORE); 938 ORE->emit(createLVAnalysis(Hints.vectorizeAnalysisPassName(), 939 ORETag, TheLoop, I) << OREMsg); 940 } 941 942 } // end namespace llvm 943 944 #ifndef NDEBUG 945 /// \return string containing a file name and a line # for the given loop. 946 static std::string getDebugLocString(const Loop *L) { 947 std::string Result; 948 if (L) { 949 raw_string_ostream OS(Result); 950 if (const DebugLoc LoopDbgLoc = L->getStartLoc()) 951 LoopDbgLoc.print(OS); 952 else 953 // Just print the module name. 954 OS << L->getHeader()->getParent()->getParent()->getModuleIdentifier(); 955 OS.flush(); 956 } 957 return Result; 958 } 959 #endif 960 961 void InnerLoopVectorizer::addNewMetadata(Instruction *To, 962 const Instruction *Orig) { 963 // If the loop was versioned with memchecks, add the corresponding no-alias 964 // metadata. 965 if (LVer && (isa<LoadInst>(Orig) || isa<StoreInst>(Orig))) 966 LVer->annotateInstWithNoAlias(To, Orig); 967 } 968 969 void InnerLoopVectorizer::addMetadata(Instruction *To, 970 Instruction *From) { 971 propagateMetadata(To, From); 972 addNewMetadata(To, From); 973 } 974 975 void InnerLoopVectorizer::addMetadata(ArrayRef<Value *> To, 976 Instruction *From) { 977 for (Value *V : To) { 978 if (Instruction *I = dyn_cast<Instruction>(V)) 979 addMetadata(I, From); 980 } 981 } 982 983 namespace llvm { 984 985 // Loop vectorization cost-model hints how the scalar epilogue loop should be 986 // lowered. 987 enum ScalarEpilogueLowering { 988 989 // The default: allowing scalar epilogues. 990 CM_ScalarEpilogueAllowed, 991 992 // Vectorization with OptForSize: don't allow epilogues. 993 CM_ScalarEpilogueNotAllowedOptSize, 994 995 // A special case of vectorisation with OptForSize: loops with a very small 996 // trip count are considered for vectorization under OptForSize, thereby 997 // making sure the cost of their loop body is dominant, free of runtime 998 // guards and scalar iteration overheads. 999 CM_ScalarEpilogueNotAllowedLowTripLoop, 1000 1001 // Loop hint predicate indicating an epilogue is undesired. 1002 CM_ScalarEpilogueNotNeededUsePredicate 1003 }; 1004 1005 /// LoopVectorizationCostModel - estimates the expected speedups due to 1006 /// vectorization. 1007 /// In many cases vectorization is not profitable. This can happen because of 1008 /// a number of reasons. In this class we mainly attempt to predict the 1009 /// expected speedup/slowdowns due to the supported instruction set. We use the 1010 /// TargetTransformInfo to query the different backends for the cost of 1011 /// different operations. 1012 class LoopVectorizationCostModel { 1013 public: 1014 LoopVectorizationCostModel(ScalarEpilogueLowering SEL, Loop *L, 1015 PredicatedScalarEvolution &PSE, LoopInfo *LI, 1016 LoopVectorizationLegality *Legal, 1017 const TargetTransformInfo &TTI, 1018 const TargetLibraryInfo *TLI, DemandedBits *DB, 1019 AssumptionCache *AC, 1020 OptimizationRemarkEmitter *ORE, const Function *F, 1021 const LoopVectorizeHints *Hints, 1022 InterleavedAccessInfo &IAI) 1023 : ScalarEpilogueStatus(SEL), TheLoop(L), PSE(PSE), LI(LI), Legal(Legal), 1024 TTI(TTI), TLI(TLI), DB(DB), AC(AC), ORE(ORE), TheFunction(F), 1025 Hints(Hints), InterleaveInfo(IAI) {} 1026 1027 /// \return An upper bound for the vectorization factor, or None if 1028 /// vectorization and interleaving should be avoided up front. 1029 Optional<unsigned> computeMaxVF(unsigned UserVF, unsigned UserIC); 1030 1031 /// \return True if runtime checks are required for vectorization, and false 1032 /// otherwise. 1033 bool runtimeChecksRequired(); 1034 1035 /// \return The most profitable vectorization factor and the cost of that VF. 1036 /// This method checks every power of two up to MaxVF. If UserVF is not ZERO 1037 /// then this vectorization factor will be selected if vectorization is 1038 /// possible. 1039 VectorizationFactor selectVectorizationFactor(unsigned MaxVF); 1040 1041 /// Setup cost-based decisions for user vectorization factor. 1042 void selectUserVectorizationFactor(unsigned UserVF) { 1043 collectUniformsAndScalars(UserVF); 1044 collectInstsToScalarize(UserVF); 1045 } 1046 1047 /// \return The size (in bits) of the smallest and widest types in the code 1048 /// that needs to be vectorized. We ignore values that remain scalar such as 1049 /// 64 bit loop indices. 1050 std::pair<unsigned, unsigned> getSmallestAndWidestTypes(); 1051 1052 /// \return The desired interleave count. 1053 /// If interleave count has been specified by metadata it will be returned. 1054 /// Otherwise, the interleave count is computed and returned. VF and LoopCost 1055 /// are the selected vectorization factor and the cost of the selected VF. 1056 unsigned selectInterleaveCount(unsigned VF, unsigned LoopCost); 1057 1058 /// Memory access instruction may be vectorized in more than one way. 1059 /// Form of instruction after vectorization depends on cost. 1060 /// This function takes cost-based decisions for Load/Store instructions 1061 /// and collects them in a map. This decisions map is used for building 1062 /// the lists of loop-uniform and loop-scalar instructions. 1063 /// The calculated cost is saved with widening decision in order to 1064 /// avoid redundant calculations. 1065 void setCostBasedWideningDecision(unsigned VF); 1066 1067 /// A struct that represents some properties of the register usage 1068 /// of a loop. 1069 struct RegisterUsage { 1070 /// Holds the number of loop invariant values that are used in the loop. 1071 /// The key is ClassID of target-provided register class. 1072 SmallMapVector<unsigned, unsigned, 4> LoopInvariantRegs; 1073 /// Holds the maximum number of concurrent live intervals in the loop. 1074 /// The key is ClassID of target-provided register class. 1075 SmallMapVector<unsigned, unsigned, 4> MaxLocalUsers; 1076 }; 1077 1078 /// \return Returns information about the register usages of the loop for the 1079 /// given vectorization factors. 1080 SmallVector<RegisterUsage, 8> calculateRegisterUsage(ArrayRef<unsigned> VFs); 1081 1082 /// Collect values we want to ignore in the cost model. 1083 void collectValuesToIgnore(); 1084 1085 /// Split reductions into those that happen in the loop, and those that happen 1086 /// outside. In loop reductions are collected into InLoopReductionChains. 1087 void collectInLoopReductions(); 1088 1089 /// \returns The smallest bitwidth each instruction can be represented with. 1090 /// The vector equivalents of these instructions should be truncated to this 1091 /// type. 1092 const MapVector<Instruction *, uint64_t> &getMinimalBitwidths() const { 1093 return MinBWs; 1094 } 1095 1096 /// \returns True if it is more profitable to scalarize instruction \p I for 1097 /// vectorization factor \p VF. 1098 bool isProfitableToScalarize(Instruction *I, unsigned VF) const { 1099 assert(VF > 1 && "Profitable to scalarize relevant only for VF > 1."); 1100 1101 // Cost model is not run in the VPlan-native path - return conservative 1102 // result until this changes. 1103 if (EnableVPlanNativePath) 1104 return false; 1105 1106 auto Scalars = InstsToScalarize.find(VF); 1107 assert(Scalars != InstsToScalarize.end() && 1108 "VF not yet analyzed for scalarization profitability"); 1109 return Scalars->second.find(I) != Scalars->second.end(); 1110 } 1111 1112 /// Returns true if \p I is known to be uniform after vectorization. 1113 bool isUniformAfterVectorization(Instruction *I, unsigned VF) const { 1114 if (VF == 1) 1115 return true; 1116 1117 // Cost model is not run in the VPlan-native path - return conservative 1118 // result until this changes. 1119 if (EnableVPlanNativePath) 1120 return false; 1121 1122 auto UniformsPerVF = Uniforms.find(VF); 1123 assert(UniformsPerVF != Uniforms.end() && 1124 "VF not yet analyzed for uniformity"); 1125 return UniformsPerVF->second.count(I); 1126 } 1127 1128 /// Returns true if \p I is known to be scalar after vectorization. 1129 bool isScalarAfterVectorization(Instruction *I, unsigned VF) const { 1130 if (VF == 1) 1131 return true; 1132 1133 // Cost model is not run in the VPlan-native path - return conservative 1134 // result until this changes. 1135 if (EnableVPlanNativePath) 1136 return false; 1137 1138 auto ScalarsPerVF = Scalars.find(VF); 1139 assert(ScalarsPerVF != Scalars.end() && 1140 "Scalar values are not calculated for VF"); 1141 return ScalarsPerVF->second.count(I); 1142 } 1143 1144 /// \returns True if instruction \p I can be truncated to a smaller bitwidth 1145 /// for vectorization factor \p VF. 1146 bool canTruncateToMinimalBitwidth(Instruction *I, unsigned VF) const { 1147 return VF > 1 && MinBWs.find(I) != MinBWs.end() && 1148 !isProfitableToScalarize(I, VF) && 1149 !isScalarAfterVectorization(I, VF); 1150 } 1151 1152 /// Decision that was taken during cost calculation for memory instruction. 1153 enum InstWidening { 1154 CM_Unknown, 1155 CM_Widen, // For consecutive accesses with stride +1. 1156 CM_Widen_Reverse, // For consecutive accesses with stride -1. 1157 CM_Interleave, 1158 CM_GatherScatter, 1159 CM_Scalarize 1160 }; 1161 1162 /// Save vectorization decision \p W and \p Cost taken by the cost model for 1163 /// instruction \p I and vector width \p VF. 1164 void setWideningDecision(Instruction *I, unsigned VF, InstWidening W, 1165 unsigned Cost) { 1166 assert(VF >= 2 && "Expected VF >=2"); 1167 WideningDecisions[std::make_pair(I, VF)] = std::make_pair(W, Cost); 1168 } 1169 1170 /// Save vectorization decision \p W and \p Cost taken by the cost model for 1171 /// interleaving group \p Grp and vector width \p VF. 1172 void setWideningDecision(const InterleaveGroup<Instruction> *Grp, unsigned VF, 1173 InstWidening W, unsigned Cost) { 1174 assert(VF >= 2 && "Expected VF >=2"); 1175 /// Broadcast this decicion to all instructions inside the group. 1176 /// But the cost will be assigned to one instruction only. 1177 for (unsigned i = 0; i < Grp->getFactor(); ++i) { 1178 if (auto *I = Grp->getMember(i)) { 1179 if (Grp->getInsertPos() == I) 1180 WideningDecisions[std::make_pair(I, VF)] = std::make_pair(W, Cost); 1181 else 1182 WideningDecisions[std::make_pair(I, VF)] = std::make_pair(W, 0); 1183 } 1184 } 1185 } 1186 1187 /// Return the cost model decision for the given instruction \p I and vector 1188 /// width \p VF. Return CM_Unknown if this instruction did not pass 1189 /// through the cost modeling. 1190 InstWidening getWideningDecision(Instruction *I, unsigned VF) { 1191 assert(VF >= 2 && "Expected VF >=2"); 1192 1193 // Cost model is not run in the VPlan-native path - return conservative 1194 // result until this changes. 1195 if (EnableVPlanNativePath) 1196 return CM_GatherScatter; 1197 1198 std::pair<Instruction *, unsigned> InstOnVF = std::make_pair(I, VF); 1199 auto Itr = WideningDecisions.find(InstOnVF); 1200 if (Itr == WideningDecisions.end()) 1201 return CM_Unknown; 1202 return Itr->second.first; 1203 } 1204 1205 /// Return the vectorization cost for the given instruction \p I and vector 1206 /// width \p VF. 1207 unsigned getWideningCost(Instruction *I, unsigned VF) { 1208 assert(VF >= 2 && "Expected VF >=2"); 1209 std::pair<Instruction *, unsigned> InstOnVF = std::make_pair(I, VF); 1210 assert(WideningDecisions.find(InstOnVF) != WideningDecisions.end() && 1211 "The cost is not calculated"); 1212 return WideningDecisions[InstOnVF].second; 1213 } 1214 1215 /// Return True if instruction \p I is an optimizable truncate whose operand 1216 /// is an induction variable. Such a truncate will be removed by adding a new 1217 /// induction variable with the destination type. 1218 bool isOptimizableIVTruncate(Instruction *I, unsigned VF) { 1219 // If the instruction is not a truncate, return false. 1220 auto *Trunc = dyn_cast<TruncInst>(I); 1221 if (!Trunc) 1222 return false; 1223 1224 // Get the source and destination types of the truncate. 1225 Type *SrcTy = ToVectorTy(cast<CastInst>(I)->getSrcTy(), VF); 1226 Type *DestTy = ToVectorTy(cast<CastInst>(I)->getDestTy(), VF); 1227 1228 // If the truncate is free for the given types, return false. Replacing a 1229 // free truncate with an induction variable would add an induction variable 1230 // update instruction to each iteration of the loop. We exclude from this 1231 // check the primary induction variable since it will need an update 1232 // instruction regardless. 1233 Value *Op = Trunc->getOperand(0); 1234 if (Op != Legal->getPrimaryInduction() && TTI.isTruncateFree(SrcTy, DestTy)) 1235 return false; 1236 1237 // If the truncated value is not an induction variable, return false. 1238 return Legal->isInductionPhi(Op); 1239 } 1240 1241 /// Collects the instructions to scalarize for each predicated instruction in 1242 /// the loop. 1243 void collectInstsToScalarize(unsigned VF); 1244 1245 /// Collect Uniform and Scalar values for the given \p VF. 1246 /// The sets depend on CM decision for Load/Store instructions 1247 /// that may be vectorized as interleave, gather-scatter or scalarized. 1248 void collectUniformsAndScalars(unsigned VF) { 1249 // Do the analysis once. 1250 if (VF == 1 || Uniforms.find(VF) != Uniforms.end()) 1251 return; 1252 setCostBasedWideningDecision(VF); 1253 collectLoopUniforms(VF); 1254 collectLoopScalars(VF); 1255 } 1256 1257 /// Returns true if the target machine supports masked store operation 1258 /// for the given \p DataType and kind of access to \p Ptr. 1259 bool isLegalMaskedStore(Type *DataType, Value *Ptr, Align Alignment) { 1260 return Legal->isConsecutivePtr(Ptr) && 1261 TTI.isLegalMaskedStore(DataType, Alignment); 1262 } 1263 1264 /// Returns true if the target machine supports masked load operation 1265 /// for the given \p DataType and kind of access to \p Ptr. 1266 bool isLegalMaskedLoad(Type *DataType, Value *Ptr, Align Alignment) { 1267 return Legal->isConsecutivePtr(Ptr) && 1268 TTI.isLegalMaskedLoad(DataType, Alignment); 1269 } 1270 1271 /// Returns true if the target machine supports masked scatter operation 1272 /// for the given \p DataType. 1273 bool isLegalMaskedScatter(Type *DataType, Align Alignment) { 1274 return TTI.isLegalMaskedScatter(DataType, Alignment); 1275 } 1276 1277 /// Returns true if the target machine supports masked gather operation 1278 /// for the given \p DataType. 1279 bool isLegalMaskedGather(Type *DataType, Align Alignment) { 1280 return TTI.isLegalMaskedGather(DataType, Alignment); 1281 } 1282 1283 /// Returns true if the target machine can represent \p V as a masked gather 1284 /// or scatter operation. 1285 bool isLegalGatherOrScatter(Value *V) { 1286 bool LI = isa<LoadInst>(V); 1287 bool SI = isa<StoreInst>(V); 1288 if (!LI && !SI) 1289 return false; 1290 auto *Ty = getMemInstValueType(V); 1291 Align Align = getLoadStoreAlignment(V); 1292 return (LI && isLegalMaskedGather(Ty, Align)) || 1293 (SI && isLegalMaskedScatter(Ty, Align)); 1294 } 1295 1296 /// Returns true if \p I is an instruction that will be scalarized with 1297 /// predication. Such instructions include conditional stores and 1298 /// instructions that may divide by zero. 1299 /// If a non-zero VF has been calculated, we check if I will be scalarized 1300 /// predication for that VF. 1301 bool isScalarWithPredication(Instruction *I, unsigned VF = 1); 1302 1303 // Returns true if \p I is an instruction that will be predicated either 1304 // through scalar predication or masked load/store or masked gather/scatter. 1305 // Superset of instructions that return true for isScalarWithPredication. 1306 bool isPredicatedInst(Instruction *I) { 1307 if (!blockNeedsPredication(I->getParent())) 1308 return false; 1309 // Loads and stores that need some form of masked operation are predicated 1310 // instructions. 1311 if (isa<LoadInst>(I) || isa<StoreInst>(I)) 1312 return Legal->isMaskRequired(I); 1313 return isScalarWithPredication(I); 1314 } 1315 1316 /// Returns true if \p I is a memory instruction with consecutive memory 1317 /// access that can be widened. 1318 bool memoryInstructionCanBeWidened(Instruction *I, unsigned VF = 1); 1319 1320 /// Returns true if \p I is a memory instruction in an interleaved-group 1321 /// of memory accesses that can be vectorized with wide vector loads/stores 1322 /// and shuffles. 1323 bool interleavedAccessCanBeWidened(Instruction *I, unsigned VF = 1); 1324 1325 /// Check if \p Instr belongs to any interleaved access group. 1326 bool isAccessInterleaved(Instruction *Instr) { 1327 return InterleaveInfo.isInterleaved(Instr); 1328 } 1329 1330 /// Get the interleaved access group that \p Instr belongs to. 1331 const InterleaveGroup<Instruction> * 1332 getInterleavedAccessGroup(Instruction *Instr) { 1333 return InterleaveInfo.getInterleaveGroup(Instr); 1334 } 1335 1336 /// Returns true if an interleaved group requires a scalar iteration 1337 /// to handle accesses with gaps, and there is nothing preventing us from 1338 /// creating a scalar epilogue. 1339 bool requiresScalarEpilogue() const { 1340 return isScalarEpilogueAllowed() && InterleaveInfo.requiresScalarEpilogue(); 1341 } 1342 1343 /// Returns true if a scalar epilogue is not allowed due to optsize or a 1344 /// loop hint annotation. 1345 bool isScalarEpilogueAllowed() const { 1346 return ScalarEpilogueStatus == CM_ScalarEpilogueAllowed; 1347 } 1348 1349 /// Returns true if all loop blocks should be masked to fold tail loop. 1350 bool foldTailByMasking() const { return FoldTailByMasking; } 1351 1352 bool blockNeedsPredication(BasicBlock *BB) { 1353 return foldTailByMasking() || Legal->blockNeedsPredication(BB); 1354 } 1355 1356 /// A SmallMapVector to store the InLoop reduction op chains, mapping phi 1357 /// nodes to the chain of instructions representing the reductions. Uses a 1358 /// MapVector to ensure deterministic iteration order. 1359 using ReductionChainMap = 1360 SmallMapVector<PHINode *, SmallVector<Instruction *, 4>, 4>; 1361 1362 /// Return the chain of instructions representing an inloop reduction. 1363 const ReductionChainMap &getInLoopReductionChains() const { 1364 return InLoopReductionChains; 1365 } 1366 1367 /// Returns true if the Phi is part of an inloop reduction. 1368 bool isInLoopReduction(PHINode *Phi) const { 1369 return InLoopReductionChains.count(Phi); 1370 } 1371 1372 /// Estimate cost of an intrinsic call instruction CI if it were vectorized 1373 /// with factor VF. Return the cost of the instruction, including 1374 /// scalarization overhead if it's needed. 1375 unsigned getVectorIntrinsicCost(CallInst *CI, unsigned VF); 1376 1377 /// Estimate cost of a call instruction CI if it were vectorized with factor 1378 /// VF. Return the cost of the instruction, including scalarization overhead 1379 /// if it's needed. The flag NeedToScalarize shows if the call needs to be 1380 /// scalarized - 1381 /// i.e. either vector version isn't available, or is too expensive. 1382 unsigned getVectorCallCost(CallInst *CI, unsigned VF, bool &NeedToScalarize); 1383 1384 /// Invalidates decisions already taken by the cost model. 1385 void invalidateCostModelingDecisions() { 1386 WideningDecisions.clear(); 1387 Uniforms.clear(); 1388 Scalars.clear(); 1389 } 1390 1391 private: 1392 unsigned NumPredStores = 0; 1393 1394 /// \return An upper bound for the vectorization factor, a power-of-2 larger 1395 /// than zero. One is returned if vectorization should best be avoided due 1396 /// to cost. 1397 unsigned computeFeasibleMaxVF(unsigned ConstTripCount); 1398 1399 /// The vectorization cost is a combination of the cost itself and a boolean 1400 /// indicating whether any of the contributing operations will actually 1401 /// operate on 1402 /// vector values after type legalization in the backend. If this latter value 1403 /// is 1404 /// false, then all operations will be scalarized (i.e. no vectorization has 1405 /// actually taken place). 1406 using VectorizationCostTy = std::pair<unsigned, bool>; 1407 1408 /// Returns the expected execution cost. The unit of the cost does 1409 /// not matter because we use the 'cost' units to compare different 1410 /// vector widths. The cost that is returned is *not* normalized by 1411 /// the factor width. 1412 VectorizationCostTy expectedCost(unsigned VF); 1413 1414 /// Returns the execution time cost of an instruction for a given vector 1415 /// width. Vector width of one means scalar. 1416 VectorizationCostTy getInstructionCost(Instruction *I, unsigned VF); 1417 1418 /// The cost-computation logic from getInstructionCost which provides 1419 /// the vector type as an output parameter. 1420 unsigned getInstructionCost(Instruction *I, unsigned VF, Type *&VectorTy); 1421 1422 /// Calculate vectorization cost of memory instruction \p I. 1423 unsigned getMemoryInstructionCost(Instruction *I, unsigned VF); 1424 1425 /// The cost computation for scalarized memory instruction. 1426 unsigned getMemInstScalarizationCost(Instruction *I, unsigned VF); 1427 1428 /// The cost computation for interleaving group of memory instructions. 1429 unsigned getInterleaveGroupCost(Instruction *I, unsigned VF); 1430 1431 /// The cost computation for Gather/Scatter instruction. 1432 unsigned getGatherScatterCost(Instruction *I, unsigned VF); 1433 1434 /// The cost computation for widening instruction \p I with consecutive 1435 /// memory access. 1436 unsigned getConsecutiveMemOpCost(Instruction *I, unsigned VF); 1437 1438 /// The cost calculation for Load/Store instruction \p I with uniform pointer - 1439 /// Load: scalar load + broadcast. 1440 /// Store: scalar store + (loop invariant value stored? 0 : extract of last 1441 /// element) 1442 unsigned getUniformMemOpCost(Instruction *I, unsigned VF); 1443 1444 /// Estimate the overhead of scalarizing an instruction. This is a 1445 /// convenience wrapper for the type-based getScalarizationOverhead API. 1446 unsigned getScalarizationOverhead(Instruction *I, unsigned VF); 1447 1448 /// Returns whether the instruction is a load or store and will be a emitted 1449 /// as a vector operation. 1450 bool isConsecutiveLoadOrStore(Instruction *I); 1451 1452 /// Returns true if an artificially high cost for emulated masked memrefs 1453 /// should be used. 1454 bool useEmulatedMaskMemRefHack(Instruction *I); 1455 1456 /// Map of scalar integer values to the smallest bitwidth they can be legally 1457 /// represented as. The vector equivalents of these values should be truncated 1458 /// to this type. 1459 MapVector<Instruction *, uint64_t> MinBWs; 1460 1461 /// A type representing the costs for instructions if they were to be 1462 /// scalarized rather than vectorized. The entries are Instruction-Cost 1463 /// pairs. 1464 using ScalarCostsTy = DenseMap<Instruction *, unsigned>; 1465 1466 /// A set containing all BasicBlocks that are known to present after 1467 /// vectorization as a predicated block. 1468 SmallPtrSet<BasicBlock *, 4> PredicatedBBsAfterVectorization; 1469 1470 /// Records whether it is allowed to have the original scalar loop execute at 1471 /// least once. This may be needed as a fallback loop in case runtime 1472 /// aliasing/dependence checks fail, or to handle the tail/remainder 1473 /// iterations when the trip count is unknown or doesn't divide by the VF, 1474 /// or as a peel-loop to handle gaps in interleave-groups. 1475 /// Under optsize and when the trip count is very small we don't allow any 1476 /// iterations to execute in the scalar loop. 1477 ScalarEpilogueLowering ScalarEpilogueStatus = CM_ScalarEpilogueAllowed; 1478 1479 /// All blocks of loop are to be masked to fold tail of scalar iterations. 1480 bool FoldTailByMasking = false; 1481 1482 /// A map holding scalar costs for different vectorization factors. The 1483 /// presence of a cost for an instruction in the mapping indicates that the 1484 /// instruction will be scalarized when vectorizing with the associated 1485 /// vectorization factor. The entries are VF-ScalarCostTy pairs. 1486 DenseMap<unsigned, ScalarCostsTy> InstsToScalarize; 1487 1488 /// Holds the instructions known to be uniform after vectorization. 1489 /// The data is collected per VF. 1490 DenseMap<unsigned, SmallPtrSet<Instruction *, 4>> Uniforms; 1491 1492 /// Holds the instructions known to be scalar after vectorization. 1493 /// The data is collected per VF. 1494 DenseMap<unsigned, SmallPtrSet<Instruction *, 4>> Scalars; 1495 1496 /// Holds the instructions (address computations) that are forced to be 1497 /// scalarized. 1498 DenseMap<unsigned, SmallPtrSet<Instruction *, 4>> ForcedScalars; 1499 1500 /// PHINodes of the reductions that should be expanded in-loop along with 1501 /// their associated chains of reduction operations, in program order from top 1502 /// (PHI) to bottom 1503 ReductionChainMap InLoopReductionChains; 1504 1505 /// Returns the expected difference in cost from scalarizing the expression 1506 /// feeding a predicated instruction \p PredInst. The instructions to 1507 /// scalarize and their scalar costs are collected in \p ScalarCosts. A 1508 /// non-negative return value implies the expression will be scalarized. 1509 /// Currently, only single-use chains are considered for scalarization. 1510 int computePredInstDiscount(Instruction *PredInst, ScalarCostsTy &ScalarCosts, 1511 unsigned VF); 1512 1513 /// Collect the instructions that are uniform after vectorization. An 1514 /// instruction is uniform if we represent it with a single scalar value in 1515 /// the vectorized loop corresponding to each vector iteration. Examples of 1516 /// uniform instructions include pointer operands of consecutive or 1517 /// interleaved memory accesses. Note that although uniformity implies an 1518 /// instruction will be scalar, the reverse is not true. In general, a 1519 /// scalarized instruction will be represented by VF scalar values in the 1520 /// vectorized loop, each corresponding to an iteration of the original 1521 /// scalar loop. 1522 void collectLoopUniforms(unsigned VF); 1523 1524 /// Collect the instructions that are scalar after vectorization. An 1525 /// instruction is scalar if it is known to be uniform or will be scalarized 1526 /// during vectorization. Non-uniform scalarized instructions will be 1527 /// represented by VF values in the vectorized loop, each corresponding to an 1528 /// iteration of the original scalar loop. 1529 void collectLoopScalars(unsigned VF); 1530 1531 /// Keeps cost model vectorization decision and cost for instructions. 1532 /// Right now it is used for memory instructions only. 1533 using DecisionList = DenseMap<std::pair<Instruction *, unsigned>, 1534 std::pair<InstWidening, unsigned>>; 1535 1536 DecisionList WideningDecisions; 1537 1538 /// Returns true if \p V is expected to be vectorized and it needs to be 1539 /// extracted. 1540 bool needsExtract(Value *V, unsigned VF) const { 1541 Instruction *I = dyn_cast<Instruction>(V); 1542 if (VF == 1 || !I || !TheLoop->contains(I) || TheLoop->isLoopInvariant(I)) 1543 return false; 1544 1545 // Assume we can vectorize V (and hence we need extraction) if the 1546 // scalars are not computed yet. This can happen, because it is called 1547 // via getScalarizationOverhead from setCostBasedWideningDecision, before 1548 // the scalars are collected. That should be a safe assumption in most 1549 // cases, because we check if the operands have vectorizable types 1550 // beforehand in LoopVectorizationLegality. 1551 return Scalars.find(VF) == Scalars.end() || 1552 !isScalarAfterVectorization(I, VF); 1553 }; 1554 1555 /// Returns a range containing only operands needing to be extracted. 1556 SmallVector<Value *, 4> filterExtractingOperands(Instruction::op_range Ops, 1557 unsigned VF) { 1558 return SmallVector<Value *, 4>(make_filter_range( 1559 Ops, [this, VF](Value *V) { return this->needsExtract(V, VF); })); 1560 } 1561 1562 public: 1563 /// The loop that we evaluate. 1564 Loop *TheLoop; 1565 1566 /// Predicated scalar evolution analysis. 1567 PredicatedScalarEvolution &PSE; 1568 1569 /// Loop Info analysis. 1570 LoopInfo *LI; 1571 1572 /// Vectorization legality. 1573 LoopVectorizationLegality *Legal; 1574 1575 /// Vector target information. 1576 const TargetTransformInfo &TTI; 1577 1578 /// Target Library Info. 1579 const TargetLibraryInfo *TLI; 1580 1581 /// Demanded bits analysis. 1582 DemandedBits *DB; 1583 1584 /// Assumption cache. 1585 AssumptionCache *AC; 1586 1587 /// Interface to emit optimization remarks. 1588 OptimizationRemarkEmitter *ORE; 1589 1590 const Function *TheFunction; 1591 1592 /// Loop Vectorize Hint. 1593 const LoopVectorizeHints *Hints; 1594 1595 /// The interleave access information contains groups of interleaved accesses 1596 /// with the same stride and close to each other. 1597 InterleavedAccessInfo &InterleaveInfo; 1598 1599 /// Values to ignore in the cost model. 1600 SmallPtrSet<const Value *, 16> ValuesToIgnore; 1601 1602 /// Values to ignore in the cost model when VF > 1. 1603 SmallPtrSet<const Value *, 16> VecValuesToIgnore; 1604 }; 1605 1606 } // end namespace llvm 1607 1608 // Return true if \p OuterLp is an outer loop annotated with hints for explicit 1609 // vectorization. The loop needs to be annotated with #pragma omp simd 1610 // simdlen(#) or #pragma clang vectorize(enable) vectorize_width(#). If the 1611 // vector length information is not provided, vectorization is not considered 1612 // explicit. Interleave hints are not allowed either. These limitations will be 1613 // relaxed in the future. 1614 // Please, note that we are currently forced to abuse the pragma 'clang 1615 // vectorize' semantics. This pragma provides *auto-vectorization hints* 1616 // (i.e., LV must check that vectorization is legal) whereas pragma 'omp simd' 1617 // provides *explicit vectorization hints* (LV can bypass legal checks and 1618 // assume that vectorization is legal). However, both hints are implemented 1619 // using the same metadata (llvm.loop.vectorize, processed by 1620 // LoopVectorizeHints). This will be fixed in the future when the native IR 1621 // representation for pragma 'omp simd' is introduced. 1622 static bool isExplicitVecOuterLoop(Loop *OuterLp, 1623 OptimizationRemarkEmitter *ORE) { 1624 assert(!OuterLp->empty() && "This is not an outer loop"); 1625 LoopVectorizeHints Hints(OuterLp, true /*DisableInterleaving*/, *ORE); 1626 1627 // Only outer loops with an explicit vectorization hint are supported. 1628 // Unannotated outer loops are ignored. 1629 if (Hints.getForce() == LoopVectorizeHints::FK_Undefined) 1630 return false; 1631 1632 Function *Fn = OuterLp->getHeader()->getParent(); 1633 if (!Hints.allowVectorization(Fn, OuterLp, 1634 true /*VectorizeOnlyWhenForced*/)) { 1635 LLVM_DEBUG(dbgs() << "LV: Loop hints prevent outer loop vectorization.\n"); 1636 return false; 1637 } 1638 1639 if (Hints.getInterleave() > 1) { 1640 // TODO: Interleave support is future work. 1641 LLVM_DEBUG(dbgs() << "LV: Not vectorizing: Interleave is not supported for " 1642 "outer loops.\n"); 1643 Hints.emitRemarkWithHints(); 1644 return false; 1645 } 1646 1647 return true; 1648 } 1649 1650 static void collectSupportedLoops(Loop &L, LoopInfo *LI, 1651 OptimizationRemarkEmitter *ORE, 1652 SmallVectorImpl<Loop *> &V) { 1653 // Collect inner loops and outer loops without irreducible control flow. For 1654 // now, only collect outer loops that have explicit vectorization hints. If we 1655 // are stress testing the VPlan H-CFG construction, we collect the outermost 1656 // loop of every loop nest. 1657 if (L.empty() || VPlanBuildStressTest || 1658 (EnableVPlanNativePath && isExplicitVecOuterLoop(&L, ORE))) { 1659 LoopBlocksRPO RPOT(&L); 1660 RPOT.perform(LI); 1661 if (!containsIrreducibleCFG<const BasicBlock *>(RPOT, *LI)) { 1662 V.push_back(&L); 1663 // TODO: Collect inner loops inside marked outer loops in case 1664 // vectorization fails for the outer loop. Do not invoke 1665 // 'containsIrreducibleCFG' again for inner loops when the outer loop is 1666 // already known to be reducible. We can use an inherited attribute for 1667 // that. 1668 return; 1669 } 1670 } 1671 for (Loop *InnerL : L) 1672 collectSupportedLoops(*InnerL, LI, ORE, V); 1673 } 1674 1675 namespace { 1676 1677 /// The LoopVectorize Pass. 1678 struct LoopVectorize : public FunctionPass { 1679 /// Pass identification, replacement for typeid 1680 static char ID; 1681 1682 LoopVectorizePass Impl; 1683 1684 explicit LoopVectorize(bool InterleaveOnlyWhenForced = false, 1685 bool VectorizeOnlyWhenForced = false) 1686 : FunctionPass(ID), 1687 Impl({InterleaveOnlyWhenForced, VectorizeOnlyWhenForced}) { 1688 initializeLoopVectorizePass(*PassRegistry::getPassRegistry()); 1689 } 1690 1691 bool runOnFunction(Function &F) override { 1692 if (skipFunction(F)) 1693 return false; 1694 1695 auto *SE = &getAnalysis<ScalarEvolutionWrapperPass>().getSE(); 1696 auto *LI = &getAnalysis<LoopInfoWrapperPass>().getLoopInfo(); 1697 auto *TTI = &getAnalysis<TargetTransformInfoWrapperPass>().getTTI(F); 1698 auto *DT = &getAnalysis<DominatorTreeWrapperPass>().getDomTree(); 1699 auto *BFI = &getAnalysis<BlockFrequencyInfoWrapperPass>().getBFI(); 1700 auto *TLIP = getAnalysisIfAvailable<TargetLibraryInfoWrapperPass>(); 1701 auto *TLI = TLIP ? &TLIP->getTLI(F) : nullptr; 1702 auto *AA = &getAnalysis<AAResultsWrapperPass>().getAAResults(); 1703 auto *AC = &getAnalysis<AssumptionCacheTracker>().getAssumptionCache(F); 1704 auto *LAA = &getAnalysis<LoopAccessLegacyAnalysis>(); 1705 auto *DB = &getAnalysis<DemandedBitsWrapperPass>().getDemandedBits(); 1706 auto *ORE = &getAnalysis<OptimizationRemarkEmitterWrapperPass>().getORE(); 1707 auto *PSI = &getAnalysis<ProfileSummaryInfoWrapperPass>().getPSI(); 1708 1709 std::function<const LoopAccessInfo &(Loop &)> GetLAA = 1710 [&](Loop &L) -> const LoopAccessInfo & { return LAA->getInfo(&L); }; 1711 1712 return Impl.runImpl(F, *SE, *LI, *TTI, *DT, *BFI, TLI, *DB, *AA, *AC, 1713 GetLAA, *ORE, PSI).MadeAnyChange; 1714 } 1715 1716 void getAnalysisUsage(AnalysisUsage &AU) const override { 1717 AU.addRequired<AssumptionCacheTracker>(); 1718 AU.addRequired<BlockFrequencyInfoWrapperPass>(); 1719 AU.addRequired<DominatorTreeWrapperPass>(); 1720 AU.addRequired<LoopInfoWrapperPass>(); 1721 AU.addRequired<ScalarEvolutionWrapperPass>(); 1722 AU.addRequired<TargetTransformInfoWrapperPass>(); 1723 AU.addRequired<AAResultsWrapperPass>(); 1724 AU.addRequired<LoopAccessLegacyAnalysis>(); 1725 AU.addRequired<DemandedBitsWrapperPass>(); 1726 AU.addRequired<OptimizationRemarkEmitterWrapperPass>(); 1727 AU.addRequired<InjectTLIMappingsLegacy>(); 1728 1729 // We currently do not preserve loopinfo/dominator analyses with outer loop 1730 // vectorization. Until this is addressed, mark these analyses as preserved 1731 // only for non-VPlan-native path. 1732 // TODO: Preserve Loop and Dominator analyses for VPlan-native path. 1733 if (!EnableVPlanNativePath) { 1734 AU.addPreserved<LoopInfoWrapperPass>(); 1735 AU.addPreserved<DominatorTreeWrapperPass>(); 1736 } 1737 1738 AU.addPreserved<BasicAAWrapperPass>(); 1739 AU.addPreserved<GlobalsAAWrapperPass>(); 1740 AU.addRequired<ProfileSummaryInfoWrapperPass>(); 1741 } 1742 }; 1743 1744 } // end anonymous namespace 1745 1746 //===----------------------------------------------------------------------===// 1747 // Implementation of LoopVectorizationLegality, InnerLoopVectorizer and 1748 // LoopVectorizationCostModel and LoopVectorizationPlanner. 1749 //===----------------------------------------------------------------------===// 1750 1751 Value *InnerLoopVectorizer::getBroadcastInstrs(Value *V) { 1752 // We need to place the broadcast of invariant variables outside the loop, 1753 // but only if it's proven safe to do so. Else, broadcast will be inside 1754 // vector loop body. 1755 Instruction *Instr = dyn_cast<Instruction>(V); 1756 bool SafeToHoist = OrigLoop->isLoopInvariant(V) && 1757 (!Instr || 1758 DT->dominates(Instr->getParent(), LoopVectorPreHeader)); 1759 // Place the code for broadcasting invariant variables in the new preheader. 1760 IRBuilder<>::InsertPointGuard Guard(Builder); 1761 if (SafeToHoist) 1762 Builder.SetInsertPoint(LoopVectorPreHeader->getTerminator()); 1763 1764 // Broadcast the scalar into all locations in the vector. 1765 Value *Shuf = Builder.CreateVectorSplat(VF, V, "broadcast"); 1766 1767 return Shuf; 1768 } 1769 1770 void InnerLoopVectorizer::createVectorIntOrFpInductionPHI( 1771 const InductionDescriptor &II, Value *Step, Instruction *EntryVal) { 1772 assert((isa<PHINode>(EntryVal) || isa<TruncInst>(EntryVal)) && 1773 "Expected either an induction phi-node or a truncate of it!"); 1774 Value *Start = II.getStartValue(); 1775 1776 // Construct the initial value of the vector IV in the vector loop preheader 1777 auto CurrIP = Builder.saveIP(); 1778 Builder.SetInsertPoint(LoopVectorPreHeader->getTerminator()); 1779 if (isa<TruncInst>(EntryVal)) { 1780 assert(Start->getType()->isIntegerTy() && 1781 "Truncation requires an integer type"); 1782 auto *TruncType = cast<IntegerType>(EntryVal->getType()); 1783 Step = Builder.CreateTrunc(Step, TruncType); 1784 Start = Builder.CreateCast(Instruction::Trunc, Start, TruncType); 1785 } 1786 Value *SplatStart = Builder.CreateVectorSplat(VF, Start); 1787 Value *SteppedStart = 1788 getStepVector(SplatStart, 0, Step, II.getInductionOpcode()); 1789 1790 // We create vector phi nodes for both integer and floating-point induction 1791 // variables. Here, we determine the kind of arithmetic we will perform. 1792 Instruction::BinaryOps AddOp; 1793 Instruction::BinaryOps MulOp; 1794 if (Step->getType()->isIntegerTy()) { 1795 AddOp = Instruction::Add; 1796 MulOp = Instruction::Mul; 1797 } else { 1798 AddOp = II.getInductionOpcode(); 1799 MulOp = Instruction::FMul; 1800 } 1801 1802 // Multiply the vectorization factor by the step using integer or 1803 // floating-point arithmetic as appropriate. 1804 Value *ConstVF = getSignedIntOrFpConstant(Step->getType(), VF); 1805 Value *Mul = addFastMathFlag(Builder.CreateBinOp(MulOp, Step, ConstVF)); 1806 1807 // Create a vector splat to use in the induction update. 1808 // 1809 // FIXME: If the step is non-constant, we create the vector splat with 1810 // IRBuilder. IRBuilder can constant-fold the multiply, but it doesn't 1811 // handle a constant vector splat. 1812 Value *SplatVF = isa<Constant>(Mul) 1813 ? ConstantVector::getSplat(ElementCount::getFixed(VF), 1814 cast<Constant>(Mul)) 1815 : Builder.CreateVectorSplat(VF, Mul); 1816 Builder.restoreIP(CurrIP); 1817 1818 // We may need to add the step a number of times, depending on the unroll 1819 // factor. The last of those goes into the PHI. 1820 PHINode *VecInd = PHINode::Create(SteppedStart->getType(), 2, "vec.ind", 1821 &*LoopVectorBody->getFirstInsertionPt()); 1822 VecInd->setDebugLoc(EntryVal->getDebugLoc()); 1823 Instruction *LastInduction = VecInd; 1824 for (unsigned Part = 0; Part < UF; ++Part) { 1825 VectorLoopValueMap.setVectorValue(EntryVal, Part, LastInduction); 1826 1827 if (isa<TruncInst>(EntryVal)) 1828 addMetadata(LastInduction, EntryVal); 1829 recordVectorLoopValueForInductionCast(II, EntryVal, LastInduction, Part); 1830 1831 LastInduction = cast<Instruction>(addFastMathFlag( 1832 Builder.CreateBinOp(AddOp, LastInduction, SplatVF, "step.add"))); 1833 LastInduction->setDebugLoc(EntryVal->getDebugLoc()); 1834 } 1835 1836 // Move the last step to the end of the latch block. This ensures consistent 1837 // placement of all induction updates. 1838 auto *LoopVectorLatch = LI->getLoopFor(LoopVectorBody)->getLoopLatch(); 1839 auto *Br = cast<BranchInst>(LoopVectorLatch->getTerminator()); 1840 auto *ICmp = cast<Instruction>(Br->getCondition()); 1841 LastInduction->moveBefore(ICmp); 1842 LastInduction->setName("vec.ind.next"); 1843 1844 VecInd->addIncoming(SteppedStart, LoopVectorPreHeader); 1845 VecInd->addIncoming(LastInduction, LoopVectorLatch); 1846 } 1847 1848 bool InnerLoopVectorizer::shouldScalarizeInstruction(Instruction *I) const { 1849 return Cost->isScalarAfterVectorization(I, VF) || 1850 Cost->isProfitableToScalarize(I, VF); 1851 } 1852 1853 bool InnerLoopVectorizer::needsScalarInduction(Instruction *IV) const { 1854 if (shouldScalarizeInstruction(IV)) 1855 return true; 1856 auto isScalarInst = [&](User *U) -> bool { 1857 auto *I = cast<Instruction>(U); 1858 return (OrigLoop->contains(I) && shouldScalarizeInstruction(I)); 1859 }; 1860 return llvm::any_of(IV->users(), isScalarInst); 1861 } 1862 1863 void InnerLoopVectorizer::recordVectorLoopValueForInductionCast( 1864 const InductionDescriptor &ID, const Instruction *EntryVal, 1865 Value *VectorLoopVal, unsigned Part, unsigned Lane) { 1866 assert((isa<PHINode>(EntryVal) || isa<TruncInst>(EntryVal)) && 1867 "Expected either an induction phi-node or a truncate of it!"); 1868 1869 // This induction variable is not the phi from the original loop but the 1870 // newly-created IV based on the proof that casted Phi is equal to the 1871 // uncasted Phi in the vectorized loop (under a runtime guard possibly). It 1872 // re-uses the same InductionDescriptor that original IV uses but we don't 1873 // have to do any recording in this case - that is done when original IV is 1874 // processed. 1875 if (isa<TruncInst>(EntryVal)) 1876 return; 1877 1878 const SmallVectorImpl<Instruction *> &Casts = ID.getCastInsts(); 1879 if (Casts.empty()) 1880 return; 1881 // Only the first Cast instruction in the Casts vector is of interest. 1882 // The rest of the Casts (if exist) have no uses outside the 1883 // induction update chain itself. 1884 Instruction *CastInst = *Casts.begin(); 1885 if (Lane < UINT_MAX) 1886 VectorLoopValueMap.setScalarValue(CastInst, {Part, Lane}, VectorLoopVal); 1887 else 1888 VectorLoopValueMap.setVectorValue(CastInst, Part, VectorLoopVal); 1889 } 1890 1891 void InnerLoopVectorizer::widenIntOrFpInduction(PHINode *IV, TruncInst *Trunc) { 1892 assert((IV->getType()->isIntegerTy() || IV != OldInduction) && 1893 "Primary induction variable must have an integer type"); 1894 1895 auto II = Legal->getInductionVars().find(IV); 1896 assert(II != Legal->getInductionVars().end() && "IV is not an induction"); 1897 1898 auto ID = II->second; 1899 assert(IV->getType() == ID.getStartValue()->getType() && "Types must match"); 1900 1901 // The value from the original loop to which we are mapping the new induction 1902 // variable. 1903 Instruction *EntryVal = Trunc ? cast<Instruction>(Trunc) : IV; 1904 1905 auto &DL = OrigLoop->getHeader()->getModule()->getDataLayout(); 1906 1907 // Generate code for the induction step. Note that induction steps are 1908 // required to be loop-invariant 1909 auto CreateStepValue = [&](const SCEV *Step) -> Value * { 1910 assert(PSE.getSE()->isLoopInvariant(Step, OrigLoop) && 1911 "Induction step should be loop invariant"); 1912 if (PSE.getSE()->isSCEVable(IV->getType())) { 1913 SCEVExpander Exp(*PSE.getSE(), DL, "induction"); 1914 return Exp.expandCodeFor(Step, Step->getType(), 1915 LoopVectorPreHeader->getTerminator()); 1916 } 1917 return cast<SCEVUnknown>(Step)->getValue(); 1918 }; 1919 1920 // The scalar value to broadcast. This is derived from the canonical 1921 // induction variable. If a truncation type is given, truncate the canonical 1922 // induction variable and step. Otherwise, derive these values from the 1923 // induction descriptor. 1924 auto CreateScalarIV = [&](Value *&Step) -> Value * { 1925 Value *ScalarIV = Induction; 1926 if (IV != OldInduction) { 1927 ScalarIV = IV->getType()->isIntegerTy() 1928 ? Builder.CreateSExtOrTrunc(Induction, IV->getType()) 1929 : Builder.CreateCast(Instruction::SIToFP, Induction, 1930 IV->getType()); 1931 ScalarIV = emitTransformedIndex(Builder, ScalarIV, PSE.getSE(), DL, ID); 1932 ScalarIV->setName("offset.idx"); 1933 } 1934 if (Trunc) { 1935 auto *TruncType = cast<IntegerType>(Trunc->getType()); 1936 assert(Step->getType()->isIntegerTy() && 1937 "Truncation requires an integer step"); 1938 ScalarIV = Builder.CreateTrunc(ScalarIV, TruncType); 1939 Step = Builder.CreateTrunc(Step, TruncType); 1940 } 1941 return ScalarIV; 1942 }; 1943 1944 // Create the vector values from the scalar IV, in the absence of creating a 1945 // vector IV. 1946 auto CreateSplatIV = [&](Value *ScalarIV, Value *Step) { 1947 Value *Broadcasted = getBroadcastInstrs(ScalarIV); 1948 for (unsigned Part = 0; Part < UF; ++Part) { 1949 Value *EntryPart = 1950 getStepVector(Broadcasted, VF * Part, Step, ID.getInductionOpcode()); 1951 VectorLoopValueMap.setVectorValue(EntryVal, Part, EntryPart); 1952 if (Trunc) 1953 addMetadata(EntryPart, Trunc); 1954 recordVectorLoopValueForInductionCast(ID, EntryVal, EntryPart, Part); 1955 } 1956 }; 1957 1958 // Now do the actual transformations, and start with creating the step value. 1959 Value *Step = CreateStepValue(ID.getStep()); 1960 if (VF <= 1) { 1961 Value *ScalarIV = CreateScalarIV(Step); 1962 CreateSplatIV(ScalarIV, Step); 1963 return; 1964 } 1965 1966 // Determine if we want a scalar version of the induction variable. This is 1967 // true if the induction variable itself is not widened, or if it has at 1968 // least one user in the loop that is not widened. 1969 auto NeedsScalarIV = needsScalarInduction(EntryVal); 1970 if (!NeedsScalarIV) { 1971 createVectorIntOrFpInductionPHI(ID, Step, EntryVal); 1972 return; 1973 } 1974 1975 // Try to create a new independent vector induction variable. If we can't 1976 // create the phi node, we will splat the scalar induction variable in each 1977 // loop iteration. 1978 if (!shouldScalarizeInstruction(EntryVal)) { 1979 createVectorIntOrFpInductionPHI(ID, Step, EntryVal); 1980 Value *ScalarIV = CreateScalarIV(Step); 1981 // Create scalar steps that can be used by instructions we will later 1982 // scalarize. Note that the addition of the scalar steps will not increase 1983 // the number of instructions in the loop in the common case prior to 1984 // InstCombine. We will be trading one vector extract for each scalar step. 1985 buildScalarSteps(ScalarIV, Step, EntryVal, ID); 1986 return; 1987 } 1988 1989 // All IV users are scalar instructions, so only emit a scalar IV, not a 1990 // vectorised IV. Except when we tail-fold, then the splat IV feeds the 1991 // predicate used by the masked loads/stores. 1992 Value *ScalarIV = CreateScalarIV(Step); 1993 if (!Cost->isScalarEpilogueAllowed()) 1994 CreateSplatIV(ScalarIV, Step); 1995 buildScalarSteps(ScalarIV, Step, EntryVal, ID); 1996 } 1997 1998 Value *InnerLoopVectorizer::getStepVector(Value *Val, int StartIdx, Value *Step, 1999 Instruction::BinaryOps BinOp) { 2000 // Create and check the types. 2001 auto *ValVTy = cast<VectorType>(Val->getType()); 2002 int VLen = ValVTy->getNumElements(); 2003 2004 Type *STy = Val->getType()->getScalarType(); 2005 assert((STy->isIntegerTy() || STy->isFloatingPointTy()) && 2006 "Induction Step must be an integer or FP"); 2007 assert(Step->getType() == STy && "Step has wrong type"); 2008 2009 SmallVector<Constant *, 8> Indices; 2010 2011 if (STy->isIntegerTy()) { 2012 // Create a vector of consecutive numbers from zero to VF. 2013 for (int i = 0; i < VLen; ++i) 2014 Indices.push_back(ConstantInt::get(STy, StartIdx + i)); 2015 2016 // Add the consecutive indices to the vector value. 2017 Constant *Cv = ConstantVector::get(Indices); 2018 assert(Cv->getType() == Val->getType() && "Invalid consecutive vec"); 2019 Step = Builder.CreateVectorSplat(VLen, Step); 2020 assert(Step->getType() == Val->getType() && "Invalid step vec"); 2021 // FIXME: The newly created binary instructions should contain nsw/nuw flags, 2022 // which can be found from the original scalar operations. 2023 Step = Builder.CreateMul(Cv, Step); 2024 return Builder.CreateAdd(Val, Step, "induction"); 2025 } 2026 2027 // Floating point induction. 2028 assert((BinOp == Instruction::FAdd || BinOp == Instruction::FSub) && 2029 "Binary Opcode should be specified for FP induction"); 2030 // Create a vector of consecutive numbers from zero to VF. 2031 for (int i = 0; i < VLen; ++i) 2032 Indices.push_back(ConstantFP::get(STy, (double)(StartIdx + i))); 2033 2034 // Add the consecutive indices to the vector value. 2035 Constant *Cv = ConstantVector::get(Indices); 2036 2037 Step = Builder.CreateVectorSplat(VLen, Step); 2038 2039 // Floating point operations had to be 'fast' to enable the induction. 2040 FastMathFlags Flags; 2041 Flags.setFast(); 2042 2043 Value *MulOp = Builder.CreateFMul(Cv, Step); 2044 if (isa<Instruction>(MulOp)) 2045 // Have to check, MulOp may be a constant 2046 cast<Instruction>(MulOp)->setFastMathFlags(Flags); 2047 2048 Value *BOp = Builder.CreateBinOp(BinOp, Val, MulOp, "induction"); 2049 if (isa<Instruction>(BOp)) 2050 cast<Instruction>(BOp)->setFastMathFlags(Flags); 2051 return BOp; 2052 } 2053 2054 void InnerLoopVectorizer::buildScalarSteps(Value *ScalarIV, Value *Step, 2055 Instruction *EntryVal, 2056 const InductionDescriptor &ID) { 2057 // We shouldn't have to build scalar steps if we aren't vectorizing. 2058 assert(VF > 1 && "VF should be greater than one"); 2059 2060 // Get the value type and ensure it and the step have the same integer type. 2061 Type *ScalarIVTy = ScalarIV->getType()->getScalarType(); 2062 assert(ScalarIVTy == Step->getType() && 2063 "Val and Step should have the same type"); 2064 2065 // We build scalar steps for both integer and floating-point induction 2066 // variables. Here, we determine the kind of arithmetic we will perform. 2067 Instruction::BinaryOps AddOp; 2068 Instruction::BinaryOps MulOp; 2069 if (ScalarIVTy->isIntegerTy()) { 2070 AddOp = Instruction::Add; 2071 MulOp = Instruction::Mul; 2072 } else { 2073 AddOp = ID.getInductionOpcode(); 2074 MulOp = Instruction::FMul; 2075 } 2076 2077 // Determine the number of scalars we need to generate for each unroll 2078 // iteration. If EntryVal is uniform, we only need to generate the first 2079 // lane. Otherwise, we generate all VF values. 2080 unsigned Lanes = 2081 Cost->isUniformAfterVectorization(cast<Instruction>(EntryVal), VF) ? 1 2082 : VF; 2083 // Compute the scalar steps and save the results in VectorLoopValueMap. 2084 for (unsigned Part = 0; Part < UF; ++Part) { 2085 for (unsigned Lane = 0; Lane < Lanes; ++Lane) { 2086 auto *StartIdx = getSignedIntOrFpConstant(ScalarIVTy, VF * Part + Lane); 2087 auto *Mul = addFastMathFlag(Builder.CreateBinOp(MulOp, StartIdx, Step)); 2088 auto *Add = addFastMathFlag(Builder.CreateBinOp(AddOp, ScalarIV, Mul)); 2089 VectorLoopValueMap.setScalarValue(EntryVal, {Part, Lane}, Add); 2090 recordVectorLoopValueForInductionCast(ID, EntryVal, Add, Part, Lane); 2091 } 2092 } 2093 } 2094 2095 Value *InnerLoopVectorizer::getOrCreateVectorValue(Value *V, unsigned Part) { 2096 assert(V != Induction && "The new induction variable should not be used."); 2097 assert(!V->getType()->isVectorTy() && "Can't widen a vector"); 2098 assert(!V->getType()->isVoidTy() && "Type does not produce a value"); 2099 2100 // If we have a stride that is replaced by one, do it here. Defer this for 2101 // the VPlan-native path until we start running Legal checks in that path. 2102 if (!EnableVPlanNativePath && Legal->hasStride(V)) 2103 V = ConstantInt::get(V->getType(), 1); 2104 2105 // If we have a vector mapped to this value, return it. 2106 if (VectorLoopValueMap.hasVectorValue(V, Part)) 2107 return VectorLoopValueMap.getVectorValue(V, Part); 2108 2109 // If the value has not been vectorized, check if it has been scalarized 2110 // instead. If it has been scalarized, and we actually need the value in 2111 // vector form, we will construct the vector values on demand. 2112 if (VectorLoopValueMap.hasAnyScalarValue(V)) { 2113 Value *ScalarValue = VectorLoopValueMap.getScalarValue(V, {Part, 0}); 2114 2115 // If we've scalarized a value, that value should be an instruction. 2116 auto *I = cast<Instruction>(V); 2117 2118 // If we aren't vectorizing, we can just copy the scalar map values over to 2119 // the vector map. 2120 if (VF == 1) { 2121 VectorLoopValueMap.setVectorValue(V, Part, ScalarValue); 2122 return ScalarValue; 2123 } 2124 2125 // Get the last scalar instruction we generated for V and Part. If the value 2126 // is known to be uniform after vectorization, this corresponds to lane zero 2127 // of the Part unroll iteration. Otherwise, the last instruction is the one 2128 // we created for the last vector lane of the Part unroll iteration. 2129 unsigned LastLane = Cost->isUniformAfterVectorization(I, VF) ? 0 : VF - 1; 2130 auto *LastInst = cast<Instruction>( 2131 VectorLoopValueMap.getScalarValue(V, {Part, LastLane})); 2132 2133 // Set the insert point after the last scalarized instruction. This ensures 2134 // the insertelement sequence will directly follow the scalar definitions. 2135 auto OldIP = Builder.saveIP(); 2136 auto NewIP = std::next(BasicBlock::iterator(LastInst)); 2137 Builder.SetInsertPoint(&*NewIP); 2138 2139 // However, if we are vectorizing, we need to construct the vector values. 2140 // If the value is known to be uniform after vectorization, we can just 2141 // broadcast the scalar value corresponding to lane zero for each unroll 2142 // iteration. Otherwise, we construct the vector values using insertelement 2143 // instructions. Since the resulting vectors are stored in 2144 // VectorLoopValueMap, we will only generate the insertelements once. 2145 Value *VectorValue = nullptr; 2146 if (Cost->isUniformAfterVectorization(I, VF)) { 2147 VectorValue = getBroadcastInstrs(ScalarValue); 2148 VectorLoopValueMap.setVectorValue(V, Part, VectorValue); 2149 } else { 2150 // Initialize packing with insertelements to start from undef. 2151 Value *Undef = UndefValue::get(FixedVectorType::get(V->getType(), VF)); 2152 VectorLoopValueMap.setVectorValue(V, Part, Undef); 2153 for (unsigned Lane = 0; Lane < VF; ++Lane) 2154 packScalarIntoVectorValue(V, {Part, Lane}); 2155 VectorValue = VectorLoopValueMap.getVectorValue(V, Part); 2156 } 2157 Builder.restoreIP(OldIP); 2158 return VectorValue; 2159 } 2160 2161 // If this scalar is unknown, assume that it is a constant or that it is 2162 // loop invariant. Broadcast V and save the value for future uses. 2163 Value *B = getBroadcastInstrs(V); 2164 VectorLoopValueMap.setVectorValue(V, Part, B); 2165 return B; 2166 } 2167 2168 Value * 2169 InnerLoopVectorizer::getOrCreateScalarValue(Value *V, 2170 const VPIteration &Instance) { 2171 // If the value is not an instruction contained in the loop, it should 2172 // already be scalar. 2173 if (OrigLoop->isLoopInvariant(V)) 2174 return V; 2175 2176 assert(Instance.Lane > 0 2177 ? !Cost->isUniformAfterVectorization(cast<Instruction>(V), VF) 2178 : true && "Uniform values only have lane zero"); 2179 2180 // If the value from the original loop has not been vectorized, it is 2181 // represented by UF x VF scalar values in the new loop. Return the requested 2182 // scalar value. 2183 if (VectorLoopValueMap.hasScalarValue(V, Instance)) 2184 return VectorLoopValueMap.getScalarValue(V, Instance); 2185 2186 // If the value has not been scalarized, get its entry in VectorLoopValueMap 2187 // for the given unroll part. If this entry is not a vector type (i.e., the 2188 // vectorization factor is one), there is no need to generate an 2189 // extractelement instruction. 2190 auto *U = getOrCreateVectorValue(V, Instance.Part); 2191 if (!U->getType()->isVectorTy()) { 2192 assert(VF == 1 && "Value not scalarized has non-vector type"); 2193 return U; 2194 } 2195 2196 // Otherwise, the value from the original loop has been vectorized and is 2197 // represented by UF vector values. Extract and return the requested scalar 2198 // value from the appropriate vector lane. 2199 return Builder.CreateExtractElement(U, Builder.getInt32(Instance.Lane)); 2200 } 2201 2202 void InnerLoopVectorizer::packScalarIntoVectorValue( 2203 Value *V, const VPIteration &Instance) { 2204 assert(V != Induction && "The new induction variable should not be used."); 2205 assert(!V->getType()->isVectorTy() && "Can't pack a vector"); 2206 assert(!V->getType()->isVoidTy() && "Type does not produce a value"); 2207 2208 Value *ScalarInst = VectorLoopValueMap.getScalarValue(V, Instance); 2209 Value *VectorValue = VectorLoopValueMap.getVectorValue(V, Instance.Part); 2210 VectorValue = Builder.CreateInsertElement(VectorValue, ScalarInst, 2211 Builder.getInt32(Instance.Lane)); 2212 VectorLoopValueMap.resetVectorValue(V, Instance.Part, VectorValue); 2213 } 2214 2215 Value *InnerLoopVectorizer::reverseVector(Value *Vec) { 2216 assert(Vec->getType()->isVectorTy() && "Invalid type"); 2217 SmallVector<int, 8> ShuffleMask; 2218 for (unsigned i = 0; i < VF; ++i) 2219 ShuffleMask.push_back(VF - i - 1); 2220 2221 return Builder.CreateShuffleVector(Vec, UndefValue::get(Vec->getType()), 2222 ShuffleMask, "reverse"); 2223 } 2224 2225 // Return whether we allow using masked interleave-groups (for dealing with 2226 // strided loads/stores that reside in predicated blocks, or for dealing 2227 // with gaps). 2228 static bool useMaskedInterleavedAccesses(const TargetTransformInfo &TTI) { 2229 // If an override option has been passed in for interleaved accesses, use it. 2230 if (EnableMaskedInterleavedMemAccesses.getNumOccurrences() > 0) 2231 return EnableMaskedInterleavedMemAccesses; 2232 2233 return TTI.enableMaskedInterleavedAccessVectorization(); 2234 } 2235 2236 // Try to vectorize the interleave group that \p Instr belongs to. 2237 // 2238 // E.g. Translate following interleaved load group (factor = 3): 2239 // for (i = 0; i < N; i+=3) { 2240 // R = Pic[i]; // Member of index 0 2241 // G = Pic[i+1]; // Member of index 1 2242 // B = Pic[i+2]; // Member of index 2 2243 // ... // do something to R, G, B 2244 // } 2245 // To: 2246 // %wide.vec = load <12 x i32> ; Read 4 tuples of R,G,B 2247 // %R.vec = shuffle %wide.vec, undef, <0, 3, 6, 9> ; R elements 2248 // %G.vec = shuffle %wide.vec, undef, <1, 4, 7, 10> ; G elements 2249 // %B.vec = shuffle %wide.vec, undef, <2, 5, 8, 11> ; B elements 2250 // 2251 // Or translate following interleaved store group (factor = 3): 2252 // for (i = 0; i < N; i+=3) { 2253 // ... do something to R, G, B 2254 // Pic[i] = R; // Member of index 0 2255 // Pic[i+1] = G; // Member of index 1 2256 // Pic[i+2] = B; // Member of index 2 2257 // } 2258 // To: 2259 // %R_G.vec = shuffle %R.vec, %G.vec, <0, 1, 2, ..., 7> 2260 // %B_U.vec = shuffle %B.vec, undef, <0, 1, 2, 3, u, u, u, u> 2261 // %interleaved.vec = shuffle %R_G.vec, %B_U.vec, 2262 // <0, 4, 8, 1, 5, 9, 2, 6, 10, 3, 7, 11> ; Interleave R,G,B elements 2263 // store <12 x i32> %interleaved.vec ; Write 4 tuples of R,G,B 2264 void InnerLoopVectorizer::vectorizeInterleaveGroup( 2265 const InterleaveGroup<Instruction> *Group, VPTransformState &State, 2266 VPValue *Addr, VPValue *BlockInMask) { 2267 Instruction *Instr = Group->getInsertPos(); 2268 const DataLayout &DL = Instr->getModule()->getDataLayout(); 2269 2270 // Prepare for the vector type of the interleaved load/store. 2271 Type *ScalarTy = getMemInstValueType(Instr); 2272 unsigned InterleaveFactor = Group->getFactor(); 2273 auto *VecTy = FixedVectorType::get(ScalarTy, InterleaveFactor * VF); 2274 2275 // Prepare for the new pointers. 2276 SmallVector<Value *, 2> AddrParts; 2277 unsigned Index = Group->getIndex(Instr); 2278 2279 // TODO: extend the masked interleaved-group support to reversed access. 2280 assert((!BlockInMask || !Group->isReverse()) && 2281 "Reversed masked interleave-group not supported."); 2282 2283 // If the group is reverse, adjust the index to refer to the last vector lane 2284 // instead of the first. We adjust the index from the first vector lane, 2285 // rather than directly getting the pointer for lane VF - 1, because the 2286 // pointer operand of the interleaved access is supposed to be uniform. For 2287 // uniform instructions, we're only required to generate a value for the 2288 // first vector lane in each unroll iteration. 2289 if (Group->isReverse()) 2290 Index += (VF - 1) * Group->getFactor(); 2291 2292 for (unsigned Part = 0; Part < UF; Part++) { 2293 Value *AddrPart = State.get(Addr, {Part, 0}); 2294 setDebugLocFromInst(Builder, AddrPart); 2295 2296 // Notice current instruction could be any index. Need to adjust the address 2297 // to the member of index 0. 2298 // 2299 // E.g. a = A[i+1]; // Member of index 1 (Current instruction) 2300 // b = A[i]; // Member of index 0 2301 // Current pointer is pointed to A[i+1], adjust it to A[i]. 2302 // 2303 // E.g. A[i+1] = a; // Member of index 1 2304 // A[i] = b; // Member of index 0 2305 // A[i+2] = c; // Member of index 2 (Current instruction) 2306 // Current pointer is pointed to A[i+2], adjust it to A[i]. 2307 2308 bool InBounds = false; 2309 if (auto *gep = dyn_cast<GetElementPtrInst>(AddrPart->stripPointerCasts())) 2310 InBounds = gep->isInBounds(); 2311 AddrPart = Builder.CreateGEP(ScalarTy, AddrPart, Builder.getInt32(-Index)); 2312 cast<GetElementPtrInst>(AddrPart)->setIsInBounds(InBounds); 2313 2314 // Cast to the vector pointer type. 2315 unsigned AddressSpace = AddrPart->getType()->getPointerAddressSpace(); 2316 Type *PtrTy = VecTy->getPointerTo(AddressSpace); 2317 AddrParts.push_back(Builder.CreateBitCast(AddrPart, PtrTy)); 2318 } 2319 2320 setDebugLocFromInst(Builder, Instr); 2321 Value *UndefVec = UndefValue::get(VecTy); 2322 2323 Value *MaskForGaps = nullptr; 2324 if (Group->requiresScalarEpilogue() && !Cost->isScalarEpilogueAllowed()) { 2325 MaskForGaps = createBitMaskForGaps(Builder, VF, *Group); 2326 assert(MaskForGaps && "Mask for Gaps is required but it is null"); 2327 } 2328 2329 // Vectorize the interleaved load group. 2330 if (isa<LoadInst>(Instr)) { 2331 // For each unroll part, create a wide load for the group. 2332 SmallVector<Value *, 2> NewLoads; 2333 for (unsigned Part = 0; Part < UF; Part++) { 2334 Instruction *NewLoad; 2335 if (BlockInMask || MaskForGaps) { 2336 assert(useMaskedInterleavedAccesses(*TTI) && 2337 "masked interleaved groups are not allowed."); 2338 Value *GroupMask = MaskForGaps; 2339 if (BlockInMask) { 2340 Value *BlockInMaskPart = State.get(BlockInMask, Part); 2341 auto *Undefs = UndefValue::get(BlockInMaskPart->getType()); 2342 Value *ShuffledMask = Builder.CreateShuffleVector( 2343 BlockInMaskPart, Undefs, 2344 createReplicatedMask(InterleaveFactor, VF), "interleaved.mask"); 2345 GroupMask = MaskForGaps 2346 ? Builder.CreateBinOp(Instruction::And, ShuffledMask, 2347 MaskForGaps) 2348 : ShuffledMask; 2349 } 2350 NewLoad = 2351 Builder.CreateMaskedLoad(AddrParts[Part], Group->getAlign(), 2352 GroupMask, UndefVec, "wide.masked.vec"); 2353 } 2354 else 2355 NewLoad = Builder.CreateAlignedLoad(VecTy, AddrParts[Part], 2356 Group->getAlign(), "wide.vec"); 2357 Group->addMetadata(NewLoad); 2358 NewLoads.push_back(NewLoad); 2359 } 2360 2361 // For each member in the group, shuffle out the appropriate data from the 2362 // wide loads. 2363 for (unsigned I = 0; I < InterleaveFactor; ++I) { 2364 Instruction *Member = Group->getMember(I); 2365 2366 // Skip the gaps in the group. 2367 if (!Member) 2368 continue; 2369 2370 auto StrideMask = createStrideMask(I, InterleaveFactor, VF); 2371 for (unsigned Part = 0; Part < UF; Part++) { 2372 Value *StridedVec = Builder.CreateShuffleVector( 2373 NewLoads[Part], UndefVec, StrideMask, "strided.vec"); 2374 2375 // If this member has different type, cast the result type. 2376 if (Member->getType() != ScalarTy) { 2377 VectorType *OtherVTy = FixedVectorType::get(Member->getType(), VF); 2378 StridedVec = createBitOrPointerCast(StridedVec, OtherVTy, DL); 2379 } 2380 2381 if (Group->isReverse()) 2382 StridedVec = reverseVector(StridedVec); 2383 2384 VectorLoopValueMap.setVectorValue(Member, Part, StridedVec); 2385 } 2386 } 2387 return; 2388 } 2389 2390 // The sub vector type for current instruction. 2391 auto *SubVT = FixedVectorType::get(ScalarTy, VF); 2392 2393 // Vectorize the interleaved store group. 2394 for (unsigned Part = 0; Part < UF; Part++) { 2395 // Collect the stored vector from each member. 2396 SmallVector<Value *, 4> StoredVecs; 2397 for (unsigned i = 0; i < InterleaveFactor; i++) { 2398 // Interleaved store group doesn't allow a gap, so each index has a member 2399 Instruction *Member = Group->getMember(i); 2400 assert(Member && "Fail to get a member from an interleaved store group"); 2401 2402 Value *StoredVec = getOrCreateVectorValue( 2403 cast<StoreInst>(Member)->getValueOperand(), Part); 2404 if (Group->isReverse()) 2405 StoredVec = reverseVector(StoredVec); 2406 2407 // If this member has different type, cast it to a unified type. 2408 2409 if (StoredVec->getType() != SubVT) 2410 StoredVec = createBitOrPointerCast(StoredVec, SubVT, DL); 2411 2412 StoredVecs.push_back(StoredVec); 2413 } 2414 2415 // Concatenate all vectors into a wide vector. 2416 Value *WideVec = concatenateVectors(Builder, StoredVecs); 2417 2418 // Interleave the elements in the wide vector. 2419 Value *IVec = Builder.CreateShuffleVector( 2420 WideVec, UndefVec, createInterleaveMask(VF, InterleaveFactor), 2421 "interleaved.vec"); 2422 2423 Instruction *NewStoreInstr; 2424 if (BlockInMask) { 2425 Value *BlockInMaskPart = State.get(BlockInMask, Part); 2426 auto *Undefs = UndefValue::get(BlockInMaskPart->getType()); 2427 Value *ShuffledMask = Builder.CreateShuffleVector( 2428 BlockInMaskPart, Undefs, createReplicatedMask(InterleaveFactor, VF), 2429 "interleaved.mask"); 2430 NewStoreInstr = Builder.CreateMaskedStore( 2431 IVec, AddrParts[Part], Group->getAlign(), ShuffledMask); 2432 } 2433 else 2434 NewStoreInstr = 2435 Builder.CreateAlignedStore(IVec, AddrParts[Part], Group->getAlign()); 2436 2437 Group->addMetadata(NewStoreInstr); 2438 } 2439 } 2440 2441 void InnerLoopVectorizer::vectorizeMemoryInstruction(Instruction *Instr, 2442 VPTransformState &State, 2443 VPValue *Addr, 2444 VPValue *StoredValue, 2445 VPValue *BlockInMask) { 2446 // Attempt to issue a wide load. 2447 LoadInst *LI = dyn_cast<LoadInst>(Instr); 2448 StoreInst *SI = dyn_cast<StoreInst>(Instr); 2449 2450 assert((LI || SI) && "Invalid Load/Store instruction"); 2451 assert((!SI || StoredValue) && "No stored value provided for widened store"); 2452 assert((!LI || !StoredValue) && "Stored value provided for widened load"); 2453 2454 LoopVectorizationCostModel::InstWidening Decision = 2455 Cost->getWideningDecision(Instr, VF); 2456 assert((Decision == LoopVectorizationCostModel::CM_Widen || 2457 Decision == LoopVectorizationCostModel::CM_Widen_Reverse || 2458 Decision == LoopVectorizationCostModel::CM_GatherScatter) && 2459 "CM decision is not to widen the memory instruction"); 2460 2461 Type *ScalarDataTy = getMemInstValueType(Instr); 2462 auto *DataTy = FixedVectorType::get(ScalarDataTy, VF); 2463 const Align Alignment = getLoadStoreAlignment(Instr); 2464 2465 // Determine if the pointer operand of the access is either consecutive or 2466 // reverse consecutive. 2467 bool Reverse = (Decision == LoopVectorizationCostModel::CM_Widen_Reverse); 2468 bool ConsecutiveStride = 2469 Reverse || (Decision == LoopVectorizationCostModel::CM_Widen); 2470 bool CreateGatherScatter = 2471 (Decision == LoopVectorizationCostModel::CM_GatherScatter); 2472 2473 // Either Ptr feeds a vector load/store, or a vector GEP should feed a vector 2474 // gather/scatter. Otherwise Decision should have been to Scalarize. 2475 assert((ConsecutiveStride || CreateGatherScatter) && 2476 "The instruction should be scalarized"); 2477 (void)ConsecutiveStride; 2478 2479 VectorParts BlockInMaskParts(UF); 2480 bool isMaskRequired = BlockInMask; 2481 if (isMaskRequired) 2482 for (unsigned Part = 0; Part < UF; ++Part) 2483 BlockInMaskParts[Part] = State.get(BlockInMask, Part); 2484 2485 const auto CreateVecPtr = [&](unsigned Part, Value *Ptr) -> Value * { 2486 // Calculate the pointer for the specific unroll-part. 2487 GetElementPtrInst *PartPtr = nullptr; 2488 2489 bool InBounds = false; 2490 if (auto *gep = dyn_cast<GetElementPtrInst>(Ptr->stripPointerCasts())) 2491 InBounds = gep->isInBounds(); 2492 2493 if (Reverse) { 2494 // If the address is consecutive but reversed, then the 2495 // wide store needs to start at the last vector element. 2496 PartPtr = cast<GetElementPtrInst>( 2497 Builder.CreateGEP(ScalarDataTy, Ptr, Builder.getInt32(-Part * VF))); 2498 PartPtr->setIsInBounds(InBounds); 2499 PartPtr = cast<GetElementPtrInst>( 2500 Builder.CreateGEP(ScalarDataTy, PartPtr, Builder.getInt32(1 - VF))); 2501 PartPtr->setIsInBounds(InBounds); 2502 if (isMaskRequired) // Reverse of a null all-one mask is a null mask. 2503 BlockInMaskParts[Part] = reverseVector(BlockInMaskParts[Part]); 2504 } else { 2505 PartPtr = cast<GetElementPtrInst>( 2506 Builder.CreateGEP(ScalarDataTy, Ptr, Builder.getInt32(Part * VF))); 2507 PartPtr->setIsInBounds(InBounds); 2508 } 2509 2510 unsigned AddressSpace = Ptr->getType()->getPointerAddressSpace(); 2511 return Builder.CreateBitCast(PartPtr, DataTy->getPointerTo(AddressSpace)); 2512 }; 2513 2514 // Handle Stores: 2515 if (SI) { 2516 setDebugLocFromInst(Builder, SI); 2517 2518 for (unsigned Part = 0; Part < UF; ++Part) { 2519 Instruction *NewSI = nullptr; 2520 Value *StoredVal = State.get(StoredValue, Part); 2521 if (CreateGatherScatter) { 2522 Value *MaskPart = isMaskRequired ? BlockInMaskParts[Part] : nullptr; 2523 Value *VectorGep = State.get(Addr, Part); 2524 NewSI = Builder.CreateMaskedScatter(StoredVal, VectorGep, Alignment, 2525 MaskPart); 2526 } else { 2527 if (Reverse) { 2528 // If we store to reverse consecutive memory locations, then we need 2529 // to reverse the order of elements in the stored value. 2530 StoredVal = reverseVector(StoredVal); 2531 // We don't want to update the value in the map as it might be used in 2532 // another expression. So don't call resetVectorValue(StoredVal). 2533 } 2534 auto *VecPtr = CreateVecPtr(Part, State.get(Addr, {0, 0})); 2535 if (isMaskRequired) 2536 NewSI = Builder.CreateMaskedStore(StoredVal, VecPtr, Alignment, 2537 BlockInMaskParts[Part]); 2538 else 2539 NewSI = Builder.CreateAlignedStore(StoredVal, VecPtr, Alignment); 2540 } 2541 addMetadata(NewSI, SI); 2542 } 2543 return; 2544 } 2545 2546 // Handle loads. 2547 assert(LI && "Must have a load instruction"); 2548 setDebugLocFromInst(Builder, LI); 2549 for (unsigned Part = 0; Part < UF; ++Part) { 2550 Value *NewLI; 2551 if (CreateGatherScatter) { 2552 Value *MaskPart = isMaskRequired ? BlockInMaskParts[Part] : nullptr; 2553 Value *VectorGep = State.get(Addr, Part); 2554 NewLI = Builder.CreateMaskedGather(VectorGep, Alignment, MaskPart, 2555 nullptr, "wide.masked.gather"); 2556 addMetadata(NewLI, LI); 2557 } else { 2558 auto *VecPtr = CreateVecPtr(Part, State.get(Addr, {0, 0})); 2559 if (isMaskRequired) 2560 NewLI = Builder.CreateMaskedLoad( 2561 VecPtr, Alignment, BlockInMaskParts[Part], UndefValue::get(DataTy), 2562 "wide.masked.load"); 2563 else 2564 NewLI = 2565 Builder.CreateAlignedLoad(DataTy, VecPtr, Alignment, "wide.load"); 2566 2567 // Add metadata to the load, but setVectorValue to the reverse shuffle. 2568 addMetadata(NewLI, LI); 2569 if (Reverse) 2570 NewLI = reverseVector(NewLI); 2571 } 2572 VectorLoopValueMap.setVectorValue(Instr, Part, NewLI); 2573 } 2574 } 2575 2576 void InnerLoopVectorizer::scalarizeInstruction(Instruction *Instr, VPUser &User, 2577 const VPIteration &Instance, 2578 bool IfPredicateInstr, 2579 VPTransformState &State) { 2580 assert(!Instr->getType()->isAggregateType() && "Can't handle vectors"); 2581 2582 setDebugLocFromInst(Builder, Instr); 2583 2584 // Does this instruction return a value ? 2585 bool IsVoidRetTy = Instr->getType()->isVoidTy(); 2586 2587 Instruction *Cloned = Instr->clone(); 2588 if (!IsVoidRetTy) 2589 Cloned->setName(Instr->getName() + ".cloned"); 2590 2591 // Replace the operands of the cloned instructions with their scalar 2592 // equivalents in the new loop. 2593 for (unsigned op = 0, e = User.getNumOperands(); op != e; ++op) { 2594 auto *NewOp = State.get(User.getOperand(op), Instance); 2595 Cloned->setOperand(op, NewOp); 2596 } 2597 addNewMetadata(Cloned, Instr); 2598 2599 // Place the cloned scalar in the new loop. 2600 Builder.Insert(Cloned); 2601 2602 // Add the cloned scalar to the scalar map entry. 2603 VectorLoopValueMap.setScalarValue(Instr, Instance, Cloned); 2604 2605 // If we just cloned a new assumption, add it the assumption cache. 2606 if (auto *II = dyn_cast<IntrinsicInst>(Cloned)) 2607 if (II->getIntrinsicID() == Intrinsic::assume) 2608 AC->registerAssumption(II); 2609 2610 // End if-block. 2611 if (IfPredicateInstr) 2612 PredicatedInstructions.push_back(Cloned); 2613 } 2614 2615 PHINode *InnerLoopVectorizer::createInductionVariable(Loop *L, Value *Start, 2616 Value *End, Value *Step, 2617 Instruction *DL) { 2618 BasicBlock *Header = L->getHeader(); 2619 BasicBlock *Latch = L->getLoopLatch(); 2620 // As we're just creating this loop, it's possible no latch exists 2621 // yet. If so, use the header as this will be a single block loop. 2622 if (!Latch) 2623 Latch = Header; 2624 2625 IRBuilder<> Builder(&*Header->getFirstInsertionPt()); 2626 Instruction *OldInst = getDebugLocFromInstOrOperands(OldInduction); 2627 setDebugLocFromInst(Builder, OldInst); 2628 auto *Induction = Builder.CreatePHI(Start->getType(), 2, "index"); 2629 2630 Builder.SetInsertPoint(Latch->getTerminator()); 2631 setDebugLocFromInst(Builder, OldInst); 2632 2633 // Create i+1 and fill the PHINode. 2634 Value *Next = Builder.CreateAdd(Induction, Step, "index.next"); 2635 Induction->addIncoming(Start, L->getLoopPreheader()); 2636 Induction->addIncoming(Next, Latch); 2637 // Create the compare. 2638 Value *ICmp = Builder.CreateICmpEQ(Next, End); 2639 Builder.CreateCondBr(ICmp, L->getExitBlock(), Header); 2640 2641 // Now we have two terminators. Remove the old one from the block. 2642 Latch->getTerminator()->eraseFromParent(); 2643 2644 return Induction; 2645 } 2646 2647 Value *InnerLoopVectorizer::getOrCreateTripCount(Loop *L) { 2648 if (TripCount) 2649 return TripCount; 2650 2651 assert(L && "Create Trip Count for null loop."); 2652 IRBuilder<> Builder(L->getLoopPreheader()->getTerminator()); 2653 // Find the loop boundaries. 2654 ScalarEvolution *SE = PSE.getSE(); 2655 const SCEV *BackedgeTakenCount = PSE.getBackedgeTakenCount(); 2656 assert(BackedgeTakenCount != SE->getCouldNotCompute() && 2657 "Invalid loop count"); 2658 2659 Type *IdxTy = Legal->getWidestInductionType(); 2660 assert(IdxTy && "No type for induction"); 2661 2662 // The exit count might have the type of i64 while the phi is i32. This can 2663 // happen if we have an induction variable that is sign extended before the 2664 // compare. The only way that we get a backedge taken count is that the 2665 // induction variable was signed and as such will not overflow. In such a case 2666 // truncation is legal. 2667 if (SE->getTypeSizeInBits(BackedgeTakenCount->getType()) > 2668 IdxTy->getPrimitiveSizeInBits()) 2669 BackedgeTakenCount = SE->getTruncateOrNoop(BackedgeTakenCount, IdxTy); 2670 BackedgeTakenCount = SE->getNoopOrZeroExtend(BackedgeTakenCount, IdxTy); 2671 2672 // Get the total trip count from the count by adding 1. 2673 const SCEV *ExitCount = SE->getAddExpr( 2674 BackedgeTakenCount, SE->getOne(BackedgeTakenCount->getType())); 2675 2676 const DataLayout &DL = L->getHeader()->getModule()->getDataLayout(); 2677 2678 // Expand the trip count and place the new instructions in the preheader. 2679 // Notice that the pre-header does not change, only the loop body. 2680 SCEVExpander Exp(*SE, DL, "induction"); 2681 2682 // Count holds the overall loop count (N). 2683 TripCount = Exp.expandCodeFor(ExitCount, ExitCount->getType(), 2684 L->getLoopPreheader()->getTerminator()); 2685 2686 if (TripCount->getType()->isPointerTy()) 2687 TripCount = 2688 CastInst::CreatePointerCast(TripCount, IdxTy, "exitcount.ptrcnt.to.int", 2689 L->getLoopPreheader()->getTerminator()); 2690 2691 return TripCount; 2692 } 2693 2694 Value *InnerLoopVectorizer::getOrCreateVectorTripCount(Loop *L) { 2695 if (VectorTripCount) 2696 return VectorTripCount; 2697 2698 Value *TC = getOrCreateTripCount(L); 2699 IRBuilder<> Builder(L->getLoopPreheader()->getTerminator()); 2700 2701 Type *Ty = TC->getType(); 2702 Constant *Step = ConstantInt::get(Ty, VF * UF); 2703 2704 // If the tail is to be folded by masking, round the number of iterations N 2705 // up to a multiple of Step instead of rounding down. This is done by first 2706 // adding Step-1 and then rounding down. Note that it's ok if this addition 2707 // overflows: the vector induction variable will eventually wrap to zero given 2708 // that it starts at zero and its Step is a power of two; the loop will then 2709 // exit, with the last early-exit vector comparison also producing all-true. 2710 if (Cost->foldTailByMasking()) { 2711 assert(isPowerOf2_32(VF * UF) && 2712 "VF*UF must be a power of 2 when folding tail by masking"); 2713 TC = Builder.CreateAdd(TC, ConstantInt::get(Ty, VF * UF - 1), "n.rnd.up"); 2714 } 2715 2716 // Now we need to generate the expression for the part of the loop that the 2717 // vectorized body will execute. This is equal to N - (N % Step) if scalar 2718 // iterations are not required for correctness, or N - Step, otherwise. Step 2719 // is equal to the vectorization factor (number of SIMD elements) times the 2720 // unroll factor (number of SIMD instructions). 2721 Value *R = Builder.CreateURem(TC, Step, "n.mod.vf"); 2722 2723 // If there is a non-reversed interleaved group that may speculatively access 2724 // memory out-of-bounds, we need to ensure that there will be at least one 2725 // iteration of the scalar epilogue loop. Thus, if the step evenly divides 2726 // the trip count, we set the remainder to be equal to the step. If the step 2727 // does not evenly divide the trip count, no adjustment is necessary since 2728 // there will already be scalar iterations. Note that the minimum iterations 2729 // check ensures that N >= Step. 2730 if (VF > 1 && Cost->requiresScalarEpilogue()) { 2731 auto *IsZero = Builder.CreateICmpEQ(R, ConstantInt::get(R->getType(), 0)); 2732 R = Builder.CreateSelect(IsZero, Step, R); 2733 } 2734 2735 VectorTripCount = Builder.CreateSub(TC, R, "n.vec"); 2736 2737 return VectorTripCount; 2738 } 2739 2740 Value *InnerLoopVectorizer::createBitOrPointerCast(Value *V, VectorType *DstVTy, 2741 const DataLayout &DL) { 2742 // Verify that V is a vector type with same number of elements as DstVTy. 2743 unsigned VF = DstVTy->getNumElements(); 2744 VectorType *SrcVecTy = cast<VectorType>(V->getType()); 2745 assert((VF == SrcVecTy->getNumElements()) && "Vector dimensions do not match"); 2746 Type *SrcElemTy = SrcVecTy->getElementType(); 2747 Type *DstElemTy = DstVTy->getElementType(); 2748 assert((DL.getTypeSizeInBits(SrcElemTy) == DL.getTypeSizeInBits(DstElemTy)) && 2749 "Vector elements must have same size"); 2750 2751 // Do a direct cast if element types are castable. 2752 if (CastInst::isBitOrNoopPointerCastable(SrcElemTy, DstElemTy, DL)) { 2753 return Builder.CreateBitOrPointerCast(V, DstVTy); 2754 } 2755 // V cannot be directly casted to desired vector type. 2756 // May happen when V is a floating point vector but DstVTy is a vector of 2757 // pointers or vice-versa. Handle this using a two-step bitcast using an 2758 // intermediate Integer type for the bitcast i.e. Ptr <-> Int <-> Float. 2759 assert((DstElemTy->isPointerTy() != SrcElemTy->isPointerTy()) && 2760 "Only one type should be a pointer type"); 2761 assert((DstElemTy->isFloatingPointTy() != SrcElemTy->isFloatingPointTy()) && 2762 "Only one type should be a floating point type"); 2763 Type *IntTy = 2764 IntegerType::getIntNTy(V->getContext(), DL.getTypeSizeInBits(SrcElemTy)); 2765 auto *VecIntTy = FixedVectorType::get(IntTy, VF); 2766 Value *CastVal = Builder.CreateBitOrPointerCast(V, VecIntTy); 2767 return Builder.CreateBitOrPointerCast(CastVal, DstVTy); 2768 } 2769 2770 void InnerLoopVectorizer::emitMinimumIterationCountCheck(Loop *L, 2771 BasicBlock *Bypass) { 2772 Value *Count = getOrCreateTripCount(L); 2773 // Reuse existing vector loop preheader for TC checks. 2774 // Note that new preheader block is generated for vector loop. 2775 BasicBlock *const TCCheckBlock = LoopVectorPreHeader; 2776 IRBuilder<> Builder(TCCheckBlock->getTerminator()); 2777 2778 // Generate code to check if the loop's trip count is less than VF * UF, or 2779 // equal to it in case a scalar epilogue is required; this implies that the 2780 // vector trip count is zero. This check also covers the case where adding one 2781 // to the backedge-taken count overflowed leading to an incorrect trip count 2782 // of zero. In this case we will also jump to the scalar loop. 2783 auto P = Cost->requiresScalarEpilogue() ? ICmpInst::ICMP_ULE 2784 : ICmpInst::ICMP_ULT; 2785 2786 // If tail is to be folded, vector loop takes care of all iterations. 2787 Value *CheckMinIters = Builder.getFalse(); 2788 if (!Cost->foldTailByMasking()) 2789 CheckMinIters = Builder.CreateICmp( 2790 P, Count, ConstantInt::get(Count->getType(), VF * UF), 2791 "min.iters.check"); 2792 2793 // Create new preheader for vector loop. 2794 LoopVectorPreHeader = 2795 SplitBlock(TCCheckBlock, TCCheckBlock->getTerminator(), DT, LI, nullptr, 2796 "vector.ph"); 2797 2798 assert(DT->properlyDominates(DT->getNode(TCCheckBlock), 2799 DT->getNode(Bypass)->getIDom()) && 2800 "TC check is expected to dominate Bypass"); 2801 2802 // Update dominator for Bypass & LoopExit. 2803 DT->changeImmediateDominator(Bypass, TCCheckBlock); 2804 DT->changeImmediateDominator(LoopExitBlock, TCCheckBlock); 2805 2806 ReplaceInstWithInst( 2807 TCCheckBlock->getTerminator(), 2808 BranchInst::Create(Bypass, LoopVectorPreHeader, CheckMinIters)); 2809 LoopBypassBlocks.push_back(TCCheckBlock); 2810 } 2811 2812 void InnerLoopVectorizer::emitSCEVChecks(Loop *L, BasicBlock *Bypass) { 2813 // Reuse existing vector loop preheader for SCEV checks. 2814 // Note that new preheader block is generated for vector loop. 2815 BasicBlock *const SCEVCheckBlock = LoopVectorPreHeader; 2816 2817 // Generate the code to check that the SCEV assumptions that we made. 2818 // We want the new basic block to start at the first instruction in a 2819 // sequence of instructions that form a check. 2820 SCEVExpander Exp(*PSE.getSE(), Bypass->getModule()->getDataLayout(), 2821 "scev.check"); 2822 Value *SCEVCheck = Exp.expandCodeForPredicate( 2823 &PSE.getUnionPredicate(), SCEVCheckBlock->getTerminator()); 2824 2825 if (auto *C = dyn_cast<ConstantInt>(SCEVCheck)) 2826 if (C->isZero()) 2827 return; 2828 2829 assert(!(SCEVCheckBlock->getParent()->hasOptSize() || 2830 (OptForSizeBasedOnProfile && 2831 Cost->Hints->getForce() != LoopVectorizeHints::FK_Enabled)) && 2832 "Cannot SCEV check stride or overflow when optimizing for size"); 2833 2834 SCEVCheckBlock->setName("vector.scevcheck"); 2835 // Create new preheader for vector loop. 2836 LoopVectorPreHeader = 2837 SplitBlock(SCEVCheckBlock, SCEVCheckBlock->getTerminator(), DT, LI, 2838 nullptr, "vector.ph"); 2839 2840 // Update dominator only if this is first RT check. 2841 if (LoopBypassBlocks.empty()) { 2842 DT->changeImmediateDominator(Bypass, SCEVCheckBlock); 2843 DT->changeImmediateDominator(LoopExitBlock, SCEVCheckBlock); 2844 } 2845 2846 ReplaceInstWithInst( 2847 SCEVCheckBlock->getTerminator(), 2848 BranchInst::Create(Bypass, LoopVectorPreHeader, SCEVCheck)); 2849 LoopBypassBlocks.push_back(SCEVCheckBlock); 2850 AddedSafetyChecks = true; 2851 } 2852 2853 void InnerLoopVectorizer::emitMemRuntimeChecks(Loop *L, BasicBlock *Bypass) { 2854 // VPlan-native path does not do any analysis for runtime checks currently. 2855 if (EnableVPlanNativePath) 2856 return; 2857 2858 // Reuse existing vector loop preheader for runtime memory checks. 2859 // Note that new preheader block is generated for vector loop. 2860 BasicBlock *const MemCheckBlock = L->getLoopPreheader(); 2861 2862 // Generate the code that checks in runtime if arrays overlap. We put the 2863 // checks into a separate block to make the more common case of few elements 2864 // faster. 2865 auto *LAI = Legal->getLAI(); 2866 const auto &RtPtrChecking = *LAI->getRuntimePointerChecking(); 2867 if (!RtPtrChecking.Need) 2868 return; 2869 Instruction *FirstCheckInst; 2870 Instruction *MemRuntimeCheck; 2871 std::tie(FirstCheckInst, MemRuntimeCheck) = 2872 addRuntimeChecks(MemCheckBlock->getTerminator(), OrigLoop, 2873 RtPtrChecking.getChecks(), RtPtrChecking.getSE()); 2874 assert(MemRuntimeCheck && "no RT checks generated although RtPtrChecking " 2875 "claimed checks are required"); 2876 2877 if (MemCheckBlock->getParent()->hasOptSize() || OptForSizeBasedOnProfile) { 2878 assert(Cost->Hints->getForce() == LoopVectorizeHints::FK_Enabled && 2879 "Cannot emit memory checks when optimizing for size, unless forced " 2880 "to vectorize."); 2881 ORE->emit([&]() { 2882 return OptimizationRemarkAnalysis(DEBUG_TYPE, "VectorizationCodeSize", 2883 L->getStartLoc(), L->getHeader()) 2884 << "Code-size may be reduced by not forcing " 2885 "vectorization, or by source-code modifications " 2886 "eliminating the need for runtime checks " 2887 "(e.g., adding 'restrict')."; 2888 }); 2889 } 2890 2891 MemCheckBlock->setName("vector.memcheck"); 2892 // Create new preheader for vector loop. 2893 LoopVectorPreHeader = 2894 SplitBlock(MemCheckBlock, MemCheckBlock->getTerminator(), DT, LI, nullptr, 2895 "vector.ph"); 2896 2897 // Update dominator only if this is first RT check. 2898 if (LoopBypassBlocks.empty()) { 2899 DT->changeImmediateDominator(Bypass, MemCheckBlock); 2900 DT->changeImmediateDominator(LoopExitBlock, MemCheckBlock); 2901 } 2902 2903 ReplaceInstWithInst( 2904 MemCheckBlock->getTerminator(), 2905 BranchInst::Create(Bypass, LoopVectorPreHeader, MemRuntimeCheck)); 2906 LoopBypassBlocks.push_back(MemCheckBlock); 2907 AddedSafetyChecks = true; 2908 2909 // We currently don't use LoopVersioning for the actual loop cloning but we 2910 // still use it to add the noalias metadata. 2911 LVer = std::make_unique<LoopVersioning>(*Legal->getLAI(), OrigLoop, LI, DT, 2912 PSE.getSE()); 2913 LVer->prepareNoAliasMetadata(); 2914 } 2915 2916 Value *InnerLoopVectorizer::emitTransformedIndex( 2917 IRBuilder<> &B, Value *Index, ScalarEvolution *SE, const DataLayout &DL, 2918 const InductionDescriptor &ID) const { 2919 2920 SCEVExpander Exp(*SE, DL, "induction"); 2921 auto Step = ID.getStep(); 2922 auto StartValue = ID.getStartValue(); 2923 assert(Index->getType() == Step->getType() && 2924 "Index type does not match StepValue type"); 2925 2926 // Note: the IR at this point is broken. We cannot use SE to create any new 2927 // SCEV and then expand it, hoping that SCEV's simplification will give us 2928 // a more optimal code. Unfortunately, attempt of doing so on invalid IR may 2929 // lead to various SCEV crashes. So all we can do is to use builder and rely 2930 // on InstCombine for future simplifications. Here we handle some trivial 2931 // cases only. 2932 auto CreateAdd = [&B](Value *X, Value *Y) { 2933 assert(X->getType() == Y->getType() && "Types don't match!"); 2934 if (auto *CX = dyn_cast<ConstantInt>(X)) 2935 if (CX->isZero()) 2936 return Y; 2937 if (auto *CY = dyn_cast<ConstantInt>(Y)) 2938 if (CY->isZero()) 2939 return X; 2940 return B.CreateAdd(X, Y); 2941 }; 2942 2943 auto CreateMul = [&B](Value *X, Value *Y) { 2944 assert(X->getType() == Y->getType() && "Types don't match!"); 2945 if (auto *CX = dyn_cast<ConstantInt>(X)) 2946 if (CX->isOne()) 2947 return Y; 2948 if (auto *CY = dyn_cast<ConstantInt>(Y)) 2949 if (CY->isOne()) 2950 return X; 2951 return B.CreateMul(X, Y); 2952 }; 2953 2954 // Get a suitable insert point for SCEV expansion. For blocks in the vector 2955 // loop, choose the end of the vector loop header (=LoopVectorBody), because 2956 // the DomTree is not kept up-to-date for additional blocks generated in the 2957 // vector loop. By using the header as insertion point, we guarantee that the 2958 // expanded instructions dominate all their uses. 2959 auto GetInsertPoint = [this, &B]() { 2960 BasicBlock *InsertBB = B.GetInsertPoint()->getParent(); 2961 if (InsertBB != LoopVectorBody && 2962 LI->getLoopFor(LoopVectorBody) == LI->getLoopFor(InsertBB)) 2963 return LoopVectorBody->getTerminator(); 2964 return &*B.GetInsertPoint(); 2965 }; 2966 switch (ID.getKind()) { 2967 case InductionDescriptor::IK_IntInduction: { 2968 assert(Index->getType() == StartValue->getType() && 2969 "Index type does not match StartValue type"); 2970 if (ID.getConstIntStepValue() && ID.getConstIntStepValue()->isMinusOne()) 2971 return B.CreateSub(StartValue, Index); 2972 auto *Offset = CreateMul( 2973 Index, Exp.expandCodeFor(Step, Index->getType(), GetInsertPoint())); 2974 return CreateAdd(StartValue, Offset); 2975 } 2976 case InductionDescriptor::IK_PtrInduction: { 2977 assert(isa<SCEVConstant>(Step) && 2978 "Expected constant step for pointer induction"); 2979 return B.CreateGEP( 2980 StartValue->getType()->getPointerElementType(), StartValue, 2981 CreateMul(Index, 2982 Exp.expandCodeFor(Step, Index->getType(), GetInsertPoint()))); 2983 } 2984 case InductionDescriptor::IK_FpInduction: { 2985 assert(Step->getType()->isFloatingPointTy() && "Expected FP Step value"); 2986 auto InductionBinOp = ID.getInductionBinOp(); 2987 assert(InductionBinOp && 2988 (InductionBinOp->getOpcode() == Instruction::FAdd || 2989 InductionBinOp->getOpcode() == Instruction::FSub) && 2990 "Original bin op should be defined for FP induction"); 2991 2992 Value *StepValue = cast<SCEVUnknown>(Step)->getValue(); 2993 2994 // Floating point operations had to be 'fast' to enable the induction. 2995 FastMathFlags Flags; 2996 Flags.setFast(); 2997 2998 Value *MulExp = B.CreateFMul(StepValue, Index); 2999 if (isa<Instruction>(MulExp)) 3000 // We have to check, the MulExp may be a constant. 3001 cast<Instruction>(MulExp)->setFastMathFlags(Flags); 3002 3003 Value *BOp = B.CreateBinOp(InductionBinOp->getOpcode(), StartValue, MulExp, 3004 "induction"); 3005 if (isa<Instruction>(BOp)) 3006 cast<Instruction>(BOp)->setFastMathFlags(Flags); 3007 3008 return BOp; 3009 } 3010 case InductionDescriptor::IK_NoInduction: 3011 return nullptr; 3012 } 3013 llvm_unreachable("invalid enum"); 3014 } 3015 3016 Loop *InnerLoopVectorizer::createVectorLoopSkeleton(StringRef Prefix) { 3017 LoopScalarBody = OrigLoop->getHeader(); 3018 LoopVectorPreHeader = OrigLoop->getLoopPreheader(); 3019 LoopExitBlock = OrigLoop->getExitBlock(); 3020 assert(LoopExitBlock && "Must have an exit block"); 3021 assert(LoopVectorPreHeader && "Invalid loop structure"); 3022 3023 LoopMiddleBlock = 3024 SplitBlock(LoopVectorPreHeader, LoopVectorPreHeader->getTerminator(), DT, 3025 LI, nullptr, Twine(Prefix) + "middle.block"); 3026 LoopScalarPreHeader = 3027 SplitBlock(LoopMiddleBlock, LoopMiddleBlock->getTerminator(), DT, LI, 3028 nullptr, Twine(Prefix) + "scalar.ph"); 3029 // We intentionally don't let SplitBlock to update LoopInfo since 3030 // LoopVectorBody should belong to another loop than LoopVectorPreHeader. 3031 // LoopVectorBody is explicitly added to the correct place few lines later. 3032 LoopVectorBody = 3033 SplitBlock(LoopVectorPreHeader, LoopVectorPreHeader->getTerminator(), DT, 3034 nullptr, nullptr, Twine(Prefix) + "vector.body"); 3035 3036 // Update dominator for loop exit. 3037 DT->changeImmediateDominator(LoopExitBlock, LoopMiddleBlock); 3038 3039 // Create and register the new vector loop. 3040 Loop *Lp = LI->AllocateLoop(); 3041 Loop *ParentLoop = OrigLoop->getParentLoop(); 3042 3043 // Insert the new loop into the loop nest and register the new basic blocks 3044 // before calling any utilities such as SCEV that require valid LoopInfo. 3045 if (ParentLoop) { 3046 ParentLoop->addChildLoop(Lp); 3047 } else { 3048 LI->addTopLevelLoop(Lp); 3049 } 3050 Lp->addBasicBlockToLoop(LoopVectorBody, *LI); 3051 return Lp; 3052 } 3053 3054 void InnerLoopVectorizer::createInductionResumeValues(Loop *L, 3055 Value *VectorTripCount) { 3056 assert(VectorTripCount && L && "Expected valid arguments"); 3057 // We are going to resume the execution of the scalar loop. 3058 // Go over all of the induction variables that we found and fix the 3059 // PHIs that are left in the scalar version of the loop. 3060 // The starting values of PHI nodes depend on the counter of the last 3061 // iteration in the vectorized loop. 3062 // If we come from a bypass edge then we need to start from the original 3063 // start value. 3064 for (auto &InductionEntry : Legal->getInductionVars()) { 3065 PHINode *OrigPhi = InductionEntry.first; 3066 InductionDescriptor II = InductionEntry.second; 3067 3068 // Create phi nodes to merge from the backedge-taken check block. 3069 PHINode *BCResumeVal = 3070 PHINode::Create(OrigPhi->getType(), 3, "bc.resume.val", 3071 LoopScalarPreHeader->getTerminator()); 3072 // Copy original phi DL over to the new one. 3073 BCResumeVal->setDebugLoc(OrigPhi->getDebugLoc()); 3074 Value *&EndValue = IVEndValues[OrigPhi]; 3075 if (OrigPhi == OldInduction) { 3076 // We know what the end value is. 3077 EndValue = VectorTripCount; 3078 } else { 3079 IRBuilder<> B(L->getLoopPreheader()->getTerminator()); 3080 Type *StepType = II.getStep()->getType(); 3081 Instruction::CastOps CastOp = 3082 CastInst::getCastOpcode(VectorTripCount, true, StepType, true); 3083 Value *CRD = B.CreateCast(CastOp, VectorTripCount, StepType, "cast.crd"); 3084 const DataLayout &DL = LoopScalarBody->getModule()->getDataLayout(); 3085 EndValue = emitTransformedIndex(B, CRD, PSE.getSE(), DL, II); 3086 EndValue->setName("ind.end"); 3087 } 3088 3089 // The new PHI merges the original incoming value, in case of a bypass, 3090 // or the value at the end of the vectorized loop. 3091 BCResumeVal->addIncoming(EndValue, LoopMiddleBlock); 3092 3093 // Fix the scalar body counter (PHI node). 3094 // The old induction's phi node in the scalar body needs the truncated 3095 // value. 3096 for (BasicBlock *BB : LoopBypassBlocks) 3097 BCResumeVal->addIncoming(II.getStartValue(), BB); 3098 OrigPhi->setIncomingValueForBlock(LoopScalarPreHeader, BCResumeVal); 3099 } 3100 } 3101 3102 BasicBlock *InnerLoopVectorizer::completeLoopSkeleton(Loop *L, 3103 MDNode *OrigLoopID) { 3104 assert(L && "Expected valid loop."); 3105 3106 // The trip counts should be cached by now. 3107 Value *Count = getOrCreateTripCount(L); 3108 Value *VectorTripCount = getOrCreateVectorTripCount(L); 3109 3110 // We need the OrigLoop (scalar loop part) latch terminator to help 3111 // produce correct debug info for the middle block BB instructions. 3112 // The legality check stage guarantees that the loop will have a single 3113 // latch. 3114 assert(isa<BranchInst>(OrigLoop->getLoopLatch()->getTerminator()) && 3115 "Scalar loop latch terminator isn't a branch"); 3116 BranchInst *ScalarLatchBr = 3117 cast<BranchInst>(OrigLoop->getLoopLatch()->getTerminator()); 3118 3119 // Add a check in the middle block to see if we have completed 3120 // all of the iterations in the first vector loop. 3121 // If (N - N%VF) == N, then we *don't* need to run the remainder. 3122 // If tail is to be folded, we know we don't need to run the remainder. 3123 Value *CmpN = Builder.getTrue(); 3124 if (!Cost->foldTailByMasking()) { 3125 CmpN = CmpInst::Create(Instruction::ICmp, CmpInst::ICMP_EQ, Count, 3126 VectorTripCount, "cmp.n", 3127 LoopMiddleBlock->getTerminator()); 3128 3129 // Here we use the same DebugLoc as the scalar loop latch branch instead 3130 // of the corresponding compare because they may have ended up with 3131 // different line numbers and we want to avoid awkward line stepping while 3132 // debugging. Eg. if the compare has got a line number inside the loop. 3133 cast<Instruction>(CmpN)->setDebugLoc(ScalarLatchBr->getDebugLoc()); 3134 } 3135 3136 BranchInst *BrInst = 3137 BranchInst::Create(LoopExitBlock, LoopScalarPreHeader, CmpN); 3138 BrInst->setDebugLoc(ScalarLatchBr->getDebugLoc()); 3139 ReplaceInstWithInst(LoopMiddleBlock->getTerminator(), BrInst); 3140 3141 // Get ready to start creating new instructions into the vectorized body. 3142 assert(LoopVectorPreHeader == L->getLoopPreheader() && 3143 "Inconsistent vector loop preheader"); 3144 Builder.SetInsertPoint(&*LoopVectorBody->getFirstInsertionPt()); 3145 3146 Optional<MDNode *> VectorizedLoopID = 3147 makeFollowupLoopID(OrigLoopID, {LLVMLoopVectorizeFollowupAll, 3148 LLVMLoopVectorizeFollowupVectorized}); 3149 if (VectorizedLoopID.hasValue()) { 3150 L->setLoopID(VectorizedLoopID.getValue()); 3151 3152 // Do not setAlreadyVectorized if loop attributes have been defined 3153 // explicitly. 3154 return LoopVectorPreHeader; 3155 } 3156 3157 // Keep all loop hints from the original loop on the vector loop (we'll 3158 // replace the vectorizer-specific hints below). 3159 if (MDNode *LID = OrigLoop->getLoopID()) 3160 L->setLoopID(LID); 3161 3162 LoopVectorizeHints Hints(L, true, *ORE); 3163 Hints.setAlreadyVectorized(); 3164 3165 #ifdef EXPENSIVE_CHECKS 3166 assert(DT->verify(DominatorTree::VerificationLevel::Fast)); 3167 LI->verify(*DT); 3168 #endif 3169 3170 return LoopVectorPreHeader; 3171 } 3172 3173 BasicBlock *InnerLoopVectorizer::createVectorizedLoopSkeleton() { 3174 /* 3175 In this function we generate a new loop. The new loop will contain 3176 the vectorized instructions while the old loop will continue to run the 3177 scalar remainder. 3178 3179 [ ] <-- loop iteration number check. 3180 / | 3181 / v 3182 | [ ] <-- vector loop bypass (may consist of multiple blocks). 3183 | / | 3184 | / v 3185 || [ ] <-- vector pre header. 3186 |/ | 3187 | v 3188 | [ ] \ 3189 | [ ]_| <-- vector loop. 3190 | | 3191 | v 3192 | -[ ] <--- middle-block. 3193 | / | 3194 | / v 3195 -|- >[ ] <--- new preheader. 3196 | | 3197 | v 3198 | [ ] \ 3199 | [ ]_| <-- old scalar loop to handle remainder. 3200 \ | 3201 \ v 3202 >[ ] <-- exit block. 3203 ... 3204 */ 3205 3206 // Get the metadata of the original loop before it gets modified. 3207 MDNode *OrigLoopID = OrigLoop->getLoopID(); 3208 3209 // Create an empty vector loop, and prepare basic blocks for the runtime 3210 // checks. 3211 Loop *Lp = createVectorLoopSkeleton(""); 3212 3213 // Now, compare the new count to zero. If it is zero skip the vector loop and 3214 // jump to the scalar loop. This check also covers the case where the 3215 // backedge-taken count is uint##_max: adding one to it will overflow leading 3216 // to an incorrect trip count of zero. In this (rare) case we will also jump 3217 // to the scalar loop. 3218 emitMinimumIterationCountCheck(Lp, LoopScalarPreHeader); 3219 3220 // Generate the code to check any assumptions that we've made for SCEV 3221 // expressions. 3222 emitSCEVChecks(Lp, LoopScalarPreHeader); 3223 3224 // Generate the code that checks in runtime if arrays overlap. We put the 3225 // checks into a separate block to make the more common case of few elements 3226 // faster. 3227 emitMemRuntimeChecks(Lp, LoopScalarPreHeader); 3228 3229 // Some loops have a single integer induction variable, while other loops 3230 // don't. One example is c++ iterators that often have multiple pointer 3231 // induction variables. In the code below we also support a case where we 3232 // don't have a single induction variable. 3233 // 3234 // We try to obtain an induction variable from the original loop as hard 3235 // as possible. However if we don't find one that: 3236 // - is an integer 3237 // - counts from zero, stepping by one 3238 // - is the size of the widest induction variable type 3239 // then we create a new one. 3240 OldInduction = Legal->getPrimaryInduction(); 3241 Type *IdxTy = Legal->getWidestInductionType(); 3242 Value *StartIdx = ConstantInt::get(IdxTy, 0); 3243 // The loop step is equal to the vectorization factor (num of SIMD elements) 3244 // times the unroll factor (num of SIMD instructions). 3245 Constant *Step = ConstantInt::get(IdxTy, VF * UF); 3246 Value *CountRoundDown = getOrCreateVectorTripCount(Lp); 3247 Induction = 3248 createInductionVariable(Lp, StartIdx, CountRoundDown, Step, 3249 getDebugLocFromInstOrOperands(OldInduction)); 3250 3251 // Emit phis for the new starting index of the scalar loop. 3252 createInductionResumeValues(Lp, CountRoundDown); 3253 3254 return completeLoopSkeleton(Lp, OrigLoopID); 3255 } 3256 3257 // Fix up external users of the induction variable. At this point, we are 3258 // in LCSSA form, with all external PHIs that use the IV having one input value, 3259 // coming from the remainder loop. We need those PHIs to also have a correct 3260 // value for the IV when arriving directly from the middle block. 3261 void InnerLoopVectorizer::fixupIVUsers(PHINode *OrigPhi, 3262 const InductionDescriptor &II, 3263 Value *CountRoundDown, Value *EndValue, 3264 BasicBlock *MiddleBlock) { 3265 // There are two kinds of external IV usages - those that use the value 3266 // computed in the last iteration (the PHI) and those that use the penultimate 3267 // value (the value that feeds into the phi from the loop latch). 3268 // We allow both, but they, obviously, have different values. 3269 3270 assert(OrigLoop->getExitBlock() && "Expected a single exit block"); 3271 3272 DenseMap<Value *, Value *> MissingVals; 3273 3274 // An external user of the last iteration's value should see the value that 3275 // the remainder loop uses to initialize its own IV. 3276 Value *PostInc = OrigPhi->getIncomingValueForBlock(OrigLoop->getLoopLatch()); 3277 for (User *U : PostInc->users()) { 3278 Instruction *UI = cast<Instruction>(U); 3279 if (!OrigLoop->contains(UI)) { 3280 assert(isa<PHINode>(UI) && "Expected LCSSA form"); 3281 MissingVals[UI] = EndValue; 3282 } 3283 } 3284 3285 // An external user of the penultimate value need to see EndValue - Step. 3286 // The simplest way to get this is to recompute it from the constituent SCEVs, 3287 // that is Start + (Step * (CRD - 1)). 3288 for (User *U : OrigPhi->users()) { 3289 auto *UI = cast<Instruction>(U); 3290 if (!OrigLoop->contains(UI)) { 3291 const DataLayout &DL = 3292 OrigLoop->getHeader()->getModule()->getDataLayout(); 3293 assert(isa<PHINode>(UI) && "Expected LCSSA form"); 3294 3295 IRBuilder<> B(MiddleBlock->getTerminator()); 3296 Value *CountMinusOne = B.CreateSub( 3297 CountRoundDown, ConstantInt::get(CountRoundDown->getType(), 1)); 3298 Value *CMO = 3299 !II.getStep()->getType()->isIntegerTy() 3300 ? B.CreateCast(Instruction::SIToFP, CountMinusOne, 3301 II.getStep()->getType()) 3302 : B.CreateSExtOrTrunc(CountMinusOne, II.getStep()->getType()); 3303 CMO->setName("cast.cmo"); 3304 Value *Escape = emitTransformedIndex(B, CMO, PSE.getSE(), DL, II); 3305 Escape->setName("ind.escape"); 3306 MissingVals[UI] = Escape; 3307 } 3308 } 3309 3310 for (auto &I : MissingVals) { 3311 PHINode *PHI = cast<PHINode>(I.first); 3312 // One corner case we have to handle is two IVs "chasing" each-other, 3313 // that is %IV2 = phi [...], [ %IV1, %latch ] 3314 // In this case, if IV1 has an external use, we need to avoid adding both 3315 // "last value of IV1" and "penultimate value of IV2". So, verify that we 3316 // don't already have an incoming value for the middle block. 3317 if (PHI->getBasicBlockIndex(MiddleBlock) == -1) 3318 PHI->addIncoming(I.second, MiddleBlock); 3319 } 3320 } 3321 3322 namespace { 3323 3324 struct CSEDenseMapInfo { 3325 static bool canHandle(const Instruction *I) { 3326 return isa<InsertElementInst>(I) || isa<ExtractElementInst>(I) || 3327 isa<ShuffleVectorInst>(I) || isa<GetElementPtrInst>(I); 3328 } 3329 3330 static inline Instruction *getEmptyKey() { 3331 return DenseMapInfo<Instruction *>::getEmptyKey(); 3332 } 3333 3334 static inline Instruction *getTombstoneKey() { 3335 return DenseMapInfo<Instruction *>::getTombstoneKey(); 3336 } 3337 3338 static unsigned getHashValue(const Instruction *I) { 3339 assert(canHandle(I) && "Unknown instruction!"); 3340 return hash_combine(I->getOpcode(), hash_combine_range(I->value_op_begin(), 3341 I->value_op_end())); 3342 } 3343 3344 static bool isEqual(const Instruction *LHS, const Instruction *RHS) { 3345 if (LHS == getEmptyKey() || RHS == getEmptyKey() || 3346 LHS == getTombstoneKey() || RHS == getTombstoneKey()) 3347 return LHS == RHS; 3348 return LHS->isIdenticalTo(RHS); 3349 } 3350 }; 3351 3352 } // end anonymous namespace 3353 3354 ///Perform cse of induction variable instructions. 3355 static void cse(BasicBlock *BB) { 3356 // Perform simple cse. 3357 SmallDenseMap<Instruction *, Instruction *, 4, CSEDenseMapInfo> CSEMap; 3358 for (BasicBlock::iterator I = BB->begin(), E = BB->end(); I != E;) { 3359 Instruction *In = &*I++; 3360 3361 if (!CSEDenseMapInfo::canHandle(In)) 3362 continue; 3363 3364 // Check if we can replace this instruction with any of the 3365 // visited instructions. 3366 if (Instruction *V = CSEMap.lookup(In)) { 3367 In->replaceAllUsesWith(V); 3368 In->eraseFromParent(); 3369 continue; 3370 } 3371 3372 CSEMap[In] = In; 3373 } 3374 } 3375 3376 unsigned LoopVectorizationCostModel::getVectorCallCost(CallInst *CI, 3377 unsigned VF, 3378 bool &NeedToScalarize) { 3379 Function *F = CI->getCalledFunction(); 3380 Type *ScalarRetTy = CI->getType(); 3381 SmallVector<Type *, 4> Tys, ScalarTys; 3382 for (auto &ArgOp : CI->arg_operands()) 3383 ScalarTys.push_back(ArgOp->getType()); 3384 3385 // Estimate cost of scalarized vector call. The source operands are assumed 3386 // to be vectors, so we need to extract individual elements from there, 3387 // execute VF scalar calls, and then gather the result into the vector return 3388 // value. 3389 unsigned ScalarCallCost = TTI.getCallInstrCost(F, ScalarRetTy, ScalarTys, 3390 TTI::TCK_RecipThroughput); 3391 if (VF == 1) 3392 return ScalarCallCost; 3393 3394 // Compute corresponding vector type for return value and arguments. 3395 Type *RetTy = ToVectorTy(ScalarRetTy, VF); 3396 for (Type *ScalarTy : ScalarTys) 3397 Tys.push_back(ToVectorTy(ScalarTy, VF)); 3398 3399 // Compute costs of unpacking argument values for the scalar calls and 3400 // packing the return values to a vector. 3401 unsigned ScalarizationCost = getScalarizationOverhead(CI, VF); 3402 3403 unsigned Cost = ScalarCallCost * VF + ScalarizationCost; 3404 3405 // If we can't emit a vector call for this function, then the currently found 3406 // cost is the cost we need to return. 3407 NeedToScalarize = true; 3408 VFShape Shape = 3409 VFShape::get(*CI, ElementCount::getFixed(VF), false /*HasGlobalPred*/); 3410 Function *VecFunc = VFDatabase(*CI).getVectorizedFunction(Shape); 3411 3412 if (!TLI || CI->isNoBuiltin() || !VecFunc) 3413 return Cost; 3414 3415 // If the corresponding vector cost is cheaper, return its cost. 3416 unsigned VectorCallCost = TTI.getCallInstrCost(nullptr, RetTy, Tys, 3417 TTI::TCK_RecipThroughput); 3418 if (VectorCallCost < Cost) { 3419 NeedToScalarize = false; 3420 return VectorCallCost; 3421 } 3422 return Cost; 3423 } 3424 3425 unsigned LoopVectorizationCostModel::getVectorIntrinsicCost(CallInst *CI, 3426 unsigned VF) { 3427 Intrinsic::ID ID = getVectorIntrinsicIDForCall(CI, TLI); 3428 assert(ID && "Expected intrinsic call!"); 3429 3430 IntrinsicCostAttributes CostAttrs(ID, *CI, VF); 3431 return TTI.getIntrinsicInstrCost(CostAttrs, 3432 TargetTransformInfo::TCK_RecipThroughput); 3433 } 3434 3435 static Type *smallestIntegerVectorType(Type *T1, Type *T2) { 3436 auto *I1 = cast<IntegerType>(cast<VectorType>(T1)->getElementType()); 3437 auto *I2 = cast<IntegerType>(cast<VectorType>(T2)->getElementType()); 3438 return I1->getBitWidth() < I2->getBitWidth() ? T1 : T2; 3439 } 3440 3441 static Type *largestIntegerVectorType(Type *T1, Type *T2) { 3442 auto *I1 = cast<IntegerType>(cast<VectorType>(T1)->getElementType()); 3443 auto *I2 = cast<IntegerType>(cast<VectorType>(T2)->getElementType()); 3444 return I1->getBitWidth() > I2->getBitWidth() ? T1 : T2; 3445 } 3446 3447 void InnerLoopVectorizer::truncateToMinimalBitwidths() { 3448 // For every instruction `I` in MinBWs, truncate the operands, create a 3449 // truncated version of `I` and reextend its result. InstCombine runs 3450 // later and will remove any ext/trunc pairs. 3451 SmallPtrSet<Value *, 4> Erased; 3452 for (const auto &KV : Cost->getMinimalBitwidths()) { 3453 // If the value wasn't vectorized, we must maintain the original scalar 3454 // type. The absence of the value from VectorLoopValueMap indicates that it 3455 // wasn't vectorized. 3456 if (!VectorLoopValueMap.hasAnyVectorValue(KV.first)) 3457 continue; 3458 for (unsigned Part = 0; Part < UF; ++Part) { 3459 Value *I = getOrCreateVectorValue(KV.first, Part); 3460 if (Erased.count(I) || I->use_empty() || !isa<Instruction>(I)) 3461 continue; 3462 Type *OriginalTy = I->getType(); 3463 Type *ScalarTruncatedTy = 3464 IntegerType::get(OriginalTy->getContext(), KV.second); 3465 auto *TruncatedTy = FixedVectorType::get( 3466 ScalarTruncatedTy, cast<VectorType>(OriginalTy)->getNumElements()); 3467 if (TruncatedTy == OriginalTy) 3468 continue; 3469 3470 IRBuilder<> B(cast<Instruction>(I)); 3471 auto ShrinkOperand = [&](Value *V) -> Value * { 3472 if (auto *ZI = dyn_cast<ZExtInst>(V)) 3473 if (ZI->getSrcTy() == TruncatedTy) 3474 return ZI->getOperand(0); 3475 return B.CreateZExtOrTrunc(V, TruncatedTy); 3476 }; 3477 3478 // The actual instruction modification depends on the instruction type, 3479 // unfortunately. 3480 Value *NewI = nullptr; 3481 if (auto *BO = dyn_cast<BinaryOperator>(I)) { 3482 NewI = B.CreateBinOp(BO->getOpcode(), ShrinkOperand(BO->getOperand(0)), 3483 ShrinkOperand(BO->getOperand(1))); 3484 3485 // Any wrapping introduced by shrinking this operation shouldn't be 3486 // considered undefined behavior. So, we can't unconditionally copy 3487 // arithmetic wrapping flags to NewI. 3488 cast<BinaryOperator>(NewI)->copyIRFlags(I, /*IncludeWrapFlags=*/false); 3489 } else if (auto *CI = dyn_cast<ICmpInst>(I)) { 3490 NewI = 3491 B.CreateICmp(CI->getPredicate(), ShrinkOperand(CI->getOperand(0)), 3492 ShrinkOperand(CI->getOperand(1))); 3493 } else if (auto *SI = dyn_cast<SelectInst>(I)) { 3494 NewI = B.CreateSelect(SI->getCondition(), 3495 ShrinkOperand(SI->getTrueValue()), 3496 ShrinkOperand(SI->getFalseValue())); 3497 } else if (auto *CI = dyn_cast<CastInst>(I)) { 3498 switch (CI->getOpcode()) { 3499 default: 3500 llvm_unreachable("Unhandled cast!"); 3501 case Instruction::Trunc: 3502 NewI = ShrinkOperand(CI->getOperand(0)); 3503 break; 3504 case Instruction::SExt: 3505 NewI = B.CreateSExtOrTrunc( 3506 CI->getOperand(0), 3507 smallestIntegerVectorType(OriginalTy, TruncatedTy)); 3508 break; 3509 case Instruction::ZExt: 3510 NewI = B.CreateZExtOrTrunc( 3511 CI->getOperand(0), 3512 smallestIntegerVectorType(OriginalTy, TruncatedTy)); 3513 break; 3514 } 3515 } else if (auto *SI = dyn_cast<ShuffleVectorInst>(I)) { 3516 auto Elements0 = 3517 cast<VectorType>(SI->getOperand(0)->getType())->getNumElements(); 3518 auto *O0 = B.CreateZExtOrTrunc( 3519 SI->getOperand(0), 3520 FixedVectorType::get(ScalarTruncatedTy, Elements0)); 3521 auto Elements1 = 3522 cast<VectorType>(SI->getOperand(1)->getType())->getNumElements(); 3523 auto *O1 = B.CreateZExtOrTrunc( 3524 SI->getOperand(1), 3525 FixedVectorType::get(ScalarTruncatedTy, Elements1)); 3526 3527 NewI = B.CreateShuffleVector(O0, O1, SI->getShuffleMask()); 3528 } else if (isa<LoadInst>(I) || isa<PHINode>(I)) { 3529 // Don't do anything with the operands, just extend the result. 3530 continue; 3531 } else if (auto *IE = dyn_cast<InsertElementInst>(I)) { 3532 auto Elements = 3533 cast<VectorType>(IE->getOperand(0)->getType())->getNumElements(); 3534 auto *O0 = B.CreateZExtOrTrunc( 3535 IE->getOperand(0), 3536 FixedVectorType::get(ScalarTruncatedTy, Elements)); 3537 auto *O1 = B.CreateZExtOrTrunc(IE->getOperand(1), ScalarTruncatedTy); 3538 NewI = B.CreateInsertElement(O0, O1, IE->getOperand(2)); 3539 } else if (auto *EE = dyn_cast<ExtractElementInst>(I)) { 3540 auto Elements = 3541 cast<VectorType>(EE->getOperand(0)->getType())->getNumElements(); 3542 auto *O0 = B.CreateZExtOrTrunc( 3543 EE->getOperand(0), 3544 FixedVectorType::get(ScalarTruncatedTy, Elements)); 3545 NewI = B.CreateExtractElement(O0, EE->getOperand(2)); 3546 } else { 3547 // If we don't know what to do, be conservative and don't do anything. 3548 continue; 3549 } 3550 3551 // Lastly, extend the result. 3552 NewI->takeName(cast<Instruction>(I)); 3553 Value *Res = B.CreateZExtOrTrunc(NewI, OriginalTy); 3554 I->replaceAllUsesWith(Res); 3555 cast<Instruction>(I)->eraseFromParent(); 3556 Erased.insert(I); 3557 VectorLoopValueMap.resetVectorValue(KV.first, Part, Res); 3558 } 3559 } 3560 3561 // We'll have created a bunch of ZExts that are now parentless. Clean up. 3562 for (const auto &KV : Cost->getMinimalBitwidths()) { 3563 // If the value wasn't vectorized, we must maintain the original scalar 3564 // type. The absence of the value from VectorLoopValueMap indicates that it 3565 // wasn't vectorized. 3566 if (!VectorLoopValueMap.hasAnyVectorValue(KV.first)) 3567 continue; 3568 for (unsigned Part = 0; Part < UF; ++Part) { 3569 Value *I = getOrCreateVectorValue(KV.first, Part); 3570 ZExtInst *Inst = dyn_cast<ZExtInst>(I); 3571 if (Inst && Inst->use_empty()) { 3572 Value *NewI = Inst->getOperand(0); 3573 Inst->eraseFromParent(); 3574 VectorLoopValueMap.resetVectorValue(KV.first, Part, NewI); 3575 } 3576 } 3577 } 3578 } 3579 3580 void InnerLoopVectorizer::fixVectorizedLoop() { 3581 // Insert truncates and extends for any truncated instructions as hints to 3582 // InstCombine. 3583 if (VF > 1) 3584 truncateToMinimalBitwidths(); 3585 3586 // Fix widened non-induction PHIs by setting up the PHI operands. 3587 if (OrigPHIsToFix.size()) { 3588 assert(EnableVPlanNativePath && 3589 "Unexpected non-induction PHIs for fixup in non VPlan-native path"); 3590 fixNonInductionPHIs(); 3591 } 3592 3593 // At this point every instruction in the original loop is widened to a 3594 // vector form. Now we need to fix the recurrences in the loop. These PHI 3595 // nodes are currently empty because we did not want to introduce cycles. 3596 // This is the second stage of vectorizing recurrences. 3597 fixCrossIterationPHIs(); 3598 3599 // Forget the original basic block. 3600 PSE.getSE()->forgetLoop(OrigLoop); 3601 3602 // Fix-up external users of the induction variables. 3603 for (auto &Entry : Legal->getInductionVars()) 3604 fixupIVUsers(Entry.first, Entry.second, 3605 getOrCreateVectorTripCount(LI->getLoopFor(LoopVectorBody)), 3606 IVEndValues[Entry.first], LoopMiddleBlock); 3607 3608 fixLCSSAPHIs(); 3609 for (Instruction *PI : PredicatedInstructions) 3610 sinkScalarOperands(&*PI); 3611 3612 // Remove redundant induction instructions. 3613 cse(LoopVectorBody); 3614 3615 // Set/update profile weights for the vector and remainder loops as original 3616 // loop iterations are now distributed among them. Note that original loop 3617 // represented by LoopScalarBody becomes remainder loop after vectorization. 3618 // 3619 // For cases like foldTailByMasking() and requiresScalarEpiloque() we may 3620 // end up getting slightly roughened result but that should be OK since 3621 // profile is not inherently precise anyway. Note also possible bypass of 3622 // vector code caused by legality checks is ignored, assigning all the weight 3623 // to the vector loop, optimistically. 3624 setProfileInfoAfterUnrolling(LI->getLoopFor(LoopScalarBody), 3625 LI->getLoopFor(LoopVectorBody), 3626 LI->getLoopFor(LoopScalarBody), VF * UF); 3627 } 3628 3629 void InnerLoopVectorizer::fixCrossIterationPHIs() { 3630 // In order to support recurrences we need to be able to vectorize Phi nodes. 3631 // Phi nodes have cycles, so we need to vectorize them in two stages. This is 3632 // stage #2: We now need to fix the recurrences by adding incoming edges to 3633 // the currently empty PHI nodes. At this point every instruction in the 3634 // original loop is widened to a vector form so we can use them to construct 3635 // the incoming edges. 3636 for (PHINode &Phi : OrigLoop->getHeader()->phis()) { 3637 // Handle first-order recurrences and reductions that need to be fixed. 3638 if (Legal->isFirstOrderRecurrence(&Phi)) 3639 fixFirstOrderRecurrence(&Phi); 3640 else if (Legal->isReductionVariable(&Phi)) 3641 fixReduction(&Phi); 3642 } 3643 } 3644 3645 void InnerLoopVectorizer::fixFirstOrderRecurrence(PHINode *Phi) { 3646 // This is the second phase of vectorizing first-order recurrences. An 3647 // overview of the transformation is described below. Suppose we have the 3648 // following loop. 3649 // 3650 // for (int i = 0; i < n; ++i) 3651 // b[i] = a[i] - a[i - 1]; 3652 // 3653 // There is a first-order recurrence on "a". For this loop, the shorthand 3654 // scalar IR looks like: 3655 // 3656 // scalar.ph: 3657 // s_init = a[-1] 3658 // br scalar.body 3659 // 3660 // scalar.body: 3661 // i = phi [0, scalar.ph], [i+1, scalar.body] 3662 // s1 = phi [s_init, scalar.ph], [s2, scalar.body] 3663 // s2 = a[i] 3664 // b[i] = s2 - s1 3665 // br cond, scalar.body, ... 3666 // 3667 // In this example, s1 is a recurrence because it's value depends on the 3668 // previous iteration. In the first phase of vectorization, we created a 3669 // temporary value for s1. We now complete the vectorization and produce the 3670 // shorthand vector IR shown below (for VF = 4, UF = 1). 3671 // 3672 // vector.ph: 3673 // v_init = vector(..., ..., ..., a[-1]) 3674 // br vector.body 3675 // 3676 // vector.body 3677 // i = phi [0, vector.ph], [i+4, vector.body] 3678 // v1 = phi [v_init, vector.ph], [v2, vector.body] 3679 // v2 = a[i, i+1, i+2, i+3]; 3680 // v3 = vector(v1(3), v2(0, 1, 2)) 3681 // b[i, i+1, i+2, i+3] = v2 - v3 3682 // br cond, vector.body, middle.block 3683 // 3684 // middle.block: 3685 // x = v2(3) 3686 // br scalar.ph 3687 // 3688 // scalar.ph: 3689 // s_init = phi [x, middle.block], [a[-1], otherwise] 3690 // br scalar.body 3691 // 3692 // After execution completes the vector loop, we extract the next value of 3693 // the recurrence (x) to use as the initial value in the scalar loop. 3694 3695 // Get the original loop preheader and single loop latch. 3696 auto *Preheader = OrigLoop->getLoopPreheader(); 3697 auto *Latch = OrigLoop->getLoopLatch(); 3698 3699 // Get the initial and previous values of the scalar recurrence. 3700 auto *ScalarInit = Phi->getIncomingValueForBlock(Preheader); 3701 auto *Previous = Phi->getIncomingValueForBlock(Latch); 3702 3703 // Create a vector from the initial value. 3704 auto *VectorInit = ScalarInit; 3705 if (VF > 1) { 3706 Builder.SetInsertPoint(LoopVectorPreHeader->getTerminator()); 3707 VectorInit = Builder.CreateInsertElement( 3708 UndefValue::get(FixedVectorType::get(VectorInit->getType(), VF)), 3709 VectorInit, Builder.getInt32(VF - 1), "vector.recur.init"); 3710 } 3711 3712 // We constructed a temporary phi node in the first phase of vectorization. 3713 // This phi node will eventually be deleted. 3714 Builder.SetInsertPoint( 3715 cast<Instruction>(VectorLoopValueMap.getVectorValue(Phi, 0))); 3716 3717 // Create a phi node for the new recurrence. The current value will either be 3718 // the initial value inserted into a vector or loop-varying vector value. 3719 auto *VecPhi = Builder.CreatePHI(VectorInit->getType(), 2, "vector.recur"); 3720 VecPhi->addIncoming(VectorInit, LoopVectorPreHeader); 3721 3722 // Get the vectorized previous value of the last part UF - 1. It appears last 3723 // among all unrolled iterations, due to the order of their construction. 3724 Value *PreviousLastPart = getOrCreateVectorValue(Previous, UF - 1); 3725 3726 // Find and set the insertion point after the previous value if it is an 3727 // instruction. 3728 BasicBlock::iterator InsertPt; 3729 // Note that the previous value may have been constant-folded so it is not 3730 // guaranteed to be an instruction in the vector loop. 3731 // FIXME: Loop invariant values do not form recurrences. We should deal with 3732 // them earlier. 3733 if (LI->getLoopFor(LoopVectorBody)->isLoopInvariant(PreviousLastPart)) 3734 InsertPt = LoopVectorBody->getFirstInsertionPt(); 3735 else { 3736 Instruction *PreviousInst = cast<Instruction>(PreviousLastPart); 3737 if (isa<PHINode>(PreviousLastPart)) 3738 // If the previous value is a phi node, we should insert after all the phi 3739 // nodes in the block containing the PHI to avoid breaking basic block 3740 // verification. Note that the basic block may be different to 3741 // LoopVectorBody, in case we predicate the loop. 3742 InsertPt = PreviousInst->getParent()->getFirstInsertionPt(); 3743 else 3744 InsertPt = ++PreviousInst->getIterator(); 3745 } 3746 Builder.SetInsertPoint(&*InsertPt); 3747 3748 // We will construct a vector for the recurrence by combining the values for 3749 // the current and previous iterations. This is the required shuffle mask. 3750 SmallVector<int, 8> ShuffleMask(VF); 3751 ShuffleMask[0] = VF - 1; 3752 for (unsigned I = 1; I < VF; ++I) 3753 ShuffleMask[I] = I + VF - 1; 3754 3755 // The vector from which to take the initial value for the current iteration 3756 // (actual or unrolled). Initially, this is the vector phi node. 3757 Value *Incoming = VecPhi; 3758 3759 // Shuffle the current and previous vector and update the vector parts. 3760 for (unsigned Part = 0; Part < UF; ++Part) { 3761 Value *PreviousPart = getOrCreateVectorValue(Previous, Part); 3762 Value *PhiPart = VectorLoopValueMap.getVectorValue(Phi, Part); 3763 auto *Shuffle = VF > 1 ? Builder.CreateShuffleVector(Incoming, PreviousPart, 3764 ShuffleMask) 3765 : Incoming; 3766 PhiPart->replaceAllUsesWith(Shuffle); 3767 cast<Instruction>(PhiPart)->eraseFromParent(); 3768 VectorLoopValueMap.resetVectorValue(Phi, Part, Shuffle); 3769 Incoming = PreviousPart; 3770 } 3771 3772 // Fix the latch value of the new recurrence in the vector loop. 3773 VecPhi->addIncoming(Incoming, LI->getLoopFor(LoopVectorBody)->getLoopLatch()); 3774 3775 // Extract the last vector element in the middle block. This will be the 3776 // initial value for the recurrence when jumping to the scalar loop. 3777 auto *ExtractForScalar = Incoming; 3778 if (VF > 1) { 3779 Builder.SetInsertPoint(LoopMiddleBlock->getTerminator()); 3780 ExtractForScalar = Builder.CreateExtractElement( 3781 ExtractForScalar, Builder.getInt32(VF - 1), "vector.recur.extract"); 3782 } 3783 // Extract the second last element in the middle block if the 3784 // Phi is used outside the loop. We need to extract the phi itself 3785 // and not the last element (the phi update in the current iteration). This 3786 // will be the value when jumping to the exit block from the LoopMiddleBlock, 3787 // when the scalar loop is not run at all. 3788 Value *ExtractForPhiUsedOutsideLoop = nullptr; 3789 if (VF > 1) 3790 ExtractForPhiUsedOutsideLoop = Builder.CreateExtractElement( 3791 Incoming, Builder.getInt32(VF - 2), "vector.recur.extract.for.phi"); 3792 // When loop is unrolled without vectorizing, initialize 3793 // ExtractForPhiUsedOutsideLoop with the value just prior to unrolled value of 3794 // `Incoming`. This is analogous to the vectorized case above: extracting the 3795 // second last element when VF > 1. 3796 else if (UF > 1) 3797 ExtractForPhiUsedOutsideLoop = getOrCreateVectorValue(Previous, UF - 2); 3798 3799 // Fix the initial value of the original recurrence in the scalar loop. 3800 Builder.SetInsertPoint(&*LoopScalarPreHeader->begin()); 3801 auto *Start = Builder.CreatePHI(Phi->getType(), 2, "scalar.recur.init"); 3802 for (auto *BB : predecessors(LoopScalarPreHeader)) { 3803 auto *Incoming = BB == LoopMiddleBlock ? ExtractForScalar : ScalarInit; 3804 Start->addIncoming(Incoming, BB); 3805 } 3806 3807 Phi->setIncomingValueForBlock(LoopScalarPreHeader, Start); 3808 Phi->setName("scalar.recur"); 3809 3810 // Finally, fix users of the recurrence outside the loop. The users will need 3811 // either the last value of the scalar recurrence or the last value of the 3812 // vector recurrence we extracted in the middle block. Since the loop is in 3813 // LCSSA form, we just need to find all the phi nodes for the original scalar 3814 // recurrence in the exit block, and then add an edge for the middle block. 3815 for (PHINode &LCSSAPhi : LoopExitBlock->phis()) { 3816 if (LCSSAPhi.getIncomingValue(0) == Phi) { 3817 LCSSAPhi.addIncoming(ExtractForPhiUsedOutsideLoop, LoopMiddleBlock); 3818 } 3819 } 3820 } 3821 3822 void InnerLoopVectorizer::fixReduction(PHINode *Phi) { 3823 Constant *Zero = Builder.getInt32(0); 3824 3825 // Get it's reduction variable descriptor. 3826 assert(Legal->isReductionVariable(Phi) && 3827 "Unable to find the reduction variable"); 3828 RecurrenceDescriptor RdxDesc = Legal->getReductionVars()[Phi]; 3829 3830 RecurrenceDescriptor::RecurrenceKind RK = RdxDesc.getRecurrenceKind(); 3831 TrackingVH<Value> ReductionStartValue = RdxDesc.getRecurrenceStartValue(); 3832 Instruction *LoopExitInst = RdxDesc.getLoopExitInstr(); 3833 RecurrenceDescriptor::MinMaxRecurrenceKind MinMaxKind = 3834 RdxDesc.getMinMaxRecurrenceKind(); 3835 setDebugLocFromInst(Builder, ReductionStartValue); 3836 bool IsInLoopReductionPhi = Cost->isInLoopReduction(Phi); 3837 3838 // We need to generate a reduction vector from the incoming scalar. 3839 // To do so, we need to generate the 'identity' vector and override 3840 // one of the elements with the incoming scalar reduction. We need 3841 // to do it in the vector-loop preheader. 3842 Builder.SetInsertPoint(LoopVectorPreHeader->getTerminator()); 3843 3844 // This is the vector-clone of the value that leaves the loop. 3845 Type *VecTy = getOrCreateVectorValue(LoopExitInst, 0)->getType(); 3846 3847 // Find the reduction identity variable. Zero for addition, or, xor, 3848 // one for multiplication, -1 for And. 3849 Value *Identity; 3850 Value *VectorStart; 3851 if (RK == RecurrenceDescriptor::RK_IntegerMinMax || 3852 RK == RecurrenceDescriptor::RK_FloatMinMax) { 3853 // MinMax reduction have the start value as their identify. 3854 if (VF == 1 || IsInLoopReductionPhi) { 3855 VectorStart = Identity = ReductionStartValue; 3856 } else { 3857 VectorStart = Identity = 3858 Builder.CreateVectorSplat(VF, ReductionStartValue, "minmax.ident"); 3859 } 3860 } else { 3861 // Handle other reduction kinds: 3862 Constant *Iden = RecurrenceDescriptor::getRecurrenceIdentity( 3863 RK, VecTy->getScalarType()); 3864 if (VF == 1 || IsInLoopReductionPhi) { 3865 Identity = Iden; 3866 // This vector is the Identity vector where the first element is the 3867 // incoming scalar reduction. 3868 VectorStart = ReductionStartValue; 3869 } else { 3870 Identity = ConstantVector::getSplat(ElementCount::getFixed(VF), Iden); 3871 3872 // This vector is the Identity vector where the first element is the 3873 // incoming scalar reduction. 3874 VectorStart = 3875 Builder.CreateInsertElement(Identity, ReductionStartValue, Zero); 3876 } 3877 } 3878 3879 // Wrap flags are in general invalid after vectorization, clear them. 3880 clearReductionWrapFlags(RdxDesc); 3881 3882 // Fix the vector-loop phi. 3883 3884 // Reductions do not have to start at zero. They can start with 3885 // any loop invariant values. 3886 BasicBlock *Latch = OrigLoop->getLoopLatch(); 3887 Value *LoopVal = Phi->getIncomingValueForBlock(Latch); 3888 3889 for (unsigned Part = 0; Part < UF; ++Part) { 3890 Value *VecRdxPhi = getOrCreateVectorValue(Phi, Part); 3891 Value *Val = getOrCreateVectorValue(LoopVal, Part); 3892 // Make sure to add the reduction start value only to the 3893 // first unroll part. 3894 Value *StartVal = (Part == 0) ? VectorStart : Identity; 3895 cast<PHINode>(VecRdxPhi)->addIncoming(StartVal, LoopVectorPreHeader); 3896 cast<PHINode>(VecRdxPhi) 3897 ->addIncoming(Val, LI->getLoopFor(LoopVectorBody)->getLoopLatch()); 3898 } 3899 3900 // Before each round, move the insertion point right between 3901 // the PHIs and the values we are going to write. 3902 // This allows us to write both PHINodes and the extractelement 3903 // instructions. 3904 Builder.SetInsertPoint(&*LoopMiddleBlock->getFirstInsertionPt()); 3905 3906 setDebugLocFromInst(Builder, LoopExitInst); 3907 3908 // If tail is folded by masking, the vector value to leave the loop should be 3909 // a Select choosing between the vectorized LoopExitInst and vectorized Phi, 3910 // instead of the former. 3911 if (Cost->foldTailByMasking()) { 3912 for (unsigned Part = 0; Part < UF; ++Part) { 3913 Value *VecLoopExitInst = 3914 VectorLoopValueMap.getVectorValue(LoopExitInst, Part); 3915 Value *Sel = nullptr; 3916 for (User *U : VecLoopExitInst->users()) { 3917 if (isa<SelectInst>(U)) { 3918 assert(!Sel && "Reduction exit feeding two selects"); 3919 Sel = U; 3920 } else 3921 assert(isa<PHINode>(U) && "Reduction exit must feed Phi's or select"); 3922 } 3923 assert(Sel && "Reduction exit feeds no select"); 3924 VectorLoopValueMap.resetVectorValue(LoopExitInst, Part, Sel); 3925 3926 // If the target can create a predicated operator for the reduction at no 3927 // extra cost in the loop (for example a predicated vadd), it can be 3928 // cheaper for the select to remain in the loop than be sunk out of it, 3929 // and so use the select value for the phi instead of the old 3930 // LoopExitValue. 3931 RecurrenceDescriptor RdxDesc = Legal->getReductionVars()[Phi]; 3932 if (PreferPredicatedReductionSelect || 3933 TTI->preferPredicatedReductionSelect( 3934 RdxDesc.getRecurrenceBinOp(RdxDesc.getRecurrenceKind()), 3935 Phi->getType(), TargetTransformInfo::ReductionFlags())) { 3936 auto *VecRdxPhi = cast<PHINode>(getOrCreateVectorValue(Phi, Part)); 3937 VecRdxPhi->setIncomingValueForBlock( 3938 LI->getLoopFor(LoopVectorBody)->getLoopLatch(), Sel); 3939 } 3940 } 3941 } 3942 3943 // If the vector reduction can be performed in a smaller type, we truncate 3944 // then extend the loop exit value to enable InstCombine to evaluate the 3945 // entire expression in the smaller type. 3946 if (VF > 1 && Phi->getType() != RdxDesc.getRecurrenceType()) { 3947 assert(!IsInLoopReductionPhi && "Unexpected truncated inloop reduction!"); 3948 Type *RdxVecTy = FixedVectorType::get(RdxDesc.getRecurrenceType(), VF); 3949 Builder.SetInsertPoint( 3950 LI->getLoopFor(LoopVectorBody)->getLoopLatch()->getTerminator()); 3951 VectorParts RdxParts(UF); 3952 for (unsigned Part = 0; Part < UF; ++Part) { 3953 RdxParts[Part] = VectorLoopValueMap.getVectorValue(LoopExitInst, Part); 3954 Value *Trunc = Builder.CreateTrunc(RdxParts[Part], RdxVecTy); 3955 Value *Extnd = RdxDesc.isSigned() ? Builder.CreateSExt(Trunc, VecTy) 3956 : Builder.CreateZExt(Trunc, VecTy); 3957 for (Value::user_iterator UI = RdxParts[Part]->user_begin(); 3958 UI != RdxParts[Part]->user_end();) 3959 if (*UI != Trunc) { 3960 (*UI++)->replaceUsesOfWith(RdxParts[Part], Extnd); 3961 RdxParts[Part] = Extnd; 3962 } else { 3963 ++UI; 3964 } 3965 } 3966 Builder.SetInsertPoint(&*LoopMiddleBlock->getFirstInsertionPt()); 3967 for (unsigned Part = 0; Part < UF; ++Part) { 3968 RdxParts[Part] = Builder.CreateTrunc(RdxParts[Part], RdxVecTy); 3969 VectorLoopValueMap.resetVectorValue(LoopExitInst, Part, RdxParts[Part]); 3970 } 3971 } 3972 3973 // Reduce all of the unrolled parts into a single vector. 3974 Value *ReducedPartRdx = VectorLoopValueMap.getVectorValue(LoopExitInst, 0); 3975 unsigned Op = RecurrenceDescriptor::getRecurrenceBinOp(RK); 3976 3977 // The middle block terminator has already been assigned a DebugLoc here (the 3978 // OrigLoop's single latch terminator). We want the whole middle block to 3979 // appear to execute on this line because: (a) it is all compiler generated, 3980 // (b) these instructions are always executed after evaluating the latch 3981 // conditional branch, and (c) other passes may add new predecessors which 3982 // terminate on this line. This is the easiest way to ensure we don't 3983 // accidentally cause an extra step back into the loop while debugging. 3984 setDebugLocFromInst(Builder, LoopMiddleBlock->getTerminator()); 3985 for (unsigned Part = 1; Part < UF; ++Part) { 3986 Value *RdxPart = VectorLoopValueMap.getVectorValue(LoopExitInst, Part); 3987 if (Op != Instruction::ICmp && Op != Instruction::FCmp) 3988 // Floating point operations had to be 'fast' to enable the reduction. 3989 ReducedPartRdx = addFastMathFlag( 3990 Builder.CreateBinOp((Instruction::BinaryOps)Op, RdxPart, 3991 ReducedPartRdx, "bin.rdx"), 3992 RdxDesc.getFastMathFlags()); 3993 else 3994 ReducedPartRdx = createMinMaxOp(Builder, MinMaxKind, ReducedPartRdx, 3995 RdxPart); 3996 } 3997 3998 // Create the reduction after the loop. Note that inloop reductions create the 3999 // target reduction in the loop using a Reduction recipe. 4000 if (VF > 1 && !IsInLoopReductionPhi) { 4001 bool NoNaN = Legal->hasFunNoNaNAttr(); 4002 ReducedPartRdx = 4003 createTargetReduction(Builder, TTI, RdxDesc, ReducedPartRdx, NoNaN); 4004 // If the reduction can be performed in a smaller type, we need to extend 4005 // the reduction to the wider type before we branch to the original loop. 4006 if (Phi->getType() != RdxDesc.getRecurrenceType()) 4007 ReducedPartRdx = 4008 RdxDesc.isSigned() 4009 ? Builder.CreateSExt(ReducedPartRdx, Phi->getType()) 4010 : Builder.CreateZExt(ReducedPartRdx, Phi->getType()); 4011 } 4012 4013 // Create a phi node that merges control-flow from the backedge-taken check 4014 // block and the middle block. 4015 PHINode *BCBlockPhi = PHINode::Create(Phi->getType(), 2, "bc.merge.rdx", 4016 LoopScalarPreHeader->getTerminator()); 4017 for (unsigned I = 0, E = LoopBypassBlocks.size(); I != E; ++I) 4018 BCBlockPhi->addIncoming(ReductionStartValue, LoopBypassBlocks[I]); 4019 BCBlockPhi->addIncoming(ReducedPartRdx, LoopMiddleBlock); 4020 4021 // Now, we need to fix the users of the reduction variable 4022 // inside and outside of the scalar remainder loop. 4023 // We know that the loop is in LCSSA form. We need to update the 4024 // PHI nodes in the exit blocks. 4025 for (PHINode &LCSSAPhi : LoopExitBlock->phis()) { 4026 // All PHINodes need to have a single entry edge, or two if 4027 // we already fixed them. 4028 assert(LCSSAPhi.getNumIncomingValues() < 3 && "Invalid LCSSA PHI"); 4029 4030 // We found a reduction value exit-PHI. Update it with the 4031 // incoming bypass edge. 4032 if (LCSSAPhi.getIncomingValue(0) == LoopExitInst) 4033 LCSSAPhi.addIncoming(ReducedPartRdx, LoopMiddleBlock); 4034 } // end of the LCSSA phi scan. 4035 4036 // Fix the scalar loop reduction variable with the incoming reduction sum 4037 // from the vector body and from the backedge value. 4038 int IncomingEdgeBlockIdx = 4039 Phi->getBasicBlockIndex(OrigLoop->getLoopLatch()); 4040 assert(IncomingEdgeBlockIdx >= 0 && "Invalid block index"); 4041 // Pick the other block. 4042 int SelfEdgeBlockIdx = (IncomingEdgeBlockIdx ? 0 : 1); 4043 Phi->setIncomingValue(SelfEdgeBlockIdx, BCBlockPhi); 4044 Phi->setIncomingValue(IncomingEdgeBlockIdx, LoopExitInst); 4045 } 4046 4047 void InnerLoopVectorizer::clearReductionWrapFlags( 4048 RecurrenceDescriptor &RdxDesc) { 4049 RecurrenceDescriptor::RecurrenceKind RK = RdxDesc.getRecurrenceKind(); 4050 if (RK != RecurrenceDescriptor::RK_IntegerAdd && 4051 RK != RecurrenceDescriptor::RK_IntegerMult) 4052 return; 4053 4054 Instruction *LoopExitInstr = RdxDesc.getLoopExitInstr(); 4055 assert(LoopExitInstr && "null loop exit instruction"); 4056 SmallVector<Instruction *, 8> Worklist; 4057 SmallPtrSet<Instruction *, 8> Visited; 4058 Worklist.push_back(LoopExitInstr); 4059 Visited.insert(LoopExitInstr); 4060 4061 while (!Worklist.empty()) { 4062 Instruction *Cur = Worklist.pop_back_val(); 4063 if (isa<OverflowingBinaryOperator>(Cur)) 4064 for (unsigned Part = 0; Part < UF; ++Part) { 4065 Value *V = getOrCreateVectorValue(Cur, Part); 4066 cast<Instruction>(V)->dropPoisonGeneratingFlags(); 4067 } 4068 4069 for (User *U : Cur->users()) { 4070 Instruction *UI = cast<Instruction>(U); 4071 if ((Cur != LoopExitInstr || OrigLoop->contains(UI->getParent())) && 4072 Visited.insert(UI).second) 4073 Worklist.push_back(UI); 4074 } 4075 } 4076 } 4077 4078 void InnerLoopVectorizer::fixLCSSAPHIs() { 4079 for (PHINode &LCSSAPhi : LoopExitBlock->phis()) { 4080 if (LCSSAPhi.getNumIncomingValues() == 1) { 4081 auto *IncomingValue = LCSSAPhi.getIncomingValue(0); 4082 // Non-instruction incoming values will have only one value. 4083 unsigned LastLane = 0; 4084 if (isa<Instruction>(IncomingValue)) 4085 LastLane = Cost->isUniformAfterVectorization( 4086 cast<Instruction>(IncomingValue), VF) 4087 ? 0 4088 : VF - 1; 4089 // Can be a loop invariant incoming value or the last scalar value to be 4090 // extracted from the vectorized loop. 4091 Builder.SetInsertPoint(LoopMiddleBlock->getTerminator()); 4092 Value *lastIncomingValue = 4093 getOrCreateScalarValue(IncomingValue, { UF - 1, LastLane }); 4094 LCSSAPhi.addIncoming(lastIncomingValue, LoopMiddleBlock); 4095 } 4096 } 4097 } 4098 4099 void InnerLoopVectorizer::sinkScalarOperands(Instruction *PredInst) { 4100 // The basic block and loop containing the predicated instruction. 4101 auto *PredBB = PredInst->getParent(); 4102 auto *VectorLoop = LI->getLoopFor(PredBB); 4103 4104 // Initialize a worklist with the operands of the predicated instruction. 4105 SetVector<Value *> Worklist(PredInst->op_begin(), PredInst->op_end()); 4106 4107 // Holds instructions that we need to analyze again. An instruction may be 4108 // reanalyzed if we don't yet know if we can sink it or not. 4109 SmallVector<Instruction *, 8> InstsToReanalyze; 4110 4111 // Returns true if a given use occurs in the predicated block. Phi nodes use 4112 // their operands in their corresponding predecessor blocks. 4113 auto isBlockOfUsePredicated = [&](Use &U) -> bool { 4114 auto *I = cast<Instruction>(U.getUser()); 4115 BasicBlock *BB = I->getParent(); 4116 if (auto *Phi = dyn_cast<PHINode>(I)) 4117 BB = Phi->getIncomingBlock( 4118 PHINode::getIncomingValueNumForOperand(U.getOperandNo())); 4119 return BB == PredBB; 4120 }; 4121 4122 // Iteratively sink the scalarized operands of the predicated instruction 4123 // into the block we created for it. When an instruction is sunk, it's 4124 // operands are then added to the worklist. The algorithm ends after one pass 4125 // through the worklist doesn't sink a single instruction. 4126 bool Changed; 4127 do { 4128 // Add the instructions that need to be reanalyzed to the worklist, and 4129 // reset the changed indicator. 4130 Worklist.insert(InstsToReanalyze.begin(), InstsToReanalyze.end()); 4131 InstsToReanalyze.clear(); 4132 Changed = false; 4133 4134 while (!Worklist.empty()) { 4135 auto *I = dyn_cast<Instruction>(Worklist.pop_back_val()); 4136 4137 // We can't sink an instruction if it is a phi node, is already in the 4138 // predicated block, is not in the loop, or may have side effects. 4139 if (!I || isa<PHINode>(I) || I->getParent() == PredBB || 4140 !VectorLoop->contains(I) || I->mayHaveSideEffects()) 4141 continue; 4142 4143 // It's legal to sink the instruction if all its uses occur in the 4144 // predicated block. Otherwise, there's nothing to do yet, and we may 4145 // need to reanalyze the instruction. 4146 if (!llvm::all_of(I->uses(), isBlockOfUsePredicated)) { 4147 InstsToReanalyze.push_back(I); 4148 continue; 4149 } 4150 4151 // Move the instruction to the beginning of the predicated block, and add 4152 // it's operands to the worklist. 4153 I->moveBefore(&*PredBB->getFirstInsertionPt()); 4154 Worklist.insert(I->op_begin(), I->op_end()); 4155 4156 // The sinking may have enabled other instructions to be sunk, so we will 4157 // need to iterate. 4158 Changed = true; 4159 } 4160 } while (Changed); 4161 } 4162 4163 void InnerLoopVectorizer::fixNonInductionPHIs() { 4164 for (PHINode *OrigPhi : OrigPHIsToFix) { 4165 PHINode *NewPhi = 4166 cast<PHINode>(VectorLoopValueMap.getVectorValue(OrigPhi, 0)); 4167 unsigned NumIncomingValues = OrigPhi->getNumIncomingValues(); 4168 4169 SmallVector<BasicBlock *, 2> ScalarBBPredecessors( 4170 predecessors(OrigPhi->getParent())); 4171 SmallVector<BasicBlock *, 2> VectorBBPredecessors( 4172 predecessors(NewPhi->getParent())); 4173 assert(ScalarBBPredecessors.size() == VectorBBPredecessors.size() && 4174 "Scalar and Vector BB should have the same number of predecessors"); 4175 4176 // The insertion point in Builder may be invalidated by the time we get 4177 // here. Force the Builder insertion point to something valid so that we do 4178 // not run into issues during insertion point restore in 4179 // getOrCreateVectorValue calls below. 4180 Builder.SetInsertPoint(NewPhi); 4181 4182 // The predecessor order is preserved and we can rely on mapping between 4183 // scalar and vector block predecessors. 4184 for (unsigned i = 0; i < NumIncomingValues; ++i) { 4185 BasicBlock *NewPredBB = VectorBBPredecessors[i]; 4186 4187 // When looking up the new scalar/vector values to fix up, use incoming 4188 // values from original phi. 4189 Value *ScIncV = 4190 OrigPhi->getIncomingValueForBlock(ScalarBBPredecessors[i]); 4191 4192 // Scalar incoming value may need a broadcast 4193 Value *NewIncV = getOrCreateVectorValue(ScIncV, 0); 4194 NewPhi->addIncoming(NewIncV, NewPredBB); 4195 } 4196 } 4197 } 4198 4199 void InnerLoopVectorizer::widenGEP(GetElementPtrInst *GEP, VPUser &Operands, 4200 unsigned UF, unsigned VF, 4201 bool IsPtrLoopInvariant, 4202 SmallBitVector &IsIndexLoopInvariant, 4203 VPTransformState &State) { 4204 // Construct a vector GEP by widening the operands of the scalar GEP as 4205 // necessary. We mark the vector GEP 'inbounds' if appropriate. A GEP 4206 // results in a vector of pointers when at least one operand of the GEP 4207 // is vector-typed. Thus, to keep the representation compact, we only use 4208 // vector-typed operands for loop-varying values. 4209 4210 if (VF > 1 && IsPtrLoopInvariant && IsIndexLoopInvariant.all()) { 4211 // If we are vectorizing, but the GEP has only loop-invariant operands, 4212 // the GEP we build (by only using vector-typed operands for 4213 // loop-varying values) would be a scalar pointer. Thus, to ensure we 4214 // produce a vector of pointers, we need to either arbitrarily pick an 4215 // operand to broadcast, or broadcast a clone of the original GEP. 4216 // Here, we broadcast a clone of the original. 4217 // 4218 // TODO: If at some point we decide to scalarize instructions having 4219 // loop-invariant operands, this special case will no longer be 4220 // required. We would add the scalarization decision to 4221 // collectLoopScalars() and teach getVectorValue() to broadcast 4222 // the lane-zero scalar value. 4223 auto *Clone = Builder.Insert(GEP->clone()); 4224 for (unsigned Part = 0; Part < UF; ++Part) { 4225 Value *EntryPart = Builder.CreateVectorSplat(VF, Clone); 4226 VectorLoopValueMap.setVectorValue(GEP, Part, EntryPart); 4227 addMetadata(EntryPart, GEP); 4228 } 4229 } else { 4230 // If the GEP has at least one loop-varying operand, we are sure to 4231 // produce a vector of pointers. But if we are only unrolling, we want 4232 // to produce a scalar GEP for each unroll part. Thus, the GEP we 4233 // produce with the code below will be scalar (if VF == 1) or vector 4234 // (otherwise). Note that for the unroll-only case, we still maintain 4235 // values in the vector mapping with initVector, as we do for other 4236 // instructions. 4237 for (unsigned Part = 0; Part < UF; ++Part) { 4238 // The pointer operand of the new GEP. If it's loop-invariant, we 4239 // won't broadcast it. 4240 auto *Ptr = IsPtrLoopInvariant ? State.get(Operands.getOperand(0), {0, 0}) 4241 : State.get(Operands.getOperand(0), Part); 4242 4243 // Collect all the indices for the new GEP. If any index is 4244 // loop-invariant, we won't broadcast it. 4245 SmallVector<Value *, 4> Indices; 4246 for (unsigned I = 1, E = Operands.getNumOperands(); I < E; I++) { 4247 VPValue *Operand = Operands.getOperand(I); 4248 if (IsIndexLoopInvariant[I - 1]) 4249 Indices.push_back(State.get(Operand, {0, 0})); 4250 else 4251 Indices.push_back(State.get(Operand, Part)); 4252 } 4253 4254 // Create the new GEP. Note that this GEP may be a scalar if VF == 1, 4255 // but it should be a vector, otherwise. 4256 auto *NewGEP = 4257 GEP->isInBounds() 4258 ? Builder.CreateInBoundsGEP(GEP->getSourceElementType(), Ptr, 4259 Indices) 4260 : Builder.CreateGEP(GEP->getSourceElementType(), Ptr, Indices); 4261 assert((VF == 1 || NewGEP->getType()->isVectorTy()) && 4262 "NewGEP is not a pointer vector"); 4263 VectorLoopValueMap.setVectorValue(GEP, Part, NewGEP); 4264 addMetadata(NewGEP, GEP); 4265 } 4266 } 4267 } 4268 4269 void InnerLoopVectorizer::widenPHIInstruction(Instruction *PN, unsigned UF, 4270 unsigned VF) { 4271 PHINode *P = cast<PHINode>(PN); 4272 if (EnableVPlanNativePath) { 4273 // Currently we enter here in the VPlan-native path for non-induction 4274 // PHIs where all control flow is uniform. We simply widen these PHIs. 4275 // Create a vector phi with no operands - the vector phi operands will be 4276 // set at the end of vector code generation. 4277 Type *VecTy = 4278 (VF == 1) ? PN->getType() : FixedVectorType::get(PN->getType(), VF); 4279 Value *VecPhi = Builder.CreatePHI(VecTy, PN->getNumOperands(), "vec.phi"); 4280 VectorLoopValueMap.setVectorValue(P, 0, VecPhi); 4281 OrigPHIsToFix.push_back(P); 4282 4283 return; 4284 } 4285 4286 assert(PN->getParent() == OrigLoop->getHeader() && 4287 "Non-header phis should have been handled elsewhere"); 4288 4289 // In order to support recurrences we need to be able to vectorize Phi nodes. 4290 // Phi nodes have cycles, so we need to vectorize them in two stages. This is 4291 // stage #1: We create a new vector PHI node with no incoming edges. We'll use 4292 // this value when we vectorize all of the instructions that use the PHI. 4293 if (Legal->isReductionVariable(P) || Legal->isFirstOrderRecurrence(P)) { 4294 for (unsigned Part = 0; Part < UF; ++Part) { 4295 // This is phase one of vectorizing PHIs. 4296 bool ScalarPHI = (VF == 1) || Cost->isInLoopReduction(cast<PHINode>(PN)); 4297 Type *VecTy = 4298 ScalarPHI ? PN->getType() : FixedVectorType::get(PN->getType(), VF); 4299 Value *EntryPart = PHINode::Create( 4300 VecTy, 2, "vec.phi", &*LoopVectorBody->getFirstInsertionPt()); 4301 VectorLoopValueMap.setVectorValue(P, Part, EntryPart); 4302 } 4303 return; 4304 } 4305 4306 setDebugLocFromInst(Builder, P); 4307 4308 // This PHINode must be an induction variable. 4309 // Make sure that we know about it. 4310 assert(Legal->getInductionVars().count(P) && "Not an induction variable"); 4311 4312 InductionDescriptor II = Legal->getInductionVars().lookup(P); 4313 const DataLayout &DL = OrigLoop->getHeader()->getModule()->getDataLayout(); 4314 4315 // FIXME: The newly created binary instructions should contain nsw/nuw flags, 4316 // which can be found from the original scalar operations. 4317 switch (II.getKind()) { 4318 case InductionDescriptor::IK_NoInduction: 4319 llvm_unreachable("Unknown induction"); 4320 case InductionDescriptor::IK_IntInduction: 4321 case InductionDescriptor::IK_FpInduction: 4322 llvm_unreachable("Integer/fp induction is handled elsewhere."); 4323 case InductionDescriptor::IK_PtrInduction: { 4324 // Handle the pointer induction variable case. 4325 assert(P->getType()->isPointerTy() && "Unexpected type."); 4326 4327 if (Cost->isScalarAfterVectorization(P, VF)) { 4328 // This is the normalized GEP that starts counting at zero. 4329 Value *PtrInd = 4330 Builder.CreateSExtOrTrunc(Induction, II.getStep()->getType()); 4331 // Determine the number of scalars we need to generate for each unroll 4332 // iteration. If the instruction is uniform, we only need to generate the 4333 // first lane. Otherwise, we generate all VF values. 4334 unsigned Lanes = Cost->isUniformAfterVectorization(P, VF) ? 1 : VF; 4335 for (unsigned Part = 0; Part < UF; ++Part) { 4336 for (unsigned Lane = 0; Lane < Lanes; ++Lane) { 4337 Constant *Idx = ConstantInt::get(PtrInd->getType(), Lane + Part * VF); 4338 Value *GlobalIdx = Builder.CreateAdd(PtrInd, Idx); 4339 Value *SclrGep = 4340 emitTransformedIndex(Builder, GlobalIdx, PSE.getSE(), DL, II); 4341 SclrGep->setName("next.gep"); 4342 VectorLoopValueMap.setScalarValue(P, {Part, Lane}, SclrGep); 4343 } 4344 } 4345 return; 4346 } 4347 assert(isa<SCEVConstant>(II.getStep()) && 4348 "Induction step not a SCEV constant!"); 4349 Type *PhiType = II.getStep()->getType(); 4350 4351 // Build a pointer phi 4352 Value *ScalarStartValue = II.getStartValue(); 4353 Type *ScStValueType = ScalarStartValue->getType(); 4354 PHINode *NewPointerPhi = 4355 PHINode::Create(ScStValueType, 2, "pointer.phi", Induction); 4356 NewPointerPhi->addIncoming(ScalarStartValue, LoopVectorPreHeader); 4357 4358 // A pointer induction, performed by using a gep 4359 BasicBlock *LoopLatch = LI->getLoopFor(LoopVectorBody)->getLoopLatch(); 4360 Instruction *InductionLoc = LoopLatch->getTerminator(); 4361 const SCEV *ScalarStep = II.getStep(); 4362 SCEVExpander Exp(*PSE.getSE(), DL, "induction"); 4363 Value *ScalarStepValue = 4364 Exp.expandCodeFor(ScalarStep, PhiType, InductionLoc); 4365 Value *InductionGEP = GetElementPtrInst::Create( 4366 ScStValueType->getPointerElementType(), NewPointerPhi, 4367 Builder.CreateMul(ScalarStepValue, ConstantInt::get(PhiType, VF * UF)), 4368 "ptr.ind", InductionLoc); 4369 NewPointerPhi->addIncoming(InductionGEP, LoopLatch); 4370 4371 // Create UF many actual address geps that use the pointer 4372 // phi as base and a vectorized version of the step value 4373 // (<step*0, ..., step*N>) as offset. 4374 for (unsigned Part = 0; Part < UF; ++Part) { 4375 SmallVector<Constant *, 8> Indices; 4376 // Create a vector of consecutive numbers from zero to VF. 4377 for (unsigned i = 0; i < VF; ++i) 4378 Indices.push_back(ConstantInt::get(PhiType, i + Part * VF)); 4379 Constant *StartOffset = ConstantVector::get(Indices); 4380 4381 Value *GEP = Builder.CreateGEP( 4382 ScStValueType->getPointerElementType(), NewPointerPhi, 4383 Builder.CreateMul(StartOffset, 4384 Builder.CreateVectorSplat(VF, ScalarStepValue), 4385 "vector.gep")); 4386 VectorLoopValueMap.setVectorValue(P, Part, GEP); 4387 } 4388 } 4389 } 4390 } 4391 4392 /// A helper function for checking whether an integer division-related 4393 /// instruction may divide by zero (in which case it must be predicated if 4394 /// executed conditionally in the scalar code). 4395 /// TODO: It may be worthwhile to generalize and check isKnownNonZero(). 4396 /// Non-zero divisors that are non compile-time constants will not be 4397 /// converted into multiplication, so we will still end up scalarizing 4398 /// the division, but can do so w/o predication. 4399 static bool mayDivideByZero(Instruction &I) { 4400 assert((I.getOpcode() == Instruction::UDiv || 4401 I.getOpcode() == Instruction::SDiv || 4402 I.getOpcode() == Instruction::URem || 4403 I.getOpcode() == Instruction::SRem) && 4404 "Unexpected instruction"); 4405 Value *Divisor = I.getOperand(1); 4406 auto *CInt = dyn_cast<ConstantInt>(Divisor); 4407 return !CInt || CInt->isZero(); 4408 } 4409 4410 void InnerLoopVectorizer::widenInstruction(Instruction &I, VPUser &User, 4411 VPTransformState &State) { 4412 switch (I.getOpcode()) { 4413 case Instruction::Call: 4414 case Instruction::Br: 4415 case Instruction::PHI: 4416 case Instruction::GetElementPtr: 4417 case Instruction::Select: 4418 llvm_unreachable("This instruction is handled by a different recipe."); 4419 case Instruction::UDiv: 4420 case Instruction::SDiv: 4421 case Instruction::SRem: 4422 case Instruction::URem: 4423 case Instruction::Add: 4424 case Instruction::FAdd: 4425 case Instruction::Sub: 4426 case Instruction::FSub: 4427 case Instruction::FNeg: 4428 case Instruction::Mul: 4429 case Instruction::FMul: 4430 case Instruction::FDiv: 4431 case Instruction::FRem: 4432 case Instruction::Shl: 4433 case Instruction::LShr: 4434 case Instruction::AShr: 4435 case Instruction::And: 4436 case Instruction::Or: 4437 case Instruction::Xor: { 4438 // Just widen unops and binops. 4439 setDebugLocFromInst(Builder, &I); 4440 4441 for (unsigned Part = 0; Part < UF; ++Part) { 4442 SmallVector<Value *, 2> Ops; 4443 for (VPValue *VPOp : User.operands()) 4444 Ops.push_back(State.get(VPOp, Part)); 4445 4446 Value *V = Builder.CreateNAryOp(I.getOpcode(), Ops); 4447 4448 if (auto *VecOp = dyn_cast<Instruction>(V)) 4449 VecOp->copyIRFlags(&I); 4450 4451 // Use this vector value for all users of the original instruction. 4452 VectorLoopValueMap.setVectorValue(&I, Part, V); 4453 addMetadata(V, &I); 4454 } 4455 4456 break; 4457 } 4458 case Instruction::ICmp: 4459 case Instruction::FCmp: { 4460 // Widen compares. Generate vector compares. 4461 bool FCmp = (I.getOpcode() == Instruction::FCmp); 4462 auto *Cmp = cast<CmpInst>(&I); 4463 setDebugLocFromInst(Builder, Cmp); 4464 for (unsigned Part = 0; Part < UF; ++Part) { 4465 Value *A = State.get(User.getOperand(0), Part); 4466 Value *B = State.get(User.getOperand(1), Part); 4467 Value *C = nullptr; 4468 if (FCmp) { 4469 // Propagate fast math flags. 4470 IRBuilder<>::FastMathFlagGuard FMFG(Builder); 4471 Builder.setFastMathFlags(Cmp->getFastMathFlags()); 4472 C = Builder.CreateFCmp(Cmp->getPredicate(), A, B); 4473 } else { 4474 C = Builder.CreateICmp(Cmp->getPredicate(), A, B); 4475 } 4476 VectorLoopValueMap.setVectorValue(&I, Part, C); 4477 addMetadata(C, &I); 4478 } 4479 4480 break; 4481 } 4482 4483 case Instruction::ZExt: 4484 case Instruction::SExt: 4485 case Instruction::FPToUI: 4486 case Instruction::FPToSI: 4487 case Instruction::FPExt: 4488 case Instruction::PtrToInt: 4489 case Instruction::IntToPtr: 4490 case Instruction::SIToFP: 4491 case Instruction::UIToFP: 4492 case Instruction::Trunc: 4493 case Instruction::FPTrunc: 4494 case Instruction::BitCast: { 4495 auto *CI = cast<CastInst>(&I); 4496 setDebugLocFromInst(Builder, CI); 4497 4498 /// Vectorize casts. 4499 Type *DestTy = 4500 (VF == 1) ? CI->getType() : FixedVectorType::get(CI->getType(), VF); 4501 4502 for (unsigned Part = 0; Part < UF; ++Part) { 4503 Value *A = State.get(User.getOperand(0), Part); 4504 Value *Cast = Builder.CreateCast(CI->getOpcode(), A, DestTy); 4505 VectorLoopValueMap.setVectorValue(&I, Part, Cast); 4506 addMetadata(Cast, &I); 4507 } 4508 break; 4509 } 4510 default: 4511 // This instruction is not vectorized by simple widening. 4512 LLVM_DEBUG(dbgs() << "LV: Found an unhandled instruction: " << I); 4513 llvm_unreachable("Unhandled instruction!"); 4514 } // end of switch. 4515 } 4516 4517 void InnerLoopVectorizer::widenCallInstruction(CallInst &I, VPUser &ArgOperands, 4518 VPTransformState &State) { 4519 assert(!isa<DbgInfoIntrinsic>(I) && 4520 "DbgInfoIntrinsic should have been dropped during VPlan construction"); 4521 setDebugLocFromInst(Builder, &I); 4522 4523 Module *M = I.getParent()->getParent()->getParent(); 4524 auto *CI = cast<CallInst>(&I); 4525 4526 SmallVector<Type *, 4> Tys; 4527 for (Value *ArgOperand : CI->arg_operands()) 4528 Tys.push_back(ToVectorTy(ArgOperand->getType(), VF)); 4529 4530 Intrinsic::ID ID = getVectorIntrinsicIDForCall(CI, TLI); 4531 4532 // The flag shows whether we use Intrinsic or a usual Call for vectorized 4533 // version of the instruction. 4534 // Is it beneficial to perform intrinsic call compared to lib call? 4535 bool NeedToScalarize = false; 4536 unsigned CallCost = Cost->getVectorCallCost(CI, VF, NeedToScalarize); 4537 bool UseVectorIntrinsic = 4538 ID && Cost->getVectorIntrinsicCost(CI, VF) <= CallCost; 4539 assert((UseVectorIntrinsic || !NeedToScalarize) && 4540 "Instruction should be scalarized elsewhere."); 4541 4542 for (unsigned Part = 0; Part < UF; ++Part) { 4543 SmallVector<Value *, 4> Args; 4544 for (auto &I : enumerate(ArgOperands.operands())) { 4545 // Some intrinsics have a scalar argument - don't replace it with a 4546 // vector. 4547 Value *Arg; 4548 if (!UseVectorIntrinsic || !hasVectorInstrinsicScalarOpd(ID, I.index())) 4549 Arg = State.get(I.value(), Part); 4550 else 4551 Arg = State.get(I.value(), {0, 0}); 4552 Args.push_back(Arg); 4553 } 4554 4555 Function *VectorF; 4556 if (UseVectorIntrinsic) { 4557 // Use vector version of the intrinsic. 4558 Type *TysForDecl[] = {CI->getType()}; 4559 if (VF > 1) 4560 TysForDecl[0] = 4561 FixedVectorType::get(CI->getType()->getScalarType(), VF); 4562 VectorF = Intrinsic::getDeclaration(M, ID, TysForDecl); 4563 assert(VectorF && "Can't retrieve vector intrinsic."); 4564 } else { 4565 // Use vector version of the function call. 4566 const VFShape Shape = VFShape::get(*CI, ElementCount::getFixed(VF), 4567 false /*HasGlobalPred*/); 4568 #ifndef NDEBUG 4569 assert(VFDatabase(*CI).getVectorizedFunction(Shape) != nullptr && 4570 "Can't create vector function."); 4571 #endif 4572 VectorF = VFDatabase(*CI).getVectorizedFunction(Shape); 4573 } 4574 SmallVector<OperandBundleDef, 1> OpBundles; 4575 CI->getOperandBundlesAsDefs(OpBundles); 4576 CallInst *V = Builder.CreateCall(VectorF, Args, OpBundles); 4577 4578 if (isa<FPMathOperator>(V)) 4579 V->copyFastMathFlags(CI); 4580 4581 VectorLoopValueMap.setVectorValue(&I, Part, V); 4582 addMetadata(V, &I); 4583 } 4584 } 4585 4586 void InnerLoopVectorizer::widenSelectInstruction(SelectInst &I, 4587 VPUser &Operands, 4588 bool InvariantCond, 4589 VPTransformState &State) { 4590 setDebugLocFromInst(Builder, &I); 4591 4592 // The condition can be loop invariant but still defined inside the 4593 // loop. This means that we can't just use the original 'cond' value. 4594 // We have to take the 'vectorized' value and pick the first lane. 4595 // Instcombine will make this a no-op. 4596 auto *InvarCond = 4597 InvariantCond ? State.get(Operands.getOperand(0), {0, 0}) : nullptr; 4598 4599 for (unsigned Part = 0; Part < UF; ++Part) { 4600 Value *Cond = 4601 InvarCond ? InvarCond : State.get(Operands.getOperand(0), Part); 4602 Value *Op0 = State.get(Operands.getOperand(1), Part); 4603 Value *Op1 = State.get(Operands.getOperand(2), Part); 4604 Value *Sel = Builder.CreateSelect(Cond, Op0, Op1); 4605 VectorLoopValueMap.setVectorValue(&I, Part, Sel); 4606 addMetadata(Sel, &I); 4607 } 4608 } 4609 4610 void LoopVectorizationCostModel::collectLoopScalars(unsigned VF) { 4611 // We should not collect Scalars more than once per VF. Right now, this 4612 // function is called from collectUniformsAndScalars(), which already does 4613 // this check. Collecting Scalars for VF=1 does not make any sense. 4614 assert(VF >= 2 && Scalars.find(VF) == Scalars.end() && 4615 "This function should not be visited twice for the same VF"); 4616 4617 SmallSetVector<Instruction *, 8> Worklist; 4618 4619 // These sets are used to seed the analysis with pointers used by memory 4620 // accesses that will remain scalar. 4621 SmallSetVector<Instruction *, 8> ScalarPtrs; 4622 SmallPtrSet<Instruction *, 8> PossibleNonScalarPtrs; 4623 auto *Latch = TheLoop->getLoopLatch(); 4624 4625 // A helper that returns true if the use of Ptr by MemAccess will be scalar. 4626 // The pointer operands of loads and stores will be scalar as long as the 4627 // memory access is not a gather or scatter operation. The value operand of a 4628 // store will remain scalar if the store is scalarized. 4629 auto isScalarUse = [&](Instruction *MemAccess, Value *Ptr) { 4630 InstWidening WideningDecision = getWideningDecision(MemAccess, VF); 4631 assert(WideningDecision != CM_Unknown && 4632 "Widening decision should be ready at this moment"); 4633 if (auto *Store = dyn_cast<StoreInst>(MemAccess)) 4634 if (Ptr == Store->getValueOperand()) 4635 return WideningDecision == CM_Scalarize; 4636 assert(Ptr == getLoadStorePointerOperand(MemAccess) && 4637 "Ptr is neither a value or pointer operand"); 4638 return WideningDecision != CM_GatherScatter; 4639 }; 4640 4641 // A helper that returns true if the given value is a bitcast or 4642 // getelementptr instruction contained in the loop. 4643 auto isLoopVaryingBitCastOrGEP = [&](Value *V) { 4644 return ((isa<BitCastInst>(V) && V->getType()->isPointerTy()) || 4645 isa<GetElementPtrInst>(V)) && 4646 !TheLoop->isLoopInvariant(V); 4647 }; 4648 4649 auto isScalarPtrInduction = [&](Instruction *MemAccess, Value *Ptr) { 4650 if (!isa<PHINode>(Ptr) || 4651 !Legal->getInductionVars().count(cast<PHINode>(Ptr))) 4652 return false; 4653 auto &Induction = Legal->getInductionVars()[cast<PHINode>(Ptr)]; 4654 if (Induction.getKind() != InductionDescriptor::IK_PtrInduction) 4655 return false; 4656 return isScalarUse(MemAccess, Ptr); 4657 }; 4658 4659 // A helper that evaluates a memory access's use of a pointer. If the 4660 // pointer is actually the pointer induction of a loop, it is being 4661 // inserted into Worklist. If the use will be a scalar use, and the 4662 // pointer is only used by memory accesses, we place the pointer in 4663 // ScalarPtrs. Otherwise, the pointer is placed in PossibleNonScalarPtrs. 4664 auto evaluatePtrUse = [&](Instruction *MemAccess, Value *Ptr) { 4665 if (isScalarPtrInduction(MemAccess, Ptr)) { 4666 Worklist.insert(cast<Instruction>(Ptr)); 4667 Instruction *Update = cast<Instruction>( 4668 cast<PHINode>(Ptr)->getIncomingValueForBlock(Latch)); 4669 Worklist.insert(Update); 4670 LLVM_DEBUG(dbgs() << "LV: Found new scalar instruction: " << *Ptr 4671 << "\n"); 4672 LLVM_DEBUG(dbgs() << "LV: Found new scalar instruction: " << *Update 4673 << "\n"); 4674 return; 4675 } 4676 // We only care about bitcast and getelementptr instructions contained in 4677 // the loop. 4678 if (!isLoopVaryingBitCastOrGEP(Ptr)) 4679 return; 4680 4681 // If the pointer has already been identified as scalar (e.g., if it was 4682 // also identified as uniform), there's nothing to do. 4683 auto *I = cast<Instruction>(Ptr); 4684 if (Worklist.count(I)) 4685 return; 4686 4687 // If the use of the pointer will be a scalar use, and all users of the 4688 // pointer are memory accesses, place the pointer in ScalarPtrs. Otherwise, 4689 // place the pointer in PossibleNonScalarPtrs. 4690 if (isScalarUse(MemAccess, Ptr) && llvm::all_of(I->users(), [&](User *U) { 4691 return isa<LoadInst>(U) || isa<StoreInst>(U); 4692 })) 4693 ScalarPtrs.insert(I); 4694 else 4695 PossibleNonScalarPtrs.insert(I); 4696 }; 4697 4698 // We seed the scalars analysis with three classes of instructions: (1) 4699 // instructions marked uniform-after-vectorization and (2) bitcast, 4700 // getelementptr and (pointer) phi instructions used by memory accesses 4701 // requiring a scalar use. 4702 // 4703 // (1) Add to the worklist all instructions that have been identified as 4704 // uniform-after-vectorization. 4705 Worklist.insert(Uniforms[VF].begin(), Uniforms[VF].end()); 4706 4707 // (2) Add to the worklist all bitcast and getelementptr instructions used by 4708 // memory accesses requiring a scalar use. The pointer operands of loads and 4709 // stores will be scalar as long as the memory accesses is not a gather or 4710 // scatter operation. The value operand of a store will remain scalar if the 4711 // store is scalarized. 4712 for (auto *BB : TheLoop->blocks()) 4713 for (auto &I : *BB) { 4714 if (auto *Load = dyn_cast<LoadInst>(&I)) { 4715 evaluatePtrUse(Load, Load->getPointerOperand()); 4716 } else if (auto *Store = dyn_cast<StoreInst>(&I)) { 4717 evaluatePtrUse(Store, Store->getPointerOperand()); 4718 evaluatePtrUse(Store, Store->getValueOperand()); 4719 } 4720 } 4721 for (auto *I : ScalarPtrs) 4722 if (!PossibleNonScalarPtrs.count(I)) { 4723 LLVM_DEBUG(dbgs() << "LV: Found scalar instruction: " << *I << "\n"); 4724 Worklist.insert(I); 4725 } 4726 4727 // Insert the forced scalars. 4728 // FIXME: Currently widenPHIInstruction() often creates a dead vector 4729 // induction variable when the PHI user is scalarized. 4730 auto ForcedScalar = ForcedScalars.find(VF); 4731 if (ForcedScalar != ForcedScalars.end()) 4732 for (auto *I : ForcedScalar->second) 4733 Worklist.insert(I); 4734 4735 // Expand the worklist by looking through any bitcasts and getelementptr 4736 // instructions we've already identified as scalar. This is similar to the 4737 // expansion step in collectLoopUniforms(); however, here we're only 4738 // expanding to include additional bitcasts and getelementptr instructions. 4739 unsigned Idx = 0; 4740 while (Idx != Worklist.size()) { 4741 Instruction *Dst = Worklist[Idx++]; 4742 if (!isLoopVaryingBitCastOrGEP(Dst->getOperand(0))) 4743 continue; 4744 auto *Src = cast<Instruction>(Dst->getOperand(0)); 4745 if (llvm::all_of(Src->users(), [&](User *U) -> bool { 4746 auto *J = cast<Instruction>(U); 4747 return !TheLoop->contains(J) || Worklist.count(J) || 4748 ((isa<LoadInst>(J) || isa<StoreInst>(J)) && 4749 isScalarUse(J, Src)); 4750 })) { 4751 Worklist.insert(Src); 4752 LLVM_DEBUG(dbgs() << "LV: Found scalar instruction: " << *Src << "\n"); 4753 } 4754 } 4755 4756 // An induction variable will remain scalar if all users of the induction 4757 // variable and induction variable update remain scalar. 4758 for (auto &Induction : Legal->getInductionVars()) { 4759 auto *Ind = Induction.first; 4760 auto *IndUpdate = cast<Instruction>(Ind->getIncomingValueForBlock(Latch)); 4761 4762 // If tail-folding is applied, the primary induction variable will be used 4763 // to feed a vector compare. 4764 if (Ind == Legal->getPrimaryInduction() && foldTailByMasking()) 4765 continue; 4766 4767 // Determine if all users of the induction variable are scalar after 4768 // vectorization. 4769 auto ScalarInd = llvm::all_of(Ind->users(), [&](User *U) -> bool { 4770 auto *I = cast<Instruction>(U); 4771 return I == IndUpdate || !TheLoop->contains(I) || Worklist.count(I); 4772 }); 4773 if (!ScalarInd) 4774 continue; 4775 4776 // Determine if all users of the induction variable update instruction are 4777 // scalar after vectorization. 4778 auto ScalarIndUpdate = 4779 llvm::all_of(IndUpdate->users(), [&](User *U) -> bool { 4780 auto *I = cast<Instruction>(U); 4781 return I == Ind || !TheLoop->contains(I) || Worklist.count(I); 4782 }); 4783 if (!ScalarIndUpdate) 4784 continue; 4785 4786 // The induction variable and its update instruction will remain scalar. 4787 Worklist.insert(Ind); 4788 Worklist.insert(IndUpdate); 4789 LLVM_DEBUG(dbgs() << "LV: Found scalar instruction: " << *Ind << "\n"); 4790 LLVM_DEBUG(dbgs() << "LV: Found scalar instruction: " << *IndUpdate 4791 << "\n"); 4792 } 4793 4794 Scalars[VF].insert(Worklist.begin(), Worklist.end()); 4795 } 4796 4797 bool LoopVectorizationCostModel::isScalarWithPredication(Instruction *I, unsigned VF) { 4798 if (!blockNeedsPredication(I->getParent())) 4799 return false; 4800 switch(I->getOpcode()) { 4801 default: 4802 break; 4803 case Instruction::Load: 4804 case Instruction::Store: { 4805 if (!Legal->isMaskRequired(I)) 4806 return false; 4807 auto *Ptr = getLoadStorePointerOperand(I); 4808 auto *Ty = getMemInstValueType(I); 4809 // We have already decided how to vectorize this instruction, get that 4810 // result. 4811 if (VF > 1) { 4812 InstWidening WideningDecision = getWideningDecision(I, VF); 4813 assert(WideningDecision != CM_Unknown && 4814 "Widening decision should be ready at this moment"); 4815 return WideningDecision == CM_Scalarize; 4816 } 4817 const Align Alignment = getLoadStoreAlignment(I); 4818 return isa<LoadInst>(I) ? !(isLegalMaskedLoad(Ty, Ptr, Alignment) || 4819 isLegalMaskedGather(Ty, Alignment)) 4820 : !(isLegalMaskedStore(Ty, Ptr, Alignment) || 4821 isLegalMaskedScatter(Ty, Alignment)); 4822 } 4823 case Instruction::UDiv: 4824 case Instruction::SDiv: 4825 case Instruction::SRem: 4826 case Instruction::URem: 4827 return mayDivideByZero(*I); 4828 } 4829 return false; 4830 } 4831 4832 bool LoopVectorizationCostModel::interleavedAccessCanBeWidened(Instruction *I, 4833 unsigned VF) { 4834 assert(isAccessInterleaved(I) && "Expecting interleaved access."); 4835 assert(getWideningDecision(I, VF) == CM_Unknown && 4836 "Decision should not be set yet."); 4837 auto *Group = getInterleavedAccessGroup(I); 4838 assert(Group && "Must have a group."); 4839 4840 // If the instruction's allocated size doesn't equal it's type size, it 4841 // requires padding and will be scalarized. 4842 auto &DL = I->getModule()->getDataLayout(); 4843 auto *ScalarTy = getMemInstValueType(I); 4844 if (hasIrregularType(ScalarTy, DL, VF)) 4845 return false; 4846 4847 // Check if masking is required. 4848 // A Group may need masking for one of two reasons: it resides in a block that 4849 // needs predication, or it was decided to use masking to deal with gaps. 4850 bool PredicatedAccessRequiresMasking = 4851 Legal->blockNeedsPredication(I->getParent()) && Legal->isMaskRequired(I); 4852 bool AccessWithGapsRequiresMasking = 4853 Group->requiresScalarEpilogue() && !isScalarEpilogueAllowed(); 4854 if (!PredicatedAccessRequiresMasking && !AccessWithGapsRequiresMasking) 4855 return true; 4856 4857 // If masked interleaving is required, we expect that the user/target had 4858 // enabled it, because otherwise it either wouldn't have been created or 4859 // it should have been invalidated by the CostModel. 4860 assert(useMaskedInterleavedAccesses(TTI) && 4861 "Masked interleave-groups for predicated accesses are not enabled."); 4862 4863 auto *Ty = getMemInstValueType(I); 4864 const Align Alignment = getLoadStoreAlignment(I); 4865 return isa<LoadInst>(I) ? TTI.isLegalMaskedLoad(Ty, Alignment) 4866 : TTI.isLegalMaskedStore(Ty, Alignment); 4867 } 4868 4869 bool LoopVectorizationCostModel::memoryInstructionCanBeWidened(Instruction *I, 4870 unsigned VF) { 4871 // Get and ensure we have a valid memory instruction. 4872 LoadInst *LI = dyn_cast<LoadInst>(I); 4873 StoreInst *SI = dyn_cast<StoreInst>(I); 4874 assert((LI || SI) && "Invalid memory instruction"); 4875 4876 auto *Ptr = getLoadStorePointerOperand(I); 4877 4878 // In order to be widened, the pointer should be consecutive, first of all. 4879 if (!Legal->isConsecutivePtr(Ptr)) 4880 return false; 4881 4882 // If the instruction is a store located in a predicated block, it will be 4883 // scalarized. 4884 if (isScalarWithPredication(I)) 4885 return false; 4886 4887 // If the instruction's allocated size doesn't equal it's type size, it 4888 // requires padding and will be scalarized. 4889 auto &DL = I->getModule()->getDataLayout(); 4890 auto *ScalarTy = LI ? LI->getType() : SI->getValueOperand()->getType(); 4891 if (hasIrregularType(ScalarTy, DL, VF)) 4892 return false; 4893 4894 return true; 4895 } 4896 4897 void LoopVectorizationCostModel::collectLoopUniforms(unsigned VF) { 4898 // We should not collect Uniforms more than once per VF. Right now, 4899 // this function is called from collectUniformsAndScalars(), which 4900 // already does this check. Collecting Uniforms for VF=1 does not make any 4901 // sense. 4902 4903 assert(VF >= 2 && Uniforms.find(VF) == Uniforms.end() && 4904 "This function should not be visited twice for the same VF"); 4905 4906 // Visit the list of Uniforms. If we'll not find any uniform value, we'll 4907 // not analyze again. Uniforms.count(VF) will return 1. 4908 Uniforms[VF].clear(); 4909 4910 // We now know that the loop is vectorizable! 4911 // Collect instructions inside the loop that will remain uniform after 4912 // vectorization. 4913 4914 // Global values, params and instructions outside of current loop are out of 4915 // scope. 4916 auto isOutOfScope = [&](Value *V) -> bool { 4917 Instruction *I = dyn_cast<Instruction>(V); 4918 return (!I || !TheLoop->contains(I)); 4919 }; 4920 4921 SetVector<Instruction *> Worklist; 4922 BasicBlock *Latch = TheLoop->getLoopLatch(); 4923 4924 // Instructions that are scalar with predication must not be considered 4925 // uniform after vectorization, because that would create an erroneous 4926 // replicating region where only a single instance out of VF should be formed. 4927 // TODO: optimize such seldom cases if found important, see PR40816. 4928 auto addToWorklistIfAllowed = [&](Instruction *I) -> void { 4929 if (isScalarWithPredication(I, VF)) { 4930 LLVM_DEBUG(dbgs() << "LV: Found not uniform being ScalarWithPredication: " 4931 << *I << "\n"); 4932 return; 4933 } 4934 LLVM_DEBUG(dbgs() << "LV: Found uniform instruction: " << *I << "\n"); 4935 Worklist.insert(I); 4936 }; 4937 4938 // Start with the conditional branch. If the branch condition is an 4939 // instruction contained in the loop that is only used by the branch, it is 4940 // uniform. 4941 auto *Cmp = dyn_cast<Instruction>(Latch->getTerminator()->getOperand(0)); 4942 if (Cmp && TheLoop->contains(Cmp) && Cmp->hasOneUse()) 4943 addToWorklistIfAllowed(Cmp); 4944 4945 // Holds consecutive and consecutive-like pointers. Consecutive-like pointers 4946 // are pointers that are treated like consecutive pointers during 4947 // vectorization. The pointer operands of interleaved accesses are an 4948 // example. 4949 SmallSetVector<Instruction *, 8> ConsecutiveLikePtrs; 4950 4951 // Holds pointer operands of instructions that are possibly non-uniform. 4952 SmallPtrSet<Instruction *, 8> PossibleNonUniformPtrs; 4953 4954 auto isUniformDecision = [&](Instruction *I, unsigned VF) { 4955 InstWidening WideningDecision = getWideningDecision(I, VF); 4956 assert(WideningDecision != CM_Unknown && 4957 "Widening decision should be ready at this moment"); 4958 4959 return (WideningDecision == CM_Widen || 4960 WideningDecision == CM_Widen_Reverse || 4961 WideningDecision == CM_Interleave); 4962 }; 4963 // Iterate over the instructions in the loop, and collect all 4964 // consecutive-like pointer operands in ConsecutiveLikePtrs. If it's possible 4965 // that a consecutive-like pointer operand will be scalarized, we collect it 4966 // in PossibleNonUniformPtrs instead. We use two sets here because a single 4967 // getelementptr instruction can be used by both vectorized and scalarized 4968 // memory instructions. For example, if a loop loads and stores from the same 4969 // location, but the store is conditional, the store will be scalarized, and 4970 // the getelementptr won't remain uniform. 4971 for (auto *BB : TheLoop->blocks()) 4972 for (auto &I : *BB) { 4973 // If there's no pointer operand, there's nothing to do. 4974 auto *Ptr = dyn_cast_or_null<Instruction>(getLoadStorePointerOperand(&I)); 4975 if (!Ptr) 4976 continue; 4977 4978 // True if all users of Ptr are memory accesses that have Ptr as their 4979 // pointer operand. 4980 auto UsersAreMemAccesses = 4981 llvm::all_of(Ptr->users(), [&](User *U) -> bool { 4982 return getLoadStorePointerOperand(U) == Ptr; 4983 }); 4984 4985 // Ensure the memory instruction will not be scalarized or used by 4986 // gather/scatter, making its pointer operand non-uniform. If the pointer 4987 // operand is used by any instruction other than a memory access, we 4988 // conservatively assume the pointer operand may be non-uniform. 4989 if (!UsersAreMemAccesses || !isUniformDecision(&I, VF)) 4990 PossibleNonUniformPtrs.insert(Ptr); 4991 4992 // If the memory instruction will be vectorized and its pointer operand 4993 // is consecutive-like, or interleaving - the pointer operand should 4994 // remain uniform. 4995 else 4996 ConsecutiveLikePtrs.insert(Ptr); 4997 } 4998 4999 // Add to the Worklist all consecutive and consecutive-like pointers that 5000 // aren't also identified as possibly non-uniform. 5001 for (auto *V : ConsecutiveLikePtrs) 5002 if (!PossibleNonUniformPtrs.count(V)) 5003 addToWorklistIfAllowed(V); 5004 5005 // Expand Worklist in topological order: whenever a new instruction 5006 // is added , its users should be already inside Worklist. It ensures 5007 // a uniform instruction will only be used by uniform instructions. 5008 unsigned idx = 0; 5009 while (idx != Worklist.size()) { 5010 Instruction *I = Worklist[idx++]; 5011 5012 for (auto OV : I->operand_values()) { 5013 // isOutOfScope operands cannot be uniform instructions. 5014 if (isOutOfScope(OV)) 5015 continue; 5016 // First order recurrence Phi's should typically be considered 5017 // non-uniform. 5018 auto *OP = dyn_cast<PHINode>(OV); 5019 if (OP && Legal->isFirstOrderRecurrence(OP)) 5020 continue; 5021 // If all the users of the operand are uniform, then add the 5022 // operand into the uniform worklist. 5023 auto *OI = cast<Instruction>(OV); 5024 if (llvm::all_of(OI->users(), [&](User *U) -> bool { 5025 auto *J = cast<Instruction>(U); 5026 return Worklist.count(J) || 5027 (OI == getLoadStorePointerOperand(J) && 5028 isUniformDecision(J, VF)); 5029 })) 5030 addToWorklistIfAllowed(OI); 5031 } 5032 } 5033 5034 // Returns true if Ptr is the pointer operand of a memory access instruction 5035 // I, and I is known to not require scalarization. 5036 auto isVectorizedMemAccessUse = [&](Instruction *I, Value *Ptr) -> bool { 5037 return getLoadStorePointerOperand(I) == Ptr && isUniformDecision(I, VF); 5038 }; 5039 5040 // For an instruction to be added into Worklist above, all its users inside 5041 // the loop should also be in Worklist. However, this condition cannot be 5042 // true for phi nodes that form a cyclic dependence. We must process phi 5043 // nodes separately. An induction variable will remain uniform if all users 5044 // of the induction variable and induction variable update remain uniform. 5045 // The code below handles both pointer and non-pointer induction variables. 5046 for (auto &Induction : Legal->getInductionVars()) { 5047 auto *Ind = Induction.first; 5048 auto *IndUpdate = cast<Instruction>(Ind->getIncomingValueForBlock(Latch)); 5049 5050 // Determine if all users of the induction variable are uniform after 5051 // vectorization. 5052 auto UniformInd = llvm::all_of(Ind->users(), [&](User *U) -> bool { 5053 auto *I = cast<Instruction>(U); 5054 return I == IndUpdate || !TheLoop->contains(I) || Worklist.count(I) || 5055 isVectorizedMemAccessUse(I, Ind); 5056 }); 5057 if (!UniformInd) 5058 continue; 5059 5060 // Determine if all users of the induction variable update instruction are 5061 // uniform after vectorization. 5062 auto UniformIndUpdate = 5063 llvm::all_of(IndUpdate->users(), [&](User *U) -> bool { 5064 auto *I = cast<Instruction>(U); 5065 return I == Ind || !TheLoop->contains(I) || Worklist.count(I) || 5066 isVectorizedMemAccessUse(I, IndUpdate); 5067 }); 5068 if (!UniformIndUpdate) 5069 continue; 5070 5071 // The induction variable and its update instruction will remain uniform. 5072 addToWorklistIfAllowed(Ind); 5073 addToWorklistIfAllowed(IndUpdate); 5074 } 5075 5076 Uniforms[VF].insert(Worklist.begin(), Worklist.end()); 5077 } 5078 5079 bool LoopVectorizationCostModel::runtimeChecksRequired() { 5080 LLVM_DEBUG(dbgs() << "LV: Performing code size checks.\n"); 5081 5082 if (Legal->getRuntimePointerChecking()->Need) { 5083 reportVectorizationFailure("Runtime ptr check is required with -Os/-Oz", 5084 "runtime pointer checks needed. Enable vectorization of this " 5085 "loop with '#pragma clang loop vectorize(enable)' when " 5086 "compiling with -Os/-Oz", 5087 "CantVersionLoopWithOptForSize", ORE, TheLoop); 5088 return true; 5089 } 5090 5091 if (!PSE.getUnionPredicate().getPredicates().empty()) { 5092 reportVectorizationFailure("Runtime SCEV check is required with -Os/-Oz", 5093 "runtime SCEV checks needed. Enable vectorization of this " 5094 "loop with '#pragma clang loop vectorize(enable)' when " 5095 "compiling with -Os/-Oz", 5096 "CantVersionLoopWithOptForSize", ORE, TheLoop); 5097 return true; 5098 } 5099 5100 // FIXME: Avoid specializing for stride==1 instead of bailing out. 5101 if (!Legal->getLAI()->getSymbolicStrides().empty()) { 5102 reportVectorizationFailure("Runtime stride check for small trip count", 5103 "runtime stride == 1 checks needed. Enable vectorization of " 5104 "this loop without such check by compiling with -Os/-Oz", 5105 "CantVersionLoopWithOptForSize", ORE, TheLoop); 5106 return true; 5107 } 5108 5109 return false; 5110 } 5111 5112 Optional<unsigned> LoopVectorizationCostModel::computeMaxVF(unsigned UserVF, 5113 unsigned UserIC) { 5114 if (Legal->getRuntimePointerChecking()->Need && TTI.hasBranchDivergence()) { 5115 // TODO: It may by useful to do since it's still likely to be dynamically 5116 // uniform if the target can skip. 5117 reportVectorizationFailure( 5118 "Not inserting runtime ptr check for divergent target", 5119 "runtime pointer checks needed. Not enabled for divergent target", 5120 "CantVersionLoopWithDivergentTarget", ORE, TheLoop); 5121 return None; 5122 } 5123 5124 unsigned TC = PSE.getSE()->getSmallConstantTripCount(TheLoop); 5125 LLVM_DEBUG(dbgs() << "LV: Found trip count: " << TC << '\n'); 5126 if (TC == 1) { 5127 reportVectorizationFailure("Single iteration (non) loop", 5128 "loop trip count is one, irrelevant for vectorization", 5129 "SingleIterationLoop", ORE, TheLoop); 5130 return None; 5131 } 5132 5133 switch (ScalarEpilogueStatus) { 5134 case CM_ScalarEpilogueAllowed: 5135 return UserVF ? UserVF : computeFeasibleMaxVF(TC); 5136 case CM_ScalarEpilogueNotNeededUsePredicate: 5137 LLVM_DEBUG( 5138 dbgs() << "LV: vector predicate hint/switch found.\n" 5139 << "LV: Not allowing scalar epilogue, creating predicated " 5140 << "vector loop.\n"); 5141 break; 5142 case CM_ScalarEpilogueNotAllowedLowTripLoop: 5143 // fallthrough as a special case of OptForSize 5144 case CM_ScalarEpilogueNotAllowedOptSize: 5145 if (ScalarEpilogueStatus == CM_ScalarEpilogueNotAllowedOptSize) 5146 LLVM_DEBUG( 5147 dbgs() << "LV: Not allowing scalar epilogue due to -Os/-Oz.\n"); 5148 else 5149 LLVM_DEBUG(dbgs() << "LV: Not allowing scalar epilogue due to low trip " 5150 << "count.\n"); 5151 5152 // Bail if runtime checks are required, which are not good when optimising 5153 // for size. 5154 if (runtimeChecksRequired()) 5155 return None; 5156 break; 5157 } 5158 5159 // Now try the tail folding 5160 5161 // Invalidate interleave groups that require an epilogue if we can't mask 5162 // the interleave-group. 5163 if (!useMaskedInterleavedAccesses(TTI)) { 5164 assert(WideningDecisions.empty() && Uniforms.empty() && Scalars.empty() && 5165 "No decisions should have been taken at this point"); 5166 // Note: There is no need to invalidate any cost modeling decisions here, as 5167 // non where taken so far. 5168 InterleaveInfo.invalidateGroupsRequiringScalarEpilogue(); 5169 } 5170 5171 unsigned MaxVF = UserVF ? UserVF : computeFeasibleMaxVF(TC); 5172 assert((UserVF || isPowerOf2_32(MaxVF)) && "MaxVF must be a power of 2"); 5173 unsigned MaxVFtimesIC = UserIC ? MaxVF * UserIC : MaxVF; 5174 if (TC > 0 && TC % MaxVFtimesIC == 0) { 5175 // Accept MaxVF if we do not have a tail. 5176 LLVM_DEBUG(dbgs() << "LV: No tail will remain for any chosen VF.\n"); 5177 return MaxVF; 5178 } 5179 5180 // If we don't know the precise trip count, or if the trip count that we 5181 // found modulo the vectorization factor is not zero, try to fold the tail 5182 // by masking. 5183 // FIXME: look for a smaller MaxVF that does divide TC rather than masking. 5184 if (Legal->prepareToFoldTailByMasking()) { 5185 FoldTailByMasking = true; 5186 return MaxVF; 5187 } 5188 5189 if (TC == 0) { 5190 reportVectorizationFailure( 5191 "Unable to calculate the loop count due to complex control flow", 5192 "unable to calculate the loop count due to complex control flow", 5193 "UnknownLoopCountComplexCFG", ORE, TheLoop); 5194 return None; 5195 } 5196 5197 reportVectorizationFailure( 5198 "Cannot optimize for size and vectorize at the same time.", 5199 "cannot optimize for size and vectorize at the same time. " 5200 "Enable vectorization of this loop with '#pragma clang loop " 5201 "vectorize(enable)' when compiling with -Os/-Oz", 5202 "NoTailLoopWithOptForSize", ORE, TheLoop); 5203 return None; 5204 } 5205 5206 unsigned 5207 LoopVectorizationCostModel::computeFeasibleMaxVF(unsigned ConstTripCount) { 5208 MinBWs = computeMinimumValueSizes(TheLoop->getBlocks(), *DB, &TTI); 5209 unsigned SmallestType, WidestType; 5210 std::tie(SmallestType, WidestType) = getSmallestAndWidestTypes(); 5211 unsigned WidestRegister = TTI.getRegisterBitWidth(true); 5212 5213 // Get the maximum safe dependence distance in bits computed by LAA. 5214 // It is computed by MaxVF * sizeOf(type) * 8, where type is taken from 5215 // the memory accesses that is most restrictive (involved in the smallest 5216 // dependence distance). 5217 unsigned MaxSafeRegisterWidth = Legal->getMaxSafeRegisterWidth(); 5218 5219 WidestRegister = std::min(WidestRegister, MaxSafeRegisterWidth); 5220 5221 // Ensure MaxVF is a power of 2; the dependence distance bound may not be. 5222 // Note that both WidestRegister and WidestType may not be a powers of 2. 5223 unsigned MaxVectorSize = PowerOf2Floor(WidestRegister / WidestType); 5224 5225 LLVM_DEBUG(dbgs() << "LV: The Smallest and Widest types: " << SmallestType 5226 << " / " << WidestType << " bits.\n"); 5227 LLVM_DEBUG(dbgs() << "LV: The Widest register safe to use is: " 5228 << WidestRegister << " bits.\n"); 5229 5230 assert(MaxVectorSize <= 256 && "Did not expect to pack so many elements" 5231 " into one vector!"); 5232 if (MaxVectorSize == 0) { 5233 LLVM_DEBUG(dbgs() << "LV: The target has no vector registers.\n"); 5234 MaxVectorSize = 1; 5235 return MaxVectorSize; 5236 } else if (ConstTripCount && ConstTripCount < MaxVectorSize && 5237 isPowerOf2_32(ConstTripCount)) { 5238 // We need to clamp the VF to be the ConstTripCount. There is no point in 5239 // choosing a higher viable VF as done in the loop below. 5240 LLVM_DEBUG(dbgs() << "LV: Clamping the MaxVF to the constant trip count: " 5241 << ConstTripCount << "\n"); 5242 MaxVectorSize = ConstTripCount; 5243 return MaxVectorSize; 5244 } 5245 5246 unsigned MaxVF = MaxVectorSize; 5247 if (TTI.shouldMaximizeVectorBandwidth(!isScalarEpilogueAllowed()) || 5248 (MaximizeBandwidth && isScalarEpilogueAllowed())) { 5249 // Collect all viable vectorization factors larger than the default MaxVF 5250 // (i.e. MaxVectorSize). 5251 SmallVector<unsigned, 8> VFs; 5252 unsigned NewMaxVectorSize = WidestRegister / SmallestType; 5253 for (unsigned VS = MaxVectorSize * 2; VS <= NewMaxVectorSize; VS *= 2) 5254 VFs.push_back(VS); 5255 5256 // For each VF calculate its register usage. 5257 auto RUs = calculateRegisterUsage(VFs); 5258 5259 // Select the largest VF which doesn't require more registers than existing 5260 // ones. 5261 for (int i = RUs.size() - 1; i >= 0; --i) { 5262 bool Selected = true; 5263 for (auto& pair : RUs[i].MaxLocalUsers) { 5264 unsigned TargetNumRegisters = TTI.getNumberOfRegisters(pair.first); 5265 if (pair.second > TargetNumRegisters) 5266 Selected = false; 5267 } 5268 if (Selected) { 5269 MaxVF = VFs[i]; 5270 break; 5271 } 5272 } 5273 if (unsigned MinVF = TTI.getMinimumVF(SmallestType)) { 5274 if (MaxVF < MinVF) { 5275 LLVM_DEBUG(dbgs() << "LV: Overriding calculated MaxVF(" << MaxVF 5276 << ") with target's minimum: " << MinVF << '\n'); 5277 MaxVF = MinVF; 5278 } 5279 } 5280 } 5281 return MaxVF; 5282 } 5283 5284 VectorizationFactor 5285 LoopVectorizationCostModel::selectVectorizationFactor(unsigned MaxVF) { 5286 float Cost = expectedCost(1).first; 5287 const float ScalarCost = Cost; 5288 unsigned Width = 1; 5289 LLVM_DEBUG(dbgs() << "LV: Scalar loop costs: " << (int)ScalarCost << ".\n"); 5290 5291 bool ForceVectorization = Hints->getForce() == LoopVectorizeHints::FK_Enabled; 5292 if (ForceVectorization && MaxVF > 1) { 5293 // Ignore scalar width, because the user explicitly wants vectorization. 5294 // Initialize cost to max so that VF = 2 is, at least, chosen during cost 5295 // evaluation. 5296 Cost = std::numeric_limits<float>::max(); 5297 } 5298 5299 for (unsigned i = 2; i <= MaxVF; i *= 2) { 5300 // Notice that the vector loop needs to be executed less times, so 5301 // we need to divide the cost of the vector loops by the width of 5302 // the vector elements. 5303 VectorizationCostTy C = expectedCost(i); 5304 float VectorCost = C.first / (float)i; 5305 LLVM_DEBUG(dbgs() << "LV: Vector loop of width " << i 5306 << " costs: " << (int)VectorCost << ".\n"); 5307 if (!C.second && !ForceVectorization) { 5308 LLVM_DEBUG( 5309 dbgs() << "LV: Not considering vector loop of width " << i 5310 << " because it will not generate any vector instructions.\n"); 5311 continue; 5312 } 5313 if (VectorCost < Cost) { 5314 Cost = VectorCost; 5315 Width = i; 5316 } 5317 } 5318 5319 if (!EnableCondStoresVectorization && NumPredStores) { 5320 reportVectorizationFailure("There are conditional stores.", 5321 "store that is conditionally executed prevents vectorization", 5322 "ConditionalStore", ORE, TheLoop); 5323 Width = 1; 5324 Cost = ScalarCost; 5325 } 5326 5327 LLVM_DEBUG(if (ForceVectorization && Width > 1 && Cost >= ScalarCost) dbgs() 5328 << "LV: Vectorization seems to be not beneficial, " 5329 << "but was forced by a user.\n"); 5330 LLVM_DEBUG(dbgs() << "LV: Selecting VF: " << Width << ".\n"); 5331 VectorizationFactor Factor = {Width, (unsigned)(Width * Cost)}; 5332 return Factor; 5333 } 5334 5335 std::pair<unsigned, unsigned> 5336 LoopVectorizationCostModel::getSmallestAndWidestTypes() { 5337 unsigned MinWidth = -1U; 5338 unsigned MaxWidth = 8; 5339 const DataLayout &DL = TheFunction->getParent()->getDataLayout(); 5340 5341 // For each block. 5342 for (BasicBlock *BB : TheLoop->blocks()) { 5343 // For each instruction in the loop. 5344 for (Instruction &I : BB->instructionsWithoutDebug()) { 5345 Type *T = I.getType(); 5346 5347 // Skip ignored values. 5348 if (ValuesToIgnore.count(&I)) 5349 continue; 5350 5351 // Only examine Loads, Stores and PHINodes. 5352 if (!isa<LoadInst>(I) && !isa<StoreInst>(I) && !isa<PHINode>(I)) 5353 continue; 5354 5355 // Examine PHI nodes that are reduction variables. Update the type to 5356 // account for the recurrence type. 5357 if (auto *PN = dyn_cast<PHINode>(&I)) { 5358 if (!Legal->isReductionVariable(PN)) 5359 continue; 5360 RecurrenceDescriptor RdxDesc = Legal->getReductionVars()[PN]; 5361 T = RdxDesc.getRecurrenceType(); 5362 } 5363 5364 // Examine the stored values. 5365 if (auto *ST = dyn_cast<StoreInst>(&I)) 5366 T = ST->getValueOperand()->getType(); 5367 5368 // Ignore loaded pointer types and stored pointer types that are not 5369 // vectorizable. 5370 // 5371 // FIXME: The check here attempts to predict whether a load or store will 5372 // be vectorized. We only know this for certain after a VF has 5373 // been selected. Here, we assume that if an access can be 5374 // vectorized, it will be. We should also look at extending this 5375 // optimization to non-pointer types. 5376 // 5377 if (T->isPointerTy() && !isConsecutiveLoadOrStore(&I) && 5378 !isAccessInterleaved(&I) && !isLegalGatherOrScatter(&I)) 5379 continue; 5380 5381 MinWidth = std::min(MinWidth, 5382 (unsigned)DL.getTypeSizeInBits(T->getScalarType())); 5383 MaxWidth = std::max(MaxWidth, 5384 (unsigned)DL.getTypeSizeInBits(T->getScalarType())); 5385 } 5386 } 5387 5388 return {MinWidth, MaxWidth}; 5389 } 5390 5391 unsigned LoopVectorizationCostModel::selectInterleaveCount(unsigned VF, 5392 unsigned LoopCost) { 5393 // -- The interleave heuristics -- 5394 // We interleave the loop in order to expose ILP and reduce the loop overhead. 5395 // There are many micro-architectural considerations that we can't predict 5396 // at this level. For example, frontend pressure (on decode or fetch) due to 5397 // code size, or the number and capabilities of the execution ports. 5398 // 5399 // We use the following heuristics to select the interleave count: 5400 // 1. If the code has reductions, then we interleave to break the cross 5401 // iteration dependency. 5402 // 2. If the loop is really small, then we interleave to reduce the loop 5403 // overhead. 5404 // 3. We don't interleave if we think that we will spill registers to memory 5405 // due to the increased register pressure. 5406 5407 if (!isScalarEpilogueAllowed()) 5408 return 1; 5409 5410 // We used the distance for the interleave count. 5411 if (Legal->getMaxSafeDepDistBytes() != -1U) 5412 return 1; 5413 5414 // Do not interleave loops with a relatively small known or estimated trip 5415 // count. 5416 auto BestKnownTC = getSmallBestKnownTC(*PSE.getSE(), TheLoop); 5417 if (BestKnownTC && *BestKnownTC < TinyTripCountInterleaveThreshold) 5418 return 1; 5419 5420 RegisterUsage R = calculateRegisterUsage({VF})[0]; 5421 // We divide by these constants so assume that we have at least one 5422 // instruction that uses at least one register. 5423 for (auto& pair : R.MaxLocalUsers) { 5424 pair.second = std::max(pair.second, 1U); 5425 } 5426 5427 // We calculate the interleave count using the following formula. 5428 // Subtract the number of loop invariants from the number of available 5429 // registers. These registers are used by all of the interleaved instances. 5430 // Next, divide the remaining registers by the number of registers that is 5431 // required by the loop, in order to estimate how many parallel instances 5432 // fit without causing spills. All of this is rounded down if necessary to be 5433 // a power of two. We want power of two interleave count to simplify any 5434 // addressing operations or alignment considerations. 5435 // We also want power of two interleave counts to ensure that the induction 5436 // variable of the vector loop wraps to zero, when tail is folded by masking; 5437 // this currently happens when OptForSize, in which case IC is set to 1 above. 5438 unsigned IC = UINT_MAX; 5439 5440 for (auto& pair : R.MaxLocalUsers) { 5441 unsigned TargetNumRegisters = TTI.getNumberOfRegisters(pair.first); 5442 LLVM_DEBUG(dbgs() << "LV: The target has " << TargetNumRegisters 5443 << " registers of " 5444 << TTI.getRegisterClassName(pair.first) << " register class\n"); 5445 if (VF == 1) { 5446 if (ForceTargetNumScalarRegs.getNumOccurrences() > 0) 5447 TargetNumRegisters = ForceTargetNumScalarRegs; 5448 } else { 5449 if (ForceTargetNumVectorRegs.getNumOccurrences() > 0) 5450 TargetNumRegisters = ForceTargetNumVectorRegs; 5451 } 5452 unsigned MaxLocalUsers = pair.second; 5453 unsigned LoopInvariantRegs = 0; 5454 if (R.LoopInvariantRegs.find(pair.first) != R.LoopInvariantRegs.end()) 5455 LoopInvariantRegs = R.LoopInvariantRegs[pair.first]; 5456 5457 unsigned TmpIC = PowerOf2Floor((TargetNumRegisters - LoopInvariantRegs) / MaxLocalUsers); 5458 // Don't count the induction variable as interleaved. 5459 if (EnableIndVarRegisterHeur) { 5460 TmpIC = 5461 PowerOf2Floor((TargetNumRegisters - LoopInvariantRegs - 1) / 5462 std::max(1U, (MaxLocalUsers - 1))); 5463 } 5464 5465 IC = std::min(IC, TmpIC); 5466 } 5467 5468 // Clamp the interleave ranges to reasonable counts. 5469 unsigned MaxInterleaveCount = TTI.getMaxInterleaveFactor(VF); 5470 5471 // Check if the user has overridden the max. 5472 if (VF == 1) { 5473 if (ForceTargetMaxScalarInterleaveFactor.getNumOccurrences() > 0) 5474 MaxInterleaveCount = ForceTargetMaxScalarInterleaveFactor; 5475 } else { 5476 if (ForceTargetMaxVectorInterleaveFactor.getNumOccurrences() > 0) 5477 MaxInterleaveCount = ForceTargetMaxVectorInterleaveFactor; 5478 } 5479 5480 // If trip count is known or estimated compile time constant, limit the 5481 // interleave count to be less than the trip count divided by VF. 5482 if (BestKnownTC) { 5483 MaxInterleaveCount = std::min(*BestKnownTC / VF, MaxInterleaveCount); 5484 } 5485 5486 // If we did not calculate the cost for VF (because the user selected the VF) 5487 // then we calculate the cost of VF here. 5488 if (LoopCost == 0) 5489 LoopCost = expectedCost(VF).first; 5490 5491 assert(LoopCost && "Non-zero loop cost expected"); 5492 5493 // Clamp the calculated IC to be between the 1 and the max interleave count 5494 // that the target and trip count allows. 5495 if (IC > MaxInterleaveCount) 5496 IC = MaxInterleaveCount; 5497 else if (IC < 1) 5498 IC = 1; 5499 5500 // Interleave if we vectorized this loop and there is a reduction that could 5501 // benefit from interleaving. 5502 if (VF > 1 && !Legal->getReductionVars().empty()) { 5503 LLVM_DEBUG(dbgs() << "LV: Interleaving because of reductions.\n"); 5504 return IC; 5505 } 5506 5507 // Note that if we've already vectorized the loop we will have done the 5508 // runtime check and so interleaving won't require further checks. 5509 bool InterleavingRequiresRuntimePointerCheck = 5510 (VF == 1 && Legal->getRuntimePointerChecking()->Need); 5511 5512 // We want to interleave small loops in order to reduce the loop overhead and 5513 // potentially expose ILP opportunities. 5514 LLVM_DEBUG(dbgs() << "LV: Loop cost is " << LoopCost << '\n'); 5515 if (!InterleavingRequiresRuntimePointerCheck && LoopCost < SmallLoopCost) { 5516 // We assume that the cost overhead is 1 and we use the cost model 5517 // to estimate the cost of the loop and interleave until the cost of the 5518 // loop overhead is about 5% of the cost of the loop. 5519 unsigned SmallIC = 5520 std::min(IC, (unsigned)PowerOf2Floor(SmallLoopCost / LoopCost)); 5521 5522 // Interleave until store/load ports (estimated by max interleave count) are 5523 // saturated. 5524 unsigned NumStores = Legal->getNumStores(); 5525 unsigned NumLoads = Legal->getNumLoads(); 5526 unsigned StoresIC = IC / (NumStores ? NumStores : 1); 5527 unsigned LoadsIC = IC / (NumLoads ? NumLoads : 1); 5528 5529 // If we have a scalar reduction (vector reductions are already dealt with 5530 // by this point), we can increase the critical path length if the loop 5531 // we're interleaving is inside another loop. Limit, by default to 2, so the 5532 // critical path only gets increased by one reduction operation. 5533 if (!Legal->getReductionVars().empty() && TheLoop->getLoopDepth() > 1) { 5534 unsigned F = static_cast<unsigned>(MaxNestedScalarReductionIC); 5535 SmallIC = std::min(SmallIC, F); 5536 StoresIC = std::min(StoresIC, F); 5537 LoadsIC = std::min(LoadsIC, F); 5538 } 5539 5540 if (EnableLoadStoreRuntimeInterleave && 5541 std::max(StoresIC, LoadsIC) > SmallIC) { 5542 LLVM_DEBUG( 5543 dbgs() << "LV: Interleaving to saturate store or load ports.\n"); 5544 return std::max(StoresIC, LoadsIC); 5545 } 5546 5547 LLVM_DEBUG(dbgs() << "LV: Interleaving to reduce branch cost.\n"); 5548 return SmallIC; 5549 } 5550 5551 // Interleave if this is a large loop (small loops are already dealt with by 5552 // this point) that could benefit from interleaving. 5553 bool HasReductions = !Legal->getReductionVars().empty(); 5554 if (TTI.enableAggressiveInterleaving(HasReductions)) { 5555 LLVM_DEBUG(dbgs() << "LV: Interleaving to expose ILP.\n"); 5556 return IC; 5557 } 5558 5559 LLVM_DEBUG(dbgs() << "LV: Not Interleaving.\n"); 5560 return 1; 5561 } 5562 5563 SmallVector<LoopVectorizationCostModel::RegisterUsage, 8> 5564 LoopVectorizationCostModel::calculateRegisterUsage(ArrayRef<unsigned> VFs) { 5565 // This function calculates the register usage by measuring the highest number 5566 // of values that are alive at a single location. Obviously, this is a very 5567 // rough estimation. We scan the loop in a topological order in order and 5568 // assign a number to each instruction. We use RPO to ensure that defs are 5569 // met before their users. We assume that each instruction that has in-loop 5570 // users starts an interval. We record every time that an in-loop value is 5571 // used, so we have a list of the first and last occurrences of each 5572 // instruction. Next, we transpose this data structure into a multi map that 5573 // holds the list of intervals that *end* at a specific location. This multi 5574 // map allows us to perform a linear search. We scan the instructions linearly 5575 // and record each time that a new interval starts, by placing it in a set. 5576 // If we find this value in the multi-map then we remove it from the set. 5577 // The max register usage is the maximum size of the set. 5578 // We also search for instructions that are defined outside the loop, but are 5579 // used inside the loop. We need this number separately from the max-interval 5580 // usage number because when we unroll, loop-invariant values do not take 5581 // more register. 5582 LoopBlocksDFS DFS(TheLoop); 5583 DFS.perform(LI); 5584 5585 RegisterUsage RU; 5586 5587 // Each 'key' in the map opens a new interval. The values 5588 // of the map are the index of the 'last seen' usage of the 5589 // instruction that is the key. 5590 using IntervalMap = DenseMap<Instruction *, unsigned>; 5591 5592 // Maps instruction to its index. 5593 SmallVector<Instruction *, 64> IdxToInstr; 5594 // Marks the end of each interval. 5595 IntervalMap EndPoint; 5596 // Saves the list of instruction indices that are used in the loop. 5597 SmallPtrSet<Instruction *, 8> Ends; 5598 // Saves the list of values that are used in the loop but are 5599 // defined outside the loop, such as arguments and constants. 5600 SmallPtrSet<Value *, 8> LoopInvariants; 5601 5602 for (BasicBlock *BB : make_range(DFS.beginRPO(), DFS.endRPO())) { 5603 for (Instruction &I : BB->instructionsWithoutDebug()) { 5604 IdxToInstr.push_back(&I); 5605 5606 // Save the end location of each USE. 5607 for (Value *U : I.operands()) { 5608 auto *Instr = dyn_cast<Instruction>(U); 5609 5610 // Ignore non-instruction values such as arguments, constants, etc. 5611 if (!Instr) 5612 continue; 5613 5614 // If this instruction is outside the loop then record it and continue. 5615 if (!TheLoop->contains(Instr)) { 5616 LoopInvariants.insert(Instr); 5617 continue; 5618 } 5619 5620 // Overwrite previous end points. 5621 EndPoint[Instr] = IdxToInstr.size(); 5622 Ends.insert(Instr); 5623 } 5624 } 5625 } 5626 5627 // Saves the list of intervals that end with the index in 'key'. 5628 using InstrList = SmallVector<Instruction *, 2>; 5629 DenseMap<unsigned, InstrList> TransposeEnds; 5630 5631 // Transpose the EndPoints to a list of values that end at each index. 5632 for (auto &Interval : EndPoint) 5633 TransposeEnds[Interval.second].push_back(Interval.first); 5634 5635 SmallPtrSet<Instruction *, 8> OpenIntervals; 5636 5637 // Get the size of the widest register. 5638 unsigned MaxSafeDepDist = -1U; 5639 if (Legal->getMaxSafeDepDistBytes() != -1U) 5640 MaxSafeDepDist = Legal->getMaxSafeDepDistBytes() * 8; 5641 unsigned WidestRegister = 5642 std::min(TTI.getRegisterBitWidth(true), MaxSafeDepDist); 5643 const DataLayout &DL = TheFunction->getParent()->getDataLayout(); 5644 5645 SmallVector<RegisterUsage, 8> RUs(VFs.size()); 5646 SmallVector<SmallMapVector<unsigned, unsigned, 4>, 8> MaxUsages(VFs.size()); 5647 5648 LLVM_DEBUG(dbgs() << "LV(REG): Calculating max register usage:\n"); 5649 5650 // A lambda that gets the register usage for the given type and VF. 5651 auto GetRegUsage = [&DL, WidestRegister](Type *Ty, unsigned VF) { 5652 if (Ty->isTokenTy()) 5653 return 0U; 5654 unsigned TypeSize = DL.getTypeSizeInBits(Ty->getScalarType()); 5655 return std::max<unsigned>(1, VF * TypeSize / WidestRegister); 5656 }; 5657 5658 for (unsigned int i = 0, s = IdxToInstr.size(); i < s; ++i) { 5659 Instruction *I = IdxToInstr[i]; 5660 5661 // Remove all of the instructions that end at this location. 5662 InstrList &List = TransposeEnds[i]; 5663 for (Instruction *ToRemove : List) 5664 OpenIntervals.erase(ToRemove); 5665 5666 // Ignore instructions that are never used within the loop. 5667 if (!Ends.count(I)) 5668 continue; 5669 5670 // Skip ignored values. 5671 if (ValuesToIgnore.count(I)) 5672 continue; 5673 5674 // For each VF find the maximum usage of registers. 5675 for (unsigned j = 0, e = VFs.size(); j < e; ++j) { 5676 // Count the number of live intervals. 5677 SmallMapVector<unsigned, unsigned, 4> RegUsage; 5678 5679 if (VFs[j] == 1) { 5680 for (auto Inst : OpenIntervals) { 5681 unsigned ClassID = TTI.getRegisterClassForType(false, Inst->getType()); 5682 if (RegUsage.find(ClassID) == RegUsage.end()) 5683 RegUsage[ClassID] = 1; 5684 else 5685 RegUsage[ClassID] += 1; 5686 } 5687 } else { 5688 collectUniformsAndScalars(VFs[j]); 5689 for (auto Inst : OpenIntervals) { 5690 // Skip ignored values for VF > 1. 5691 if (VecValuesToIgnore.count(Inst)) 5692 continue; 5693 if (isScalarAfterVectorization(Inst, VFs[j])) { 5694 unsigned ClassID = TTI.getRegisterClassForType(false, Inst->getType()); 5695 if (RegUsage.find(ClassID) == RegUsage.end()) 5696 RegUsage[ClassID] = 1; 5697 else 5698 RegUsage[ClassID] += 1; 5699 } else { 5700 unsigned ClassID = TTI.getRegisterClassForType(true, Inst->getType()); 5701 if (RegUsage.find(ClassID) == RegUsage.end()) 5702 RegUsage[ClassID] = GetRegUsage(Inst->getType(), VFs[j]); 5703 else 5704 RegUsage[ClassID] += GetRegUsage(Inst->getType(), VFs[j]); 5705 } 5706 } 5707 } 5708 5709 for (auto& pair : RegUsage) { 5710 if (MaxUsages[j].find(pair.first) != MaxUsages[j].end()) 5711 MaxUsages[j][pair.first] = std::max(MaxUsages[j][pair.first], pair.second); 5712 else 5713 MaxUsages[j][pair.first] = pair.second; 5714 } 5715 } 5716 5717 LLVM_DEBUG(dbgs() << "LV(REG): At #" << i << " Interval # " 5718 << OpenIntervals.size() << '\n'); 5719 5720 // Add the current instruction to the list of open intervals. 5721 OpenIntervals.insert(I); 5722 } 5723 5724 for (unsigned i = 0, e = VFs.size(); i < e; ++i) { 5725 SmallMapVector<unsigned, unsigned, 4> Invariant; 5726 5727 for (auto Inst : LoopInvariants) { 5728 unsigned Usage = VFs[i] == 1 ? 1 : GetRegUsage(Inst->getType(), VFs[i]); 5729 unsigned ClassID = TTI.getRegisterClassForType(VFs[i] > 1, Inst->getType()); 5730 if (Invariant.find(ClassID) == Invariant.end()) 5731 Invariant[ClassID] = Usage; 5732 else 5733 Invariant[ClassID] += Usage; 5734 } 5735 5736 LLVM_DEBUG({ 5737 dbgs() << "LV(REG): VF = " << VFs[i] << '\n'; 5738 dbgs() << "LV(REG): Found max usage: " << MaxUsages[i].size() 5739 << " item\n"; 5740 for (const auto &pair : MaxUsages[i]) { 5741 dbgs() << "LV(REG): RegisterClass: " 5742 << TTI.getRegisterClassName(pair.first) << ", " << pair.second 5743 << " registers\n"; 5744 } 5745 dbgs() << "LV(REG): Found invariant usage: " << Invariant.size() 5746 << " item\n"; 5747 for (const auto &pair : Invariant) { 5748 dbgs() << "LV(REG): RegisterClass: " 5749 << TTI.getRegisterClassName(pair.first) << ", " << pair.second 5750 << " registers\n"; 5751 } 5752 }); 5753 5754 RU.LoopInvariantRegs = Invariant; 5755 RU.MaxLocalUsers = MaxUsages[i]; 5756 RUs[i] = RU; 5757 } 5758 5759 return RUs; 5760 } 5761 5762 bool LoopVectorizationCostModel::useEmulatedMaskMemRefHack(Instruction *I){ 5763 // TODO: Cost model for emulated masked load/store is completely 5764 // broken. This hack guides the cost model to use an artificially 5765 // high enough value to practically disable vectorization with such 5766 // operations, except where previously deployed legality hack allowed 5767 // using very low cost values. This is to avoid regressions coming simply 5768 // from moving "masked load/store" check from legality to cost model. 5769 // Masked Load/Gather emulation was previously never allowed. 5770 // Limited number of Masked Store/Scatter emulation was allowed. 5771 assert(isPredicatedInst(I) && "Expecting a scalar emulated instruction"); 5772 return isa<LoadInst>(I) || 5773 (isa<StoreInst>(I) && 5774 NumPredStores > NumberOfStoresToPredicate); 5775 } 5776 5777 void LoopVectorizationCostModel::collectInstsToScalarize(unsigned VF) { 5778 // If we aren't vectorizing the loop, or if we've already collected the 5779 // instructions to scalarize, there's nothing to do. Collection may already 5780 // have occurred if we have a user-selected VF and are now computing the 5781 // expected cost for interleaving. 5782 if (VF < 2 || InstsToScalarize.find(VF) != InstsToScalarize.end()) 5783 return; 5784 5785 // Initialize a mapping for VF in InstsToScalalarize. If we find that it's 5786 // not profitable to scalarize any instructions, the presence of VF in the 5787 // map will indicate that we've analyzed it already. 5788 ScalarCostsTy &ScalarCostsVF = InstsToScalarize[VF]; 5789 5790 // Find all the instructions that are scalar with predication in the loop and 5791 // determine if it would be better to not if-convert the blocks they are in. 5792 // If so, we also record the instructions to scalarize. 5793 for (BasicBlock *BB : TheLoop->blocks()) { 5794 if (!blockNeedsPredication(BB)) 5795 continue; 5796 for (Instruction &I : *BB) 5797 if (isScalarWithPredication(&I)) { 5798 ScalarCostsTy ScalarCosts; 5799 // Do not apply discount logic if hacked cost is needed 5800 // for emulated masked memrefs. 5801 if (!useEmulatedMaskMemRefHack(&I) && 5802 computePredInstDiscount(&I, ScalarCosts, VF) >= 0) 5803 ScalarCostsVF.insert(ScalarCosts.begin(), ScalarCosts.end()); 5804 // Remember that BB will remain after vectorization. 5805 PredicatedBBsAfterVectorization.insert(BB); 5806 } 5807 } 5808 } 5809 5810 int LoopVectorizationCostModel::computePredInstDiscount( 5811 Instruction *PredInst, DenseMap<Instruction *, unsigned> &ScalarCosts, 5812 unsigned VF) { 5813 assert(!isUniformAfterVectorization(PredInst, VF) && 5814 "Instruction marked uniform-after-vectorization will be predicated"); 5815 5816 // Initialize the discount to zero, meaning that the scalar version and the 5817 // vector version cost the same. 5818 int Discount = 0; 5819 5820 // Holds instructions to analyze. The instructions we visit are mapped in 5821 // ScalarCosts. Those instructions are the ones that would be scalarized if 5822 // we find that the scalar version costs less. 5823 SmallVector<Instruction *, 8> Worklist; 5824 5825 // Returns true if the given instruction can be scalarized. 5826 auto canBeScalarized = [&](Instruction *I) -> bool { 5827 // We only attempt to scalarize instructions forming a single-use chain 5828 // from the original predicated block that would otherwise be vectorized. 5829 // Although not strictly necessary, we give up on instructions we know will 5830 // already be scalar to avoid traversing chains that are unlikely to be 5831 // beneficial. 5832 if (!I->hasOneUse() || PredInst->getParent() != I->getParent() || 5833 isScalarAfterVectorization(I, VF)) 5834 return false; 5835 5836 // If the instruction is scalar with predication, it will be analyzed 5837 // separately. We ignore it within the context of PredInst. 5838 if (isScalarWithPredication(I)) 5839 return false; 5840 5841 // If any of the instruction's operands are uniform after vectorization, 5842 // the instruction cannot be scalarized. This prevents, for example, a 5843 // masked load from being scalarized. 5844 // 5845 // We assume we will only emit a value for lane zero of an instruction 5846 // marked uniform after vectorization, rather than VF identical values. 5847 // Thus, if we scalarize an instruction that uses a uniform, we would 5848 // create uses of values corresponding to the lanes we aren't emitting code 5849 // for. This behavior can be changed by allowing getScalarValue to clone 5850 // the lane zero values for uniforms rather than asserting. 5851 for (Use &U : I->operands()) 5852 if (auto *J = dyn_cast<Instruction>(U.get())) 5853 if (isUniformAfterVectorization(J, VF)) 5854 return false; 5855 5856 // Otherwise, we can scalarize the instruction. 5857 return true; 5858 }; 5859 5860 // Compute the expected cost discount from scalarizing the entire expression 5861 // feeding the predicated instruction. We currently only consider expressions 5862 // that are single-use instruction chains. 5863 Worklist.push_back(PredInst); 5864 while (!Worklist.empty()) { 5865 Instruction *I = Worklist.pop_back_val(); 5866 5867 // If we've already analyzed the instruction, there's nothing to do. 5868 if (ScalarCosts.find(I) != ScalarCosts.end()) 5869 continue; 5870 5871 // Compute the cost of the vector instruction. Note that this cost already 5872 // includes the scalarization overhead of the predicated instruction. 5873 unsigned VectorCost = getInstructionCost(I, VF).first; 5874 5875 // Compute the cost of the scalarized instruction. This cost is the cost of 5876 // the instruction as if it wasn't if-converted and instead remained in the 5877 // predicated block. We will scale this cost by block probability after 5878 // computing the scalarization overhead. 5879 unsigned ScalarCost = VF * getInstructionCost(I, 1).first; 5880 5881 // Compute the scalarization overhead of needed insertelement instructions 5882 // and phi nodes. 5883 if (isScalarWithPredication(I) && !I->getType()->isVoidTy()) { 5884 ScalarCost += TTI.getScalarizationOverhead( 5885 cast<VectorType>(ToVectorTy(I->getType(), VF)), 5886 APInt::getAllOnesValue(VF), true, false); 5887 ScalarCost += VF * TTI.getCFInstrCost(Instruction::PHI, 5888 TTI::TCK_RecipThroughput); 5889 } 5890 5891 // Compute the scalarization overhead of needed extractelement 5892 // instructions. For each of the instruction's operands, if the operand can 5893 // be scalarized, add it to the worklist; otherwise, account for the 5894 // overhead. 5895 for (Use &U : I->operands()) 5896 if (auto *J = dyn_cast<Instruction>(U.get())) { 5897 assert(VectorType::isValidElementType(J->getType()) && 5898 "Instruction has non-scalar type"); 5899 if (canBeScalarized(J)) 5900 Worklist.push_back(J); 5901 else if (needsExtract(J, VF)) 5902 ScalarCost += TTI.getScalarizationOverhead( 5903 cast<VectorType>(ToVectorTy(J->getType(), VF)), 5904 APInt::getAllOnesValue(VF), false, true); 5905 } 5906 5907 // Scale the total scalar cost by block probability. 5908 ScalarCost /= getReciprocalPredBlockProb(); 5909 5910 // Compute the discount. A non-negative discount means the vector version 5911 // of the instruction costs more, and scalarizing would be beneficial. 5912 Discount += VectorCost - ScalarCost; 5913 ScalarCosts[I] = ScalarCost; 5914 } 5915 5916 return Discount; 5917 } 5918 5919 LoopVectorizationCostModel::VectorizationCostTy 5920 LoopVectorizationCostModel::expectedCost(unsigned VF) { 5921 VectorizationCostTy Cost; 5922 5923 // For each block. 5924 for (BasicBlock *BB : TheLoop->blocks()) { 5925 VectorizationCostTy BlockCost; 5926 5927 // For each instruction in the old loop. 5928 for (Instruction &I : BB->instructionsWithoutDebug()) { 5929 // Skip ignored values. 5930 if (ValuesToIgnore.count(&I) || (VF > 1 && VecValuesToIgnore.count(&I))) 5931 continue; 5932 5933 VectorizationCostTy C = getInstructionCost(&I, VF); 5934 5935 // Check if we should override the cost. 5936 if (ForceTargetInstructionCost.getNumOccurrences() > 0) 5937 C.first = ForceTargetInstructionCost; 5938 5939 BlockCost.first += C.first; 5940 BlockCost.second |= C.second; 5941 LLVM_DEBUG(dbgs() << "LV: Found an estimated cost of " << C.first 5942 << " for VF " << VF << " For instruction: " << I 5943 << '\n'); 5944 } 5945 5946 // If we are vectorizing a predicated block, it will have been 5947 // if-converted. This means that the block's instructions (aside from 5948 // stores and instructions that may divide by zero) will now be 5949 // unconditionally executed. For the scalar case, we may not always execute 5950 // the predicated block. Thus, scale the block's cost by the probability of 5951 // executing it. 5952 if (VF == 1 && blockNeedsPredication(BB)) 5953 BlockCost.first /= getReciprocalPredBlockProb(); 5954 5955 Cost.first += BlockCost.first; 5956 Cost.second |= BlockCost.second; 5957 } 5958 5959 return Cost; 5960 } 5961 5962 /// Gets Address Access SCEV after verifying that the access pattern 5963 /// is loop invariant except the induction variable dependence. 5964 /// 5965 /// This SCEV can be sent to the Target in order to estimate the address 5966 /// calculation cost. 5967 static const SCEV *getAddressAccessSCEV( 5968 Value *Ptr, 5969 LoopVectorizationLegality *Legal, 5970 PredicatedScalarEvolution &PSE, 5971 const Loop *TheLoop) { 5972 5973 auto *Gep = dyn_cast<GetElementPtrInst>(Ptr); 5974 if (!Gep) 5975 return nullptr; 5976 5977 // We are looking for a gep with all loop invariant indices except for one 5978 // which should be an induction variable. 5979 auto SE = PSE.getSE(); 5980 unsigned NumOperands = Gep->getNumOperands(); 5981 for (unsigned i = 1; i < NumOperands; ++i) { 5982 Value *Opd = Gep->getOperand(i); 5983 if (!SE->isLoopInvariant(SE->getSCEV(Opd), TheLoop) && 5984 !Legal->isInductionVariable(Opd)) 5985 return nullptr; 5986 } 5987 5988 // Now we know we have a GEP ptr, %inv, %ind, %inv. return the Ptr SCEV. 5989 return PSE.getSCEV(Ptr); 5990 } 5991 5992 static bool isStrideMul(Instruction *I, LoopVectorizationLegality *Legal) { 5993 return Legal->hasStride(I->getOperand(0)) || 5994 Legal->hasStride(I->getOperand(1)); 5995 } 5996 5997 unsigned LoopVectorizationCostModel::getMemInstScalarizationCost(Instruction *I, 5998 unsigned VF) { 5999 assert(VF > 1 && "Scalarization cost of instruction implies vectorization."); 6000 Type *ValTy = getMemInstValueType(I); 6001 auto SE = PSE.getSE(); 6002 6003 unsigned AS = getLoadStoreAddressSpace(I); 6004 Value *Ptr = getLoadStorePointerOperand(I); 6005 Type *PtrTy = ToVectorTy(Ptr->getType(), VF); 6006 6007 // Figure out whether the access is strided and get the stride value 6008 // if it's known in compile time 6009 const SCEV *PtrSCEV = getAddressAccessSCEV(Ptr, Legal, PSE, TheLoop); 6010 6011 // Get the cost of the scalar memory instruction and address computation. 6012 unsigned Cost = VF * TTI.getAddressComputationCost(PtrTy, SE, PtrSCEV); 6013 6014 // Don't pass *I here, since it is scalar but will actually be part of a 6015 // vectorized loop where the user of it is a vectorized instruction. 6016 const Align Alignment = getLoadStoreAlignment(I); 6017 Cost += VF * TTI.getMemoryOpCost(I->getOpcode(), ValTy->getScalarType(), 6018 Alignment, AS, 6019 TTI::TCK_RecipThroughput); 6020 6021 // Get the overhead of the extractelement and insertelement instructions 6022 // we might create due to scalarization. 6023 Cost += getScalarizationOverhead(I, VF); 6024 6025 // If we have a predicated store, it may not be executed for each vector 6026 // lane. Scale the cost by the probability of executing the predicated 6027 // block. 6028 if (isPredicatedInst(I)) { 6029 Cost /= getReciprocalPredBlockProb(); 6030 6031 if (useEmulatedMaskMemRefHack(I)) 6032 // Artificially setting to a high enough value to practically disable 6033 // vectorization with such operations. 6034 Cost = 3000000; 6035 } 6036 6037 return Cost; 6038 } 6039 6040 unsigned LoopVectorizationCostModel::getConsecutiveMemOpCost(Instruction *I, 6041 unsigned VF) { 6042 Type *ValTy = getMemInstValueType(I); 6043 auto *VectorTy = cast<VectorType>(ToVectorTy(ValTy, VF)); 6044 Value *Ptr = getLoadStorePointerOperand(I); 6045 unsigned AS = getLoadStoreAddressSpace(I); 6046 int ConsecutiveStride = Legal->isConsecutivePtr(Ptr); 6047 enum TTI::TargetCostKind CostKind = TTI::TCK_RecipThroughput; 6048 6049 assert((ConsecutiveStride == 1 || ConsecutiveStride == -1) && 6050 "Stride should be 1 or -1 for consecutive memory access"); 6051 const Align Alignment = getLoadStoreAlignment(I); 6052 unsigned Cost = 0; 6053 if (Legal->isMaskRequired(I)) 6054 Cost += TTI.getMaskedMemoryOpCost(I->getOpcode(), VectorTy, Alignment, AS, 6055 CostKind); 6056 else 6057 Cost += TTI.getMemoryOpCost(I->getOpcode(), VectorTy, Alignment, AS, 6058 CostKind, I); 6059 6060 bool Reverse = ConsecutiveStride < 0; 6061 if (Reverse) 6062 Cost += TTI.getShuffleCost(TargetTransformInfo::SK_Reverse, VectorTy, 0); 6063 return Cost; 6064 } 6065 6066 unsigned LoopVectorizationCostModel::getUniformMemOpCost(Instruction *I, 6067 unsigned VF) { 6068 Type *ValTy = getMemInstValueType(I); 6069 auto *VectorTy = cast<VectorType>(ToVectorTy(ValTy, VF)); 6070 const Align Alignment = getLoadStoreAlignment(I); 6071 unsigned AS = getLoadStoreAddressSpace(I); 6072 enum TTI::TargetCostKind CostKind = TTI::TCK_RecipThroughput; 6073 if (isa<LoadInst>(I)) { 6074 return TTI.getAddressComputationCost(ValTy) + 6075 TTI.getMemoryOpCost(Instruction::Load, ValTy, Alignment, AS, 6076 CostKind) + 6077 TTI.getShuffleCost(TargetTransformInfo::SK_Broadcast, VectorTy); 6078 } 6079 StoreInst *SI = cast<StoreInst>(I); 6080 6081 bool isLoopInvariantStoreValue = Legal->isUniform(SI->getValueOperand()); 6082 return TTI.getAddressComputationCost(ValTy) + 6083 TTI.getMemoryOpCost(Instruction::Store, ValTy, Alignment, AS, 6084 CostKind) + 6085 (isLoopInvariantStoreValue 6086 ? 0 6087 : TTI.getVectorInstrCost(Instruction::ExtractElement, VectorTy, 6088 VF - 1)); 6089 } 6090 6091 unsigned LoopVectorizationCostModel::getGatherScatterCost(Instruction *I, 6092 unsigned VF) { 6093 Type *ValTy = getMemInstValueType(I); 6094 auto *VectorTy = cast<VectorType>(ToVectorTy(ValTy, VF)); 6095 const Align Alignment = getLoadStoreAlignment(I); 6096 const Value *Ptr = getLoadStorePointerOperand(I); 6097 6098 return TTI.getAddressComputationCost(VectorTy) + 6099 TTI.getGatherScatterOpCost( 6100 I->getOpcode(), VectorTy, Ptr, Legal->isMaskRequired(I), Alignment, 6101 TargetTransformInfo::TCK_RecipThroughput, I); 6102 } 6103 6104 unsigned LoopVectorizationCostModel::getInterleaveGroupCost(Instruction *I, 6105 unsigned VF) { 6106 Type *ValTy = getMemInstValueType(I); 6107 auto *VectorTy = cast<VectorType>(ToVectorTy(ValTy, VF)); 6108 unsigned AS = getLoadStoreAddressSpace(I); 6109 6110 auto Group = getInterleavedAccessGroup(I); 6111 assert(Group && "Fail to get an interleaved access group."); 6112 6113 unsigned InterleaveFactor = Group->getFactor(); 6114 auto *WideVecTy = FixedVectorType::get(ValTy, VF * InterleaveFactor); 6115 6116 // Holds the indices of existing members in an interleaved load group. 6117 // An interleaved store group doesn't need this as it doesn't allow gaps. 6118 SmallVector<unsigned, 4> Indices; 6119 if (isa<LoadInst>(I)) { 6120 for (unsigned i = 0; i < InterleaveFactor; i++) 6121 if (Group->getMember(i)) 6122 Indices.push_back(i); 6123 } 6124 6125 // Calculate the cost of the whole interleaved group. 6126 bool UseMaskForGaps = 6127 Group->requiresScalarEpilogue() && !isScalarEpilogueAllowed(); 6128 unsigned Cost = TTI.getInterleavedMemoryOpCost( 6129 I->getOpcode(), WideVecTy, Group->getFactor(), Indices, Group->getAlign(), 6130 AS, TTI::TCK_RecipThroughput, Legal->isMaskRequired(I), UseMaskForGaps); 6131 6132 if (Group->isReverse()) { 6133 // TODO: Add support for reversed masked interleaved access. 6134 assert(!Legal->isMaskRequired(I) && 6135 "Reverse masked interleaved access not supported."); 6136 Cost += Group->getNumMembers() * 6137 TTI.getShuffleCost(TargetTransformInfo::SK_Reverse, VectorTy, 0); 6138 } 6139 return Cost; 6140 } 6141 6142 unsigned LoopVectorizationCostModel::getMemoryInstructionCost(Instruction *I, 6143 unsigned VF) { 6144 // Calculate scalar cost only. Vectorization cost should be ready at this 6145 // moment. 6146 if (VF == 1) { 6147 Type *ValTy = getMemInstValueType(I); 6148 const Align Alignment = getLoadStoreAlignment(I); 6149 unsigned AS = getLoadStoreAddressSpace(I); 6150 6151 return TTI.getAddressComputationCost(ValTy) + 6152 TTI.getMemoryOpCost(I->getOpcode(), ValTy, Alignment, AS, 6153 TTI::TCK_RecipThroughput, I); 6154 } 6155 return getWideningCost(I, VF); 6156 } 6157 6158 LoopVectorizationCostModel::VectorizationCostTy 6159 LoopVectorizationCostModel::getInstructionCost(Instruction *I, unsigned VF) { 6160 // If we know that this instruction will remain uniform, check the cost of 6161 // the scalar version. 6162 if (isUniformAfterVectorization(I, VF)) 6163 VF = 1; 6164 6165 if (VF > 1 && isProfitableToScalarize(I, VF)) 6166 return VectorizationCostTy(InstsToScalarize[VF][I], false); 6167 6168 // Forced scalars do not have any scalarization overhead. 6169 auto ForcedScalar = ForcedScalars.find(VF); 6170 if (VF > 1 && ForcedScalar != ForcedScalars.end()) { 6171 auto InstSet = ForcedScalar->second; 6172 if (InstSet.count(I)) 6173 return VectorizationCostTy((getInstructionCost(I, 1).first * VF), false); 6174 } 6175 6176 Type *VectorTy; 6177 unsigned C = getInstructionCost(I, VF, VectorTy); 6178 6179 bool TypeNotScalarized = 6180 VF > 1 && VectorTy->isVectorTy() && TTI.getNumberOfParts(VectorTy) < VF; 6181 return VectorizationCostTy(C, TypeNotScalarized); 6182 } 6183 6184 unsigned LoopVectorizationCostModel::getScalarizationOverhead(Instruction *I, 6185 unsigned VF) { 6186 6187 if (VF == 1) 6188 return 0; 6189 6190 unsigned Cost = 0; 6191 Type *RetTy = ToVectorTy(I->getType(), VF); 6192 if (!RetTy->isVoidTy() && 6193 (!isa<LoadInst>(I) || !TTI.supportsEfficientVectorElementLoadStore())) 6194 Cost += TTI.getScalarizationOverhead( 6195 cast<VectorType>(RetTy), APInt::getAllOnesValue(VF), true, false); 6196 6197 // Some targets keep addresses scalar. 6198 if (isa<LoadInst>(I) && !TTI.prefersVectorizedAddressing()) 6199 return Cost; 6200 6201 // Some targets support efficient element stores. 6202 if (isa<StoreInst>(I) && TTI.supportsEfficientVectorElementLoadStore()) 6203 return Cost; 6204 6205 // Collect operands to consider. 6206 CallInst *CI = dyn_cast<CallInst>(I); 6207 Instruction::op_range Ops = CI ? CI->arg_operands() : I->operands(); 6208 6209 // Skip operands that do not require extraction/scalarization and do not incur 6210 // any overhead. 6211 return Cost + TTI.getOperandsScalarizationOverhead( 6212 filterExtractingOperands(Ops, VF), VF); 6213 } 6214 6215 void LoopVectorizationCostModel::setCostBasedWideningDecision(unsigned VF) { 6216 if (VF == 1) 6217 return; 6218 NumPredStores = 0; 6219 for (BasicBlock *BB : TheLoop->blocks()) { 6220 // For each instruction in the old loop. 6221 for (Instruction &I : *BB) { 6222 Value *Ptr = getLoadStorePointerOperand(&I); 6223 if (!Ptr) 6224 continue; 6225 6226 // TODO: We should generate better code and update the cost model for 6227 // predicated uniform stores. Today they are treated as any other 6228 // predicated store (see added test cases in 6229 // invariant-store-vectorization.ll). 6230 if (isa<StoreInst>(&I) && isScalarWithPredication(&I)) 6231 NumPredStores++; 6232 6233 if (Legal->isUniform(Ptr) && 6234 // Conditional loads and stores should be scalarized and predicated. 6235 // isScalarWithPredication cannot be used here since masked 6236 // gather/scatters are not considered scalar with predication. 6237 !Legal->blockNeedsPredication(I.getParent())) { 6238 // TODO: Avoid replicating loads and stores instead of 6239 // relying on instcombine to remove them. 6240 // Load: Scalar load + broadcast 6241 // Store: Scalar store + isLoopInvariantStoreValue ? 0 : extract 6242 unsigned Cost = getUniformMemOpCost(&I, VF); 6243 setWideningDecision(&I, VF, CM_Scalarize, Cost); 6244 continue; 6245 } 6246 6247 // We assume that widening is the best solution when possible. 6248 if (memoryInstructionCanBeWidened(&I, VF)) { 6249 unsigned Cost = getConsecutiveMemOpCost(&I, VF); 6250 int ConsecutiveStride = 6251 Legal->isConsecutivePtr(getLoadStorePointerOperand(&I)); 6252 assert((ConsecutiveStride == 1 || ConsecutiveStride == -1) && 6253 "Expected consecutive stride."); 6254 InstWidening Decision = 6255 ConsecutiveStride == 1 ? CM_Widen : CM_Widen_Reverse; 6256 setWideningDecision(&I, VF, Decision, Cost); 6257 continue; 6258 } 6259 6260 // Choose between Interleaving, Gather/Scatter or Scalarization. 6261 unsigned InterleaveCost = std::numeric_limits<unsigned>::max(); 6262 unsigned NumAccesses = 1; 6263 if (isAccessInterleaved(&I)) { 6264 auto Group = getInterleavedAccessGroup(&I); 6265 assert(Group && "Fail to get an interleaved access group."); 6266 6267 // Make one decision for the whole group. 6268 if (getWideningDecision(&I, VF) != CM_Unknown) 6269 continue; 6270 6271 NumAccesses = Group->getNumMembers(); 6272 if (interleavedAccessCanBeWidened(&I, VF)) 6273 InterleaveCost = getInterleaveGroupCost(&I, VF); 6274 } 6275 6276 unsigned GatherScatterCost = 6277 isLegalGatherOrScatter(&I) 6278 ? getGatherScatterCost(&I, VF) * NumAccesses 6279 : std::numeric_limits<unsigned>::max(); 6280 6281 unsigned ScalarizationCost = 6282 getMemInstScalarizationCost(&I, VF) * NumAccesses; 6283 6284 // Choose better solution for the current VF, 6285 // write down this decision and use it during vectorization. 6286 unsigned Cost; 6287 InstWidening Decision; 6288 if (InterleaveCost <= GatherScatterCost && 6289 InterleaveCost < ScalarizationCost) { 6290 Decision = CM_Interleave; 6291 Cost = InterleaveCost; 6292 } else if (GatherScatterCost < ScalarizationCost) { 6293 Decision = CM_GatherScatter; 6294 Cost = GatherScatterCost; 6295 } else { 6296 Decision = CM_Scalarize; 6297 Cost = ScalarizationCost; 6298 } 6299 // If the instructions belongs to an interleave group, the whole group 6300 // receives the same decision. The whole group receives the cost, but 6301 // the cost will actually be assigned to one instruction. 6302 if (auto Group = getInterleavedAccessGroup(&I)) 6303 setWideningDecision(Group, VF, Decision, Cost); 6304 else 6305 setWideningDecision(&I, VF, Decision, Cost); 6306 } 6307 } 6308 6309 // Make sure that any load of address and any other address computation 6310 // remains scalar unless there is gather/scatter support. This avoids 6311 // inevitable extracts into address registers, and also has the benefit of 6312 // activating LSR more, since that pass can't optimize vectorized 6313 // addresses. 6314 if (TTI.prefersVectorizedAddressing()) 6315 return; 6316 6317 // Start with all scalar pointer uses. 6318 SmallPtrSet<Instruction *, 8> AddrDefs; 6319 for (BasicBlock *BB : TheLoop->blocks()) 6320 for (Instruction &I : *BB) { 6321 Instruction *PtrDef = 6322 dyn_cast_or_null<Instruction>(getLoadStorePointerOperand(&I)); 6323 if (PtrDef && TheLoop->contains(PtrDef) && 6324 getWideningDecision(&I, VF) != CM_GatherScatter) 6325 AddrDefs.insert(PtrDef); 6326 } 6327 6328 // Add all instructions used to generate the addresses. 6329 SmallVector<Instruction *, 4> Worklist; 6330 for (auto *I : AddrDefs) 6331 Worklist.push_back(I); 6332 while (!Worklist.empty()) { 6333 Instruction *I = Worklist.pop_back_val(); 6334 for (auto &Op : I->operands()) 6335 if (auto *InstOp = dyn_cast<Instruction>(Op)) 6336 if ((InstOp->getParent() == I->getParent()) && !isa<PHINode>(InstOp) && 6337 AddrDefs.insert(InstOp).second) 6338 Worklist.push_back(InstOp); 6339 } 6340 6341 for (auto *I : AddrDefs) { 6342 if (isa<LoadInst>(I)) { 6343 // Setting the desired widening decision should ideally be handled in 6344 // by cost functions, but since this involves the task of finding out 6345 // if the loaded register is involved in an address computation, it is 6346 // instead changed here when we know this is the case. 6347 InstWidening Decision = getWideningDecision(I, VF); 6348 if (Decision == CM_Widen || Decision == CM_Widen_Reverse) 6349 // Scalarize a widened load of address. 6350 setWideningDecision(I, VF, CM_Scalarize, 6351 (VF * getMemoryInstructionCost(I, 1))); 6352 else if (auto Group = getInterleavedAccessGroup(I)) { 6353 // Scalarize an interleave group of address loads. 6354 for (unsigned I = 0; I < Group->getFactor(); ++I) { 6355 if (Instruction *Member = Group->getMember(I)) 6356 setWideningDecision(Member, VF, CM_Scalarize, 6357 (VF * getMemoryInstructionCost(Member, 1))); 6358 } 6359 } 6360 } else 6361 // Make sure I gets scalarized and a cost estimate without 6362 // scalarization overhead. 6363 ForcedScalars[VF].insert(I); 6364 } 6365 } 6366 6367 unsigned LoopVectorizationCostModel::getInstructionCost(Instruction *I, 6368 unsigned VF, 6369 Type *&VectorTy) { 6370 Type *RetTy = I->getType(); 6371 if (canTruncateToMinimalBitwidth(I, VF)) 6372 RetTy = IntegerType::get(RetTy->getContext(), MinBWs[I]); 6373 VectorTy = isScalarAfterVectorization(I, VF) ? RetTy : ToVectorTy(RetTy, VF); 6374 auto SE = PSE.getSE(); 6375 TTI::TargetCostKind CostKind = TTI::TCK_RecipThroughput; 6376 6377 // TODO: We need to estimate the cost of intrinsic calls. 6378 switch (I->getOpcode()) { 6379 case Instruction::GetElementPtr: 6380 // We mark this instruction as zero-cost because the cost of GEPs in 6381 // vectorized code depends on whether the corresponding memory instruction 6382 // is scalarized or not. Therefore, we handle GEPs with the memory 6383 // instruction cost. 6384 return 0; 6385 case Instruction::Br: { 6386 // In cases of scalarized and predicated instructions, there will be VF 6387 // predicated blocks in the vectorized loop. Each branch around these 6388 // blocks requires also an extract of its vector compare i1 element. 6389 bool ScalarPredicatedBB = false; 6390 BranchInst *BI = cast<BranchInst>(I); 6391 if (VF > 1 && BI->isConditional() && 6392 (PredicatedBBsAfterVectorization.count(BI->getSuccessor(0)) || 6393 PredicatedBBsAfterVectorization.count(BI->getSuccessor(1)))) 6394 ScalarPredicatedBB = true; 6395 6396 if (ScalarPredicatedBB) { 6397 // Return cost for branches around scalarized and predicated blocks. 6398 auto *Vec_i1Ty = 6399 FixedVectorType::get(IntegerType::getInt1Ty(RetTy->getContext()), VF); 6400 return (TTI.getScalarizationOverhead(Vec_i1Ty, APInt::getAllOnesValue(VF), 6401 false, true) + 6402 (TTI.getCFInstrCost(Instruction::Br, CostKind) * VF)); 6403 } else if (I->getParent() == TheLoop->getLoopLatch() || VF == 1) 6404 // The back-edge branch will remain, as will all scalar branches. 6405 return TTI.getCFInstrCost(Instruction::Br, CostKind); 6406 else 6407 // This branch will be eliminated by if-conversion. 6408 return 0; 6409 // Note: We currently assume zero cost for an unconditional branch inside 6410 // a predicated block since it will become a fall-through, although we 6411 // may decide in the future to call TTI for all branches. 6412 } 6413 case Instruction::PHI: { 6414 auto *Phi = cast<PHINode>(I); 6415 6416 // First-order recurrences are replaced by vector shuffles inside the loop. 6417 // NOTE: Don't use ToVectorTy as SK_ExtractSubvector expects a vector type. 6418 if (VF > 1 && Legal->isFirstOrderRecurrence(Phi)) 6419 return TTI.getShuffleCost(TargetTransformInfo::SK_ExtractSubvector, 6420 cast<VectorType>(VectorTy), VF - 1, 6421 FixedVectorType::get(RetTy, 1)); 6422 6423 // Phi nodes in non-header blocks (not inductions, reductions, etc.) are 6424 // converted into select instructions. We require N - 1 selects per phi 6425 // node, where N is the number of incoming values. 6426 if (VF > 1 && Phi->getParent() != TheLoop->getHeader()) 6427 return (Phi->getNumIncomingValues() - 1) * 6428 TTI.getCmpSelInstrCost( 6429 Instruction::Select, ToVectorTy(Phi->getType(), VF), 6430 ToVectorTy(Type::getInt1Ty(Phi->getContext()), VF), 6431 CostKind); 6432 6433 return TTI.getCFInstrCost(Instruction::PHI, CostKind); 6434 } 6435 case Instruction::UDiv: 6436 case Instruction::SDiv: 6437 case Instruction::URem: 6438 case Instruction::SRem: 6439 // If we have a predicated instruction, it may not be executed for each 6440 // vector lane. Get the scalarization cost and scale this amount by the 6441 // probability of executing the predicated block. If the instruction is not 6442 // predicated, we fall through to the next case. 6443 if (VF > 1 && isScalarWithPredication(I)) { 6444 unsigned Cost = 0; 6445 6446 // These instructions have a non-void type, so account for the phi nodes 6447 // that we will create. This cost is likely to be zero. The phi node 6448 // cost, if any, should be scaled by the block probability because it 6449 // models a copy at the end of each predicated block. 6450 Cost += VF * TTI.getCFInstrCost(Instruction::PHI, CostKind); 6451 6452 // The cost of the non-predicated instruction. 6453 Cost += VF * TTI.getArithmeticInstrCost(I->getOpcode(), RetTy, CostKind); 6454 6455 // The cost of insertelement and extractelement instructions needed for 6456 // scalarization. 6457 Cost += getScalarizationOverhead(I, VF); 6458 6459 // Scale the cost by the probability of executing the predicated blocks. 6460 // This assumes the predicated block for each vector lane is equally 6461 // likely. 6462 return Cost / getReciprocalPredBlockProb(); 6463 } 6464 LLVM_FALLTHROUGH; 6465 case Instruction::Add: 6466 case Instruction::FAdd: 6467 case Instruction::Sub: 6468 case Instruction::FSub: 6469 case Instruction::Mul: 6470 case Instruction::FMul: 6471 case Instruction::FDiv: 6472 case Instruction::FRem: 6473 case Instruction::Shl: 6474 case Instruction::LShr: 6475 case Instruction::AShr: 6476 case Instruction::And: 6477 case Instruction::Or: 6478 case Instruction::Xor: { 6479 // Since we will replace the stride by 1 the multiplication should go away. 6480 if (I->getOpcode() == Instruction::Mul && isStrideMul(I, Legal)) 6481 return 0; 6482 // Certain instructions can be cheaper to vectorize if they have a constant 6483 // second vector operand. One example of this are shifts on x86. 6484 Value *Op2 = I->getOperand(1); 6485 TargetTransformInfo::OperandValueProperties Op2VP; 6486 TargetTransformInfo::OperandValueKind Op2VK = 6487 TTI.getOperandInfo(Op2, Op2VP); 6488 if (Op2VK == TargetTransformInfo::OK_AnyValue && Legal->isUniform(Op2)) 6489 Op2VK = TargetTransformInfo::OK_UniformValue; 6490 6491 SmallVector<const Value *, 4> Operands(I->operand_values()); 6492 unsigned N = isScalarAfterVectorization(I, VF) ? VF : 1; 6493 return N * TTI.getArithmeticInstrCost( 6494 I->getOpcode(), VectorTy, CostKind, 6495 TargetTransformInfo::OK_AnyValue, 6496 Op2VK, TargetTransformInfo::OP_None, Op2VP, Operands, I); 6497 } 6498 case Instruction::FNeg: { 6499 unsigned N = isScalarAfterVectorization(I, VF) ? VF : 1; 6500 return N * TTI.getArithmeticInstrCost( 6501 I->getOpcode(), VectorTy, CostKind, 6502 TargetTransformInfo::OK_AnyValue, 6503 TargetTransformInfo::OK_AnyValue, 6504 TargetTransformInfo::OP_None, TargetTransformInfo::OP_None, 6505 I->getOperand(0), I); 6506 } 6507 case Instruction::Select: { 6508 SelectInst *SI = cast<SelectInst>(I); 6509 const SCEV *CondSCEV = SE->getSCEV(SI->getCondition()); 6510 bool ScalarCond = (SE->isLoopInvariant(CondSCEV, TheLoop)); 6511 Type *CondTy = SI->getCondition()->getType(); 6512 if (!ScalarCond) 6513 CondTy = FixedVectorType::get(CondTy, VF); 6514 6515 return TTI.getCmpSelInstrCost(I->getOpcode(), VectorTy, CondTy, 6516 CostKind, I); 6517 } 6518 case Instruction::ICmp: 6519 case Instruction::FCmp: { 6520 Type *ValTy = I->getOperand(0)->getType(); 6521 Instruction *Op0AsInstruction = dyn_cast<Instruction>(I->getOperand(0)); 6522 if (canTruncateToMinimalBitwidth(Op0AsInstruction, VF)) 6523 ValTy = IntegerType::get(ValTy->getContext(), MinBWs[Op0AsInstruction]); 6524 VectorTy = ToVectorTy(ValTy, VF); 6525 return TTI.getCmpSelInstrCost(I->getOpcode(), VectorTy, nullptr, CostKind, 6526 I); 6527 } 6528 case Instruction::Store: 6529 case Instruction::Load: { 6530 unsigned Width = VF; 6531 if (Width > 1) { 6532 InstWidening Decision = getWideningDecision(I, Width); 6533 assert(Decision != CM_Unknown && 6534 "CM decision should be taken at this point"); 6535 if (Decision == CM_Scalarize) 6536 Width = 1; 6537 } 6538 VectorTy = ToVectorTy(getMemInstValueType(I), Width); 6539 return getMemoryInstructionCost(I, VF); 6540 } 6541 case Instruction::ZExt: 6542 case Instruction::SExt: 6543 case Instruction::FPToUI: 6544 case Instruction::FPToSI: 6545 case Instruction::FPExt: 6546 case Instruction::PtrToInt: 6547 case Instruction::IntToPtr: 6548 case Instruction::SIToFP: 6549 case Instruction::UIToFP: 6550 case Instruction::Trunc: 6551 case Instruction::FPTrunc: 6552 case Instruction::BitCast: { 6553 // Computes the CastContextHint from a Load/Store instruction. 6554 auto ComputeCCH = [&](Instruction *I) -> TTI::CastContextHint { 6555 assert((isa<LoadInst>(I) || isa<StoreInst>(I)) && 6556 "Expected a load or a store!"); 6557 6558 if (VF == 1 || !TheLoop->contains(I)) 6559 return TTI::CastContextHint::Normal; 6560 6561 switch (getWideningDecision(I, VF)) { 6562 case LoopVectorizationCostModel::CM_GatherScatter: 6563 return TTI::CastContextHint::GatherScatter; 6564 case LoopVectorizationCostModel::CM_Interleave: 6565 return TTI::CastContextHint::Interleave; 6566 case LoopVectorizationCostModel::CM_Scalarize: 6567 case LoopVectorizationCostModel::CM_Widen: 6568 return Legal->isMaskRequired(I) ? TTI::CastContextHint::Masked 6569 : TTI::CastContextHint::Normal; 6570 case LoopVectorizationCostModel::CM_Widen_Reverse: 6571 return TTI::CastContextHint::Reversed; 6572 case LoopVectorizationCostModel::CM_Unknown: 6573 llvm_unreachable("Instr did not go through cost modelling?"); 6574 } 6575 6576 llvm_unreachable("Unhandled case!"); 6577 }; 6578 6579 unsigned Opcode = I->getOpcode(); 6580 TTI::CastContextHint CCH = TTI::CastContextHint::None; 6581 // For Trunc, the context is the only user, which must be a StoreInst. 6582 if (Opcode == Instruction::Trunc || Opcode == Instruction::FPTrunc) { 6583 if (I->hasOneUse()) 6584 if (StoreInst *Store = dyn_cast<StoreInst>(*I->user_begin())) 6585 CCH = ComputeCCH(Store); 6586 } 6587 // For Z/Sext, the context is the operand, which must be a LoadInst. 6588 else if (Opcode == Instruction::ZExt || Opcode == Instruction::SExt || 6589 Opcode == Instruction::FPExt) { 6590 if (LoadInst *Load = dyn_cast<LoadInst>(I->getOperand(0))) 6591 CCH = ComputeCCH(Load); 6592 } 6593 6594 // We optimize the truncation of induction variables having constant 6595 // integer steps. The cost of these truncations is the same as the scalar 6596 // operation. 6597 if (isOptimizableIVTruncate(I, VF)) { 6598 auto *Trunc = cast<TruncInst>(I); 6599 return TTI.getCastInstrCost(Instruction::Trunc, Trunc->getDestTy(), 6600 Trunc->getSrcTy(), CCH, CostKind, Trunc); 6601 } 6602 6603 Type *SrcScalarTy = I->getOperand(0)->getType(); 6604 Type *SrcVecTy = 6605 VectorTy->isVectorTy() ? ToVectorTy(SrcScalarTy, VF) : SrcScalarTy; 6606 if (canTruncateToMinimalBitwidth(I, VF)) { 6607 // This cast is going to be shrunk. This may remove the cast or it might 6608 // turn it into slightly different cast. For example, if MinBW == 16, 6609 // "zext i8 %1 to i32" becomes "zext i8 %1 to i16". 6610 // 6611 // Calculate the modified src and dest types. 6612 Type *MinVecTy = VectorTy; 6613 if (Opcode == Instruction::Trunc) { 6614 SrcVecTy = smallestIntegerVectorType(SrcVecTy, MinVecTy); 6615 VectorTy = 6616 largestIntegerVectorType(ToVectorTy(I->getType(), VF), MinVecTy); 6617 } else if (Opcode == Instruction::ZExt || Opcode == Instruction::SExt) { 6618 SrcVecTy = largestIntegerVectorType(SrcVecTy, MinVecTy); 6619 VectorTy = 6620 smallestIntegerVectorType(ToVectorTy(I->getType(), VF), MinVecTy); 6621 } 6622 } 6623 6624 unsigned N = isScalarAfterVectorization(I, VF) ? VF : 1; 6625 return N * 6626 TTI.getCastInstrCost(Opcode, VectorTy, SrcVecTy, CCH, CostKind, I); 6627 } 6628 case Instruction::Call: { 6629 bool NeedToScalarize; 6630 CallInst *CI = cast<CallInst>(I); 6631 unsigned CallCost = getVectorCallCost(CI, VF, NeedToScalarize); 6632 if (getVectorIntrinsicIDForCall(CI, TLI)) 6633 return std::min(CallCost, getVectorIntrinsicCost(CI, VF)); 6634 return CallCost; 6635 } 6636 default: 6637 // The cost of executing VF copies of the scalar instruction. This opcode 6638 // is unknown. Assume that it is the same as 'mul'. 6639 return VF * TTI.getArithmeticInstrCost(Instruction::Mul, VectorTy, 6640 CostKind) + 6641 getScalarizationOverhead(I, VF); 6642 } // end of switch. 6643 } 6644 6645 char LoopVectorize::ID = 0; 6646 6647 static const char lv_name[] = "Loop Vectorization"; 6648 6649 INITIALIZE_PASS_BEGIN(LoopVectorize, LV_NAME, lv_name, false, false) 6650 INITIALIZE_PASS_DEPENDENCY(TargetTransformInfoWrapperPass) 6651 INITIALIZE_PASS_DEPENDENCY(BasicAAWrapperPass) 6652 INITIALIZE_PASS_DEPENDENCY(AAResultsWrapperPass) 6653 INITIALIZE_PASS_DEPENDENCY(GlobalsAAWrapperPass) 6654 INITIALIZE_PASS_DEPENDENCY(AssumptionCacheTracker) 6655 INITIALIZE_PASS_DEPENDENCY(BlockFrequencyInfoWrapperPass) 6656 INITIALIZE_PASS_DEPENDENCY(DominatorTreeWrapperPass) 6657 INITIALIZE_PASS_DEPENDENCY(ScalarEvolutionWrapperPass) 6658 INITIALIZE_PASS_DEPENDENCY(LoopInfoWrapperPass) 6659 INITIALIZE_PASS_DEPENDENCY(LoopAccessLegacyAnalysis) 6660 INITIALIZE_PASS_DEPENDENCY(DemandedBitsWrapperPass) 6661 INITIALIZE_PASS_DEPENDENCY(OptimizationRemarkEmitterWrapperPass) 6662 INITIALIZE_PASS_DEPENDENCY(ProfileSummaryInfoWrapperPass) 6663 INITIALIZE_PASS_DEPENDENCY(InjectTLIMappingsLegacy) 6664 INITIALIZE_PASS_END(LoopVectorize, LV_NAME, lv_name, false, false) 6665 6666 namespace llvm { 6667 6668 Pass *createLoopVectorizePass() { return new LoopVectorize(); } 6669 6670 Pass *createLoopVectorizePass(bool InterleaveOnlyWhenForced, 6671 bool VectorizeOnlyWhenForced) { 6672 return new LoopVectorize(InterleaveOnlyWhenForced, VectorizeOnlyWhenForced); 6673 } 6674 6675 } // end namespace llvm 6676 6677 bool LoopVectorizationCostModel::isConsecutiveLoadOrStore(Instruction *Inst) { 6678 // Check if the pointer operand of a load or store instruction is 6679 // consecutive. 6680 if (auto *Ptr = getLoadStorePointerOperand(Inst)) 6681 return Legal->isConsecutivePtr(Ptr); 6682 return false; 6683 } 6684 6685 void LoopVectorizationCostModel::collectValuesToIgnore() { 6686 // Ignore ephemeral values. 6687 CodeMetrics::collectEphemeralValues(TheLoop, AC, ValuesToIgnore); 6688 6689 // Ignore type-promoting instructions we identified during reduction 6690 // detection. 6691 for (auto &Reduction : Legal->getReductionVars()) { 6692 RecurrenceDescriptor &RedDes = Reduction.second; 6693 SmallPtrSetImpl<Instruction *> &Casts = RedDes.getCastInsts(); 6694 VecValuesToIgnore.insert(Casts.begin(), Casts.end()); 6695 } 6696 // Ignore type-casting instructions we identified during induction 6697 // detection. 6698 for (auto &Induction : Legal->getInductionVars()) { 6699 InductionDescriptor &IndDes = Induction.second; 6700 const SmallVectorImpl<Instruction *> &Casts = IndDes.getCastInsts(); 6701 VecValuesToIgnore.insert(Casts.begin(), Casts.end()); 6702 } 6703 } 6704 6705 void LoopVectorizationCostModel::collectInLoopReductions() { 6706 // For the moment, without predicated reduction instructions, we do not 6707 // support inloop reductions whilst folding the tail, and hence in those cases 6708 // all reductions are currently out of the loop. 6709 if (!PreferInLoopReductions || foldTailByMasking()) 6710 return; 6711 6712 for (auto &Reduction : Legal->getReductionVars()) { 6713 PHINode *Phi = Reduction.first; 6714 RecurrenceDescriptor &RdxDesc = Reduction.second; 6715 6716 // We don't collect reductions that are type promoted (yet). 6717 if (RdxDesc.getRecurrenceType() != Phi->getType()) 6718 continue; 6719 6720 // Check that we can correctly put the reductions into the loop, by 6721 // finding the chain of operations that leads from the phi to the loop 6722 // exit value. 6723 SmallVector<Instruction *, 4> ReductionOperations = 6724 RdxDesc.getReductionOpChain(Phi, TheLoop); 6725 bool InLoop = !ReductionOperations.empty(); 6726 if (InLoop) 6727 InLoopReductionChains[Phi] = ReductionOperations; 6728 LLVM_DEBUG(dbgs() << "LV: Using " << (InLoop ? "inloop" : "out of loop") 6729 << " reduction for phi: " << *Phi << "\n"); 6730 } 6731 } 6732 6733 // TODO: we could return a pair of values that specify the max VF and 6734 // min VF, to be used in `buildVPlans(MinVF, MaxVF)` instead of 6735 // `buildVPlans(VF, VF)`. We cannot do it because VPLAN at the moment 6736 // doesn't have a cost model that can choose which plan to execute if 6737 // more than one is generated. 6738 static unsigned determineVPlanVF(const unsigned WidestVectorRegBits, 6739 LoopVectorizationCostModel &CM) { 6740 unsigned WidestType; 6741 std::tie(std::ignore, WidestType) = CM.getSmallestAndWidestTypes(); 6742 return WidestVectorRegBits / WidestType; 6743 } 6744 6745 VectorizationFactor 6746 LoopVectorizationPlanner::planInVPlanNativePath(unsigned UserVF) { 6747 unsigned VF = UserVF; 6748 // Outer loop handling: They may require CFG and instruction level 6749 // transformations before even evaluating whether vectorization is profitable. 6750 // Since we cannot modify the incoming IR, we need to build VPlan upfront in 6751 // the vectorization pipeline. 6752 if (!OrigLoop->empty()) { 6753 // If the user doesn't provide a vectorization factor, determine a 6754 // reasonable one. 6755 if (!UserVF) { 6756 VF = determineVPlanVF(TTI->getRegisterBitWidth(true /* Vector*/), CM); 6757 LLVM_DEBUG(dbgs() << "LV: VPlan computed VF " << VF << ".\n"); 6758 6759 // Make sure we have a VF > 1 for stress testing. 6760 if (VPlanBuildStressTest && VF < 2) { 6761 LLVM_DEBUG(dbgs() << "LV: VPlan stress testing: " 6762 << "overriding computed VF.\n"); 6763 VF = 4; 6764 } 6765 } 6766 assert(EnableVPlanNativePath && "VPlan-native path is not enabled."); 6767 assert(isPowerOf2_32(VF) && "VF needs to be a power of two"); 6768 LLVM_DEBUG(dbgs() << "LV: Using " << (UserVF ? "user " : "") << "VF " << VF 6769 << " to build VPlans.\n"); 6770 buildVPlans(VF, VF); 6771 6772 // For VPlan build stress testing, we bail out after VPlan construction. 6773 if (VPlanBuildStressTest) 6774 return VectorizationFactor::Disabled(); 6775 6776 return {VF, 0}; 6777 } 6778 6779 LLVM_DEBUG( 6780 dbgs() << "LV: Not vectorizing. Inner loops aren't supported in the " 6781 "VPlan-native path.\n"); 6782 return VectorizationFactor::Disabled(); 6783 } 6784 6785 Optional<VectorizationFactor> LoopVectorizationPlanner::plan(unsigned UserVF, 6786 unsigned UserIC) { 6787 assert(OrigLoop->empty() && "Inner loop expected."); 6788 Optional<unsigned> MaybeMaxVF = CM.computeMaxVF(UserVF, UserIC); 6789 if (!MaybeMaxVF) // Cases that should not to be vectorized nor interleaved. 6790 return None; 6791 6792 // Invalidate interleave groups if all blocks of loop will be predicated. 6793 if (CM.blockNeedsPredication(OrigLoop->getHeader()) && 6794 !useMaskedInterleavedAccesses(*TTI)) { 6795 LLVM_DEBUG( 6796 dbgs() 6797 << "LV: Invalidate all interleaved groups due to fold-tail by masking " 6798 "which requires masked-interleaved support.\n"); 6799 if (CM.InterleaveInfo.invalidateGroups()) 6800 // Invalidating interleave groups also requires invalidating all decisions 6801 // based on them, which includes widening decisions and uniform and scalar 6802 // values. 6803 CM.invalidateCostModelingDecisions(); 6804 } 6805 6806 if (UserVF) { 6807 LLVM_DEBUG(dbgs() << "LV: Using user VF " << UserVF << ".\n"); 6808 assert(isPowerOf2_32(UserVF) && "VF needs to be a power of two"); 6809 // Collect the instructions (and their associated costs) that will be more 6810 // profitable to scalarize. 6811 CM.selectUserVectorizationFactor(UserVF); 6812 CM.collectInLoopReductions(); 6813 buildVPlansWithVPRecipes(UserVF, UserVF); 6814 LLVM_DEBUG(printPlans(dbgs())); 6815 return {{UserVF, 0}}; 6816 } 6817 6818 unsigned MaxVF = MaybeMaxVF.getValue(); 6819 assert(MaxVF != 0 && "MaxVF is zero."); 6820 6821 for (unsigned VF = 1; VF <= MaxVF; VF *= 2) { 6822 // Collect Uniform and Scalar instructions after vectorization with VF. 6823 CM.collectUniformsAndScalars(VF); 6824 6825 // Collect the instructions (and their associated costs) that will be more 6826 // profitable to scalarize. 6827 if (VF > 1) 6828 CM.collectInstsToScalarize(VF); 6829 } 6830 6831 CM.collectInLoopReductions(); 6832 6833 buildVPlansWithVPRecipes(1, MaxVF); 6834 LLVM_DEBUG(printPlans(dbgs())); 6835 if (MaxVF == 1) 6836 return VectorizationFactor::Disabled(); 6837 6838 // Select the optimal vectorization factor. 6839 return CM.selectVectorizationFactor(MaxVF); 6840 } 6841 6842 void LoopVectorizationPlanner::setBestPlan(unsigned VF, unsigned UF) { 6843 LLVM_DEBUG(dbgs() << "Setting best plan to VF=" << VF << ", UF=" << UF 6844 << '\n'); 6845 BestVF = VF; 6846 BestUF = UF; 6847 6848 erase_if(VPlans, [VF](const VPlanPtr &Plan) { 6849 return !Plan->hasVF(VF); 6850 }); 6851 assert(VPlans.size() == 1 && "Best VF has not a single VPlan."); 6852 } 6853 6854 void LoopVectorizationPlanner::executePlan(InnerLoopVectorizer &ILV, 6855 DominatorTree *DT) { 6856 // Perform the actual loop transformation. 6857 6858 // 1. Create a new empty loop. Unlink the old loop and connect the new one. 6859 VPCallbackILV CallbackILV(ILV); 6860 6861 VPTransformState State{BestVF, BestUF, LI, 6862 DT, ILV.Builder, ILV.VectorLoopValueMap, 6863 &ILV, CallbackILV}; 6864 State.CFG.PrevBB = ILV.createVectorizedLoopSkeleton(); 6865 State.TripCount = ILV.getOrCreateTripCount(nullptr); 6866 State.CanonicalIV = ILV.Induction; 6867 6868 //===------------------------------------------------===// 6869 // 6870 // Notice: any optimization or new instruction that go 6871 // into the code below should also be implemented in 6872 // the cost-model. 6873 // 6874 //===------------------------------------------------===// 6875 6876 // 2. Copy and widen instructions from the old loop into the new loop. 6877 assert(VPlans.size() == 1 && "Not a single VPlan to execute."); 6878 VPlans.front()->execute(&State); 6879 6880 // 3. Fix the vectorized code: take care of header phi's, live-outs, 6881 // predication, updating analyses. 6882 ILV.fixVectorizedLoop(); 6883 } 6884 6885 void LoopVectorizationPlanner::collectTriviallyDeadInstructions( 6886 SmallPtrSetImpl<Instruction *> &DeadInstructions) { 6887 BasicBlock *Latch = OrigLoop->getLoopLatch(); 6888 6889 // We create new control-flow for the vectorized loop, so the original 6890 // condition will be dead after vectorization if it's only used by the 6891 // branch. 6892 auto *Cmp = dyn_cast<Instruction>(Latch->getTerminator()->getOperand(0)); 6893 if (Cmp && Cmp->hasOneUse()) 6894 DeadInstructions.insert(Cmp); 6895 6896 // We create new "steps" for induction variable updates to which the original 6897 // induction variables map. An original update instruction will be dead if 6898 // all its users except the induction variable are dead. 6899 for (auto &Induction : Legal->getInductionVars()) { 6900 PHINode *Ind = Induction.first; 6901 auto *IndUpdate = cast<Instruction>(Ind->getIncomingValueForBlock(Latch)); 6902 if (llvm::all_of(IndUpdate->users(), [&](User *U) -> bool { 6903 return U == Ind || DeadInstructions.count(cast<Instruction>(U)); 6904 })) 6905 DeadInstructions.insert(IndUpdate); 6906 6907 // We record as "Dead" also the type-casting instructions we had identified 6908 // during induction analysis. We don't need any handling for them in the 6909 // vectorized loop because we have proven that, under a proper runtime 6910 // test guarding the vectorized loop, the value of the phi, and the casted 6911 // value of the phi, are the same. The last instruction in this casting chain 6912 // will get its scalar/vector/widened def from the scalar/vector/widened def 6913 // of the respective phi node. Any other casts in the induction def-use chain 6914 // have no other uses outside the phi update chain, and will be ignored. 6915 InductionDescriptor &IndDes = Induction.second; 6916 const SmallVectorImpl<Instruction *> &Casts = IndDes.getCastInsts(); 6917 DeadInstructions.insert(Casts.begin(), Casts.end()); 6918 } 6919 } 6920 6921 Value *InnerLoopUnroller::reverseVector(Value *Vec) { return Vec; } 6922 6923 Value *InnerLoopUnroller::getBroadcastInstrs(Value *V) { return V; } 6924 6925 Value *InnerLoopUnroller::getStepVector(Value *Val, int StartIdx, Value *Step, 6926 Instruction::BinaryOps BinOp) { 6927 // When unrolling and the VF is 1, we only need to add a simple scalar. 6928 Type *Ty = Val->getType(); 6929 assert(!Ty->isVectorTy() && "Val must be a scalar"); 6930 6931 if (Ty->isFloatingPointTy()) { 6932 Constant *C = ConstantFP::get(Ty, (double)StartIdx); 6933 6934 // Floating point operations had to be 'fast' to enable the unrolling. 6935 Value *MulOp = addFastMathFlag(Builder.CreateFMul(C, Step)); 6936 return addFastMathFlag(Builder.CreateBinOp(BinOp, Val, MulOp)); 6937 } 6938 Constant *C = ConstantInt::get(Ty, StartIdx); 6939 return Builder.CreateAdd(Val, Builder.CreateMul(C, Step), "induction"); 6940 } 6941 6942 static void AddRuntimeUnrollDisableMetaData(Loop *L) { 6943 SmallVector<Metadata *, 4> MDs; 6944 // Reserve first location for self reference to the LoopID metadata node. 6945 MDs.push_back(nullptr); 6946 bool IsUnrollMetadata = false; 6947 MDNode *LoopID = L->getLoopID(); 6948 if (LoopID) { 6949 // First find existing loop unrolling disable metadata. 6950 for (unsigned i = 1, ie = LoopID->getNumOperands(); i < ie; ++i) { 6951 auto *MD = dyn_cast<MDNode>(LoopID->getOperand(i)); 6952 if (MD) { 6953 const auto *S = dyn_cast<MDString>(MD->getOperand(0)); 6954 IsUnrollMetadata = 6955 S && S->getString().startswith("llvm.loop.unroll.disable"); 6956 } 6957 MDs.push_back(LoopID->getOperand(i)); 6958 } 6959 } 6960 6961 if (!IsUnrollMetadata) { 6962 // Add runtime unroll disable metadata. 6963 LLVMContext &Context = L->getHeader()->getContext(); 6964 SmallVector<Metadata *, 1> DisableOperands; 6965 DisableOperands.push_back( 6966 MDString::get(Context, "llvm.loop.unroll.runtime.disable")); 6967 MDNode *DisableNode = MDNode::get(Context, DisableOperands); 6968 MDs.push_back(DisableNode); 6969 MDNode *NewLoopID = MDNode::get(Context, MDs); 6970 // Set operand 0 to refer to the loop id itself. 6971 NewLoopID->replaceOperandWith(0, NewLoopID); 6972 L->setLoopID(NewLoopID); 6973 } 6974 } 6975 6976 bool LoopVectorizationPlanner::getDecisionAndClampRange( 6977 const std::function<bool(unsigned)> &Predicate, VFRange &Range) { 6978 assert(Range.End > Range.Start && "Trying to test an empty VF range."); 6979 bool PredicateAtRangeStart = Predicate(Range.Start); 6980 6981 for (unsigned TmpVF = Range.Start * 2; TmpVF < Range.End; TmpVF *= 2) 6982 if (Predicate(TmpVF) != PredicateAtRangeStart) { 6983 Range.End = TmpVF; 6984 break; 6985 } 6986 6987 return PredicateAtRangeStart; 6988 } 6989 6990 /// Build VPlans for the full range of feasible VF's = {\p MinVF, 2 * \p MinVF, 6991 /// 4 * \p MinVF, ..., \p MaxVF} by repeatedly building a VPlan for a sub-range 6992 /// of VF's starting at a given VF and extending it as much as possible. Each 6993 /// vectorization decision can potentially shorten this sub-range during 6994 /// buildVPlan(). 6995 void LoopVectorizationPlanner::buildVPlans(unsigned MinVF, unsigned MaxVF) { 6996 for (unsigned VF = MinVF; VF < MaxVF + 1;) { 6997 VFRange SubRange = {VF, MaxVF + 1}; 6998 VPlans.push_back(buildVPlan(SubRange)); 6999 VF = SubRange.End; 7000 } 7001 } 7002 7003 VPValue *VPRecipeBuilder::createEdgeMask(BasicBlock *Src, BasicBlock *Dst, 7004 VPlanPtr &Plan) { 7005 assert(is_contained(predecessors(Dst), Src) && "Invalid edge"); 7006 7007 // Look for cached value. 7008 std::pair<BasicBlock *, BasicBlock *> Edge(Src, Dst); 7009 EdgeMaskCacheTy::iterator ECEntryIt = EdgeMaskCache.find(Edge); 7010 if (ECEntryIt != EdgeMaskCache.end()) 7011 return ECEntryIt->second; 7012 7013 VPValue *SrcMask = createBlockInMask(Src, Plan); 7014 7015 // The terminator has to be a branch inst! 7016 BranchInst *BI = dyn_cast<BranchInst>(Src->getTerminator()); 7017 assert(BI && "Unexpected terminator found"); 7018 7019 if (!BI->isConditional() || BI->getSuccessor(0) == BI->getSuccessor(1)) 7020 return EdgeMaskCache[Edge] = SrcMask; 7021 7022 VPValue *EdgeMask = Plan->getVPValue(BI->getCondition()); 7023 assert(EdgeMask && "No Edge Mask found for condition"); 7024 7025 if (BI->getSuccessor(0) != Dst) 7026 EdgeMask = Builder.createNot(EdgeMask); 7027 7028 if (SrcMask) // Otherwise block in-mask is all-one, no need to AND. 7029 EdgeMask = Builder.createAnd(EdgeMask, SrcMask); 7030 7031 return EdgeMaskCache[Edge] = EdgeMask; 7032 } 7033 7034 VPValue *VPRecipeBuilder::createBlockInMask(BasicBlock *BB, VPlanPtr &Plan) { 7035 assert(OrigLoop->contains(BB) && "Block is not a part of a loop"); 7036 7037 // Look for cached value. 7038 BlockMaskCacheTy::iterator BCEntryIt = BlockMaskCache.find(BB); 7039 if (BCEntryIt != BlockMaskCache.end()) 7040 return BCEntryIt->second; 7041 7042 // All-one mask is modelled as no-mask following the convention for masked 7043 // load/store/gather/scatter. Initialize BlockMask to no-mask. 7044 VPValue *BlockMask = nullptr; 7045 7046 if (OrigLoop->getHeader() == BB) { 7047 if (!CM.blockNeedsPredication(BB)) 7048 return BlockMaskCache[BB] = BlockMask; // Loop incoming mask is all-one. 7049 7050 // Introduce the early-exit compare IV <= BTC to form header block mask. 7051 // This is used instead of IV < TC because TC may wrap, unlike BTC. 7052 // Start by constructing the desired canonical IV. 7053 VPValue *IV = nullptr; 7054 if (Legal->getPrimaryInduction()) 7055 IV = Plan->getVPValue(Legal->getPrimaryInduction()); 7056 else { 7057 auto IVRecipe = new VPWidenCanonicalIVRecipe(); 7058 Builder.getInsertBlock()->appendRecipe(IVRecipe); 7059 IV = IVRecipe->getVPValue(); 7060 } 7061 VPValue *BTC = Plan->getOrCreateBackedgeTakenCount(); 7062 bool TailFolded = !CM.isScalarEpilogueAllowed(); 7063 if (TailFolded && CM.TTI.emitGetActiveLaneMask()) 7064 BlockMask = Builder.createNaryOp(VPInstruction::ActiveLaneMask, {IV, BTC}); 7065 else 7066 BlockMask = Builder.createNaryOp(VPInstruction::ICmpULE, {IV, BTC}); 7067 return BlockMaskCache[BB] = BlockMask; 7068 } 7069 7070 // This is the block mask. We OR all incoming edges. 7071 for (auto *Predecessor : predecessors(BB)) { 7072 VPValue *EdgeMask = createEdgeMask(Predecessor, BB, Plan); 7073 if (!EdgeMask) // Mask of predecessor is all-one so mask of block is too. 7074 return BlockMaskCache[BB] = EdgeMask; 7075 7076 if (!BlockMask) { // BlockMask has its initialized nullptr value. 7077 BlockMask = EdgeMask; 7078 continue; 7079 } 7080 7081 BlockMask = Builder.createOr(BlockMask, EdgeMask); 7082 } 7083 7084 return BlockMaskCache[BB] = BlockMask; 7085 } 7086 7087 VPWidenMemoryInstructionRecipe * 7088 VPRecipeBuilder::tryToWidenMemory(Instruction *I, VFRange &Range, 7089 VPlanPtr &Plan) { 7090 assert((isa<LoadInst>(I) || isa<StoreInst>(I)) && 7091 "Must be called with either a load or store"); 7092 7093 auto willWiden = [&](unsigned VF) -> bool { 7094 if (VF == 1) 7095 return false; 7096 LoopVectorizationCostModel::InstWidening Decision = 7097 CM.getWideningDecision(I, VF); 7098 assert(Decision != LoopVectorizationCostModel::CM_Unknown && 7099 "CM decision should be taken at this point."); 7100 if (Decision == LoopVectorizationCostModel::CM_Interleave) 7101 return true; 7102 if (CM.isScalarAfterVectorization(I, VF) || 7103 CM.isProfitableToScalarize(I, VF)) 7104 return false; 7105 return Decision != LoopVectorizationCostModel::CM_Scalarize; 7106 }; 7107 7108 if (!LoopVectorizationPlanner::getDecisionAndClampRange(willWiden, Range)) 7109 return nullptr; 7110 7111 VPValue *Mask = nullptr; 7112 if (Legal->isMaskRequired(I)) 7113 Mask = createBlockInMask(I->getParent(), Plan); 7114 7115 VPValue *Addr = Plan->getOrAddVPValue(getLoadStorePointerOperand(I)); 7116 if (LoadInst *Load = dyn_cast<LoadInst>(I)) 7117 return new VPWidenMemoryInstructionRecipe(*Load, Addr, Mask); 7118 7119 StoreInst *Store = cast<StoreInst>(I); 7120 VPValue *StoredValue = Plan->getOrAddVPValue(Store->getValueOperand()); 7121 return new VPWidenMemoryInstructionRecipe(*Store, Addr, StoredValue, Mask); 7122 } 7123 7124 VPWidenIntOrFpInductionRecipe * 7125 VPRecipeBuilder::tryToOptimizeInductionPHI(PHINode *Phi) const { 7126 // Check if this is an integer or fp induction. If so, build the recipe that 7127 // produces its scalar and vector values. 7128 InductionDescriptor II = Legal->getInductionVars().lookup(Phi); 7129 if (II.getKind() == InductionDescriptor::IK_IntInduction || 7130 II.getKind() == InductionDescriptor::IK_FpInduction) 7131 return new VPWidenIntOrFpInductionRecipe(Phi); 7132 7133 return nullptr; 7134 } 7135 7136 VPWidenIntOrFpInductionRecipe * 7137 VPRecipeBuilder::tryToOptimizeInductionTruncate(TruncInst *I, 7138 VFRange &Range) const { 7139 // Optimize the special case where the source is a constant integer 7140 // induction variable. Notice that we can only optimize the 'trunc' case 7141 // because (a) FP conversions lose precision, (b) sext/zext may wrap, and 7142 // (c) other casts depend on pointer size. 7143 7144 // Determine whether \p K is a truncation based on an induction variable that 7145 // can be optimized. 7146 auto isOptimizableIVTruncate = 7147 [&](Instruction *K) -> std::function<bool(unsigned)> { 7148 return 7149 [=](unsigned VF) -> bool { return CM.isOptimizableIVTruncate(K, VF); }; 7150 }; 7151 7152 if (LoopVectorizationPlanner::getDecisionAndClampRange( 7153 isOptimizableIVTruncate(I), Range)) 7154 return new VPWidenIntOrFpInductionRecipe(cast<PHINode>(I->getOperand(0)), 7155 I); 7156 return nullptr; 7157 } 7158 7159 VPBlendRecipe *VPRecipeBuilder::tryToBlend(PHINode *Phi, VPlanPtr &Plan) { 7160 // We know that all PHIs in non-header blocks are converted into selects, so 7161 // we don't have to worry about the insertion order and we can just use the 7162 // builder. At this point we generate the predication tree. There may be 7163 // duplications since this is a simple recursive scan, but future 7164 // optimizations will clean it up. 7165 7166 SmallVector<VPValue *, 2> Operands; 7167 unsigned NumIncoming = Phi->getNumIncomingValues(); 7168 for (unsigned In = 0; In < NumIncoming; In++) { 7169 VPValue *EdgeMask = 7170 createEdgeMask(Phi->getIncomingBlock(In), Phi->getParent(), Plan); 7171 assert((EdgeMask || NumIncoming == 1) && 7172 "Multiple predecessors with one having a full mask"); 7173 Operands.push_back(Plan->getOrAddVPValue(Phi->getIncomingValue(In))); 7174 if (EdgeMask) 7175 Operands.push_back(EdgeMask); 7176 } 7177 return new VPBlendRecipe(Phi, Operands); 7178 } 7179 7180 VPWidenCallRecipe *VPRecipeBuilder::tryToWidenCall(CallInst *CI, VFRange &Range, 7181 VPlan &Plan) const { 7182 7183 bool IsPredicated = LoopVectorizationPlanner::getDecisionAndClampRange( 7184 [this, CI](unsigned VF) { return CM.isScalarWithPredication(CI, VF); }, 7185 Range); 7186 7187 if (IsPredicated) 7188 return nullptr; 7189 7190 Intrinsic::ID ID = getVectorIntrinsicIDForCall(CI, TLI); 7191 if (ID && (ID == Intrinsic::assume || ID == Intrinsic::lifetime_end || 7192 ID == Intrinsic::lifetime_start || ID == Intrinsic::sideeffect)) 7193 return nullptr; 7194 7195 auto willWiden = [&](unsigned VF) -> bool { 7196 Intrinsic::ID ID = getVectorIntrinsicIDForCall(CI, TLI); 7197 // The following case may be scalarized depending on the VF. 7198 // The flag shows whether we use Intrinsic or a usual Call for vectorized 7199 // version of the instruction. 7200 // Is it beneficial to perform intrinsic call compared to lib call? 7201 bool NeedToScalarize = false; 7202 unsigned CallCost = CM.getVectorCallCost(CI, VF, NeedToScalarize); 7203 bool UseVectorIntrinsic = 7204 ID && CM.getVectorIntrinsicCost(CI, VF) <= CallCost; 7205 return UseVectorIntrinsic || !NeedToScalarize; 7206 }; 7207 7208 if (!LoopVectorizationPlanner::getDecisionAndClampRange(willWiden, Range)) 7209 return nullptr; 7210 7211 return new VPWidenCallRecipe(*CI, Plan.mapToVPValues(CI->arg_operands())); 7212 } 7213 7214 bool VPRecipeBuilder::shouldWiden(Instruction *I, VFRange &Range) const { 7215 assert(!isa<BranchInst>(I) && !isa<PHINode>(I) && !isa<LoadInst>(I) && 7216 !isa<StoreInst>(I) && "Instruction should have been handled earlier"); 7217 // Instruction should be widened, unless it is scalar after vectorization, 7218 // scalarization is profitable or it is predicated. 7219 auto WillScalarize = [this, I](unsigned VF) -> bool { 7220 return CM.isScalarAfterVectorization(I, VF) || 7221 CM.isProfitableToScalarize(I, VF) || 7222 CM.isScalarWithPredication(I, VF); 7223 }; 7224 return !LoopVectorizationPlanner::getDecisionAndClampRange(WillScalarize, 7225 Range); 7226 } 7227 7228 VPWidenRecipe *VPRecipeBuilder::tryToWiden(Instruction *I, VPlan &Plan) const { 7229 auto IsVectorizableOpcode = [](unsigned Opcode) { 7230 switch (Opcode) { 7231 case Instruction::Add: 7232 case Instruction::And: 7233 case Instruction::AShr: 7234 case Instruction::BitCast: 7235 case Instruction::FAdd: 7236 case Instruction::FCmp: 7237 case Instruction::FDiv: 7238 case Instruction::FMul: 7239 case Instruction::FNeg: 7240 case Instruction::FPExt: 7241 case Instruction::FPToSI: 7242 case Instruction::FPToUI: 7243 case Instruction::FPTrunc: 7244 case Instruction::FRem: 7245 case Instruction::FSub: 7246 case Instruction::ICmp: 7247 case Instruction::IntToPtr: 7248 case Instruction::LShr: 7249 case Instruction::Mul: 7250 case Instruction::Or: 7251 case Instruction::PtrToInt: 7252 case Instruction::SDiv: 7253 case Instruction::Select: 7254 case Instruction::SExt: 7255 case Instruction::Shl: 7256 case Instruction::SIToFP: 7257 case Instruction::SRem: 7258 case Instruction::Sub: 7259 case Instruction::Trunc: 7260 case Instruction::UDiv: 7261 case Instruction::UIToFP: 7262 case Instruction::URem: 7263 case Instruction::Xor: 7264 case Instruction::ZExt: 7265 return true; 7266 } 7267 return false; 7268 }; 7269 7270 if (!IsVectorizableOpcode(I->getOpcode())) 7271 return nullptr; 7272 7273 // Success: widen this instruction. 7274 return new VPWidenRecipe(*I, Plan.mapToVPValues(I->operands())); 7275 } 7276 7277 VPBasicBlock *VPRecipeBuilder::handleReplication( 7278 Instruction *I, VFRange &Range, VPBasicBlock *VPBB, 7279 DenseMap<Instruction *, VPReplicateRecipe *> &PredInst2Recipe, 7280 VPlanPtr &Plan) { 7281 bool IsUniform = LoopVectorizationPlanner::getDecisionAndClampRange( 7282 [&](unsigned VF) { return CM.isUniformAfterVectorization(I, VF); }, 7283 Range); 7284 7285 bool IsPredicated = LoopVectorizationPlanner::getDecisionAndClampRange( 7286 [&](unsigned VF) { return CM.isScalarWithPredication(I, VF); }, Range); 7287 7288 auto *Recipe = new VPReplicateRecipe(I, Plan->mapToVPValues(I->operands()), 7289 IsUniform, IsPredicated); 7290 setRecipe(I, Recipe); 7291 7292 // Find if I uses a predicated instruction. If so, it will use its scalar 7293 // value. Avoid hoisting the insert-element which packs the scalar value into 7294 // a vector value, as that happens iff all users use the vector value. 7295 for (auto &Op : I->operands()) 7296 if (auto *PredInst = dyn_cast<Instruction>(Op)) 7297 if (PredInst2Recipe.find(PredInst) != PredInst2Recipe.end()) 7298 PredInst2Recipe[PredInst]->setAlsoPack(false); 7299 7300 // Finalize the recipe for Instr, first if it is not predicated. 7301 if (!IsPredicated) { 7302 LLVM_DEBUG(dbgs() << "LV: Scalarizing:" << *I << "\n"); 7303 VPBB->appendRecipe(Recipe); 7304 return VPBB; 7305 } 7306 LLVM_DEBUG(dbgs() << "LV: Scalarizing and predicating:" << *I << "\n"); 7307 assert(VPBB->getSuccessors().empty() && 7308 "VPBB has successors when handling predicated replication."); 7309 // Record predicated instructions for above packing optimizations. 7310 PredInst2Recipe[I] = Recipe; 7311 VPBlockBase *Region = createReplicateRegion(I, Recipe, Plan); 7312 VPBlockUtils::insertBlockAfter(Region, VPBB); 7313 auto *RegSucc = new VPBasicBlock(); 7314 VPBlockUtils::insertBlockAfter(RegSucc, Region); 7315 return RegSucc; 7316 } 7317 7318 VPRegionBlock *VPRecipeBuilder::createReplicateRegion(Instruction *Instr, 7319 VPRecipeBase *PredRecipe, 7320 VPlanPtr &Plan) { 7321 // Instructions marked for predication are replicated and placed under an 7322 // if-then construct to prevent side-effects. 7323 7324 // Generate recipes to compute the block mask for this region. 7325 VPValue *BlockInMask = createBlockInMask(Instr->getParent(), Plan); 7326 7327 // Build the triangular if-then region. 7328 std::string RegionName = (Twine("pred.") + Instr->getOpcodeName()).str(); 7329 assert(Instr->getParent() && "Predicated instruction not in any basic block"); 7330 auto *BOMRecipe = new VPBranchOnMaskRecipe(BlockInMask); 7331 auto *Entry = new VPBasicBlock(Twine(RegionName) + ".entry", BOMRecipe); 7332 auto *PHIRecipe = 7333 Instr->getType()->isVoidTy() ? nullptr : new VPPredInstPHIRecipe(Instr); 7334 auto *Exit = new VPBasicBlock(Twine(RegionName) + ".continue", PHIRecipe); 7335 auto *Pred = new VPBasicBlock(Twine(RegionName) + ".if", PredRecipe); 7336 VPRegionBlock *Region = new VPRegionBlock(Entry, Exit, RegionName, true); 7337 7338 // Note: first set Entry as region entry and then connect successors starting 7339 // from it in order, to propagate the "parent" of each VPBasicBlock. 7340 VPBlockUtils::insertTwoBlocksAfter(Pred, Exit, BlockInMask, Entry); 7341 VPBlockUtils::connectBlocks(Pred, Exit); 7342 7343 return Region; 7344 } 7345 7346 VPRecipeBase *VPRecipeBuilder::tryToCreateWidenRecipe(Instruction *Instr, 7347 VFRange &Range, 7348 VPlanPtr &Plan) { 7349 // First, check for specific widening recipes that deal with calls, memory 7350 // operations, inductions and Phi nodes. 7351 if (auto *CI = dyn_cast<CallInst>(Instr)) 7352 return tryToWidenCall(CI, Range, *Plan); 7353 7354 if (isa<LoadInst>(Instr) || isa<StoreInst>(Instr)) 7355 return tryToWidenMemory(Instr, Range, Plan); 7356 7357 VPRecipeBase *Recipe; 7358 if (auto Phi = dyn_cast<PHINode>(Instr)) { 7359 if (Phi->getParent() != OrigLoop->getHeader()) 7360 return tryToBlend(Phi, Plan); 7361 if ((Recipe = tryToOptimizeInductionPHI(Phi))) 7362 return Recipe; 7363 return new VPWidenPHIRecipe(Phi); 7364 } 7365 7366 if (isa<TruncInst>(Instr) && 7367 (Recipe = tryToOptimizeInductionTruncate(cast<TruncInst>(Instr), Range))) 7368 return Recipe; 7369 7370 if (!shouldWiden(Instr, Range)) 7371 return nullptr; 7372 7373 if (auto GEP = dyn_cast<GetElementPtrInst>(Instr)) 7374 return new VPWidenGEPRecipe(GEP, Plan->mapToVPValues(GEP->operands()), 7375 OrigLoop); 7376 7377 if (auto *SI = dyn_cast<SelectInst>(Instr)) { 7378 bool InvariantCond = 7379 PSE.getSE()->isLoopInvariant(PSE.getSCEV(SI->getOperand(0)), OrigLoop); 7380 return new VPWidenSelectRecipe(*SI, Plan->mapToVPValues(SI->operands()), 7381 InvariantCond); 7382 } 7383 7384 return tryToWiden(Instr, *Plan); 7385 } 7386 7387 void LoopVectorizationPlanner::buildVPlansWithVPRecipes(unsigned MinVF, 7388 unsigned MaxVF) { 7389 assert(OrigLoop->empty() && "Inner loop expected."); 7390 7391 // Collect conditions feeding internal conditional branches; they need to be 7392 // represented in VPlan for it to model masking. 7393 SmallPtrSet<Value *, 1> NeedDef; 7394 7395 auto *Latch = OrigLoop->getLoopLatch(); 7396 for (BasicBlock *BB : OrigLoop->blocks()) { 7397 if (BB == Latch) 7398 continue; 7399 BranchInst *Branch = dyn_cast<BranchInst>(BB->getTerminator()); 7400 if (Branch && Branch->isConditional()) 7401 NeedDef.insert(Branch->getCondition()); 7402 } 7403 7404 // If the tail is to be folded by masking, the primary induction variable, if 7405 // exists needs to be represented in VPlan for it to model early-exit masking. 7406 // Also, both the Phi and the live-out instruction of each reduction are 7407 // required in order to introduce a select between them in VPlan. 7408 if (CM.foldTailByMasking()) { 7409 if (Legal->getPrimaryInduction()) 7410 NeedDef.insert(Legal->getPrimaryInduction()); 7411 for (auto &Reduction : Legal->getReductionVars()) { 7412 NeedDef.insert(Reduction.first); 7413 NeedDef.insert(Reduction.second.getLoopExitInstr()); 7414 } 7415 } 7416 7417 // Collect instructions from the original loop that will become trivially dead 7418 // in the vectorized loop. We don't need to vectorize these instructions. For 7419 // example, original induction update instructions can become dead because we 7420 // separately emit induction "steps" when generating code for the new loop. 7421 // Similarly, we create a new latch condition when setting up the structure 7422 // of the new loop, so the old one can become dead. 7423 SmallPtrSet<Instruction *, 4> DeadInstructions; 7424 collectTriviallyDeadInstructions(DeadInstructions); 7425 7426 // Add assume instructions we need to drop to DeadInstructions, to prevent 7427 // them from being added to the VPlan. 7428 // TODO: We only need to drop assumes in blocks that get flattend. If the 7429 // control flow is preserved, we should keep them. 7430 auto &ConditionalAssumes = Legal->getConditionalAssumes(); 7431 DeadInstructions.insert(ConditionalAssumes.begin(), ConditionalAssumes.end()); 7432 7433 DenseMap<Instruction *, Instruction *> &SinkAfter = Legal->getSinkAfter(); 7434 // Dead instructions do not need sinking. Remove them from SinkAfter. 7435 for (Instruction *I : DeadInstructions) 7436 SinkAfter.erase(I); 7437 7438 for (unsigned VF = MinVF; VF < MaxVF + 1;) { 7439 VFRange SubRange = {VF, MaxVF + 1}; 7440 VPlans.push_back(buildVPlanWithVPRecipes(SubRange, NeedDef, 7441 DeadInstructions, SinkAfter)); 7442 VF = SubRange.End; 7443 } 7444 } 7445 7446 VPlanPtr LoopVectorizationPlanner::buildVPlanWithVPRecipes( 7447 VFRange &Range, SmallPtrSetImpl<Value *> &NeedDef, 7448 SmallPtrSetImpl<Instruction *> &DeadInstructions, 7449 const DenseMap<Instruction *, Instruction *> &SinkAfter) { 7450 7451 // Hold a mapping from predicated instructions to their recipes, in order to 7452 // fix their AlsoPack behavior if a user is determined to replicate and use a 7453 // scalar instead of vector value. 7454 DenseMap<Instruction *, VPReplicateRecipe *> PredInst2Recipe; 7455 7456 SmallPtrSet<const InterleaveGroup<Instruction> *, 1> InterleaveGroups; 7457 7458 VPRecipeBuilder RecipeBuilder(OrigLoop, TLI, Legal, CM, PSE, Builder); 7459 7460 // --------------------------------------------------------------------------- 7461 // Pre-construction: record ingredients whose recipes we'll need to further 7462 // process after constructing the initial VPlan. 7463 // --------------------------------------------------------------------------- 7464 7465 // Mark instructions we'll need to sink later and their targets as 7466 // ingredients whose recipe we'll need to record. 7467 for (auto &Entry : SinkAfter) { 7468 RecipeBuilder.recordRecipeOf(Entry.first); 7469 RecipeBuilder.recordRecipeOf(Entry.second); 7470 } 7471 for (auto &Reduction : CM.getInLoopReductionChains()) { 7472 PHINode *Phi = Reduction.first; 7473 RecurrenceDescriptor::RecurrenceKind Kind = 7474 Legal->getReductionVars()[Phi].getRecurrenceKind(); 7475 const SmallVector<Instruction *, 4> &ReductionOperations = Reduction.second; 7476 7477 RecipeBuilder.recordRecipeOf(Phi); 7478 for (auto &R : ReductionOperations) { 7479 RecipeBuilder.recordRecipeOf(R); 7480 // For min/max reducitons, where we have a pair of icmp/select, we also 7481 // need to record the ICmp recipe, so it can be removed later. 7482 if (Kind == RecurrenceDescriptor::RK_IntegerMinMax || 7483 Kind == RecurrenceDescriptor::RK_FloatMinMax) { 7484 RecipeBuilder.recordRecipeOf(cast<Instruction>(R->getOperand(0))); 7485 } 7486 } 7487 } 7488 7489 // For each interleave group which is relevant for this (possibly trimmed) 7490 // Range, add it to the set of groups to be later applied to the VPlan and add 7491 // placeholders for its members' Recipes which we'll be replacing with a 7492 // single VPInterleaveRecipe. 7493 for (InterleaveGroup<Instruction> *IG : IAI.getInterleaveGroups()) { 7494 auto applyIG = [IG, this](unsigned VF) -> bool { 7495 return (VF >= 2 && // Query is illegal for VF == 1 7496 CM.getWideningDecision(IG->getInsertPos(), VF) == 7497 LoopVectorizationCostModel::CM_Interleave); 7498 }; 7499 if (!getDecisionAndClampRange(applyIG, Range)) 7500 continue; 7501 InterleaveGroups.insert(IG); 7502 for (unsigned i = 0; i < IG->getFactor(); i++) 7503 if (Instruction *Member = IG->getMember(i)) 7504 RecipeBuilder.recordRecipeOf(Member); 7505 }; 7506 7507 // --------------------------------------------------------------------------- 7508 // Build initial VPlan: Scan the body of the loop in a topological order to 7509 // visit each basic block after having visited its predecessor basic blocks. 7510 // --------------------------------------------------------------------------- 7511 7512 // Create a dummy pre-entry VPBasicBlock to start building the VPlan. 7513 auto Plan = std::make_unique<VPlan>(); 7514 VPBasicBlock *VPBB = new VPBasicBlock("Pre-Entry"); 7515 Plan->setEntry(VPBB); 7516 7517 // Represent values that will have defs inside VPlan. 7518 for (Value *V : NeedDef) 7519 Plan->addVPValue(V); 7520 7521 // Scan the body of the loop in a topological order to visit each basic block 7522 // after having visited its predecessor basic blocks. 7523 LoopBlocksDFS DFS(OrigLoop); 7524 DFS.perform(LI); 7525 7526 for (BasicBlock *BB : make_range(DFS.beginRPO(), DFS.endRPO())) { 7527 // Relevant instructions from basic block BB will be grouped into VPRecipe 7528 // ingredients and fill a new VPBasicBlock. 7529 unsigned VPBBsForBB = 0; 7530 auto *FirstVPBBForBB = new VPBasicBlock(BB->getName()); 7531 VPBlockUtils::insertBlockAfter(FirstVPBBForBB, VPBB); 7532 VPBB = FirstVPBBForBB; 7533 Builder.setInsertPoint(VPBB); 7534 7535 // Introduce each ingredient into VPlan. 7536 // TODO: Model and preserve debug instrinsics in VPlan. 7537 for (Instruction &I : BB->instructionsWithoutDebug()) { 7538 Instruction *Instr = &I; 7539 7540 // First filter out irrelevant instructions, to ensure no recipes are 7541 // built for them. 7542 if (isa<BranchInst>(Instr) || DeadInstructions.count(Instr)) 7543 continue; 7544 7545 if (auto Recipe = 7546 RecipeBuilder.tryToCreateWidenRecipe(Instr, Range, Plan)) { 7547 RecipeBuilder.setRecipe(Instr, Recipe); 7548 VPBB->appendRecipe(Recipe); 7549 continue; 7550 } 7551 7552 // Otherwise, if all widening options failed, Instruction is to be 7553 // replicated. This may create a successor for VPBB. 7554 VPBasicBlock *NextVPBB = RecipeBuilder.handleReplication( 7555 Instr, Range, VPBB, PredInst2Recipe, Plan); 7556 if (NextVPBB != VPBB) { 7557 VPBB = NextVPBB; 7558 VPBB->setName(BB->hasName() ? BB->getName() + "." + Twine(VPBBsForBB++) 7559 : ""); 7560 } 7561 } 7562 } 7563 7564 // Discard empty dummy pre-entry VPBasicBlock. Note that other VPBasicBlocks 7565 // may also be empty, such as the last one VPBB, reflecting original 7566 // basic-blocks with no recipes. 7567 VPBasicBlock *PreEntry = cast<VPBasicBlock>(Plan->getEntry()); 7568 assert(PreEntry->empty() && "Expecting empty pre-entry block."); 7569 VPBlockBase *Entry = Plan->setEntry(PreEntry->getSingleSuccessor()); 7570 VPBlockUtils::disconnectBlocks(PreEntry, Entry); 7571 delete PreEntry; 7572 7573 // --------------------------------------------------------------------------- 7574 // Transform initial VPlan: Apply previously taken decisions, in order, to 7575 // bring the VPlan to its final state. 7576 // --------------------------------------------------------------------------- 7577 7578 // Apply Sink-After legal constraints. 7579 for (auto &Entry : SinkAfter) { 7580 VPRecipeBase *Sink = RecipeBuilder.getRecipe(Entry.first); 7581 VPRecipeBase *Target = RecipeBuilder.getRecipe(Entry.second); 7582 Sink->moveAfter(Target); 7583 } 7584 7585 // Interleave memory: for each Interleave Group we marked earlier as relevant 7586 // for this VPlan, replace the Recipes widening its memory instructions with a 7587 // single VPInterleaveRecipe at its insertion point. 7588 for (auto IG : InterleaveGroups) { 7589 auto *Recipe = cast<VPWidenMemoryInstructionRecipe>( 7590 RecipeBuilder.getRecipe(IG->getInsertPos())); 7591 (new VPInterleaveRecipe(IG, Recipe->getAddr(), Recipe->getMask())) 7592 ->insertBefore(Recipe); 7593 7594 for (unsigned i = 0; i < IG->getFactor(); ++i) 7595 if (Instruction *Member = IG->getMember(i)) { 7596 RecipeBuilder.getRecipe(Member)->eraseFromParent(); 7597 } 7598 } 7599 7600 // Adjust the recipes for any inloop reductions. 7601 if (Range.Start > 1) 7602 adjustRecipesForInLoopReductions(Plan, RecipeBuilder); 7603 7604 // Finally, if tail is folded by masking, introduce selects between the phi 7605 // and the live-out instruction of each reduction, at the end of the latch. 7606 if (CM.foldTailByMasking() && !Legal->getReductionVars().empty()) { 7607 Builder.setInsertPoint(VPBB); 7608 auto *Cond = RecipeBuilder.createBlockInMask(OrigLoop->getHeader(), Plan); 7609 for (auto &Reduction : Legal->getReductionVars()) { 7610 assert(!CM.isInLoopReduction(Reduction.first) && 7611 "Didn't expect inloop tail folded reduction yet!"); 7612 VPValue *Phi = Plan->getVPValue(Reduction.first); 7613 VPValue *Red = Plan->getVPValue(Reduction.second.getLoopExitInstr()); 7614 Builder.createNaryOp(Instruction::Select, {Cond, Red, Phi}); 7615 } 7616 } 7617 7618 std::string PlanName; 7619 raw_string_ostream RSO(PlanName); 7620 unsigned VF = Range.Start; 7621 Plan->addVF(VF); 7622 RSO << "Initial VPlan for VF={" << VF; 7623 for (VF *= 2; VF < Range.End; VF *= 2) { 7624 Plan->addVF(VF); 7625 RSO << "," << VF; 7626 } 7627 RSO << "},UF>=1"; 7628 RSO.flush(); 7629 Plan->setName(PlanName); 7630 7631 return Plan; 7632 } 7633 7634 VPlanPtr LoopVectorizationPlanner::buildVPlan(VFRange &Range) { 7635 // Outer loop handling: They may require CFG and instruction level 7636 // transformations before even evaluating whether vectorization is profitable. 7637 // Since we cannot modify the incoming IR, we need to build VPlan upfront in 7638 // the vectorization pipeline. 7639 assert(!OrigLoop->empty()); 7640 assert(EnableVPlanNativePath && "VPlan-native path is not enabled."); 7641 7642 // Create new empty VPlan 7643 auto Plan = std::make_unique<VPlan>(); 7644 7645 // Build hierarchical CFG 7646 VPlanHCFGBuilder HCFGBuilder(OrigLoop, LI, *Plan); 7647 HCFGBuilder.buildHierarchicalCFG(); 7648 7649 for (unsigned VF = Range.Start; VF < Range.End; VF *= 2) 7650 Plan->addVF(VF); 7651 7652 if (EnableVPlanPredication) { 7653 VPlanPredicator VPP(*Plan); 7654 VPP.predicate(); 7655 7656 // Avoid running transformation to recipes until masked code generation in 7657 // VPlan-native path is in place. 7658 return Plan; 7659 } 7660 7661 SmallPtrSet<Instruction *, 1> DeadInstructions; 7662 VPlanTransforms::VPInstructionsToVPRecipes( 7663 OrigLoop, Plan, Legal->getInductionVars(), DeadInstructions); 7664 return Plan; 7665 } 7666 7667 // Adjust the recipes for any inloop reductions. The chain of instructions 7668 // leading from the loop exit instr to the phi need to be converted to 7669 // reductions, with one operand being vector and the other being the scalar 7670 // reduction chain. 7671 void LoopVectorizationPlanner::adjustRecipesForInLoopReductions( 7672 VPlanPtr &Plan, VPRecipeBuilder &RecipeBuilder) { 7673 for (auto &Reduction : CM.getInLoopReductionChains()) { 7674 PHINode *Phi = Reduction.first; 7675 RecurrenceDescriptor &RdxDesc = Legal->getReductionVars()[Phi]; 7676 const SmallVector<Instruction *, 4> &ReductionOperations = Reduction.second; 7677 7678 // ReductionOperations are orders top-down from the phi's use to the 7679 // LoopExitValue. We keep a track of the previous item (the Chain) to tell 7680 // which of the two operands will remain scalar and which will be reduced. 7681 // For minmax the chain will be the select instructions. 7682 Instruction *Chain = Phi; 7683 for (Instruction *R : ReductionOperations) { 7684 VPRecipeBase *WidenRecipe = RecipeBuilder.getRecipe(R); 7685 RecurrenceDescriptor::RecurrenceKind Kind = RdxDesc.getRecurrenceKind(); 7686 7687 VPValue *ChainOp = Plan->getVPValue(Chain); 7688 unsigned FirstOpId; 7689 if (Kind == RecurrenceDescriptor::RK_IntegerMinMax || 7690 Kind == RecurrenceDescriptor::RK_FloatMinMax) { 7691 assert(WidenRecipe->getVPRecipeID() == VPRecipeBase::VPWidenSelectSC && 7692 "Expected to replace a VPWidenSelectSC"); 7693 FirstOpId = 1; 7694 } else { 7695 assert(WidenRecipe->getVPRecipeID() == VPRecipeBase::VPWidenSC && 7696 "Expected to replace a VPWidenSC"); 7697 FirstOpId = 0; 7698 } 7699 unsigned VecOpId = 7700 R->getOperand(FirstOpId) == Chain ? FirstOpId + 1 : FirstOpId; 7701 VPValue *VecOp = Plan->getVPValue(R->getOperand(VecOpId)); 7702 7703 VPReductionRecipe *RedRecipe = new VPReductionRecipe( 7704 &RdxDesc, R, ChainOp, VecOp, Legal->hasFunNoNaNAttr(), TTI); 7705 WidenRecipe->getParent()->insert(RedRecipe, WidenRecipe->getIterator()); 7706 WidenRecipe->eraseFromParent(); 7707 7708 if (Kind == RecurrenceDescriptor::RK_IntegerMinMax || 7709 Kind == RecurrenceDescriptor::RK_FloatMinMax) { 7710 VPRecipeBase *CompareRecipe = 7711 RecipeBuilder.getRecipe(cast<Instruction>(R->getOperand(0))); 7712 assert(CompareRecipe->getVPRecipeID() == VPRecipeBase::VPWidenSC && 7713 "Expected to replace a VPWidenSC"); 7714 CompareRecipe->eraseFromParent(); 7715 } 7716 Chain = R; 7717 } 7718 } 7719 } 7720 7721 Value* LoopVectorizationPlanner::VPCallbackILV:: 7722 getOrCreateVectorValues(Value *V, unsigned Part) { 7723 return ILV.getOrCreateVectorValue(V, Part); 7724 } 7725 7726 Value *LoopVectorizationPlanner::VPCallbackILV::getOrCreateScalarValue( 7727 Value *V, const VPIteration &Instance) { 7728 return ILV.getOrCreateScalarValue(V, Instance); 7729 } 7730 7731 void VPInterleaveRecipe::print(raw_ostream &O, const Twine &Indent, 7732 VPSlotTracker &SlotTracker) const { 7733 O << "\"INTERLEAVE-GROUP with factor " << IG->getFactor() << " at "; 7734 IG->getInsertPos()->printAsOperand(O, false); 7735 O << ", "; 7736 getAddr()->printAsOperand(O, SlotTracker); 7737 VPValue *Mask = getMask(); 7738 if (Mask) { 7739 O << ", "; 7740 Mask->printAsOperand(O, SlotTracker); 7741 } 7742 for (unsigned i = 0; i < IG->getFactor(); ++i) 7743 if (Instruction *I = IG->getMember(i)) 7744 O << "\\l\" +\n" << Indent << "\" " << VPlanIngredient(I) << " " << i; 7745 } 7746 7747 void VPWidenCallRecipe::execute(VPTransformState &State) { 7748 State.ILV->widenCallInstruction(Ingredient, User, State); 7749 } 7750 7751 void VPWidenSelectRecipe::execute(VPTransformState &State) { 7752 State.ILV->widenSelectInstruction(Ingredient, User, InvariantCond, State); 7753 } 7754 7755 void VPWidenRecipe::execute(VPTransformState &State) { 7756 State.ILV->widenInstruction(Ingredient, User, State); 7757 } 7758 7759 void VPWidenGEPRecipe::execute(VPTransformState &State) { 7760 State.ILV->widenGEP(GEP, User, State.UF, State.VF, IsPtrLoopInvariant, 7761 IsIndexLoopInvariant, State); 7762 } 7763 7764 void VPWidenIntOrFpInductionRecipe::execute(VPTransformState &State) { 7765 assert(!State.Instance && "Int or FP induction being replicated."); 7766 State.ILV->widenIntOrFpInduction(IV, Trunc); 7767 } 7768 7769 void VPWidenPHIRecipe::execute(VPTransformState &State) { 7770 State.ILV->widenPHIInstruction(Phi, State.UF, State.VF); 7771 } 7772 7773 void VPBlendRecipe::execute(VPTransformState &State) { 7774 State.ILV->setDebugLocFromInst(State.Builder, Phi); 7775 // We know that all PHIs in non-header blocks are converted into 7776 // selects, so we don't have to worry about the insertion order and we 7777 // can just use the builder. 7778 // At this point we generate the predication tree. There may be 7779 // duplications since this is a simple recursive scan, but future 7780 // optimizations will clean it up. 7781 7782 unsigned NumIncoming = getNumIncomingValues(); 7783 7784 // Generate a sequence of selects of the form: 7785 // SELECT(Mask3, In3, 7786 // SELECT(Mask2, In2, 7787 // SELECT(Mask1, In1, 7788 // In0))) 7789 // Note that Mask0 is never used: lanes for which no path reaches this phi and 7790 // are essentially undef are taken from In0. 7791 InnerLoopVectorizer::VectorParts Entry(State.UF); 7792 for (unsigned In = 0; In < NumIncoming; ++In) { 7793 for (unsigned Part = 0; Part < State.UF; ++Part) { 7794 // We might have single edge PHIs (blocks) - use an identity 7795 // 'select' for the first PHI operand. 7796 Value *In0 = State.get(getIncomingValue(In), Part); 7797 if (In == 0) 7798 Entry[Part] = In0; // Initialize with the first incoming value. 7799 else { 7800 // Select between the current value and the previous incoming edge 7801 // based on the incoming mask. 7802 Value *Cond = State.get(getMask(In), Part); 7803 Entry[Part] = 7804 State.Builder.CreateSelect(Cond, In0, Entry[Part], "predphi"); 7805 } 7806 } 7807 } 7808 for (unsigned Part = 0; Part < State.UF; ++Part) 7809 State.ValueMap.setVectorValue(Phi, Part, Entry[Part]); 7810 } 7811 7812 void VPInterleaveRecipe::execute(VPTransformState &State) { 7813 assert(!State.Instance && "Interleave group being replicated."); 7814 State.ILV->vectorizeInterleaveGroup(IG, State, getAddr(), getMask()); 7815 } 7816 7817 void VPReductionRecipe::execute(VPTransformState &State) { 7818 assert(!State.Instance && "Reduction being replicated."); 7819 for (unsigned Part = 0; Part < State.UF; ++Part) { 7820 unsigned Kind = RdxDesc->getRecurrenceKind(); 7821 Value *NewVecOp = State.get(VecOp, Part); 7822 Value *NewRed = 7823 createTargetReduction(State.Builder, TTI, *RdxDesc, NewVecOp, NoNaN); 7824 Value *PrevInChain = State.get(ChainOp, Part); 7825 Value *NextInChain; 7826 if (Kind == RecurrenceDescriptor::RK_IntegerMinMax || 7827 Kind == RecurrenceDescriptor::RK_FloatMinMax) { 7828 NextInChain = 7829 createMinMaxOp(State.Builder, RdxDesc->getMinMaxRecurrenceKind(), 7830 NewRed, PrevInChain); 7831 } else { 7832 NextInChain = State.Builder.CreateBinOp( 7833 (Instruction::BinaryOps)I->getOpcode(), NewRed, PrevInChain); 7834 } 7835 State.ValueMap.setVectorValue(I, Part, NextInChain); 7836 } 7837 } 7838 7839 void VPReplicateRecipe::execute(VPTransformState &State) { 7840 if (State.Instance) { // Generate a single instance. 7841 State.ILV->scalarizeInstruction(Ingredient, User, *State.Instance, 7842 IsPredicated, State); 7843 // Insert scalar instance packing it into a vector. 7844 if (AlsoPack && State.VF > 1) { 7845 // If we're constructing lane 0, initialize to start from undef. 7846 if (State.Instance->Lane == 0) { 7847 Value *Undef = UndefValue::get( 7848 FixedVectorType::get(Ingredient->getType(), State.VF)); 7849 State.ValueMap.setVectorValue(Ingredient, State.Instance->Part, Undef); 7850 } 7851 State.ILV->packScalarIntoVectorValue(Ingredient, *State.Instance); 7852 } 7853 return; 7854 } 7855 7856 // Generate scalar instances for all VF lanes of all UF parts, unless the 7857 // instruction is uniform inwhich case generate only the first lane for each 7858 // of the UF parts. 7859 unsigned EndLane = IsUniform ? 1 : State.VF; 7860 for (unsigned Part = 0; Part < State.UF; ++Part) 7861 for (unsigned Lane = 0; Lane < EndLane; ++Lane) 7862 State.ILV->scalarizeInstruction(Ingredient, User, {Part, Lane}, 7863 IsPredicated, State); 7864 } 7865 7866 void VPBranchOnMaskRecipe::execute(VPTransformState &State) { 7867 assert(State.Instance && "Branch on Mask works only on single instance."); 7868 7869 unsigned Part = State.Instance->Part; 7870 unsigned Lane = State.Instance->Lane; 7871 7872 Value *ConditionBit = nullptr; 7873 VPValue *BlockInMask = getMask(); 7874 if (BlockInMask) { 7875 ConditionBit = State.get(BlockInMask, Part); 7876 if (ConditionBit->getType()->isVectorTy()) 7877 ConditionBit = State.Builder.CreateExtractElement( 7878 ConditionBit, State.Builder.getInt32(Lane)); 7879 } else // Block in mask is all-one. 7880 ConditionBit = State.Builder.getTrue(); 7881 7882 // Replace the temporary unreachable terminator with a new conditional branch, 7883 // whose two destinations will be set later when they are created. 7884 auto *CurrentTerminator = State.CFG.PrevBB->getTerminator(); 7885 assert(isa<UnreachableInst>(CurrentTerminator) && 7886 "Expected to replace unreachable terminator with conditional branch."); 7887 auto *CondBr = BranchInst::Create(State.CFG.PrevBB, nullptr, ConditionBit); 7888 CondBr->setSuccessor(0, nullptr); 7889 ReplaceInstWithInst(CurrentTerminator, CondBr); 7890 } 7891 7892 void VPPredInstPHIRecipe::execute(VPTransformState &State) { 7893 assert(State.Instance && "Predicated instruction PHI works per instance."); 7894 Instruction *ScalarPredInst = cast<Instruction>( 7895 State.ValueMap.getScalarValue(PredInst, *State.Instance)); 7896 BasicBlock *PredicatedBB = ScalarPredInst->getParent(); 7897 BasicBlock *PredicatingBB = PredicatedBB->getSinglePredecessor(); 7898 assert(PredicatingBB && "Predicated block has no single predecessor."); 7899 7900 // By current pack/unpack logic we need to generate only a single phi node: if 7901 // a vector value for the predicated instruction exists at this point it means 7902 // the instruction has vector users only, and a phi for the vector value is 7903 // needed. In this case the recipe of the predicated instruction is marked to 7904 // also do that packing, thereby "hoisting" the insert-element sequence. 7905 // Otherwise, a phi node for the scalar value is needed. 7906 unsigned Part = State.Instance->Part; 7907 if (State.ValueMap.hasVectorValue(PredInst, Part)) { 7908 Value *VectorValue = State.ValueMap.getVectorValue(PredInst, Part); 7909 InsertElementInst *IEI = cast<InsertElementInst>(VectorValue); 7910 PHINode *VPhi = State.Builder.CreatePHI(IEI->getType(), 2); 7911 VPhi->addIncoming(IEI->getOperand(0), PredicatingBB); // Unmodified vector. 7912 VPhi->addIncoming(IEI, PredicatedBB); // New vector with inserted element. 7913 State.ValueMap.resetVectorValue(PredInst, Part, VPhi); // Update cache. 7914 } else { 7915 Type *PredInstType = PredInst->getType(); 7916 PHINode *Phi = State.Builder.CreatePHI(PredInstType, 2); 7917 Phi->addIncoming(UndefValue::get(ScalarPredInst->getType()), PredicatingBB); 7918 Phi->addIncoming(ScalarPredInst, PredicatedBB); 7919 State.ValueMap.resetScalarValue(PredInst, *State.Instance, Phi); 7920 } 7921 } 7922 7923 void VPWidenMemoryInstructionRecipe::execute(VPTransformState &State) { 7924 VPValue *StoredValue = isa<StoreInst>(Instr) ? getStoredValue() : nullptr; 7925 State.ILV->vectorizeMemoryInstruction(&Instr, State, getAddr(), StoredValue, 7926 getMask()); 7927 } 7928 7929 // Determine how to lower the scalar epilogue, which depends on 1) optimising 7930 // for minimum code-size, 2) predicate compiler options, 3) loop hints forcing 7931 // predication, and 4) a TTI hook that analyses whether the loop is suitable 7932 // for predication. 7933 static ScalarEpilogueLowering getScalarEpilogueLowering( 7934 Function *F, Loop *L, LoopVectorizeHints &Hints, ProfileSummaryInfo *PSI, 7935 BlockFrequencyInfo *BFI, TargetTransformInfo *TTI, TargetLibraryInfo *TLI, 7936 AssumptionCache *AC, LoopInfo *LI, ScalarEvolution *SE, DominatorTree *DT, 7937 LoopVectorizationLegality &LVL) { 7938 // 1) OptSize takes precedence over all other options, i.e. if this is set, 7939 // don't look at hints or options, and don't request a scalar epilogue. 7940 // (For PGSO, as shouldOptimizeForSize isn't currently accessible from 7941 // LoopAccessInfo (due to code dependency and not being able to reliably get 7942 // PSI/BFI from a loop analysis under NPM), we cannot suppress the collection 7943 // of strides in LoopAccessInfo::analyzeLoop() and vectorize without 7944 // versioning when the vectorization is forced, unlike hasOptSize. So revert 7945 // back to the old way and vectorize with versioning when forced. See D81345.) 7946 if (F->hasOptSize() || (llvm::shouldOptimizeForSize(L->getHeader(), PSI, BFI, 7947 PGSOQueryType::IRPass) && 7948 Hints.getForce() != LoopVectorizeHints::FK_Enabled)) 7949 return CM_ScalarEpilogueNotAllowedOptSize; 7950 7951 bool PredicateOptDisabled = PreferPredicateOverEpilog.getNumOccurrences() && 7952 !PreferPredicateOverEpilog; 7953 7954 // 2) Next, if disabling predication is requested on the command line, honour 7955 // this and request a scalar epilogue. 7956 if (PredicateOptDisabled) 7957 return CM_ScalarEpilogueAllowed; 7958 7959 // 3) and 4) look if enabling predication is requested on the command line, 7960 // with a loop hint, or if the TTI hook indicates this is profitable, request 7961 // predication . 7962 if (PreferPredicateOverEpilog || 7963 Hints.getPredicate() == LoopVectorizeHints::FK_Enabled || 7964 (TTI->preferPredicateOverEpilogue(L, LI, *SE, *AC, TLI, DT, 7965 LVL.getLAI()) && 7966 Hints.getPredicate() != LoopVectorizeHints::FK_Disabled)) 7967 return CM_ScalarEpilogueNotNeededUsePredicate; 7968 7969 return CM_ScalarEpilogueAllowed; 7970 } 7971 7972 // Process the loop in the VPlan-native vectorization path. This path builds 7973 // VPlan upfront in the vectorization pipeline, which allows to apply 7974 // VPlan-to-VPlan transformations from the very beginning without modifying the 7975 // input LLVM IR. 7976 static bool processLoopInVPlanNativePath( 7977 Loop *L, PredicatedScalarEvolution &PSE, LoopInfo *LI, DominatorTree *DT, 7978 LoopVectorizationLegality *LVL, TargetTransformInfo *TTI, 7979 TargetLibraryInfo *TLI, DemandedBits *DB, AssumptionCache *AC, 7980 OptimizationRemarkEmitter *ORE, BlockFrequencyInfo *BFI, 7981 ProfileSummaryInfo *PSI, LoopVectorizeHints &Hints) { 7982 7983 if (PSE.getBackedgeTakenCount() == PSE.getSE()->getCouldNotCompute()) { 7984 LLVM_DEBUG(dbgs() << "LV: cannot compute the outer-loop trip count\n"); 7985 return false; 7986 } 7987 assert(EnableVPlanNativePath && "VPlan-native path is disabled."); 7988 Function *F = L->getHeader()->getParent(); 7989 InterleavedAccessInfo IAI(PSE, L, DT, LI, LVL->getLAI()); 7990 7991 ScalarEpilogueLowering SEL = getScalarEpilogueLowering( 7992 F, L, Hints, PSI, BFI, TTI, TLI, AC, LI, PSE.getSE(), DT, *LVL); 7993 7994 LoopVectorizationCostModel CM(SEL, L, PSE, LI, LVL, *TTI, TLI, DB, AC, ORE, F, 7995 &Hints, IAI); 7996 // Use the planner for outer loop vectorization. 7997 // TODO: CM is not used at this point inside the planner. Turn CM into an 7998 // optional argument if we don't need it in the future. 7999 LoopVectorizationPlanner LVP(L, LI, TLI, TTI, LVL, CM, IAI, PSE); 8000 8001 // Get user vectorization factor. 8002 const unsigned UserVF = Hints.getWidth(); 8003 8004 // Plan how to best vectorize, return the best VF and its cost. 8005 const VectorizationFactor VF = LVP.planInVPlanNativePath(UserVF); 8006 8007 // If we are stress testing VPlan builds, do not attempt to generate vector 8008 // code. Masked vector code generation support will follow soon. 8009 // Also, do not attempt to vectorize if no vector code will be produced. 8010 if (VPlanBuildStressTest || EnableVPlanPredication || 8011 VectorizationFactor::Disabled() == VF) 8012 return false; 8013 8014 LVP.setBestPlan(VF.Width, 1); 8015 8016 InnerLoopVectorizer LB(L, PSE, LI, DT, TLI, TTI, AC, ORE, VF.Width, 1, LVL, 8017 &CM, BFI, PSI); 8018 LLVM_DEBUG(dbgs() << "Vectorizing outer loop in \"" 8019 << L->getHeader()->getParent()->getName() << "\"\n"); 8020 LVP.executePlan(LB, DT); 8021 8022 // Mark the loop as already vectorized to avoid vectorizing again. 8023 Hints.setAlreadyVectorized(); 8024 8025 assert(!verifyFunction(*L->getHeader()->getParent(), &dbgs())); 8026 return true; 8027 } 8028 8029 LoopVectorizePass::LoopVectorizePass(LoopVectorizeOptions Opts) 8030 : InterleaveOnlyWhenForced(Opts.InterleaveOnlyWhenForced || 8031 !EnableLoopInterleaving), 8032 VectorizeOnlyWhenForced(Opts.VectorizeOnlyWhenForced || 8033 !EnableLoopVectorization) {} 8034 8035 bool LoopVectorizePass::processLoop(Loop *L) { 8036 assert((EnableVPlanNativePath || L->empty()) && 8037 "VPlan-native path is not enabled. Only process inner loops."); 8038 8039 #ifndef NDEBUG 8040 const std::string DebugLocStr = getDebugLocString(L); 8041 #endif /* NDEBUG */ 8042 8043 LLVM_DEBUG(dbgs() << "\nLV: Checking a loop in \"" 8044 << L->getHeader()->getParent()->getName() << "\" from " 8045 << DebugLocStr << "\n"); 8046 8047 LoopVectorizeHints Hints(L, InterleaveOnlyWhenForced, *ORE); 8048 8049 LLVM_DEBUG( 8050 dbgs() << "LV: Loop hints:" 8051 << " force=" 8052 << (Hints.getForce() == LoopVectorizeHints::FK_Disabled 8053 ? "disabled" 8054 : (Hints.getForce() == LoopVectorizeHints::FK_Enabled 8055 ? "enabled" 8056 : "?")) 8057 << " width=" << Hints.getWidth() 8058 << " unroll=" << Hints.getInterleave() << "\n"); 8059 8060 // Function containing loop 8061 Function *F = L->getHeader()->getParent(); 8062 8063 // Looking at the diagnostic output is the only way to determine if a loop 8064 // was vectorized (other than looking at the IR or machine code), so it 8065 // is important to generate an optimization remark for each loop. Most of 8066 // these messages are generated as OptimizationRemarkAnalysis. Remarks 8067 // generated as OptimizationRemark and OptimizationRemarkMissed are 8068 // less verbose reporting vectorized loops and unvectorized loops that may 8069 // benefit from vectorization, respectively. 8070 8071 if (!Hints.allowVectorization(F, L, VectorizeOnlyWhenForced)) { 8072 LLVM_DEBUG(dbgs() << "LV: Loop hints prevent vectorization.\n"); 8073 return false; 8074 } 8075 8076 PredicatedScalarEvolution PSE(*SE, *L); 8077 8078 // Check if it is legal to vectorize the loop. 8079 LoopVectorizationRequirements Requirements(*ORE); 8080 LoopVectorizationLegality LVL(L, PSE, DT, TTI, TLI, AA, F, GetLAA, LI, ORE, 8081 &Requirements, &Hints, DB, AC, BFI, PSI); 8082 if (!LVL.canVectorize(EnableVPlanNativePath)) { 8083 LLVM_DEBUG(dbgs() << "LV: Not vectorizing: Cannot prove legality.\n"); 8084 Hints.emitRemarkWithHints(); 8085 return false; 8086 } 8087 8088 // Check the function attributes and profiles to find out if this function 8089 // should be optimized for size. 8090 ScalarEpilogueLowering SEL = getScalarEpilogueLowering( 8091 F, L, Hints, PSI, BFI, TTI, TLI, AC, LI, PSE.getSE(), DT, LVL); 8092 8093 // Entrance to the VPlan-native vectorization path. Outer loops are processed 8094 // here. They may require CFG and instruction level transformations before 8095 // even evaluating whether vectorization is profitable. Since we cannot modify 8096 // the incoming IR, we need to build VPlan upfront in the vectorization 8097 // pipeline. 8098 if (!L->empty()) 8099 return processLoopInVPlanNativePath(L, PSE, LI, DT, &LVL, TTI, TLI, DB, AC, 8100 ORE, BFI, PSI, Hints); 8101 8102 assert(L->empty() && "Inner loop expected."); 8103 8104 // Check the loop for a trip count threshold: vectorize loops with a tiny trip 8105 // count by optimizing for size, to minimize overheads. 8106 auto ExpectedTC = getSmallBestKnownTC(*SE, L); 8107 if (ExpectedTC && *ExpectedTC < TinyTripCountVectorThreshold) { 8108 LLVM_DEBUG(dbgs() << "LV: Found a loop with a very small trip count. " 8109 << "This loop is worth vectorizing only if no scalar " 8110 << "iteration overheads are incurred."); 8111 if (Hints.getForce() == LoopVectorizeHints::FK_Enabled) 8112 LLVM_DEBUG(dbgs() << " But vectorizing was explicitly forced.\n"); 8113 else { 8114 LLVM_DEBUG(dbgs() << "\n"); 8115 SEL = CM_ScalarEpilogueNotAllowedLowTripLoop; 8116 } 8117 } 8118 8119 // Check the function attributes to see if implicit floats are allowed. 8120 // FIXME: This check doesn't seem possibly correct -- what if the loop is 8121 // an integer loop and the vector instructions selected are purely integer 8122 // vector instructions? 8123 if (F->hasFnAttribute(Attribute::NoImplicitFloat)) { 8124 reportVectorizationFailure( 8125 "Can't vectorize when the NoImplicitFloat attribute is used", 8126 "loop not vectorized due to NoImplicitFloat attribute", 8127 "NoImplicitFloat", ORE, L); 8128 Hints.emitRemarkWithHints(); 8129 return false; 8130 } 8131 8132 // Check if the target supports potentially unsafe FP vectorization. 8133 // FIXME: Add a check for the type of safety issue (denormal, signaling) 8134 // for the target we're vectorizing for, to make sure none of the 8135 // additional fp-math flags can help. 8136 if (Hints.isPotentiallyUnsafe() && 8137 TTI->isFPVectorizationPotentiallyUnsafe()) { 8138 reportVectorizationFailure( 8139 "Potentially unsafe FP op prevents vectorization", 8140 "loop not vectorized due to unsafe FP support.", 8141 "UnsafeFP", ORE, L); 8142 Hints.emitRemarkWithHints(); 8143 return false; 8144 } 8145 8146 bool UseInterleaved = TTI->enableInterleavedAccessVectorization(); 8147 InterleavedAccessInfo IAI(PSE, L, DT, LI, LVL.getLAI()); 8148 8149 // If an override option has been passed in for interleaved accesses, use it. 8150 if (EnableInterleavedMemAccesses.getNumOccurrences() > 0) 8151 UseInterleaved = EnableInterleavedMemAccesses; 8152 8153 // Analyze interleaved memory accesses. 8154 if (UseInterleaved) { 8155 IAI.analyzeInterleaving(useMaskedInterleavedAccesses(*TTI)); 8156 } 8157 8158 // Use the cost model. 8159 LoopVectorizationCostModel CM(SEL, L, PSE, LI, &LVL, *TTI, TLI, DB, AC, ORE, 8160 F, &Hints, IAI); 8161 CM.collectValuesToIgnore(); 8162 8163 // Use the planner for vectorization. 8164 LoopVectorizationPlanner LVP(L, LI, TLI, TTI, &LVL, CM, IAI, PSE); 8165 8166 // Get user vectorization factor and interleave count. 8167 unsigned UserVF = Hints.getWidth(); 8168 unsigned UserIC = Hints.getInterleave(); 8169 8170 // Plan how to best vectorize, return the best VF and its cost. 8171 Optional<VectorizationFactor> MaybeVF = LVP.plan(UserVF, UserIC); 8172 8173 VectorizationFactor VF = VectorizationFactor::Disabled(); 8174 unsigned IC = 1; 8175 8176 if (MaybeVF) { 8177 VF = *MaybeVF; 8178 // Select the interleave count. 8179 IC = CM.selectInterleaveCount(VF.Width, VF.Cost); 8180 } 8181 8182 // Identify the diagnostic messages that should be produced. 8183 std::pair<StringRef, std::string> VecDiagMsg, IntDiagMsg; 8184 bool VectorizeLoop = true, InterleaveLoop = true; 8185 if (Requirements.doesNotMeet(F, L, Hints)) { 8186 LLVM_DEBUG(dbgs() << "LV: Not vectorizing: loop did not meet vectorization " 8187 "requirements.\n"); 8188 Hints.emitRemarkWithHints(); 8189 return false; 8190 } 8191 8192 if (VF.Width == 1) { 8193 LLVM_DEBUG(dbgs() << "LV: Vectorization is possible but not beneficial.\n"); 8194 VecDiagMsg = std::make_pair( 8195 "VectorizationNotBeneficial", 8196 "the cost-model indicates that vectorization is not beneficial"); 8197 VectorizeLoop = false; 8198 } 8199 8200 if (!MaybeVF && UserIC > 1) { 8201 // Tell the user interleaving was avoided up-front, despite being explicitly 8202 // requested. 8203 LLVM_DEBUG(dbgs() << "LV: Ignoring UserIC, because vectorization and " 8204 "interleaving should be avoided up front\n"); 8205 IntDiagMsg = std::make_pair( 8206 "InterleavingAvoided", 8207 "Ignoring UserIC, because interleaving was avoided up front"); 8208 InterleaveLoop = false; 8209 } else if (IC == 1 && UserIC <= 1) { 8210 // Tell the user interleaving is not beneficial. 8211 LLVM_DEBUG(dbgs() << "LV: Interleaving is not beneficial.\n"); 8212 IntDiagMsg = std::make_pair( 8213 "InterleavingNotBeneficial", 8214 "the cost-model indicates that interleaving is not beneficial"); 8215 InterleaveLoop = false; 8216 if (UserIC == 1) { 8217 IntDiagMsg.first = "InterleavingNotBeneficialAndDisabled"; 8218 IntDiagMsg.second += 8219 " and is explicitly disabled or interleave count is set to 1"; 8220 } 8221 } else if (IC > 1 && UserIC == 1) { 8222 // Tell the user interleaving is beneficial, but it explicitly disabled. 8223 LLVM_DEBUG( 8224 dbgs() << "LV: Interleaving is beneficial but is explicitly disabled."); 8225 IntDiagMsg = std::make_pair( 8226 "InterleavingBeneficialButDisabled", 8227 "the cost-model indicates that interleaving is beneficial " 8228 "but is explicitly disabled or interleave count is set to 1"); 8229 InterleaveLoop = false; 8230 } 8231 8232 // Override IC if user provided an interleave count. 8233 IC = UserIC > 0 ? UserIC : IC; 8234 8235 // Emit diagnostic messages, if any. 8236 const char *VAPassName = Hints.vectorizeAnalysisPassName(); 8237 if (!VectorizeLoop && !InterleaveLoop) { 8238 // Do not vectorize or interleaving the loop. 8239 ORE->emit([&]() { 8240 return OptimizationRemarkMissed(VAPassName, VecDiagMsg.first, 8241 L->getStartLoc(), L->getHeader()) 8242 << VecDiagMsg.second; 8243 }); 8244 ORE->emit([&]() { 8245 return OptimizationRemarkMissed(LV_NAME, IntDiagMsg.first, 8246 L->getStartLoc(), L->getHeader()) 8247 << IntDiagMsg.second; 8248 }); 8249 return false; 8250 } else if (!VectorizeLoop && InterleaveLoop) { 8251 LLVM_DEBUG(dbgs() << "LV: Interleave Count is " << IC << '\n'); 8252 ORE->emit([&]() { 8253 return OptimizationRemarkAnalysis(VAPassName, VecDiagMsg.first, 8254 L->getStartLoc(), L->getHeader()) 8255 << VecDiagMsg.second; 8256 }); 8257 } else if (VectorizeLoop && !InterleaveLoop) { 8258 LLVM_DEBUG(dbgs() << "LV: Found a vectorizable loop (" << VF.Width 8259 << ") in " << DebugLocStr << '\n'); 8260 ORE->emit([&]() { 8261 return OptimizationRemarkAnalysis(LV_NAME, IntDiagMsg.first, 8262 L->getStartLoc(), L->getHeader()) 8263 << IntDiagMsg.second; 8264 }); 8265 } else if (VectorizeLoop && InterleaveLoop) { 8266 LLVM_DEBUG(dbgs() << "LV: Found a vectorizable loop (" << VF.Width 8267 << ") in " << DebugLocStr << '\n'); 8268 LLVM_DEBUG(dbgs() << "LV: Interleave Count is " << IC << '\n'); 8269 } 8270 8271 LVP.setBestPlan(VF.Width, IC); 8272 8273 using namespace ore; 8274 bool DisableRuntimeUnroll = false; 8275 MDNode *OrigLoopID = L->getLoopID(); 8276 8277 if (!VectorizeLoop) { 8278 assert(IC > 1 && "interleave count should not be 1 or 0"); 8279 // If we decided that it is not legal to vectorize the loop, then 8280 // interleave it. 8281 InnerLoopUnroller Unroller(L, PSE, LI, DT, TLI, TTI, AC, ORE, IC, &LVL, &CM, 8282 BFI, PSI); 8283 LVP.executePlan(Unroller, DT); 8284 8285 ORE->emit([&]() { 8286 return OptimizationRemark(LV_NAME, "Interleaved", L->getStartLoc(), 8287 L->getHeader()) 8288 << "interleaved loop (interleaved count: " 8289 << NV("InterleaveCount", IC) << ")"; 8290 }); 8291 } else { 8292 // If we decided that it is *legal* to vectorize the loop, then do it. 8293 InnerLoopVectorizer LB(L, PSE, LI, DT, TLI, TTI, AC, ORE, VF.Width, IC, 8294 &LVL, &CM, BFI, PSI); 8295 LVP.executePlan(LB, DT); 8296 ++LoopsVectorized; 8297 8298 // Add metadata to disable runtime unrolling a scalar loop when there are 8299 // no runtime checks about strides and memory. A scalar loop that is 8300 // rarely used is not worth unrolling. 8301 if (!LB.areSafetyChecksAdded()) 8302 DisableRuntimeUnroll = true; 8303 8304 // Report the vectorization decision. 8305 ORE->emit([&]() { 8306 return OptimizationRemark(LV_NAME, "Vectorized", L->getStartLoc(), 8307 L->getHeader()) 8308 << "vectorized loop (vectorization width: " 8309 << NV("VectorizationFactor", VF.Width) 8310 << ", interleaved count: " << NV("InterleaveCount", IC) << ")"; 8311 }); 8312 } 8313 8314 Optional<MDNode *> RemainderLoopID = 8315 makeFollowupLoopID(OrigLoopID, {LLVMLoopVectorizeFollowupAll, 8316 LLVMLoopVectorizeFollowupEpilogue}); 8317 if (RemainderLoopID.hasValue()) { 8318 L->setLoopID(RemainderLoopID.getValue()); 8319 } else { 8320 if (DisableRuntimeUnroll) 8321 AddRuntimeUnrollDisableMetaData(L); 8322 8323 // Mark the loop as already vectorized to avoid vectorizing again. 8324 Hints.setAlreadyVectorized(); 8325 } 8326 8327 assert(!verifyFunction(*L->getHeader()->getParent(), &dbgs())); 8328 return true; 8329 } 8330 8331 LoopVectorizeResult LoopVectorizePass::runImpl( 8332 Function &F, ScalarEvolution &SE_, LoopInfo &LI_, TargetTransformInfo &TTI_, 8333 DominatorTree &DT_, BlockFrequencyInfo &BFI_, TargetLibraryInfo *TLI_, 8334 DemandedBits &DB_, AAResults &AA_, AssumptionCache &AC_, 8335 std::function<const LoopAccessInfo &(Loop &)> &GetLAA_, 8336 OptimizationRemarkEmitter &ORE_, ProfileSummaryInfo *PSI_) { 8337 SE = &SE_; 8338 LI = &LI_; 8339 TTI = &TTI_; 8340 DT = &DT_; 8341 BFI = &BFI_; 8342 TLI = TLI_; 8343 AA = &AA_; 8344 AC = &AC_; 8345 GetLAA = &GetLAA_; 8346 DB = &DB_; 8347 ORE = &ORE_; 8348 PSI = PSI_; 8349 8350 // Don't attempt if 8351 // 1. the target claims to have no vector registers, and 8352 // 2. interleaving won't help ILP. 8353 // 8354 // The second condition is necessary because, even if the target has no 8355 // vector registers, loop vectorization may still enable scalar 8356 // interleaving. 8357 if (!TTI->getNumberOfRegisters(TTI->getRegisterClassForType(true)) && 8358 TTI->getMaxInterleaveFactor(1) < 2) 8359 return LoopVectorizeResult(false, false); 8360 8361 bool Changed = false, CFGChanged = false; 8362 8363 // The vectorizer requires loops to be in simplified form. 8364 // Since simplification may add new inner loops, it has to run before the 8365 // legality and profitability checks. This means running the loop vectorizer 8366 // will simplify all loops, regardless of whether anything end up being 8367 // vectorized. 8368 for (auto &L : *LI) 8369 Changed |= CFGChanged |= 8370 simplifyLoop(L, DT, LI, SE, AC, nullptr, false /* PreserveLCSSA */); 8371 8372 // Build up a worklist of inner-loops to vectorize. This is necessary as 8373 // the act of vectorizing or partially unrolling a loop creates new loops 8374 // and can invalidate iterators across the loops. 8375 SmallVector<Loop *, 8> Worklist; 8376 8377 for (Loop *L : *LI) 8378 collectSupportedLoops(*L, LI, ORE, Worklist); 8379 8380 LoopsAnalyzed += Worklist.size(); 8381 8382 // Now walk the identified inner loops. 8383 while (!Worklist.empty()) { 8384 Loop *L = Worklist.pop_back_val(); 8385 8386 // For the inner loops we actually process, form LCSSA to simplify the 8387 // transform. 8388 Changed |= formLCSSARecursively(*L, *DT, LI, SE); 8389 8390 Changed |= CFGChanged |= processLoop(L); 8391 } 8392 8393 // Process each loop nest in the function. 8394 return LoopVectorizeResult(Changed, CFGChanged); 8395 } 8396 8397 PreservedAnalyses LoopVectorizePass::run(Function &F, 8398 FunctionAnalysisManager &AM) { 8399 auto &SE = AM.getResult<ScalarEvolutionAnalysis>(F); 8400 auto &LI = AM.getResult<LoopAnalysis>(F); 8401 auto &TTI = AM.getResult<TargetIRAnalysis>(F); 8402 auto &DT = AM.getResult<DominatorTreeAnalysis>(F); 8403 auto &BFI = AM.getResult<BlockFrequencyAnalysis>(F); 8404 auto &TLI = AM.getResult<TargetLibraryAnalysis>(F); 8405 auto &AA = AM.getResult<AAManager>(F); 8406 auto &AC = AM.getResult<AssumptionAnalysis>(F); 8407 auto &DB = AM.getResult<DemandedBitsAnalysis>(F); 8408 auto &ORE = AM.getResult<OptimizationRemarkEmitterAnalysis>(F); 8409 MemorySSA *MSSA = EnableMSSALoopDependency 8410 ? &AM.getResult<MemorySSAAnalysis>(F).getMSSA() 8411 : nullptr; 8412 8413 auto &LAM = AM.getResult<LoopAnalysisManagerFunctionProxy>(F).getManager(); 8414 std::function<const LoopAccessInfo &(Loop &)> GetLAA = 8415 [&](Loop &L) -> const LoopAccessInfo & { 8416 LoopStandardAnalysisResults AR = {AA, AC, DT, LI, SE, TLI, TTI, MSSA}; 8417 return LAM.getResult<LoopAccessAnalysis>(L, AR); 8418 }; 8419 auto &MAMProxy = AM.getResult<ModuleAnalysisManagerFunctionProxy>(F); 8420 ProfileSummaryInfo *PSI = 8421 MAMProxy.getCachedResult<ProfileSummaryAnalysis>(*F.getParent()); 8422 LoopVectorizeResult Result = 8423 runImpl(F, SE, LI, TTI, DT, BFI, &TLI, DB, AA, AC, GetLAA, ORE, PSI); 8424 if (!Result.MadeAnyChange) 8425 return PreservedAnalyses::all(); 8426 PreservedAnalyses PA; 8427 8428 // We currently do not preserve loopinfo/dominator analyses with outer loop 8429 // vectorization. Until this is addressed, mark these analyses as preserved 8430 // only for non-VPlan-native path. 8431 // TODO: Preserve Loop and Dominator analyses for VPlan-native path. 8432 if (!EnableVPlanNativePath) { 8433 PA.preserve<LoopAnalysis>(); 8434 PA.preserve<DominatorTreeAnalysis>(); 8435 } 8436 PA.preserve<BasicAA>(); 8437 PA.preserve<GlobalsAA>(); 8438 if (!Result.MadeCFGChange) 8439 PA.preserveSet<CFGAnalyses>(); 8440 return PA; 8441 } 8442