1 //===- LoopVectorize.cpp - A Loop Vectorizer ------------------------------===// 2 // 3 // Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. 4 // See https://llvm.org/LICENSE.txt for license information. 5 // SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception 6 // 7 //===----------------------------------------------------------------------===// 8 // 9 // This is the LLVM loop vectorizer. This pass modifies 'vectorizable' loops 10 // and generates target-independent LLVM-IR. 11 // The vectorizer uses the TargetTransformInfo analysis to estimate the costs 12 // of instructions in order to estimate the profitability of vectorization. 13 // 14 // The loop vectorizer combines consecutive loop iterations into a single 15 // 'wide' iteration. After this transformation the index is incremented 16 // by the SIMD vector width, and not by one. 17 // 18 // This pass has three parts: 19 // 1. The main loop pass that drives the different parts. 20 // 2. LoopVectorizationLegality - A unit that checks for the legality 21 // of the vectorization. 22 // 3. InnerLoopVectorizer - A unit that performs the actual 23 // widening of instructions. 24 // 4. LoopVectorizationCostModel - A unit that checks for the profitability 25 // of vectorization. It decides on the optimal vector width, which 26 // can be one, if vectorization is not profitable. 27 // 28 // There is a development effort going on to migrate loop vectorizer to the 29 // VPlan infrastructure and to introduce outer loop vectorization support (see 30 // docs/Proposal/VectorizationPlan.rst and 31 // http://lists.llvm.org/pipermail/llvm-dev/2017-December/119523.html). For this 32 // purpose, we temporarily introduced the VPlan-native vectorization path: an 33 // alternative vectorization path that is natively implemented on top of the 34 // VPlan infrastructure. See EnableVPlanNativePath for enabling. 35 // 36 //===----------------------------------------------------------------------===// 37 // 38 // The reduction-variable vectorization is based on the paper: 39 // D. Nuzman and R. Henderson. Multi-platform Auto-vectorization. 40 // 41 // Variable uniformity checks are inspired by: 42 // Karrenberg, R. and Hack, S. Whole Function Vectorization. 43 // 44 // The interleaved access vectorization is based on the paper: 45 // Dorit Nuzman, Ira Rosen and Ayal Zaks. Auto-Vectorization of Interleaved 46 // Data for SIMD 47 // 48 // Other ideas/concepts are from: 49 // A. Zaks and D. Nuzman. Autovectorization in GCC-two years later. 50 // 51 // S. Maleki, Y. Gao, M. Garzaran, T. Wong and D. Padua. An Evaluation of 52 // Vectorizing Compilers. 53 // 54 //===----------------------------------------------------------------------===// 55 56 #include "llvm/Transforms/Vectorize/LoopVectorize.h" 57 #include "LoopVectorizationPlanner.h" 58 #include "VPRecipeBuilder.h" 59 #include "VPlan.h" 60 #include "VPlanHCFGBuilder.h" 61 #include "VPlanPredicator.h" 62 #include "VPlanTransforms.h" 63 #include "llvm/ADT/APInt.h" 64 #include "llvm/ADT/ArrayRef.h" 65 #include "llvm/ADT/DenseMap.h" 66 #include "llvm/ADT/DenseMapInfo.h" 67 #include "llvm/ADT/Hashing.h" 68 #include "llvm/ADT/MapVector.h" 69 #include "llvm/ADT/None.h" 70 #include "llvm/ADT/Optional.h" 71 #include "llvm/ADT/STLExtras.h" 72 #include "llvm/ADT/SetVector.h" 73 #include "llvm/ADT/SmallPtrSet.h" 74 #include "llvm/ADT/SmallVector.h" 75 #include "llvm/ADT/Statistic.h" 76 #include "llvm/ADT/StringRef.h" 77 #include "llvm/ADT/Twine.h" 78 #include "llvm/ADT/iterator_range.h" 79 #include "llvm/Analysis/AssumptionCache.h" 80 #include "llvm/Analysis/BasicAliasAnalysis.h" 81 #include "llvm/Analysis/BlockFrequencyInfo.h" 82 #include "llvm/Analysis/CFG.h" 83 #include "llvm/Analysis/CodeMetrics.h" 84 #include "llvm/Analysis/DemandedBits.h" 85 #include "llvm/Analysis/GlobalsModRef.h" 86 #include "llvm/Analysis/LoopAccessAnalysis.h" 87 #include "llvm/Analysis/LoopAnalysisManager.h" 88 #include "llvm/Analysis/LoopInfo.h" 89 #include "llvm/Analysis/LoopIterator.h" 90 #include "llvm/Analysis/MemorySSA.h" 91 #include "llvm/Analysis/OptimizationRemarkEmitter.h" 92 #include "llvm/Analysis/ProfileSummaryInfo.h" 93 #include "llvm/Analysis/ScalarEvolution.h" 94 #include "llvm/Analysis/ScalarEvolutionExpressions.h" 95 #include "llvm/Analysis/TargetLibraryInfo.h" 96 #include "llvm/Analysis/TargetTransformInfo.h" 97 #include "llvm/Analysis/VectorUtils.h" 98 #include "llvm/IR/Attributes.h" 99 #include "llvm/IR/BasicBlock.h" 100 #include "llvm/IR/CFG.h" 101 #include "llvm/IR/Constant.h" 102 #include "llvm/IR/Constants.h" 103 #include "llvm/IR/DataLayout.h" 104 #include "llvm/IR/DebugInfoMetadata.h" 105 #include "llvm/IR/DebugLoc.h" 106 #include "llvm/IR/DerivedTypes.h" 107 #include "llvm/IR/DiagnosticInfo.h" 108 #include "llvm/IR/Dominators.h" 109 #include "llvm/IR/Function.h" 110 #include "llvm/IR/IRBuilder.h" 111 #include "llvm/IR/InstrTypes.h" 112 #include "llvm/IR/Instruction.h" 113 #include "llvm/IR/Instructions.h" 114 #include "llvm/IR/IntrinsicInst.h" 115 #include "llvm/IR/Intrinsics.h" 116 #include "llvm/IR/LLVMContext.h" 117 #include "llvm/IR/Metadata.h" 118 #include "llvm/IR/Module.h" 119 #include "llvm/IR/Operator.h" 120 #include "llvm/IR/Type.h" 121 #include "llvm/IR/Use.h" 122 #include "llvm/IR/User.h" 123 #include "llvm/IR/Value.h" 124 #include "llvm/IR/ValueHandle.h" 125 #include "llvm/IR/Verifier.h" 126 #include "llvm/InitializePasses.h" 127 #include "llvm/Pass.h" 128 #include "llvm/Support/Casting.h" 129 #include "llvm/Support/CommandLine.h" 130 #include "llvm/Support/Compiler.h" 131 #include "llvm/Support/Debug.h" 132 #include "llvm/Support/ErrorHandling.h" 133 #include "llvm/Support/MathExtras.h" 134 #include "llvm/Support/raw_ostream.h" 135 #include "llvm/Transforms/Utils/BasicBlockUtils.h" 136 #include "llvm/Transforms/Utils/InjectTLIMappings.h" 137 #include "llvm/Transforms/Utils/LoopSimplify.h" 138 #include "llvm/Transforms/Utils/LoopUtils.h" 139 #include "llvm/Transforms/Utils/LoopVersioning.h" 140 #include "llvm/Transforms/Utils/ScalarEvolutionExpander.h" 141 #include "llvm/Transforms/Utils/SizeOpts.h" 142 #include "llvm/Transforms/Vectorize/LoopVectorizationLegality.h" 143 #include <algorithm> 144 #include <cassert> 145 #include <cstdint> 146 #include <cstdlib> 147 #include <functional> 148 #include <iterator> 149 #include <limits> 150 #include <memory> 151 #include <string> 152 #include <tuple> 153 #include <utility> 154 155 using namespace llvm; 156 157 #define LV_NAME "loop-vectorize" 158 #define DEBUG_TYPE LV_NAME 159 160 /// @{ 161 /// Metadata attribute names 162 static const char *const LLVMLoopVectorizeFollowupAll = 163 "llvm.loop.vectorize.followup_all"; 164 static const char *const LLVMLoopVectorizeFollowupVectorized = 165 "llvm.loop.vectorize.followup_vectorized"; 166 static const char *const LLVMLoopVectorizeFollowupEpilogue = 167 "llvm.loop.vectorize.followup_epilogue"; 168 /// @} 169 170 STATISTIC(LoopsVectorized, "Number of loops vectorized"); 171 STATISTIC(LoopsAnalyzed, "Number of loops analyzed for vectorization"); 172 173 /// Loops with a known constant trip count below this number are vectorized only 174 /// if no scalar iteration overheads are incurred. 175 static cl::opt<unsigned> TinyTripCountVectorThreshold( 176 "vectorizer-min-trip-count", cl::init(16), cl::Hidden, 177 cl::desc("Loops with a constant trip count that is smaller than this " 178 "value are vectorized only if no scalar iteration overheads " 179 "are incurred.")); 180 181 // Indicates that an epilogue is undesired, predication is preferred. 182 // This means that the vectorizer will try to fold the loop-tail (epilogue) 183 // into the loop and predicate the loop body accordingly. 184 static cl::opt<bool> PreferPredicateOverEpilog( 185 "prefer-predicate-over-epilog", cl::init(false), cl::Hidden, 186 cl::desc("Indicate that an epilogue is undesired, predication should be " 187 "used instead.")); 188 189 static cl::opt<bool> MaximizeBandwidth( 190 "vectorizer-maximize-bandwidth", cl::init(false), cl::Hidden, 191 cl::desc("Maximize bandwidth when selecting vectorization factor which " 192 "will be determined by the smallest type in loop.")); 193 194 static cl::opt<bool> EnableInterleavedMemAccesses( 195 "enable-interleaved-mem-accesses", cl::init(false), cl::Hidden, 196 cl::desc("Enable vectorization on interleaved memory accesses in a loop")); 197 198 /// An interleave-group may need masking if it resides in a block that needs 199 /// predication, or in order to mask away gaps. 200 static cl::opt<bool> EnableMaskedInterleavedMemAccesses( 201 "enable-masked-interleaved-mem-accesses", cl::init(false), cl::Hidden, 202 cl::desc("Enable vectorization on masked interleaved memory accesses in a loop")); 203 204 static cl::opt<unsigned> TinyTripCountInterleaveThreshold( 205 "tiny-trip-count-interleave-threshold", cl::init(128), cl::Hidden, 206 cl::desc("We don't interleave loops with a estimated constant trip count " 207 "below this number")); 208 209 static cl::opt<unsigned> ForceTargetNumScalarRegs( 210 "force-target-num-scalar-regs", cl::init(0), cl::Hidden, 211 cl::desc("A flag that overrides the target's number of scalar registers.")); 212 213 static cl::opt<unsigned> ForceTargetNumVectorRegs( 214 "force-target-num-vector-regs", cl::init(0), cl::Hidden, 215 cl::desc("A flag that overrides the target's number of vector registers.")); 216 217 static cl::opt<unsigned> ForceTargetMaxScalarInterleaveFactor( 218 "force-target-max-scalar-interleave", cl::init(0), cl::Hidden, 219 cl::desc("A flag that overrides the target's max interleave factor for " 220 "scalar loops.")); 221 222 static cl::opt<unsigned> ForceTargetMaxVectorInterleaveFactor( 223 "force-target-max-vector-interleave", cl::init(0), cl::Hidden, 224 cl::desc("A flag that overrides the target's max interleave factor for " 225 "vectorized loops.")); 226 227 static cl::opt<unsigned> ForceTargetInstructionCost( 228 "force-target-instruction-cost", cl::init(0), cl::Hidden, 229 cl::desc("A flag that overrides the target's expected cost for " 230 "an instruction to a single constant value. Mostly " 231 "useful for getting consistent testing.")); 232 233 static cl::opt<unsigned> SmallLoopCost( 234 "small-loop-cost", cl::init(20), cl::Hidden, 235 cl::desc( 236 "The cost of a loop that is considered 'small' by the interleaver.")); 237 238 static cl::opt<bool> LoopVectorizeWithBlockFrequency( 239 "loop-vectorize-with-block-frequency", cl::init(true), cl::Hidden, 240 cl::desc("Enable the use of the block frequency analysis to access PGO " 241 "heuristics minimizing code growth in cold regions and being more " 242 "aggressive in hot regions.")); 243 244 // Runtime interleave loops for load/store throughput. 245 static cl::opt<bool> EnableLoadStoreRuntimeInterleave( 246 "enable-loadstore-runtime-interleave", cl::init(true), cl::Hidden, 247 cl::desc( 248 "Enable runtime interleaving until load/store ports are saturated")); 249 250 /// The number of stores in a loop that are allowed to need predication. 251 static cl::opt<unsigned> NumberOfStoresToPredicate( 252 "vectorize-num-stores-pred", cl::init(1), cl::Hidden, 253 cl::desc("Max number of stores to be predicated behind an if.")); 254 255 static cl::opt<bool> EnableIndVarRegisterHeur( 256 "enable-ind-var-reg-heur", cl::init(true), cl::Hidden, 257 cl::desc("Count the induction variable only once when interleaving")); 258 259 static cl::opt<bool> EnableCondStoresVectorization( 260 "enable-cond-stores-vec", cl::init(true), cl::Hidden, 261 cl::desc("Enable if predication of stores during vectorization.")); 262 263 static cl::opt<unsigned> MaxNestedScalarReductionIC( 264 "max-nested-scalar-reduction-interleave", cl::init(2), cl::Hidden, 265 cl::desc("The maximum interleave count to use when interleaving a scalar " 266 "reduction in a nested loop.")); 267 268 static cl::opt<bool> 269 PreferInLoopReductions("prefer-inloop-reductions", cl::init(false), 270 cl::Hidden, 271 cl::desc("Prefer in-loop vector reductions, " 272 "overriding the targets preference.")); 273 274 cl::opt<bool> EnableVPlanNativePath( 275 "enable-vplan-native-path", cl::init(false), cl::Hidden, 276 cl::desc("Enable VPlan-native vectorization path with " 277 "support for outer loop vectorization.")); 278 279 // FIXME: Remove this switch once we have divergence analysis. Currently we 280 // assume divergent non-backedge branches when this switch is true. 281 cl::opt<bool> EnableVPlanPredication( 282 "enable-vplan-predication", cl::init(false), cl::Hidden, 283 cl::desc("Enable VPlan-native vectorization path predicator with " 284 "support for outer loop vectorization.")); 285 286 // This flag enables the stress testing of the VPlan H-CFG construction in the 287 // VPlan-native vectorization path. It must be used in conjuction with 288 // -enable-vplan-native-path. -vplan-verify-hcfg can also be used to enable the 289 // verification of the H-CFGs built. 290 static cl::opt<bool> VPlanBuildStressTest( 291 "vplan-build-stress-test", cl::init(false), cl::Hidden, 292 cl::desc( 293 "Build VPlan for every supported loop nest in the function and bail " 294 "out right after the build (stress test the VPlan H-CFG construction " 295 "in the VPlan-native vectorization path).")); 296 297 cl::opt<bool> llvm::EnableLoopInterleaving( 298 "interleave-loops", cl::init(true), cl::Hidden, 299 cl::desc("Enable loop interleaving in Loop vectorization passes")); 300 cl::opt<bool> llvm::EnableLoopVectorization( 301 "vectorize-loops", cl::init(true), cl::Hidden, 302 cl::desc("Run the Loop vectorization passes")); 303 304 /// A helper function that returns the type of loaded or stored value. 305 static Type *getMemInstValueType(Value *I) { 306 assert((isa<LoadInst>(I) || isa<StoreInst>(I)) && 307 "Expected Load or Store instruction"); 308 if (auto *LI = dyn_cast<LoadInst>(I)) 309 return LI->getType(); 310 return cast<StoreInst>(I)->getValueOperand()->getType(); 311 } 312 313 /// A helper function that returns true if the given type is irregular. The 314 /// type is irregular if its allocated size doesn't equal the store size of an 315 /// element of the corresponding vector type at the given vectorization factor. 316 static bool hasIrregularType(Type *Ty, const DataLayout &DL, unsigned VF) { 317 // Determine if an array of VF elements of type Ty is "bitcast compatible" 318 // with a <VF x Ty> vector. 319 if (VF > 1) { 320 auto *VectorTy = FixedVectorType::get(Ty, VF); 321 return VF * DL.getTypeAllocSize(Ty) != DL.getTypeStoreSize(VectorTy); 322 } 323 324 // If the vectorization factor is one, we just check if an array of type Ty 325 // requires padding between elements. 326 return DL.getTypeAllocSizeInBits(Ty) != DL.getTypeSizeInBits(Ty); 327 } 328 329 /// A helper function that returns the reciprocal of the block probability of 330 /// predicated blocks. If we return X, we are assuming the predicated block 331 /// will execute once for every X iterations of the loop header. 332 /// 333 /// TODO: We should use actual block probability here, if available. Currently, 334 /// we always assume predicated blocks have a 50% chance of executing. 335 static unsigned getReciprocalPredBlockProb() { return 2; } 336 337 /// A helper function that adds a 'fast' flag to floating-point operations. 338 static Value *addFastMathFlag(Value *V) { 339 if (isa<FPMathOperator>(V)) 340 cast<Instruction>(V)->setFastMathFlags(FastMathFlags::getFast()); 341 return V; 342 } 343 344 static Value *addFastMathFlag(Value *V, FastMathFlags FMF) { 345 if (isa<FPMathOperator>(V)) 346 cast<Instruction>(V)->setFastMathFlags(FMF); 347 return V; 348 } 349 350 /// A helper function that returns an integer or floating-point constant with 351 /// value C. 352 static Constant *getSignedIntOrFpConstant(Type *Ty, int64_t C) { 353 return Ty->isIntegerTy() ? ConstantInt::getSigned(Ty, C) 354 : ConstantFP::get(Ty, C); 355 } 356 357 /// Returns "best known" trip count for the specified loop \p L as defined by 358 /// the following procedure: 359 /// 1) Returns exact trip count if it is known. 360 /// 2) Returns expected trip count according to profile data if any. 361 /// 3) Returns upper bound estimate if it is known. 362 /// 4) Returns None if all of the above failed. 363 static Optional<unsigned> getSmallBestKnownTC(ScalarEvolution &SE, Loop *L) { 364 // Check if exact trip count is known. 365 if (unsigned ExpectedTC = SE.getSmallConstantTripCount(L)) 366 return ExpectedTC; 367 368 // Check if there is an expected trip count available from profile data. 369 if (LoopVectorizeWithBlockFrequency) 370 if (auto EstimatedTC = getLoopEstimatedTripCount(L)) 371 return EstimatedTC; 372 373 // Check if upper bound estimate is known. 374 if (unsigned ExpectedTC = SE.getSmallConstantMaxTripCount(L)) 375 return ExpectedTC; 376 377 return None; 378 } 379 380 namespace llvm { 381 382 /// InnerLoopVectorizer vectorizes loops which contain only one basic 383 /// block to a specified vectorization factor (VF). 384 /// This class performs the widening of scalars into vectors, or multiple 385 /// scalars. This class also implements the following features: 386 /// * It inserts an epilogue loop for handling loops that don't have iteration 387 /// counts that are known to be a multiple of the vectorization factor. 388 /// * It handles the code generation for reduction variables. 389 /// * Scalarization (implementation using scalars) of un-vectorizable 390 /// instructions. 391 /// InnerLoopVectorizer does not perform any vectorization-legality 392 /// checks, and relies on the caller to check for the different legality 393 /// aspects. The InnerLoopVectorizer relies on the 394 /// LoopVectorizationLegality class to provide information about the induction 395 /// and reduction variables that were found to a given vectorization factor. 396 class InnerLoopVectorizer { 397 public: 398 InnerLoopVectorizer(Loop *OrigLoop, PredicatedScalarEvolution &PSE, 399 LoopInfo *LI, DominatorTree *DT, 400 const TargetLibraryInfo *TLI, 401 const TargetTransformInfo *TTI, AssumptionCache *AC, 402 OptimizationRemarkEmitter *ORE, unsigned VecWidth, 403 unsigned UnrollFactor, LoopVectorizationLegality *LVL, 404 LoopVectorizationCostModel *CM, BlockFrequencyInfo *BFI, 405 ProfileSummaryInfo *PSI) 406 : OrigLoop(OrigLoop), PSE(PSE), LI(LI), DT(DT), TLI(TLI), TTI(TTI), 407 AC(AC), ORE(ORE), VF(VecWidth), UF(UnrollFactor), 408 Builder(PSE.getSE()->getContext()), 409 VectorLoopValueMap(UnrollFactor, VecWidth), Legal(LVL), Cost(CM), 410 BFI(BFI), PSI(PSI) { 411 // Query this against the original loop and save it here because the profile 412 // of the original loop header may change as the transformation happens. 413 OptForSizeBasedOnProfile = llvm::shouldOptimizeForSize( 414 OrigLoop->getHeader(), PSI, BFI, PGSOQueryType::IRPass); 415 } 416 417 virtual ~InnerLoopVectorizer() = default; 418 419 /// Create a new empty loop that will contain vectorized instructions later 420 /// on, while the old loop will be used as the scalar remainder. Control flow 421 /// is generated around the vectorized (and scalar epilogue) loops consisting 422 /// of various checks and bypasses. Return the pre-header block of the new 423 /// loop. 424 BasicBlock *createVectorizedLoopSkeleton(); 425 426 /// Widen a single instruction within the innermost loop. 427 void widenInstruction(Instruction &I, VPUser &Operands, 428 VPTransformState &State); 429 430 /// Widen a single call instruction within the innermost loop. 431 void widenCallInstruction(CallInst &I, VPUser &ArgOperands, 432 VPTransformState &State); 433 434 /// Widen a single select instruction within the innermost loop. 435 void widenSelectInstruction(SelectInst &I, VPUser &Operands, 436 bool InvariantCond, VPTransformState &State); 437 438 /// Fix the vectorized code, taking care of header phi's, live-outs, and more. 439 void fixVectorizedLoop(); 440 441 // Return true if any runtime check is added. 442 bool areSafetyChecksAdded() { return AddedSafetyChecks; } 443 444 /// A type for vectorized values in the new loop. Each value from the 445 /// original loop, when vectorized, is represented by UF vector values in the 446 /// new unrolled loop, where UF is the unroll factor. 447 using VectorParts = SmallVector<Value *, 2>; 448 449 /// Vectorize a single GetElementPtrInst based on information gathered and 450 /// decisions taken during planning. 451 void widenGEP(GetElementPtrInst *GEP, VPUser &Indices, unsigned UF, 452 unsigned VF, bool IsPtrLoopInvariant, 453 SmallBitVector &IsIndexLoopInvariant, VPTransformState &State); 454 455 /// Vectorize a single PHINode in a block. This method handles the induction 456 /// variable canonicalization. It supports both VF = 1 for unrolled loops and 457 /// arbitrary length vectors. 458 void widenPHIInstruction(Instruction *PN, unsigned UF, unsigned VF); 459 460 /// A helper function to scalarize a single Instruction in the innermost loop. 461 /// Generates a sequence of scalar instances for each lane between \p MinLane 462 /// and \p MaxLane, times each part between \p MinPart and \p MaxPart, 463 /// inclusive. Uses the VPValue operands from \p Operands instead of \p 464 /// Instr's operands. 465 void scalarizeInstruction(Instruction *Instr, VPUser &Operands, 466 const VPIteration &Instance, bool IfPredicateInstr, 467 VPTransformState &State); 468 469 /// Widen an integer or floating-point induction variable \p IV. If \p Trunc 470 /// is provided, the integer induction variable will first be truncated to 471 /// the corresponding type. 472 void widenIntOrFpInduction(PHINode *IV, TruncInst *Trunc = nullptr); 473 474 /// getOrCreateVectorValue and getOrCreateScalarValue coordinate to generate a 475 /// vector or scalar value on-demand if one is not yet available. When 476 /// vectorizing a loop, we visit the definition of an instruction before its 477 /// uses. When visiting the definition, we either vectorize or scalarize the 478 /// instruction, creating an entry for it in the corresponding map. (In some 479 /// cases, such as induction variables, we will create both vector and scalar 480 /// entries.) Then, as we encounter uses of the definition, we derive values 481 /// for each scalar or vector use unless such a value is already available. 482 /// For example, if we scalarize a definition and one of its uses is vector, 483 /// we build the required vector on-demand with an insertelement sequence 484 /// when visiting the use. Otherwise, if the use is scalar, we can use the 485 /// existing scalar definition. 486 /// 487 /// Return a value in the new loop corresponding to \p V from the original 488 /// loop at unroll index \p Part. If the value has already been vectorized, 489 /// the corresponding vector entry in VectorLoopValueMap is returned. If, 490 /// however, the value has a scalar entry in VectorLoopValueMap, we construct 491 /// a new vector value on-demand by inserting the scalar values into a vector 492 /// with an insertelement sequence. If the value has been neither vectorized 493 /// nor scalarized, it must be loop invariant, so we simply broadcast the 494 /// value into a vector. 495 Value *getOrCreateVectorValue(Value *V, unsigned Part); 496 497 /// Return a value in the new loop corresponding to \p V from the original 498 /// loop at unroll and vector indices \p Instance. If the value has been 499 /// vectorized but not scalarized, the necessary extractelement instruction 500 /// will be generated. 501 Value *getOrCreateScalarValue(Value *V, const VPIteration &Instance); 502 503 /// Construct the vector value of a scalarized value \p V one lane at a time. 504 void packScalarIntoVectorValue(Value *V, const VPIteration &Instance); 505 506 /// Try to vectorize interleaved access group \p Group with the base address 507 /// given in \p Addr, optionally masking the vector operations if \p 508 /// BlockInMask is non-null. Use \p State to translate given VPValues to IR 509 /// values in the vectorized loop. 510 void vectorizeInterleaveGroup(const InterleaveGroup<Instruction> *Group, 511 VPTransformState &State, VPValue *Addr, 512 VPValue *BlockInMask = nullptr); 513 514 /// Vectorize Load and Store instructions with the base address given in \p 515 /// Addr, optionally masking the vector operations if \p BlockInMask is 516 /// non-null. Use \p State to translate given VPValues to IR values in the 517 /// vectorized loop. 518 void vectorizeMemoryInstruction(Instruction *Instr, VPTransformState &State, 519 VPValue *Addr, VPValue *StoredValue, 520 VPValue *BlockInMask); 521 522 /// Set the debug location in the builder using the debug location in 523 /// the instruction. 524 void setDebugLocFromInst(IRBuilder<> &B, const Value *Ptr); 525 526 /// Fix the non-induction PHIs in the OrigPHIsToFix vector. 527 void fixNonInductionPHIs(void); 528 529 protected: 530 friend class LoopVectorizationPlanner; 531 532 /// A small list of PHINodes. 533 using PhiVector = SmallVector<PHINode *, 4>; 534 535 /// A type for scalarized values in the new loop. Each value from the 536 /// original loop, when scalarized, is represented by UF x VF scalar values 537 /// in the new unrolled loop, where UF is the unroll factor and VF is the 538 /// vectorization factor. 539 using ScalarParts = SmallVector<SmallVector<Value *, 4>, 2>; 540 541 /// Set up the values of the IVs correctly when exiting the vector loop. 542 void fixupIVUsers(PHINode *OrigPhi, const InductionDescriptor &II, 543 Value *CountRoundDown, Value *EndValue, 544 BasicBlock *MiddleBlock); 545 546 /// Create a new induction variable inside L. 547 PHINode *createInductionVariable(Loop *L, Value *Start, Value *End, 548 Value *Step, Instruction *DL); 549 550 /// Handle all cross-iteration phis in the header. 551 void fixCrossIterationPHIs(); 552 553 /// Fix a first-order recurrence. This is the second phase of vectorizing 554 /// this phi node. 555 void fixFirstOrderRecurrence(PHINode *Phi); 556 557 /// Fix a reduction cross-iteration phi. This is the second phase of 558 /// vectorizing this phi node. 559 void fixReduction(PHINode *Phi); 560 561 /// Clear NSW/NUW flags from reduction instructions if necessary. 562 void clearReductionWrapFlags(RecurrenceDescriptor &RdxDesc); 563 564 /// The Loop exit block may have single value PHI nodes with some 565 /// incoming value. While vectorizing we only handled real values 566 /// that were defined inside the loop and we should have one value for 567 /// each predecessor of its parent basic block. See PR14725. 568 void fixLCSSAPHIs(); 569 570 /// Iteratively sink the scalarized operands of a predicated instruction into 571 /// the block that was created for it. 572 void sinkScalarOperands(Instruction *PredInst); 573 574 /// Shrinks vector element sizes to the smallest bitwidth they can be legally 575 /// represented as. 576 void truncateToMinimalBitwidths(); 577 578 /// Create a broadcast instruction. This method generates a broadcast 579 /// instruction (shuffle) for loop invariant values and for the induction 580 /// value. If this is the induction variable then we extend it to N, N+1, ... 581 /// this is needed because each iteration in the loop corresponds to a SIMD 582 /// element. 583 virtual Value *getBroadcastInstrs(Value *V); 584 585 /// This function adds (StartIdx, StartIdx + Step, StartIdx + 2*Step, ...) 586 /// to each vector element of Val. The sequence starts at StartIndex. 587 /// \p Opcode is relevant for FP induction variable. 588 virtual Value *getStepVector(Value *Val, int StartIdx, Value *Step, 589 Instruction::BinaryOps Opcode = 590 Instruction::BinaryOpsEnd); 591 592 /// Compute scalar induction steps. \p ScalarIV is the scalar induction 593 /// variable on which to base the steps, \p Step is the size of the step, and 594 /// \p EntryVal is the value from the original loop that maps to the steps. 595 /// Note that \p EntryVal doesn't have to be an induction variable - it 596 /// can also be a truncate instruction. 597 void buildScalarSteps(Value *ScalarIV, Value *Step, Instruction *EntryVal, 598 const InductionDescriptor &ID); 599 600 /// Create a vector induction phi node based on an existing scalar one. \p 601 /// EntryVal is the value from the original loop that maps to the vector phi 602 /// node, and \p Step is the loop-invariant step. If \p EntryVal is a 603 /// truncate instruction, instead of widening the original IV, we widen a 604 /// version of the IV truncated to \p EntryVal's type. 605 void createVectorIntOrFpInductionPHI(const InductionDescriptor &II, 606 Value *Step, Instruction *EntryVal); 607 608 /// Returns true if an instruction \p I should be scalarized instead of 609 /// vectorized for the chosen vectorization factor. 610 bool shouldScalarizeInstruction(Instruction *I) const; 611 612 /// Returns true if we should generate a scalar version of \p IV. 613 bool needsScalarInduction(Instruction *IV) const; 614 615 /// If there is a cast involved in the induction variable \p ID, which should 616 /// be ignored in the vectorized loop body, this function records the 617 /// VectorLoopValue of the respective Phi also as the VectorLoopValue of the 618 /// cast. We had already proved that the casted Phi is equal to the uncasted 619 /// Phi in the vectorized loop (under a runtime guard), and therefore 620 /// there is no need to vectorize the cast - the same value can be used in the 621 /// vector loop for both the Phi and the cast. 622 /// If \p VectorLoopValue is a scalarized value, \p Lane is also specified, 623 /// Otherwise, \p VectorLoopValue is a widened/vectorized value. 624 /// 625 /// \p EntryVal is the value from the original loop that maps to the vector 626 /// phi node and is used to distinguish what is the IV currently being 627 /// processed - original one (if \p EntryVal is a phi corresponding to the 628 /// original IV) or the "newly-created" one based on the proof mentioned above 629 /// (see also buildScalarSteps() and createVectorIntOrFPInductionPHI()). In the 630 /// latter case \p EntryVal is a TruncInst and we must not record anything for 631 /// that IV, but it's error-prone to expect callers of this routine to care 632 /// about that, hence this explicit parameter. 633 void recordVectorLoopValueForInductionCast(const InductionDescriptor &ID, 634 const Instruction *EntryVal, 635 Value *VectorLoopValue, 636 unsigned Part, 637 unsigned Lane = UINT_MAX); 638 639 /// Generate a shuffle sequence that will reverse the vector Vec. 640 virtual Value *reverseVector(Value *Vec); 641 642 /// Returns (and creates if needed) the original loop trip count. 643 Value *getOrCreateTripCount(Loop *NewLoop); 644 645 /// Returns (and creates if needed) the trip count of the widened loop. 646 Value *getOrCreateVectorTripCount(Loop *NewLoop); 647 648 /// Returns a bitcasted value to the requested vector type. 649 /// Also handles bitcasts of vector<float> <-> vector<pointer> types. 650 Value *createBitOrPointerCast(Value *V, VectorType *DstVTy, 651 const DataLayout &DL); 652 653 /// Emit a bypass check to see if the vector trip count is zero, including if 654 /// it overflows. 655 void emitMinimumIterationCountCheck(Loop *L, BasicBlock *Bypass); 656 657 /// Emit a bypass check to see if all of the SCEV assumptions we've 658 /// had to make are correct. 659 void emitSCEVChecks(Loop *L, BasicBlock *Bypass); 660 661 /// Emit bypass checks to check any memory assumptions we may have made. 662 void emitMemRuntimeChecks(Loop *L, BasicBlock *Bypass); 663 664 /// Compute the transformed value of Index at offset StartValue using step 665 /// StepValue. 666 /// For integer induction, returns StartValue + Index * StepValue. 667 /// For pointer induction, returns StartValue[Index * StepValue]. 668 /// FIXME: The newly created binary instructions should contain nsw/nuw 669 /// flags, which can be found from the original scalar operations. 670 Value *emitTransformedIndex(IRBuilder<> &B, Value *Index, ScalarEvolution *SE, 671 const DataLayout &DL, 672 const InductionDescriptor &ID) const; 673 674 /// Emit basic blocks (prefixed with \p Prefix) for the iteration check, 675 /// vector loop preheader, middle block and scalar preheader. Also 676 /// allocate a loop object for the new vector loop and return it. 677 Loop *createVectorLoopSkeleton(StringRef Prefix); 678 679 /// Create new phi nodes for the induction variables to resume iteration count 680 /// in the scalar epilogue, from where the vectorized loop left off (given by 681 /// \p VectorTripCount). 682 void createInductionResumeValues(Loop *L, Value *VectorTripCount); 683 684 /// Complete the loop skeleton by adding debug MDs, creating appropriate 685 /// conditional branches in the middle block, preparing the builder and 686 /// running the verifier. Take in the vector loop \p L as argument, and return 687 /// the preheader of the completed vector loop. 688 BasicBlock *completeLoopSkeleton(Loop *L, MDNode *OrigLoopID); 689 690 /// Add additional metadata to \p To that was not present on \p Orig. 691 /// 692 /// Currently this is used to add the noalias annotations based on the 693 /// inserted memchecks. Use this for instructions that are *cloned* into the 694 /// vector loop. 695 void addNewMetadata(Instruction *To, const Instruction *Orig); 696 697 /// Add metadata from one instruction to another. 698 /// 699 /// This includes both the original MDs from \p From and additional ones (\see 700 /// addNewMetadata). Use this for *newly created* instructions in the vector 701 /// loop. 702 void addMetadata(Instruction *To, Instruction *From); 703 704 /// Similar to the previous function but it adds the metadata to a 705 /// vector of instructions. 706 void addMetadata(ArrayRef<Value *> To, Instruction *From); 707 708 /// The original loop. 709 Loop *OrigLoop; 710 711 /// A wrapper around ScalarEvolution used to add runtime SCEV checks. Applies 712 /// dynamic knowledge to simplify SCEV expressions and converts them to a 713 /// more usable form. 714 PredicatedScalarEvolution &PSE; 715 716 /// Loop Info. 717 LoopInfo *LI; 718 719 /// Dominator Tree. 720 DominatorTree *DT; 721 722 /// Alias Analysis. 723 AAResults *AA; 724 725 /// Target Library Info. 726 const TargetLibraryInfo *TLI; 727 728 /// Target Transform Info. 729 const TargetTransformInfo *TTI; 730 731 /// Assumption Cache. 732 AssumptionCache *AC; 733 734 /// Interface to emit optimization remarks. 735 OptimizationRemarkEmitter *ORE; 736 737 /// LoopVersioning. It's only set up (non-null) if memchecks were 738 /// used. 739 /// 740 /// This is currently only used to add no-alias metadata based on the 741 /// memchecks. The actually versioning is performed manually. 742 std::unique_ptr<LoopVersioning> LVer; 743 744 /// The vectorization SIMD factor to use. Each vector will have this many 745 /// vector elements. 746 unsigned VF; 747 748 /// The vectorization unroll factor to use. Each scalar is vectorized to this 749 /// many different vector instructions. 750 unsigned UF; 751 752 /// The builder that we use 753 IRBuilder<> Builder; 754 755 // --- Vectorization state --- 756 757 /// The vector-loop preheader. 758 BasicBlock *LoopVectorPreHeader; 759 760 /// The scalar-loop preheader. 761 BasicBlock *LoopScalarPreHeader; 762 763 /// Middle Block between the vector and the scalar. 764 BasicBlock *LoopMiddleBlock; 765 766 /// The ExitBlock of the scalar loop. 767 BasicBlock *LoopExitBlock; 768 769 /// The vector loop body. 770 BasicBlock *LoopVectorBody; 771 772 /// The scalar loop body. 773 BasicBlock *LoopScalarBody; 774 775 /// A list of all bypass blocks. The first block is the entry of the loop. 776 SmallVector<BasicBlock *, 4> LoopBypassBlocks; 777 778 /// The new Induction variable which was added to the new block. 779 PHINode *Induction = nullptr; 780 781 /// The induction variable of the old basic block. 782 PHINode *OldInduction = nullptr; 783 784 /// Maps values from the original loop to their corresponding values in the 785 /// vectorized loop. A key value can map to either vector values, scalar 786 /// values or both kinds of values, depending on whether the key was 787 /// vectorized and scalarized. 788 VectorizerValueMap VectorLoopValueMap; 789 790 /// Store instructions that were predicated. 791 SmallVector<Instruction *, 4> PredicatedInstructions; 792 793 /// Trip count of the original loop. 794 Value *TripCount = nullptr; 795 796 /// Trip count of the widened loop (TripCount - TripCount % (VF*UF)) 797 Value *VectorTripCount = nullptr; 798 799 /// The legality analysis. 800 LoopVectorizationLegality *Legal; 801 802 /// The profitablity analysis. 803 LoopVectorizationCostModel *Cost; 804 805 // Record whether runtime checks are added. 806 bool AddedSafetyChecks = false; 807 808 // Holds the end values for each induction variable. We save the end values 809 // so we can later fix-up the external users of the induction variables. 810 DenseMap<PHINode *, Value *> IVEndValues; 811 812 // Vector of original scalar PHIs whose corresponding widened PHIs need to be 813 // fixed up at the end of vector code generation. 814 SmallVector<PHINode *, 8> OrigPHIsToFix; 815 816 /// BFI and PSI are used to check for profile guided size optimizations. 817 BlockFrequencyInfo *BFI; 818 ProfileSummaryInfo *PSI; 819 820 // Whether this loop should be optimized for size based on profile guided size 821 // optimizatios. 822 bool OptForSizeBasedOnProfile; 823 }; 824 825 class InnerLoopUnroller : public InnerLoopVectorizer { 826 public: 827 InnerLoopUnroller(Loop *OrigLoop, PredicatedScalarEvolution &PSE, 828 LoopInfo *LI, DominatorTree *DT, 829 const TargetLibraryInfo *TLI, 830 const TargetTransformInfo *TTI, AssumptionCache *AC, 831 OptimizationRemarkEmitter *ORE, unsigned UnrollFactor, 832 LoopVectorizationLegality *LVL, 833 LoopVectorizationCostModel *CM, BlockFrequencyInfo *BFI, 834 ProfileSummaryInfo *PSI) 835 : InnerLoopVectorizer(OrigLoop, PSE, LI, DT, TLI, TTI, AC, ORE, 1, 836 UnrollFactor, LVL, CM, BFI, PSI) {} 837 838 private: 839 Value *getBroadcastInstrs(Value *V) override; 840 Value *getStepVector(Value *Val, int StartIdx, Value *Step, 841 Instruction::BinaryOps Opcode = 842 Instruction::BinaryOpsEnd) override; 843 Value *reverseVector(Value *Vec) override; 844 }; 845 846 } // end namespace llvm 847 848 /// Look for a meaningful debug location on the instruction or it's 849 /// operands. 850 static Instruction *getDebugLocFromInstOrOperands(Instruction *I) { 851 if (!I) 852 return I; 853 854 DebugLoc Empty; 855 if (I->getDebugLoc() != Empty) 856 return I; 857 858 for (User::op_iterator OI = I->op_begin(), OE = I->op_end(); OI != OE; ++OI) { 859 if (Instruction *OpInst = dyn_cast<Instruction>(*OI)) 860 if (OpInst->getDebugLoc() != Empty) 861 return OpInst; 862 } 863 864 return I; 865 } 866 867 void InnerLoopVectorizer::setDebugLocFromInst(IRBuilder<> &B, const Value *Ptr) { 868 if (const Instruction *Inst = dyn_cast_or_null<Instruction>(Ptr)) { 869 const DILocation *DIL = Inst->getDebugLoc(); 870 if (DIL && Inst->getFunction()->isDebugInfoForProfiling() && 871 !isa<DbgInfoIntrinsic>(Inst)) { 872 auto NewDIL = DIL->cloneByMultiplyingDuplicationFactor(UF * VF); 873 if (NewDIL) 874 B.SetCurrentDebugLocation(NewDIL.getValue()); 875 else 876 LLVM_DEBUG(dbgs() 877 << "Failed to create new discriminator: " 878 << DIL->getFilename() << " Line: " << DIL->getLine()); 879 } 880 else 881 B.SetCurrentDebugLocation(DIL); 882 } else 883 B.SetCurrentDebugLocation(DebugLoc()); 884 } 885 886 /// Write a record \p DebugMsg about vectorization failure to the debug 887 /// output stream. If \p I is passed, it is an instruction that prevents 888 /// vectorization. 889 #ifndef NDEBUG 890 static void debugVectorizationFailure(const StringRef DebugMsg, 891 Instruction *I) { 892 dbgs() << "LV: Not vectorizing: " << DebugMsg; 893 if (I != nullptr) 894 dbgs() << " " << *I; 895 else 896 dbgs() << '.'; 897 dbgs() << '\n'; 898 } 899 #endif 900 901 /// Create an analysis remark that explains why vectorization failed 902 /// 903 /// \p PassName is the name of the pass (e.g. can be AlwaysPrint). \p 904 /// RemarkName is the identifier for the remark. If \p I is passed it is an 905 /// instruction that prevents vectorization. Otherwise \p TheLoop is used for 906 /// the location of the remark. \return the remark object that can be 907 /// streamed to. 908 static OptimizationRemarkAnalysis createLVAnalysis(const char *PassName, 909 StringRef RemarkName, Loop *TheLoop, Instruction *I) { 910 Value *CodeRegion = TheLoop->getHeader(); 911 DebugLoc DL = TheLoop->getStartLoc(); 912 913 if (I) { 914 CodeRegion = I->getParent(); 915 // If there is no debug location attached to the instruction, revert back to 916 // using the loop's. 917 if (I->getDebugLoc()) 918 DL = I->getDebugLoc(); 919 } 920 921 OptimizationRemarkAnalysis R(PassName, RemarkName, DL, CodeRegion); 922 R << "loop not vectorized: "; 923 return R; 924 } 925 926 namespace llvm { 927 928 void reportVectorizationFailure(const StringRef DebugMsg, 929 const StringRef OREMsg, const StringRef ORETag, 930 OptimizationRemarkEmitter *ORE, Loop *TheLoop, Instruction *I) { 931 LLVM_DEBUG(debugVectorizationFailure(DebugMsg, I)); 932 LoopVectorizeHints Hints(TheLoop, true /* doesn't matter */, *ORE); 933 ORE->emit(createLVAnalysis(Hints.vectorizeAnalysisPassName(), 934 ORETag, TheLoop, I) << OREMsg); 935 } 936 937 } // end namespace llvm 938 939 #ifndef NDEBUG 940 /// \return string containing a file name and a line # for the given loop. 941 static std::string getDebugLocString(const Loop *L) { 942 std::string Result; 943 if (L) { 944 raw_string_ostream OS(Result); 945 if (const DebugLoc LoopDbgLoc = L->getStartLoc()) 946 LoopDbgLoc.print(OS); 947 else 948 // Just print the module name. 949 OS << L->getHeader()->getParent()->getParent()->getModuleIdentifier(); 950 OS.flush(); 951 } 952 return Result; 953 } 954 #endif 955 956 void InnerLoopVectorizer::addNewMetadata(Instruction *To, 957 const Instruction *Orig) { 958 // If the loop was versioned with memchecks, add the corresponding no-alias 959 // metadata. 960 if (LVer && (isa<LoadInst>(Orig) || isa<StoreInst>(Orig))) 961 LVer->annotateInstWithNoAlias(To, Orig); 962 } 963 964 void InnerLoopVectorizer::addMetadata(Instruction *To, 965 Instruction *From) { 966 propagateMetadata(To, From); 967 addNewMetadata(To, From); 968 } 969 970 void InnerLoopVectorizer::addMetadata(ArrayRef<Value *> To, 971 Instruction *From) { 972 for (Value *V : To) { 973 if (Instruction *I = dyn_cast<Instruction>(V)) 974 addMetadata(I, From); 975 } 976 } 977 978 namespace llvm { 979 980 // Loop vectorization cost-model hints how the scalar epilogue loop should be 981 // lowered. 982 enum ScalarEpilogueLowering { 983 984 // The default: allowing scalar epilogues. 985 CM_ScalarEpilogueAllowed, 986 987 // Vectorization with OptForSize: don't allow epilogues. 988 CM_ScalarEpilogueNotAllowedOptSize, 989 990 // A special case of vectorisation with OptForSize: loops with a very small 991 // trip count are considered for vectorization under OptForSize, thereby 992 // making sure the cost of their loop body is dominant, free of runtime 993 // guards and scalar iteration overheads. 994 CM_ScalarEpilogueNotAllowedLowTripLoop, 995 996 // Loop hint predicate indicating an epilogue is undesired. 997 CM_ScalarEpilogueNotNeededUsePredicate 998 }; 999 1000 /// LoopVectorizationCostModel - estimates the expected speedups due to 1001 /// vectorization. 1002 /// In many cases vectorization is not profitable. This can happen because of 1003 /// a number of reasons. In this class we mainly attempt to predict the 1004 /// expected speedup/slowdowns due to the supported instruction set. We use the 1005 /// TargetTransformInfo to query the different backends for the cost of 1006 /// different operations. 1007 class LoopVectorizationCostModel { 1008 public: 1009 LoopVectorizationCostModel(ScalarEpilogueLowering SEL, Loop *L, 1010 PredicatedScalarEvolution &PSE, LoopInfo *LI, 1011 LoopVectorizationLegality *Legal, 1012 const TargetTransformInfo &TTI, 1013 const TargetLibraryInfo *TLI, DemandedBits *DB, 1014 AssumptionCache *AC, 1015 OptimizationRemarkEmitter *ORE, const Function *F, 1016 const LoopVectorizeHints *Hints, 1017 InterleavedAccessInfo &IAI) 1018 : ScalarEpilogueStatus(SEL), TheLoop(L), PSE(PSE), LI(LI), Legal(Legal), 1019 TTI(TTI), TLI(TLI), DB(DB), AC(AC), ORE(ORE), TheFunction(F), 1020 Hints(Hints), InterleaveInfo(IAI) {} 1021 1022 /// \return An upper bound for the vectorization factor, or None if 1023 /// vectorization and interleaving should be avoided up front. 1024 Optional<unsigned> computeMaxVF(unsigned UserVF, unsigned UserIC); 1025 1026 /// \return True if runtime checks are required for vectorization, and false 1027 /// otherwise. 1028 bool runtimeChecksRequired(); 1029 1030 /// \return The most profitable vectorization factor and the cost of that VF. 1031 /// This method checks every power of two up to MaxVF. If UserVF is not ZERO 1032 /// then this vectorization factor will be selected if vectorization is 1033 /// possible. 1034 VectorizationFactor selectVectorizationFactor(unsigned MaxVF); 1035 1036 /// Setup cost-based decisions for user vectorization factor. 1037 void selectUserVectorizationFactor(unsigned UserVF) { 1038 collectUniformsAndScalars(UserVF); 1039 collectInstsToScalarize(UserVF); 1040 } 1041 1042 /// \return The size (in bits) of the smallest and widest types in the code 1043 /// that needs to be vectorized. We ignore values that remain scalar such as 1044 /// 64 bit loop indices. 1045 std::pair<unsigned, unsigned> getSmallestAndWidestTypes(); 1046 1047 /// \return The desired interleave count. 1048 /// If interleave count has been specified by metadata it will be returned. 1049 /// Otherwise, the interleave count is computed and returned. VF and LoopCost 1050 /// are the selected vectorization factor and the cost of the selected VF. 1051 unsigned selectInterleaveCount(unsigned VF, unsigned LoopCost); 1052 1053 /// Memory access instruction may be vectorized in more than one way. 1054 /// Form of instruction after vectorization depends on cost. 1055 /// This function takes cost-based decisions for Load/Store instructions 1056 /// and collects them in a map. This decisions map is used for building 1057 /// the lists of loop-uniform and loop-scalar instructions. 1058 /// The calculated cost is saved with widening decision in order to 1059 /// avoid redundant calculations. 1060 void setCostBasedWideningDecision(unsigned VF); 1061 1062 /// A struct that represents some properties of the register usage 1063 /// of a loop. 1064 struct RegisterUsage { 1065 /// Holds the number of loop invariant values that are used in the loop. 1066 /// The key is ClassID of target-provided register class. 1067 SmallMapVector<unsigned, unsigned, 4> LoopInvariantRegs; 1068 /// Holds the maximum number of concurrent live intervals in the loop. 1069 /// The key is ClassID of target-provided register class. 1070 SmallMapVector<unsigned, unsigned, 4> MaxLocalUsers; 1071 }; 1072 1073 /// \return Returns information about the register usages of the loop for the 1074 /// given vectorization factors. 1075 SmallVector<RegisterUsage, 8> calculateRegisterUsage(ArrayRef<unsigned> VFs); 1076 1077 /// Collect values we want to ignore in the cost model. 1078 void collectValuesToIgnore(); 1079 1080 /// Split reductions into those that happen in the loop, and those that happen 1081 /// outside. In loop reductions are collected into InLoopReductionChains. 1082 void collectInLoopReductions(); 1083 1084 /// \returns The smallest bitwidth each instruction can be represented with. 1085 /// The vector equivalents of these instructions should be truncated to this 1086 /// type. 1087 const MapVector<Instruction *, uint64_t> &getMinimalBitwidths() const { 1088 return MinBWs; 1089 } 1090 1091 /// \returns True if it is more profitable to scalarize instruction \p I for 1092 /// vectorization factor \p VF. 1093 bool isProfitableToScalarize(Instruction *I, unsigned VF) const { 1094 assert(VF > 1 && "Profitable to scalarize relevant only for VF > 1."); 1095 1096 // Cost model is not run in the VPlan-native path - return conservative 1097 // result until this changes. 1098 if (EnableVPlanNativePath) 1099 return false; 1100 1101 auto Scalars = InstsToScalarize.find(VF); 1102 assert(Scalars != InstsToScalarize.end() && 1103 "VF not yet analyzed for scalarization profitability"); 1104 return Scalars->second.find(I) != Scalars->second.end(); 1105 } 1106 1107 /// Returns true if \p I is known to be uniform after vectorization. 1108 bool isUniformAfterVectorization(Instruction *I, unsigned VF) const { 1109 if (VF == 1) 1110 return true; 1111 1112 // Cost model is not run in the VPlan-native path - return conservative 1113 // result until this changes. 1114 if (EnableVPlanNativePath) 1115 return false; 1116 1117 auto UniformsPerVF = Uniforms.find(VF); 1118 assert(UniformsPerVF != Uniforms.end() && 1119 "VF not yet analyzed for uniformity"); 1120 return UniformsPerVF->second.count(I); 1121 } 1122 1123 /// Returns true if \p I is known to be scalar after vectorization. 1124 bool isScalarAfterVectorization(Instruction *I, unsigned VF) const { 1125 if (VF == 1) 1126 return true; 1127 1128 // Cost model is not run in the VPlan-native path - return conservative 1129 // result until this changes. 1130 if (EnableVPlanNativePath) 1131 return false; 1132 1133 auto ScalarsPerVF = Scalars.find(VF); 1134 assert(ScalarsPerVF != Scalars.end() && 1135 "Scalar values are not calculated for VF"); 1136 return ScalarsPerVF->second.count(I); 1137 } 1138 1139 /// \returns True if instruction \p I can be truncated to a smaller bitwidth 1140 /// for vectorization factor \p VF. 1141 bool canTruncateToMinimalBitwidth(Instruction *I, unsigned VF) const { 1142 return VF > 1 && MinBWs.find(I) != MinBWs.end() && 1143 !isProfitableToScalarize(I, VF) && 1144 !isScalarAfterVectorization(I, VF); 1145 } 1146 1147 /// Decision that was taken during cost calculation for memory instruction. 1148 enum InstWidening { 1149 CM_Unknown, 1150 CM_Widen, // For consecutive accesses with stride +1. 1151 CM_Widen_Reverse, // For consecutive accesses with stride -1. 1152 CM_Interleave, 1153 CM_GatherScatter, 1154 CM_Scalarize 1155 }; 1156 1157 /// Save vectorization decision \p W and \p Cost taken by the cost model for 1158 /// instruction \p I and vector width \p VF. 1159 void setWideningDecision(Instruction *I, unsigned VF, InstWidening W, 1160 unsigned Cost) { 1161 assert(VF >= 2 && "Expected VF >=2"); 1162 WideningDecisions[std::make_pair(I, VF)] = std::make_pair(W, Cost); 1163 } 1164 1165 /// Save vectorization decision \p W and \p Cost taken by the cost model for 1166 /// interleaving group \p Grp and vector width \p VF. 1167 void setWideningDecision(const InterleaveGroup<Instruction> *Grp, unsigned VF, 1168 InstWidening W, unsigned Cost) { 1169 assert(VF >= 2 && "Expected VF >=2"); 1170 /// Broadcast this decicion to all instructions inside the group. 1171 /// But the cost will be assigned to one instruction only. 1172 for (unsigned i = 0; i < Grp->getFactor(); ++i) { 1173 if (auto *I = Grp->getMember(i)) { 1174 if (Grp->getInsertPos() == I) 1175 WideningDecisions[std::make_pair(I, VF)] = std::make_pair(W, Cost); 1176 else 1177 WideningDecisions[std::make_pair(I, VF)] = std::make_pair(W, 0); 1178 } 1179 } 1180 } 1181 1182 /// Return the cost model decision for the given instruction \p I and vector 1183 /// width \p VF. Return CM_Unknown if this instruction did not pass 1184 /// through the cost modeling. 1185 InstWidening getWideningDecision(Instruction *I, unsigned VF) { 1186 assert(VF >= 2 && "Expected VF >=2"); 1187 1188 // Cost model is not run in the VPlan-native path - return conservative 1189 // result until this changes. 1190 if (EnableVPlanNativePath) 1191 return CM_GatherScatter; 1192 1193 std::pair<Instruction *, unsigned> InstOnVF = std::make_pair(I, VF); 1194 auto Itr = WideningDecisions.find(InstOnVF); 1195 if (Itr == WideningDecisions.end()) 1196 return CM_Unknown; 1197 return Itr->second.first; 1198 } 1199 1200 /// Return the vectorization cost for the given instruction \p I and vector 1201 /// width \p VF. 1202 unsigned getWideningCost(Instruction *I, unsigned VF) { 1203 assert(VF >= 2 && "Expected VF >=2"); 1204 std::pair<Instruction *, unsigned> InstOnVF = std::make_pair(I, VF); 1205 assert(WideningDecisions.find(InstOnVF) != WideningDecisions.end() && 1206 "The cost is not calculated"); 1207 return WideningDecisions[InstOnVF].second; 1208 } 1209 1210 /// Return True if instruction \p I is an optimizable truncate whose operand 1211 /// is an induction variable. Such a truncate will be removed by adding a new 1212 /// induction variable with the destination type. 1213 bool isOptimizableIVTruncate(Instruction *I, unsigned VF) { 1214 // If the instruction is not a truncate, return false. 1215 auto *Trunc = dyn_cast<TruncInst>(I); 1216 if (!Trunc) 1217 return false; 1218 1219 // Get the source and destination types of the truncate. 1220 Type *SrcTy = ToVectorTy(cast<CastInst>(I)->getSrcTy(), VF); 1221 Type *DestTy = ToVectorTy(cast<CastInst>(I)->getDestTy(), VF); 1222 1223 // If the truncate is free for the given types, return false. Replacing a 1224 // free truncate with an induction variable would add an induction variable 1225 // update instruction to each iteration of the loop. We exclude from this 1226 // check the primary induction variable since it will need an update 1227 // instruction regardless. 1228 Value *Op = Trunc->getOperand(0); 1229 if (Op != Legal->getPrimaryInduction() && TTI.isTruncateFree(SrcTy, DestTy)) 1230 return false; 1231 1232 // If the truncated value is not an induction variable, return false. 1233 return Legal->isInductionPhi(Op); 1234 } 1235 1236 /// Collects the instructions to scalarize for each predicated instruction in 1237 /// the loop. 1238 void collectInstsToScalarize(unsigned VF); 1239 1240 /// Collect Uniform and Scalar values for the given \p VF. 1241 /// The sets depend on CM decision for Load/Store instructions 1242 /// that may be vectorized as interleave, gather-scatter or scalarized. 1243 void collectUniformsAndScalars(unsigned VF) { 1244 // Do the analysis once. 1245 if (VF == 1 || Uniforms.find(VF) != Uniforms.end()) 1246 return; 1247 setCostBasedWideningDecision(VF); 1248 collectLoopUniforms(VF); 1249 collectLoopScalars(VF); 1250 } 1251 1252 /// Returns true if the target machine supports masked store operation 1253 /// for the given \p DataType and kind of access to \p Ptr. 1254 bool isLegalMaskedStore(Type *DataType, Value *Ptr, Align Alignment) { 1255 return Legal->isConsecutivePtr(Ptr) && 1256 TTI.isLegalMaskedStore(DataType, Alignment); 1257 } 1258 1259 /// Returns true if the target machine supports masked load operation 1260 /// for the given \p DataType and kind of access to \p Ptr. 1261 bool isLegalMaskedLoad(Type *DataType, Value *Ptr, Align Alignment) { 1262 return Legal->isConsecutivePtr(Ptr) && 1263 TTI.isLegalMaskedLoad(DataType, Alignment); 1264 } 1265 1266 /// Returns true if the target machine supports masked scatter operation 1267 /// for the given \p DataType. 1268 bool isLegalMaskedScatter(Type *DataType, Align Alignment) { 1269 return TTI.isLegalMaskedScatter(DataType, Alignment); 1270 } 1271 1272 /// Returns true if the target machine supports masked gather operation 1273 /// for the given \p DataType. 1274 bool isLegalMaskedGather(Type *DataType, Align Alignment) { 1275 return TTI.isLegalMaskedGather(DataType, Alignment); 1276 } 1277 1278 /// Returns true if the target machine can represent \p V as a masked gather 1279 /// or scatter operation. 1280 bool isLegalGatherOrScatter(Value *V) { 1281 bool LI = isa<LoadInst>(V); 1282 bool SI = isa<StoreInst>(V); 1283 if (!LI && !SI) 1284 return false; 1285 auto *Ty = getMemInstValueType(V); 1286 Align Align = getLoadStoreAlignment(V); 1287 return (LI && isLegalMaskedGather(Ty, Align)) || 1288 (SI && isLegalMaskedScatter(Ty, Align)); 1289 } 1290 1291 /// Returns true if \p I is an instruction that will be scalarized with 1292 /// predication. Such instructions include conditional stores and 1293 /// instructions that may divide by zero. 1294 /// If a non-zero VF has been calculated, we check if I will be scalarized 1295 /// predication for that VF. 1296 bool isScalarWithPredication(Instruction *I, unsigned VF = 1); 1297 1298 // Returns true if \p I is an instruction that will be predicated either 1299 // through scalar predication or masked load/store or masked gather/scatter. 1300 // Superset of instructions that return true for isScalarWithPredication. 1301 bool isPredicatedInst(Instruction *I) { 1302 if (!blockNeedsPredication(I->getParent())) 1303 return false; 1304 // Loads and stores that need some form of masked operation are predicated 1305 // instructions. 1306 if (isa<LoadInst>(I) || isa<StoreInst>(I)) 1307 return Legal->isMaskRequired(I); 1308 return isScalarWithPredication(I); 1309 } 1310 1311 /// Returns true if \p I is a memory instruction with consecutive memory 1312 /// access that can be widened. 1313 bool memoryInstructionCanBeWidened(Instruction *I, unsigned VF = 1); 1314 1315 /// Returns true if \p I is a memory instruction in an interleaved-group 1316 /// of memory accesses that can be vectorized with wide vector loads/stores 1317 /// and shuffles. 1318 bool interleavedAccessCanBeWidened(Instruction *I, unsigned VF = 1); 1319 1320 /// Check if \p Instr belongs to any interleaved access group. 1321 bool isAccessInterleaved(Instruction *Instr) { 1322 return InterleaveInfo.isInterleaved(Instr); 1323 } 1324 1325 /// Get the interleaved access group that \p Instr belongs to. 1326 const InterleaveGroup<Instruction> * 1327 getInterleavedAccessGroup(Instruction *Instr) { 1328 return InterleaveInfo.getInterleaveGroup(Instr); 1329 } 1330 1331 /// Returns true if an interleaved group requires a scalar iteration 1332 /// to handle accesses with gaps, and there is nothing preventing us from 1333 /// creating a scalar epilogue. 1334 bool requiresScalarEpilogue() const { 1335 return isScalarEpilogueAllowed() && InterleaveInfo.requiresScalarEpilogue(); 1336 } 1337 1338 /// Returns true if a scalar epilogue is not allowed due to optsize or a 1339 /// loop hint annotation. 1340 bool isScalarEpilogueAllowed() const { 1341 return ScalarEpilogueStatus == CM_ScalarEpilogueAllowed; 1342 } 1343 1344 /// Returns true if all loop blocks should be masked to fold tail loop. 1345 bool foldTailByMasking() const { return FoldTailByMasking; } 1346 1347 bool blockNeedsPredication(BasicBlock *BB) { 1348 return foldTailByMasking() || Legal->blockNeedsPredication(BB); 1349 } 1350 1351 /// A SmallMapVector to store the InLoop reduction op chains, mapping phi 1352 /// nodes to the chain of instructions representing the reductions. Uses a 1353 /// MapVector to ensure deterministic iteration order. 1354 using ReductionChainMap = 1355 SmallMapVector<PHINode *, SmallVector<Instruction *, 4>, 4>; 1356 1357 /// Return the chain of instructions representing an inloop reduction. 1358 const ReductionChainMap &getInLoopReductionChains() const { 1359 return InLoopReductionChains; 1360 } 1361 1362 /// Returns true if the Phi is part of an inloop reduction. 1363 bool isInLoopReduction(PHINode *Phi) const { 1364 return InLoopReductionChains.count(Phi); 1365 } 1366 1367 /// Estimate cost of an intrinsic call instruction CI if it were vectorized 1368 /// with factor VF. Return the cost of the instruction, including 1369 /// scalarization overhead if it's needed. 1370 unsigned getVectorIntrinsicCost(CallInst *CI, unsigned VF); 1371 1372 /// Estimate cost of a call instruction CI if it were vectorized with factor 1373 /// VF. Return the cost of the instruction, including scalarization overhead 1374 /// if it's needed. The flag NeedToScalarize shows if the call needs to be 1375 /// scalarized - 1376 /// i.e. either vector version isn't available, or is too expensive. 1377 unsigned getVectorCallCost(CallInst *CI, unsigned VF, bool &NeedToScalarize); 1378 1379 /// Invalidates decisions already taken by the cost model. 1380 void invalidateCostModelingDecisions() { 1381 WideningDecisions.clear(); 1382 Uniforms.clear(); 1383 Scalars.clear(); 1384 } 1385 1386 private: 1387 unsigned NumPredStores = 0; 1388 1389 /// \return An upper bound for the vectorization factor, a power-of-2 larger 1390 /// than zero. One is returned if vectorization should best be avoided due 1391 /// to cost. 1392 unsigned computeFeasibleMaxVF(unsigned ConstTripCount); 1393 1394 /// The vectorization cost is a combination of the cost itself and a boolean 1395 /// indicating whether any of the contributing operations will actually 1396 /// operate on 1397 /// vector values after type legalization in the backend. If this latter value 1398 /// is 1399 /// false, then all operations will be scalarized (i.e. no vectorization has 1400 /// actually taken place). 1401 using VectorizationCostTy = std::pair<unsigned, bool>; 1402 1403 /// Returns the expected execution cost. The unit of the cost does 1404 /// not matter because we use the 'cost' units to compare different 1405 /// vector widths. The cost that is returned is *not* normalized by 1406 /// the factor width. 1407 VectorizationCostTy expectedCost(unsigned VF); 1408 1409 /// Returns the execution time cost of an instruction for a given vector 1410 /// width. Vector width of one means scalar. 1411 VectorizationCostTy getInstructionCost(Instruction *I, unsigned VF); 1412 1413 /// The cost-computation logic from getInstructionCost which provides 1414 /// the vector type as an output parameter. 1415 unsigned getInstructionCost(Instruction *I, unsigned VF, Type *&VectorTy); 1416 1417 /// Calculate vectorization cost of memory instruction \p I. 1418 unsigned getMemoryInstructionCost(Instruction *I, unsigned VF); 1419 1420 /// The cost computation for scalarized memory instruction. 1421 unsigned getMemInstScalarizationCost(Instruction *I, unsigned VF); 1422 1423 /// The cost computation for interleaving group of memory instructions. 1424 unsigned getInterleaveGroupCost(Instruction *I, unsigned VF); 1425 1426 /// The cost computation for Gather/Scatter instruction. 1427 unsigned getGatherScatterCost(Instruction *I, unsigned VF); 1428 1429 /// The cost computation for widening instruction \p I with consecutive 1430 /// memory access. 1431 unsigned getConsecutiveMemOpCost(Instruction *I, unsigned VF); 1432 1433 /// The cost calculation for Load/Store instruction \p I with uniform pointer - 1434 /// Load: scalar load + broadcast. 1435 /// Store: scalar store + (loop invariant value stored? 0 : extract of last 1436 /// element) 1437 unsigned getUniformMemOpCost(Instruction *I, unsigned VF); 1438 1439 /// Estimate the overhead of scalarizing an instruction. This is a 1440 /// convenience wrapper for the type-based getScalarizationOverhead API. 1441 unsigned getScalarizationOverhead(Instruction *I, unsigned VF); 1442 1443 /// Returns whether the instruction is a load or store and will be a emitted 1444 /// as a vector operation. 1445 bool isConsecutiveLoadOrStore(Instruction *I); 1446 1447 /// Returns true if an artificially high cost for emulated masked memrefs 1448 /// should be used. 1449 bool useEmulatedMaskMemRefHack(Instruction *I); 1450 1451 /// Map of scalar integer values to the smallest bitwidth they can be legally 1452 /// represented as. The vector equivalents of these values should be truncated 1453 /// to this type. 1454 MapVector<Instruction *, uint64_t> MinBWs; 1455 1456 /// A type representing the costs for instructions if they were to be 1457 /// scalarized rather than vectorized. The entries are Instruction-Cost 1458 /// pairs. 1459 using ScalarCostsTy = DenseMap<Instruction *, unsigned>; 1460 1461 /// A set containing all BasicBlocks that are known to present after 1462 /// vectorization as a predicated block. 1463 SmallPtrSet<BasicBlock *, 4> PredicatedBBsAfterVectorization; 1464 1465 /// Records whether it is allowed to have the original scalar loop execute at 1466 /// least once. This may be needed as a fallback loop in case runtime 1467 /// aliasing/dependence checks fail, or to handle the tail/remainder 1468 /// iterations when the trip count is unknown or doesn't divide by the VF, 1469 /// or as a peel-loop to handle gaps in interleave-groups. 1470 /// Under optsize and when the trip count is very small we don't allow any 1471 /// iterations to execute in the scalar loop. 1472 ScalarEpilogueLowering ScalarEpilogueStatus = CM_ScalarEpilogueAllowed; 1473 1474 /// All blocks of loop are to be masked to fold tail of scalar iterations. 1475 bool FoldTailByMasking = false; 1476 1477 /// A map holding scalar costs for different vectorization factors. The 1478 /// presence of a cost for an instruction in the mapping indicates that the 1479 /// instruction will be scalarized when vectorizing with the associated 1480 /// vectorization factor. The entries are VF-ScalarCostTy pairs. 1481 DenseMap<unsigned, ScalarCostsTy> InstsToScalarize; 1482 1483 /// Holds the instructions known to be uniform after vectorization. 1484 /// The data is collected per VF. 1485 DenseMap<unsigned, SmallPtrSet<Instruction *, 4>> Uniforms; 1486 1487 /// Holds the instructions known to be scalar after vectorization. 1488 /// The data is collected per VF. 1489 DenseMap<unsigned, SmallPtrSet<Instruction *, 4>> Scalars; 1490 1491 /// Holds the instructions (address computations) that are forced to be 1492 /// scalarized. 1493 DenseMap<unsigned, SmallPtrSet<Instruction *, 4>> ForcedScalars; 1494 1495 /// PHINodes of the reductions that should be expanded in-loop along with 1496 /// their associated chains of reduction operations, in program order from top 1497 /// (PHI) to bottom 1498 ReductionChainMap InLoopReductionChains; 1499 1500 /// Returns the expected difference in cost from scalarizing the expression 1501 /// feeding a predicated instruction \p PredInst. The instructions to 1502 /// scalarize and their scalar costs are collected in \p ScalarCosts. A 1503 /// non-negative return value implies the expression will be scalarized. 1504 /// Currently, only single-use chains are considered for scalarization. 1505 int computePredInstDiscount(Instruction *PredInst, ScalarCostsTy &ScalarCosts, 1506 unsigned VF); 1507 1508 /// Collect the instructions that are uniform after vectorization. An 1509 /// instruction is uniform if we represent it with a single scalar value in 1510 /// the vectorized loop corresponding to each vector iteration. Examples of 1511 /// uniform instructions include pointer operands of consecutive or 1512 /// interleaved memory accesses. Note that although uniformity implies an 1513 /// instruction will be scalar, the reverse is not true. In general, a 1514 /// scalarized instruction will be represented by VF scalar values in the 1515 /// vectorized loop, each corresponding to an iteration of the original 1516 /// scalar loop. 1517 void collectLoopUniforms(unsigned VF); 1518 1519 /// Collect the instructions that are scalar after vectorization. An 1520 /// instruction is scalar if it is known to be uniform or will be scalarized 1521 /// during vectorization. Non-uniform scalarized instructions will be 1522 /// represented by VF values in the vectorized loop, each corresponding to an 1523 /// iteration of the original scalar loop. 1524 void collectLoopScalars(unsigned VF); 1525 1526 /// Keeps cost model vectorization decision and cost for instructions. 1527 /// Right now it is used for memory instructions only. 1528 using DecisionList = DenseMap<std::pair<Instruction *, unsigned>, 1529 std::pair<InstWidening, unsigned>>; 1530 1531 DecisionList WideningDecisions; 1532 1533 /// Returns true if \p V is expected to be vectorized and it needs to be 1534 /// extracted. 1535 bool needsExtract(Value *V, unsigned VF) const { 1536 Instruction *I = dyn_cast<Instruction>(V); 1537 if (VF == 1 || !I || !TheLoop->contains(I) || TheLoop->isLoopInvariant(I)) 1538 return false; 1539 1540 // Assume we can vectorize V (and hence we need extraction) if the 1541 // scalars are not computed yet. This can happen, because it is called 1542 // via getScalarizationOverhead from setCostBasedWideningDecision, before 1543 // the scalars are collected. That should be a safe assumption in most 1544 // cases, because we check if the operands have vectorizable types 1545 // beforehand in LoopVectorizationLegality. 1546 return Scalars.find(VF) == Scalars.end() || 1547 !isScalarAfterVectorization(I, VF); 1548 }; 1549 1550 /// Returns a range containing only operands needing to be extracted. 1551 SmallVector<Value *, 4> filterExtractingOperands(Instruction::op_range Ops, 1552 unsigned VF) { 1553 return SmallVector<Value *, 4>(make_filter_range( 1554 Ops, [this, VF](Value *V) { return this->needsExtract(V, VF); })); 1555 } 1556 1557 public: 1558 /// The loop that we evaluate. 1559 Loop *TheLoop; 1560 1561 /// Predicated scalar evolution analysis. 1562 PredicatedScalarEvolution &PSE; 1563 1564 /// Loop Info analysis. 1565 LoopInfo *LI; 1566 1567 /// Vectorization legality. 1568 LoopVectorizationLegality *Legal; 1569 1570 /// Vector target information. 1571 const TargetTransformInfo &TTI; 1572 1573 /// Target Library Info. 1574 const TargetLibraryInfo *TLI; 1575 1576 /// Demanded bits analysis. 1577 DemandedBits *DB; 1578 1579 /// Assumption cache. 1580 AssumptionCache *AC; 1581 1582 /// Interface to emit optimization remarks. 1583 OptimizationRemarkEmitter *ORE; 1584 1585 const Function *TheFunction; 1586 1587 /// Loop Vectorize Hint. 1588 const LoopVectorizeHints *Hints; 1589 1590 /// The interleave access information contains groups of interleaved accesses 1591 /// with the same stride and close to each other. 1592 InterleavedAccessInfo &InterleaveInfo; 1593 1594 /// Values to ignore in the cost model. 1595 SmallPtrSet<const Value *, 16> ValuesToIgnore; 1596 1597 /// Values to ignore in the cost model when VF > 1. 1598 SmallPtrSet<const Value *, 16> VecValuesToIgnore; 1599 }; 1600 1601 } // end namespace llvm 1602 1603 // Return true if \p OuterLp is an outer loop annotated with hints for explicit 1604 // vectorization. The loop needs to be annotated with #pragma omp simd 1605 // simdlen(#) or #pragma clang vectorize(enable) vectorize_width(#). If the 1606 // vector length information is not provided, vectorization is not considered 1607 // explicit. Interleave hints are not allowed either. These limitations will be 1608 // relaxed in the future. 1609 // Please, note that we are currently forced to abuse the pragma 'clang 1610 // vectorize' semantics. This pragma provides *auto-vectorization hints* 1611 // (i.e., LV must check that vectorization is legal) whereas pragma 'omp simd' 1612 // provides *explicit vectorization hints* (LV can bypass legal checks and 1613 // assume that vectorization is legal). However, both hints are implemented 1614 // using the same metadata (llvm.loop.vectorize, processed by 1615 // LoopVectorizeHints). This will be fixed in the future when the native IR 1616 // representation for pragma 'omp simd' is introduced. 1617 static bool isExplicitVecOuterLoop(Loop *OuterLp, 1618 OptimizationRemarkEmitter *ORE) { 1619 assert(!OuterLp->empty() && "This is not an outer loop"); 1620 LoopVectorizeHints Hints(OuterLp, true /*DisableInterleaving*/, *ORE); 1621 1622 // Only outer loops with an explicit vectorization hint are supported. 1623 // Unannotated outer loops are ignored. 1624 if (Hints.getForce() == LoopVectorizeHints::FK_Undefined) 1625 return false; 1626 1627 Function *Fn = OuterLp->getHeader()->getParent(); 1628 if (!Hints.allowVectorization(Fn, OuterLp, 1629 true /*VectorizeOnlyWhenForced*/)) { 1630 LLVM_DEBUG(dbgs() << "LV: Loop hints prevent outer loop vectorization.\n"); 1631 return false; 1632 } 1633 1634 if (Hints.getInterleave() > 1) { 1635 // TODO: Interleave support is future work. 1636 LLVM_DEBUG(dbgs() << "LV: Not vectorizing: Interleave is not supported for " 1637 "outer loops.\n"); 1638 Hints.emitRemarkWithHints(); 1639 return false; 1640 } 1641 1642 return true; 1643 } 1644 1645 static void collectSupportedLoops(Loop &L, LoopInfo *LI, 1646 OptimizationRemarkEmitter *ORE, 1647 SmallVectorImpl<Loop *> &V) { 1648 // Collect inner loops and outer loops without irreducible control flow. For 1649 // now, only collect outer loops that have explicit vectorization hints. If we 1650 // are stress testing the VPlan H-CFG construction, we collect the outermost 1651 // loop of every loop nest. 1652 if (L.empty() || VPlanBuildStressTest || 1653 (EnableVPlanNativePath && isExplicitVecOuterLoop(&L, ORE))) { 1654 LoopBlocksRPO RPOT(&L); 1655 RPOT.perform(LI); 1656 if (!containsIrreducibleCFG<const BasicBlock *>(RPOT, *LI)) { 1657 V.push_back(&L); 1658 // TODO: Collect inner loops inside marked outer loops in case 1659 // vectorization fails for the outer loop. Do not invoke 1660 // 'containsIrreducibleCFG' again for inner loops when the outer loop is 1661 // already known to be reducible. We can use an inherited attribute for 1662 // that. 1663 return; 1664 } 1665 } 1666 for (Loop *InnerL : L) 1667 collectSupportedLoops(*InnerL, LI, ORE, V); 1668 } 1669 1670 namespace { 1671 1672 /// The LoopVectorize Pass. 1673 struct LoopVectorize : public FunctionPass { 1674 /// Pass identification, replacement for typeid 1675 static char ID; 1676 1677 LoopVectorizePass Impl; 1678 1679 explicit LoopVectorize(bool InterleaveOnlyWhenForced = false, 1680 bool VectorizeOnlyWhenForced = false) 1681 : FunctionPass(ID), 1682 Impl({InterleaveOnlyWhenForced, VectorizeOnlyWhenForced}) { 1683 initializeLoopVectorizePass(*PassRegistry::getPassRegistry()); 1684 } 1685 1686 bool runOnFunction(Function &F) override { 1687 if (skipFunction(F)) 1688 return false; 1689 1690 auto *SE = &getAnalysis<ScalarEvolutionWrapperPass>().getSE(); 1691 auto *LI = &getAnalysis<LoopInfoWrapperPass>().getLoopInfo(); 1692 auto *TTI = &getAnalysis<TargetTransformInfoWrapperPass>().getTTI(F); 1693 auto *DT = &getAnalysis<DominatorTreeWrapperPass>().getDomTree(); 1694 auto *BFI = &getAnalysis<BlockFrequencyInfoWrapperPass>().getBFI(); 1695 auto *TLIP = getAnalysisIfAvailable<TargetLibraryInfoWrapperPass>(); 1696 auto *TLI = TLIP ? &TLIP->getTLI(F) : nullptr; 1697 auto *AA = &getAnalysis<AAResultsWrapperPass>().getAAResults(); 1698 auto *AC = &getAnalysis<AssumptionCacheTracker>().getAssumptionCache(F); 1699 auto *LAA = &getAnalysis<LoopAccessLegacyAnalysis>(); 1700 auto *DB = &getAnalysis<DemandedBitsWrapperPass>().getDemandedBits(); 1701 auto *ORE = &getAnalysis<OptimizationRemarkEmitterWrapperPass>().getORE(); 1702 auto *PSI = &getAnalysis<ProfileSummaryInfoWrapperPass>().getPSI(); 1703 1704 std::function<const LoopAccessInfo &(Loop &)> GetLAA = 1705 [&](Loop &L) -> const LoopAccessInfo & { return LAA->getInfo(&L); }; 1706 1707 return Impl.runImpl(F, *SE, *LI, *TTI, *DT, *BFI, TLI, *DB, *AA, *AC, 1708 GetLAA, *ORE, PSI).MadeAnyChange; 1709 } 1710 1711 void getAnalysisUsage(AnalysisUsage &AU) const override { 1712 AU.addRequired<AssumptionCacheTracker>(); 1713 AU.addRequired<BlockFrequencyInfoWrapperPass>(); 1714 AU.addRequired<DominatorTreeWrapperPass>(); 1715 AU.addRequired<LoopInfoWrapperPass>(); 1716 AU.addRequired<ScalarEvolutionWrapperPass>(); 1717 AU.addRequired<TargetTransformInfoWrapperPass>(); 1718 AU.addRequired<AAResultsWrapperPass>(); 1719 AU.addRequired<LoopAccessLegacyAnalysis>(); 1720 AU.addRequired<DemandedBitsWrapperPass>(); 1721 AU.addRequired<OptimizationRemarkEmitterWrapperPass>(); 1722 AU.addRequired<InjectTLIMappingsLegacy>(); 1723 1724 // We currently do not preserve loopinfo/dominator analyses with outer loop 1725 // vectorization. Until this is addressed, mark these analyses as preserved 1726 // only for non-VPlan-native path. 1727 // TODO: Preserve Loop and Dominator analyses for VPlan-native path. 1728 if (!EnableVPlanNativePath) { 1729 AU.addPreserved<LoopInfoWrapperPass>(); 1730 AU.addPreserved<DominatorTreeWrapperPass>(); 1731 } 1732 1733 AU.addPreserved<BasicAAWrapperPass>(); 1734 AU.addPreserved<GlobalsAAWrapperPass>(); 1735 AU.addRequired<ProfileSummaryInfoWrapperPass>(); 1736 } 1737 }; 1738 1739 } // end anonymous namespace 1740 1741 //===----------------------------------------------------------------------===// 1742 // Implementation of LoopVectorizationLegality, InnerLoopVectorizer and 1743 // LoopVectorizationCostModel and LoopVectorizationPlanner. 1744 //===----------------------------------------------------------------------===// 1745 1746 Value *InnerLoopVectorizer::getBroadcastInstrs(Value *V) { 1747 // We need to place the broadcast of invariant variables outside the loop, 1748 // but only if it's proven safe to do so. Else, broadcast will be inside 1749 // vector loop body. 1750 Instruction *Instr = dyn_cast<Instruction>(V); 1751 bool SafeToHoist = OrigLoop->isLoopInvariant(V) && 1752 (!Instr || 1753 DT->dominates(Instr->getParent(), LoopVectorPreHeader)); 1754 // Place the code for broadcasting invariant variables in the new preheader. 1755 IRBuilder<>::InsertPointGuard Guard(Builder); 1756 if (SafeToHoist) 1757 Builder.SetInsertPoint(LoopVectorPreHeader->getTerminator()); 1758 1759 // Broadcast the scalar into all locations in the vector. 1760 Value *Shuf = Builder.CreateVectorSplat(VF, V, "broadcast"); 1761 1762 return Shuf; 1763 } 1764 1765 void InnerLoopVectorizer::createVectorIntOrFpInductionPHI( 1766 const InductionDescriptor &II, Value *Step, Instruction *EntryVal) { 1767 assert((isa<PHINode>(EntryVal) || isa<TruncInst>(EntryVal)) && 1768 "Expected either an induction phi-node or a truncate of it!"); 1769 Value *Start = II.getStartValue(); 1770 1771 // Construct the initial value of the vector IV in the vector loop preheader 1772 auto CurrIP = Builder.saveIP(); 1773 Builder.SetInsertPoint(LoopVectorPreHeader->getTerminator()); 1774 if (isa<TruncInst>(EntryVal)) { 1775 assert(Start->getType()->isIntegerTy() && 1776 "Truncation requires an integer type"); 1777 auto *TruncType = cast<IntegerType>(EntryVal->getType()); 1778 Step = Builder.CreateTrunc(Step, TruncType); 1779 Start = Builder.CreateCast(Instruction::Trunc, Start, TruncType); 1780 } 1781 Value *SplatStart = Builder.CreateVectorSplat(VF, Start); 1782 Value *SteppedStart = 1783 getStepVector(SplatStart, 0, Step, II.getInductionOpcode()); 1784 1785 // We create vector phi nodes for both integer and floating-point induction 1786 // variables. Here, we determine the kind of arithmetic we will perform. 1787 Instruction::BinaryOps AddOp; 1788 Instruction::BinaryOps MulOp; 1789 if (Step->getType()->isIntegerTy()) { 1790 AddOp = Instruction::Add; 1791 MulOp = Instruction::Mul; 1792 } else { 1793 AddOp = II.getInductionOpcode(); 1794 MulOp = Instruction::FMul; 1795 } 1796 1797 // Multiply the vectorization factor by the step using integer or 1798 // floating-point arithmetic as appropriate. 1799 Value *ConstVF = getSignedIntOrFpConstant(Step->getType(), VF); 1800 Value *Mul = addFastMathFlag(Builder.CreateBinOp(MulOp, Step, ConstVF)); 1801 1802 // Create a vector splat to use in the induction update. 1803 // 1804 // FIXME: If the step is non-constant, we create the vector splat with 1805 // IRBuilder. IRBuilder can constant-fold the multiply, but it doesn't 1806 // handle a constant vector splat. 1807 Value *SplatVF = 1808 isa<Constant>(Mul) 1809 ? ConstantVector::getSplat({VF, false}, cast<Constant>(Mul)) 1810 : Builder.CreateVectorSplat(VF, Mul); 1811 Builder.restoreIP(CurrIP); 1812 1813 // We may need to add the step a number of times, depending on the unroll 1814 // factor. The last of those goes into the PHI. 1815 PHINode *VecInd = PHINode::Create(SteppedStart->getType(), 2, "vec.ind", 1816 &*LoopVectorBody->getFirstInsertionPt()); 1817 VecInd->setDebugLoc(EntryVal->getDebugLoc()); 1818 Instruction *LastInduction = VecInd; 1819 for (unsigned Part = 0; Part < UF; ++Part) { 1820 VectorLoopValueMap.setVectorValue(EntryVal, Part, LastInduction); 1821 1822 if (isa<TruncInst>(EntryVal)) 1823 addMetadata(LastInduction, EntryVal); 1824 recordVectorLoopValueForInductionCast(II, EntryVal, LastInduction, Part); 1825 1826 LastInduction = cast<Instruction>(addFastMathFlag( 1827 Builder.CreateBinOp(AddOp, LastInduction, SplatVF, "step.add"))); 1828 LastInduction->setDebugLoc(EntryVal->getDebugLoc()); 1829 } 1830 1831 // Move the last step to the end of the latch block. This ensures consistent 1832 // placement of all induction updates. 1833 auto *LoopVectorLatch = LI->getLoopFor(LoopVectorBody)->getLoopLatch(); 1834 auto *Br = cast<BranchInst>(LoopVectorLatch->getTerminator()); 1835 auto *ICmp = cast<Instruction>(Br->getCondition()); 1836 LastInduction->moveBefore(ICmp); 1837 LastInduction->setName("vec.ind.next"); 1838 1839 VecInd->addIncoming(SteppedStart, LoopVectorPreHeader); 1840 VecInd->addIncoming(LastInduction, LoopVectorLatch); 1841 } 1842 1843 bool InnerLoopVectorizer::shouldScalarizeInstruction(Instruction *I) const { 1844 return Cost->isScalarAfterVectorization(I, VF) || 1845 Cost->isProfitableToScalarize(I, VF); 1846 } 1847 1848 bool InnerLoopVectorizer::needsScalarInduction(Instruction *IV) const { 1849 if (shouldScalarizeInstruction(IV)) 1850 return true; 1851 auto isScalarInst = [&](User *U) -> bool { 1852 auto *I = cast<Instruction>(U); 1853 return (OrigLoop->contains(I) && shouldScalarizeInstruction(I)); 1854 }; 1855 return llvm::any_of(IV->users(), isScalarInst); 1856 } 1857 1858 void InnerLoopVectorizer::recordVectorLoopValueForInductionCast( 1859 const InductionDescriptor &ID, const Instruction *EntryVal, 1860 Value *VectorLoopVal, unsigned Part, unsigned Lane) { 1861 assert((isa<PHINode>(EntryVal) || isa<TruncInst>(EntryVal)) && 1862 "Expected either an induction phi-node or a truncate of it!"); 1863 1864 // This induction variable is not the phi from the original loop but the 1865 // newly-created IV based on the proof that casted Phi is equal to the 1866 // uncasted Phi in the vectorized loop (under a runtime guard possibly). It 1867 // re-uses the same InductionDescriptor that original IV uses but we don't 1868 // have to do any recording in this case - that is done when original IV is 1869 // processed. 1870 if (isa<TruncInst>(EntryVal)) 1871 return; 1872 1873 const SmallVectorImpl<Instruction *> &Casts = ID.getCastInsts(); 1874 if (Casts.empty()) 1875 return; 1876 // Only the first Cast instruction in the Casts vector is of interest. 1877 // The rest of the Casts (if exist) have no uses outside the 1878 // induction update chain itself. 1879 Instruction *CastInst = *Casts.begin(); 1880 if (Lane < UINT_MAX) 1881 VectorLoopValueMap.setScalarValue(CastInst, {Part, Lane}, VectorLoopVal); 1882 else 1883 VectorLoopValueMap.setVectorValue(CastInst, Part, VectorLoopVal); 1884 } 1885 1886 void InnerLoopVectorizer::widenIntOrFpInduction(PHINode *IV, TruncInst *Trunc) { 1887 assert((IV->getType()->isIntegerTy() || IV != OldInduction) && 1888 "Primary induction variable must have an integer type"); 1889 1890 auto II = Legal->getInductionVars().find(IV); 1891 assert(II != Legal->getInductionVars().end() && "IV is not an induction"); 1892 1893 auto ID = II->second; 1894 assert(IV->getType() == ID.getStartValue()->getType() && "Types must match"); 1895 1896 // The value from the original loop to which we are mapping the new induction 1897 // variable. 1898 Instruction *EntryVal = Trunc ? cast<Instruction>(Trunc) : IV; 1899 1900 auto &DL = OrigLoop->getHeader()->getModule()->getDataLayout(); 1901 1902 // Generate code for the induction step. Note that induction steps are 1903 // required to be loop-invariant 1904 auto CreateStepValue = [&](const SCEV *Step) -> Value * { 1905 assert(PSE.getSE()->isLoopInvariant(Step, OrigLoop) && 1906 "Induction step should be loop invariant"); 1907 if (PSE.getSE()->isSCEVable(IV->getType())) { 1908 SCEVExpander Exp(*PSE.getSE(), DL, "induction"); 1909 return Exp.expandCodeFor(Step, Step->getType(), 1910 LoopVectorPreHeader->getTerminator()); 1911 } 1912 return cast<SCEVUnknown>(Step)->getValue(); 1913 }; 1914 1915 // The scalar value to broadcast. This is derived from the canonical 1916 // induction variable. If a truncation type is given, truncate the canonical 1917 // induction variable and step. Otherwise, derive these values from the 1918 // induction descriptor. 1919 auto CreateScalarIV = [&](Value *&Step) -> Value * { 1920 Value *ScalarIV = Induction; 1921 if (IV != OldInduction) { 1922 ScalarIV = IV->getType()->isIntegerTy() 1923 ? Builder.CreateSExtOrTrunc(Induction, IV->getType()) 1924 : Builder.CreateCast(Instruction::SIToFP, Induction, 1925 IV->getType()); 1926 ScalarIV = emitTransformedIndex(Builder, ScalarIV, PSE.getSE(), DL, ID); 1927 ScalarIV->setName("offset.idx"); 1928 } 1929 if (Trunc) { 1930 auto *TruncType = cast<IntegerType>(Trunc->getType()); 1931 assert(Step->getType()->isIntegerTy() && 1932 "Truncation requires an integer step"); 1933 ScalarIV = Builder.CreateTrunc(ScalarIV, TruncType); 1934 Step = Builder.CreateTrunc(Step, TruncType); 1935 } 1936 return ScalarIV; 1937 }; 1938 1939 // Create the vector values from the scalar IV, in the absence of creating a 1940 // vector IV. 1941 auto CreateSplatIV = [&](Value *ScalarIV, Value *Step) { 1942 Value *Broadcasted = getBroadcastInstrs(ScalarIV); 1943 for (unsigned Part = 0; Part < UF; ++Part) { 1944 Value *EntryPart = 1945 getStepVector(Broadcasted, VF * Part, Step, ID.getInductionOpcode()); 1946 VectorLoopValueMap.setVectorValue(EntryVal, Part, EntryPart); 1947 if (Trunc) 1948 addMetadata(EntryPart, Trunc); 1949 recordVectorLoopValueForInductionCast(ID, EntryVal, EntryPart, Part); 1950 } 1951 }; 1952 1953 // Now do the actual transformations, and start with creating the step value. 1954 Value *Step = CreateStepValue(ID.getStep()); 1955 if (VF <= 1) { 1956 Value *ScalarIV = CreateScalarIV(Step); 1957 CreateSplatIV(ScalarIV, Step); 1958 return; 1959 } 1960 1961 // Determine if we want a scalar version of the induction variable. This is 1962 // true if the induction variable itself is not widened, or if it has at 1963 // least one user in the loop that is not widened. 1964 auto NeedsScalarIV = needsScalarInduction(EntryVal); 1965 if (!NeedsScalarIV) { 1966 createVectorIntOrFpInductionPHI(ID, Step, EntryVal); 1967 return; 1968 } 1969 1970 // Try to create a new independent vector induction variable. If we can't 1971 // create the phi node, we will splat the scalar induction variable in each 1972 // loop iteration. 1973 if (!shouldScalarizeInstruction(EntryVal)) { 1974 createVectorIntOrFpInductionPHI(ID, Step, EntryVal); 1975 Value *ScalarIV = CreateScalarIV(Step); 1976 // Create scalar steps that can be used by instructions we will later 1977 // scalarize. Note that the addition of the scalar steps will not increase 1978 // the number of instructions in the loop in the common case prior to 1979 // InstCombine. We will be trading one vector extract for each scalar step. 1980 buildScalarSteps(ScalarIV, Step, EntryVal, ID); 1981 return; 1982 } 1983 1984 // All IV users are scalar instructions, so only emit a scalar IV, not a 1985 // vectorised IV. Except when we tail-fold, then the splat IV feeds the 1986 // predicate used by the masked loads/stores. 1987 Value *ScalarIV = CreateScalarIV(Step); 1988 if (!Cost->isScalarEpilogueAllowed()) 1989 CreateSplatIV(ScalarIV, Step); 1990 buildScalarSteps(ScalarIV, Step, EntryVal, ID); 1991 } 1992 1993 Value *InnerLoopVectorizer::getStepVector(Value *Val, int StartIdx, Value *Step, 1994 Instruction::BinaryOps BinOp) { 1995 // Create and check the types. 1996 auto *ValVTy = cast<VectorType>(Val->getType()); 1997 int VLen = ValVTy->getNumElements(); 1998 1999 Type *STy = Val->getType()->getScalarType(); 2000 assert((STy->isIntegerTy() || STy->isFloatingPointTy()) && 2001 "Induction Step must be an integer or FP"); 2002 assert(Step->getType() == STy && "Step has wrong type"); 2003 2004 SmallVector<Constant *, 8> Indices; 2005 2006 if (STy->isIntegerTy()) { 2007 // Create a vector of consecutive numbers from zero to VF. 2008 for (int i = 0; i < VLen; ++i) 2009 Indices.push_back(ConstantInt::get(STy, StartIdx + i)); 2010 2011 // Add the consecutive indices to the vector value. 2012 Constant *Cv = ConstantVector::get(Indices); 2013 assert(Cv->getType() == Val->getType() && "Invalid consecutive vec"); 2014 Step = Builder.CreateVectorSplat(VLen, Step); 2015 assert(Step->getType() == Val->getType() && "Invalid step vec"); 2016 // FIXME: The newly created binary instructions should contain nsw/nuw flags, 2017 // which can be found from the original scalar operations. 2018 Step = Builder.CreateMul(Cv, Step); 2019 return Builder.CreateAdd(Val, Step, "induction"); 2020 } 2021 2022 // Floating point induction. 2023 assert((BinOp == Instruction::FAdd || BinOp == Instruction::FSub) && 2024 "Binary Opcode should be specified for FP induction"); 2025 // Create a vector of consecutive numbers from zero to VF. 2026 for (int i = 0; i < VLen; ++i) 2027 Indices.push_back(ConstantFP::get(STy, (double)(StartIdx + i))); 2028 2029 // Add the consecutive indices to the vector value. 2030 Constant *Cv = ConstantVector::get(Indices); 2031 2032 Step = Builder.CreateVectorSplat(VLen, Step); 2033 2034 // Floating point operations had to be 'fast' to enable the induction. 2035 FastMathFlags Flags; 2036 Flags.setFast(); 2037 2038 Value *MulOp = Builder.CreateFMul(Cv, Step); 2039 if (isa<Instruction>(MulOp)) 2040 // Have to check, MulOp may be a constant 2041 cast<Instruction>(MulOp)->setFastMathFlags(Flags); 2042 2043 Value *BOp = Builder.CreateBinOp(BinOp, Val, MulOp, "induction"); 2044 if (isa<Instruction>(BOp)) 2045 cast<Instruction>(BOp)->setFastMathFlags(Flags); 2046 return BOp; 2047 } 2048 2049 void InnerLoopVectorizer::buildScalarSteps(Value *ScalarIV, Value *Step, 2050 Instruction *EntryVal, 2051 const InductionDescriptor &ID) { 2052 // We shouldn't have to build scalar steps if we aren't vectorizing. 2053 assert(VF > 1 && "VF should be greater than one"); 2054 2055 // Get the value type and ensure it and the step have the same integer type. 2056 Type *ScalarIVTy = ScalarIV->getType()->getScalarType(); 2057 assert(ScalarIVTy == Step->getType() && 2058 "Val and Step should have the same type"); 2059 2060 // We build scalar steps for both integer and floating-point induction 2061 // variables. Here, we determine the kind of arithmetic we will perform. 2062 Instruction::BinaryOps AddOp; 2063 Instruction::BinaryOps MulOp; 2064 if (ScalarIVTy->isIntegerTy()) { 2065 AddOp = Instruction::Add; 2066 MulOp = Instruction::Mul; 2067 } else { 2068 AddOp = ID.getInductionOpcode(); 2069 MulOp = Instruction::FMul; 2070 } 2071 2072 // Determine the number of scalars we need to generate for each unroll 2073 // iteration. If EntryVal is uniform, we only need to generate the first 2074 // lane. Otherwise, we generate all VF values. 2075 unsigned Lanes = 2076 Cost->isUniformAfterVectorization(cast<Instruction>(EntryVal), VF) ? 1 2077 : VF; 2078 // Compute the scalar steps and save the results in VectorLoopValueMap. 2079 for (unsigned Part = 0; Part < UF; ++Part) { 2080 for (unsigned Lane = 0; Lane < Lanes; ++Lane) { 2081 auto *StartIdx = getSignedIntOrFpConstant(ScalarIVTy, VF * Part + Lane); 2082 auto *Mul = addFastMathFlag(Builder.CreateBinOp(MulOp, StartIdx, Step)); 2083 auto *Add = addFastMathFlag(Builder.CreateBinOp(AddOp, ScalarIV, Mul)); 2084 VectorLoopValueMap.setScalarValue(EntryVal, {Part, Lane}, Add); 2085 recordVectorLoopValueForInductionCast(ID, EntryVal, Add, Part, Lane); 2086 } 2087 } 2088 } 2089 2090 Value *InnerLoopVectorizer::getOrCreateVectorValue(Value *V, unsigned Part) { 2091 assert(V != Induction && "The new induction variable should not be used."); 2092 assert(!V->getType()->isVectorTy() && "Can't widen a vector"); 2093 assert(!V->getType()->isVoidTy() && "Type does not produce a value"); 2094 2095 // If we have a stride that is replaced by one, do it here. Defer this for 2096 // the VPlan-native path until we start running Legal checks in that path. 2097 if (!EnableVPlanNativePath && Legal->hasStride(V)) 2098 V = ConstantInt::get(V->getType(), 1); 2099 2100 // If we have a vector mapped to this value, return it. 2101 if (VectorLoopValueMap.hasVectorValue(V, Part)) 2102 return VectorLoopValueMap.getVectorValue(V, Part); 2103 2104 // If the value has not been vectorized, check if it has been scalarized 2105 // instead. If it has been scalarized, and we actually need the value in 2106 // vector form, we will construct the vector values on demand. 2107 if (VectorLoopValueMap.hasAnyScalarValue(V)) { 2108 Value *ScalarValue = VectorLoopValueMap.getScalarValue(V, {Part, 0}); 2109 2110 // If we've scalarized a value, that value should be an instruction. 2111 auto *I = cast<Instruction>(V); 2112 2113 // If we aren't vectorizing, we can just copy the scalar map values over to 2114 // the vector map. 2115 if (VF == 1) { 2116 VectorLoopValueMap.setVectorValue(V, Part, ScalarValue); 2117 return ScalarValue; 2118 } 2119 2120 // Get the last scalar instruction we generated for V and Part. If the value 2121 // is known to be uniform after vectorization, this corresponds to lane zero 2122 // of the Part unroll iteration. Otherwise, the last instruction is the one 2123 // we created for the last vector lane of the Part unroll iteration. 2124 unsigned LastLane = Cost->isUniformAfterVectorization(I, VF) ? 0 : VF - 1; 2125 auto *LastInst = cast<Instruction>( 2126 VectorLoopValueMap.getScalarValue(V, {Part, LastLane})); 2127 2128 // Set the insert point after the last scalarized instruction. This ensures 2129 // the insertelement sequence will directly follow the scalar definitions. 2130 auto OldIP = Builder.saveIP(); 2131 auto NewIP = std::next(BasicBlock::iterator(LastInst)); 2132 Builder.SetInsertPoint(&*NewIP); 2133 2134 // However, if we are vectorizing, we need to construct the vector values. 2135 // If the value is known to be uniform after vectorization, we can just 2136 // broadcast the scalar value corresponding to lane zero for each unroll 2137 // iteration. Otherwise, we construct the vector values using insertelement 2138 // instructions. Since the resulting vectors are stored in 2139 // VectorLoopValueMap, we will only generate the insertelements once. 2140 Value *VectorValue = nullptr; 2141 if (Cost->isUniformAfterVectorization(I, VF)) { 2142 VectorValue = getBroadcastInstrs(ScalarValue); 2143 VectorLoopValueMap.setVectorValue(V, Part, VectorValue); 2144 } else { 2145 // Initialize packing with insertelements to start from undef. 2146 Value *Undef = UndefValue::get(FixedVectorType::get(V->getType(), VF)); 2147 VectorLoopValueMap.setVectorValue(V, Part, Undef); 2148 for (unsigned Lane = 0; Lane < VF; ++Lane) 2149 packScalarIntoVectorValue(V, {Part, Lane}); 2150 VectorValue = VectorLoopValueMap.getVectorValue(V, Part); 2151 } 2152 Builder.restoreIP(OldIP); 2153 return VectorValue; 2154 } 2155 2156 // If this scalar is unknown, assume that it is a constant or that it is 2157 // loop invariant. Broadcast V and save the value for future uses. 2158 Value *B = getBroadcastInstrs(V); 2159 VectorLoopValueMap.setVectorValue(V, Part, B); 2160 return B; 2161 } 2162 2163 Value * 2164 InnerLoopVectorizer::getOrCreateScalarValue(Value *V, 2165 const VPIteration &Instance) { 2166 // If the value is not an instruction contained in the loop, it should 2167 // already be scalar. 2168 if (OrigLoop->isLoopInvariant(V)) 2169 return V; 2170 2171 assert(Instance.Lane > 0 2172 ? !Cost->isUniformAfterVectorization(cast<Instruction>(V), VF) 2173 : true && "Uniform values only have lane zero"); 2174 2175 // If the value from the original loop has not been vectorized, it is 2176 // represented by UF x VF scalar values in the new loop. Return the requested 2177 // scalar value. 2178 if (VectorLoopValueMap.hasScalarValue(V, Instance)) 2179 return VectorLoopValueMap.getScalarValue(V, Instance); 2180 2181 // If the value has not been scalarized, get its entry in VectorLoopValueMap 2182 // for the given unroll part. If this entry is not a vector type (i.e., the 2183 // vectorization factor is one), there is no need to generate an 2184 // extractelement instruction. 2185 auto *U = getOrCreateVectorValue(V, Instance.Part); 2186 if (!U->getType()->isVectorTy()) { 2187 assert(VF == 1 && "Value not scalarized has non-vector type"); 2188 return U; 2189 } 2190 2191 // Otherwise, the value from the original loop has been vectorized and is 2192 // represented by UF vector values. Extract and return the requested scalar 2193 // value from the appropriate vector lane. 2194 return Builder.CreateExtractElement(U, Builder.getInt32(Instance.Lane)); 2195 } 2196 2197 void InnerLoopVectorizer::packScalarIntoVectorValue( 2198 Value *V, const VPIteration &Instance) { 2199 assert(V != Induction && "The new induction variable should not be used."); 2200 assert(!V->getType()->isVectorTy() && "Can't pack a vector"); 2201 assert(!V->getType()->isVoidTy() && "Type does not produce a value"); 2202 2203 Value *ScalarInst = VectorLoopValueMap.getScalarValue(V, Instance); 2204 Value *VectorValue = VectorLoopValueMap.getVectorValue(V, Instance.Part); 2205 VectorValue = Builder.CreateInsertElement(VectorValue, ScalarInst, 2206 Builder.getInt32(Instance.Lane)); 2207 VectorLoopValueMap.resetVectorValue(V, Instance.Part, VectorValue); 2208 } 2209 2210 Value *InnerLoopVectorizer::reverseVector(Value *Vec) { 2211 assert(Vec->getType()->isVectorTy() && "Invalid type"); 2212 SmallVector<int, 8> ShuffleMask; 2213 for (unsigned i = 0; i < VF; ++i) 2214 ShuffleMask.push_back(VF - i - 1); 2215 2216 return Builder.CreateShuffleVector(Vec, UndefValue::get(Vec->getType()), 2217 ShuffleMask, "reverse"); 2218 } 2219 2220 // Return whether we allow using masked interleave-groups (for dealing with 2221 // strided loads/stores that reside in predicated blocks, or for dealing 2222 // with gaps). 2223 static bool useMaskedInterleavedAccesses(const TargetTransformInfo &TTI) { 2224 // If an override option has been passed in for interleaved accesses, use it. 2225 if (EnableMaskedInterleavedMemAccesses.getNumOccurrences() > 0) 2226 return EnableMaskedInterleavedMemAccesses; 2227 2228 return TTI.enableMaskedInterleavedAccessVectorization(); 2229 } 2230 2231 // Try to vectorize the interleave group that \p Instr belongs to. 2232 // 2233 // E.g. Translate following interleaved load group (factor = 3): 2234 // for (i = 0; i < N; i+=3) { 2235 // R = Pic[i]; // Member of index 0 2236 // G = Pic[i+1]; // Member of index 1 2237 // B = Pic[i+2]; // Member of index 2 2238 // ... // do something to R, G, B 2239 // } 2240 // To: 2241 // %wide.vec = load <12 x i32> ; Read 4 tuples of R,G,B 2242 // %R.vec = shuffle %wide.vec, undef, <0, 3, 6, 9> ; R elements 2243 // %G.vec = shuffle %wide.vec, undef, <1, 4, 7, 10> ; G elements 2244 // %B.vec = shuffle %wide.vec, undef, <2, 5, 8, 11> ; B elements 2245 // 2246 // Or translate following interleaved store group (factor = 3): 2247 // for (i = 0; i < N; i+=3) { 2248 // ... do something to R, G, B 2249 // Pic[i] = R; // Member of index 0 2250 // Pic[i+1] = G; // Member of index 1 2251 // Pic[i+2] = B; // Member of index 2 2252 // } 2253 // To: 2254 // %R_G.vec = shuffle %R.vec, %G.vec, <0, 1, 2, ..., 7> 2255 // %B_U.vec = shuffle %B.vec, undef, <0, 1, 2, 3, u, u, u, u> 2256 // %interleaved.vec = shuffle %R_G.vec, %B_U.vec, 2257 // <0, 4, 8, 1, 5, 9, 2, 6, 10, 3, 7, 11> ; Interleave R,G,B elements 2258 // store <12 x i32> %interleaved.vec ; Write 4 tuples of R,G,B 2259 void InnerLoopVectorizer::vectorizeInterleaveGroup( 2260 const InterleaveGroup<Instruction> *Group, VPTransformState &State, 2261 VPValue *Addr, VPValue *BlockInMask) { 2262 Instruction *Instr = Group->getInsertPos(); 2263 const DataLayout &DL = Instr->getModule()->getDataLayout(); 2264 2265 // Prepare for the vector type of the interleaved load/store. 2266 Type *ScalarTy = getMemInstValueType(Instr); 2267 unsigned InterleaveFactor = Group->getFactor(); 2268 auto *VecTy = FixedVectorType::get(ScalarTy, InterleaveFactor * VF); 2269 2270 // Prepare for the new pointers. 2271 SmallVector<Value *, 2> AddrParts; 2272 unsigned Index = Group->getIndex(Instr); 2273 2274 // TODO: extend the masked interleaved-group support to reversed access. 2275 assert((!BlockInMask || !Group->isReverse()) && 2276 "Reversed masked interleave-group not supported."); 2277 2278 // If the group is reverse, adjust the index to refer to the last vector lane 2279 // instead of the first. We adjust the index from the first vector lane, 2280 // rather than directly getting the pointer for lane VF - 1, because the 2281 // pointer operand of the interleaved access is supposed to be uniform. For 2282 // uniform instructions, we're only required to generate a value for the 2283 // first vector lane in each unroll iteration. 2284 if (Group->isReverse()) 2285 Index += (VF - 1) * Group->getFactor(); 2286 2287 for (unsigned Part = 0; Part < UF; Part++) { 2288 Value *AddrPart = State.get(Addr, {Part, 0}); 2289 setDebugLocFromInst(Builder, AddrPart); 2290 2291 // Notice current instruction could be any index. Need to adjust the address 2292 // to the member of index 0. 2293 // 2294 // E.g. a = A[i+1]; // Member of index 1 (Current instruction) 2295 // b = A[i]; // Member of index 0 2296 // Current pointer is pointed to A[i+1], adjust it to A[i]. 2297 // 2298 // E.g. A[i+1] = a; // Member of index 1 2299 // A[i] = b; // Member of index 0 2300 // A[i+2] = c; // Member of index 2 (Current instruction) 2301 // Current pointer is pointed to A[i+2], adjust it to A[i]. 2302 2303 bool InBounds = false; 2304 if (auto *gep = dyn_cast<GetElementPtrInst>(AddrPart->stripPointerCasts())) 2305 InBounds = gep->isInBounds(); 2306 AddrPart = Builder.CreateGEP(ScalarTy, AddrPart, Builder.getInt32(-Index)); 2307 cast<GetElementPtrInst>(AddrPart)->setIsInBounds(InBounds); 2308 2309 // Cast to the vector pointer type. 2310 unsigned AddressSpace = AddrPart->getType()->getPointerAddressSpace(); 2311 Type *PtrTy = VecTy->getPointerTo(AddressSpace); 2312 AddrParts.push_back(Builder.CreateBitCast(AddrPart, PtrTy)); 2313 } 2314 2315 setDebugLocFromInst(Builder, Instr); 2316 Value *UndefVec = UndefValue::get(VecTy); 2317 2318 Value *MaskForGaps = nullptr; 2319 if (Group->requiresScalarEpilogue() && !Cost->isScalarEpilogueAllowed()) { 2320 MaskForGaps = createBitMaskForGaps(Builder, VF, *Group); 2321 assert(MaskForGaps && "Mask for Gaps is required but it is null"); 2322 } 2323 2324 // Vectorize the interleaved load group. 2325 if (isa<LoadInst>(Instr)) { 2326 // For each unroll part, create a wide load for the group. 2327 SmallVector<Value *, 2> NewLoads; 2328 for (unsigned Part = 0; Part < UF; Part++) { 2329 Instruction *NewLoad; 2330 if (BlockInMask || MaskForGaps) { 2331 assert(useMaskedInterleavedAccesses(*TTI) && 2332 "masked interleaved groups are not allowed."); 2333 Value *GroupMask = MaskForGaps; 2334 if (BlockInMask) { 2335 Value *BlockInMaskPart = State.get(BlockInMask, Part); 2336 auto *Undefs = UndefValue::get(BlockInMaskPart->getType()); 2337 Value *ShuffledMask = Builder.CreateShuffleVector( 2338 BlockInMaskPart, Undefs, 2339 createReplicatedMask(InterleaveFactor, VF), "interleaved.mask"); 2340 GroupMask = MaskForGaps 2341 ? Builder.CreateBinOp(Instruction::And, ShuffledMask, 2342 MaskForGaps) 2343 : ShuffledMask; 2344 } 2345 NewLoad = 2346 Builder.CreateMaskedLoad(AddrParts[Part], Group->getAlign(), 2347 GroupMask, UndefVec, "wide.masked.vec"); 2348 } 2349 else 2350 NewLoad = Builder.CreateAlignedLoad(VecTy, AddrParts[Part], 2351 Group->getAlign(), "wide.vec"); 2352 Group->addMetadata(NewLoad); 2353 NewLoads.push_back(NewLoad); 2354 } 2355 2356 // For each member in the group, shuffle out the appropriate data from the 2357 // wide loads. 2358 for (unsigned I = 0; I < InterleaveFactor; ++I) { 2359 Instruction *Member = Group->getMember(I); 2360 2361 // Skip the gaps in the group. 2362 if (!Member) 2363 continue; 2364 2365 auto StrideMask = createStrideMask(I, InterleaveFactor, VF); 2366 for (unsigned Part = 0; Part < UF; Part++) { 2367 Value *StridedVec = Builder.CreateShuffleVector( 2368 NewLoads[Part], UndefVec, StrideMask, "strided.vec"); 2369 2370 // If this member has different type, cast the result type. 2371 if (Member->getType() != ScalarTy) { 2372 VectorType *OtherVTy = FixedVectorType::get(Member->getType(), VF); 2373 StridedVec = createBitOrPointerCast(StridedVec, OtherVTy, DL); 2374 } 2375 2376 if (Group->isReverse()) 2377 StridedVec = reverseVector(StridedVec); 2378 2379 VectorLoopValueMap.setVectorValue(Member, Part, StridedVec); 2380 } 2381 } 2382 return; 2383 } 2384 2385 // The sub vector type for current instruction. 2386 auto *SubVT = FixedVectorType::get(ScalarTy, VF); 2387 2388 // Vectorize the interleaved store group. 2389 for (unsigned Part = 0; Part < UF; Part++) { 2390 // Collect the stored vector from each member. 2391 SmallVector<Value *, 4> StoredVecs; 2392 for (unsigned i = 0; i < InterleaveFactor; i++) { 2393 // Interleaved store group doesn't allow a gap, so each index has a member 2394 Instruction *Member = Group->getMember(i); 2395 assert(Member && "Fail to get a member from an interleaved store group"); 2396 2397 Value *StoredVec = getOrCreateVectorValue( 2398 cast<StoreInst>(Member)->getValueOperand(), Part); 2399 if (Group->isReverse()) 2400 StoredVec = reverseVector(StoredVec); 2401 2402 // If this member has different type, cast it to a unified type. 2403 2404 if (StoredVec->getType() != SubVT) 2405 StoredVec = createBitOrPointerCast(StoredVec, SubVT, DL); 2406 2407 StoredVecs.push_back(StoredVec); 2408 } 2409 2410 // Concatenate all vectors into a wide vector. 2411 Value *WideVec = concatenateVectors(Builder, StoredVecs); 2412 2413 // Interleave the elements in the wide vector. 2414 Value *IVec = Builder.CreateShuffleVector( 2415 WideVec, UndefVec, createInterleaveMask(VF, InterleaveFactor), 2416 "interleaved.vec"); 2417 2418 Instruction *NewStoreInstr; 2419 if (BlockInMask) { 2420 Value *BlockInMaskPart = State.get(BlockInMask, Part); 2421 auto *Undefs = UndefValue::get(BlockInMaskPart->getType()); 2422 Value *ShuffledMask = Builder.CreateShuffleVector( 2423 BlockInMaskPart, Undefs, createReplicatedMask(InterleaveFactor, VF), 2424 "interleaved.mask"); 2425 NewStoreInstr = Builder.CreateMaskedStore( 2426 IVec, AddrParts[Part], Group->getAlign(), ShuffledMask); 2427 } 2428 else 2429 NewStoreInstr = 2430 Builder.CreateAlignedStore(IVec, AddrParts[Part], Group->getAlign()); 2431 2432 Group->addMetadata(NewStoreInstr); 2433 } 2434 } 2435 2436 void InnerLoopVectorizer::vectorizeMemoryInstruction(Instruction *Instr, 2437 VPTransformState &State, 2438 VPValue *Addr, 2439 VPValue *StoredValue, 2440 VPValue *BlockInMask) { 2441 // Attempt to issue a wide load. 2442 LoadInst *LI = dyn_cast<LoadInst>(Instr); 2443 StoreInst *SI = dyn_cast<StoreInst>(Instr); 2444 2445 assert((LI || SI) && "Invalid Load/Store instruction"); 2446 assert((!SI || StoredValue) && "No stored value provided for widened store"); 2447 assert((!LI || !StoredValue) && "Stored value provided for widened load"); 2448 2449 LoopVectorizationCostModel::InstWidening Decision = 2450 Cost->getWideningDecision(Instr, VF); 2451 assert((Decision == LoopVectorizationCostModel::CM_Widen || 2452 Decision == LoopVectorizationCostModel::CM_Widen_Reverse || 2453 Decision == LoopVectorizationCostModel::CM_GatherScatter) && 2454 "CM decision is not to widen the memory instruction"); 2455 2456 Type *ScalarDataTy = getMemInstValueType(Instr); 2457 auto *DataTy = FixedVectorType::get(ScalarDataTy, VF); 2458 const Align Alignment = getLoadStoreAlignment(Instr); 2459 2460 // Determine if the pointer operand of the access is either consecutive or 2461 // reverse consecutive. 2462 bool Reverse = (Decision == LoopVectorizationCostModel::CM_Widen_Reverse); 2463 bool ConsecutiveStride = 2464 Reverse || (Decision == LoopVectorizationCostModel::CM_Widen); 2465 bool CreateGatherScatter = 2466 (Decision == LoopVectorizationCostModel::CM_GatherScatter); 2467 2468 // Either Ptr feeds a vector load/store, or a vector GEP should feed a vector 2469 // gather/scatter. Otherwise Decision should have been to Scalarize. 2470 assert((ConsecutiveStride || CreateGatherScatter) && 2471 "The instruction should be scalarized"); 2472 (void)ConsecutiveStride; 2473 2474 VectorParts BlockInMaskParts(UF); 2475 bool isMaskRequired = BlockInMask; 2476 if (isMaskRequired) 2477 for (unsigned Part = 0; Part < UF; ++Part) 2478 BlockInMaskParts[Part] = State.get(BlockInMask, Part); 2479 2480 const auto CreateVecPtr = [&](unsigned Part, Value *Ptr) -> Value * { 2481 // Calculate the pointer for the specific unroll-part. 2482 GetElementPtrInst *PartPtr = nullptr; 2483 2484 bool InBounds = false; 2485 if (auto *gep = dyn_cast<GetElementPtrInst>(Ptr->stripPointerCasts())) 2486 InBounds = gep->isInBounds(); 2487 2488 if (Reverse) { 2489 // If the address is consecutive but reversed, then the 2490 // wide store needs to start at the last vector element. 2491 PartPtr = cast<GetElementPtrInst>( 2492 Builder.CreateGEP(ScalarDataTy, Ptr, Builder.getInt32(-Part * VF))); 2493 PartPtr->setIsInBounds(InBounds); 2494 PartPtr = cast<GetElementPtrInst>( 2495 Builder.CreateGEP(ScalarDataTy, PartPtr, Builder.getInt32(1 - VF))); 2496 PartPtr->setIsInBounds(InBounds); 2497 if (isMaskRequired) // Reverse of a null all-one mask is a null mask. 2498 BlockInMaskParts[Part] = reverseVector(BlockInMaskParts[Part]); 2499 } else { 2500 PartPtr = cast<GetElementPtrInst>( 2501 Builder.CreateGEP(ScalarDataTy, Ptr, Builder.getInt32(Part * VF))); 2502 PartPtr->setIsInBounds(InBounds); 2503 } 2504 2505 unsigned AddressSpace = Ptr->getType()->getPointerAddressSpace(); 2506 return Builder.CreateBitCast(PartPtr, DataTy->getPointerTo(AddressSpace)); 2507 }; 2508 2509 // Handle Stores: 2510 if (SI) { 2511 setDebugLocFromInst(Builder, SI); 2512 2513 for (unsigned Part = 0; Part < UF; ++Part) { 2514 Instruction *NewSI = nullptr; 2515 Value *StoredVal = State.get(StoredValue, Part); 2516 if (CreateGatherScatter) { 2517 Value *MaskPart = isMaskRequired ? BlockInMaskParts[Part] : nullptr; 2518 Value *VectorGep = State.get(Addr, Part); 2519 NewSI = Builder.CreateMaskedScatter(StoredVal, VectorGep, Alignment, 2520 MaskPart); 2521 } else { 2522 if (Reverse) { 2523 // If we store to reverse consecutive memory locations, then we need 2524 // to reverse the order of elements in the stored value. 2525 StoredVal = reverseVector(StoredVal); 2526 // We don't want to update the value in the map as it might be used in 2527 // another expression. So don't call resetVectorValue(StoredVal). 2528 } 2529 auto *VecPtr = CreateVecPtr(Part, State.get(Addr, {0, 0})); 2530 if (isMaskRequired) 2531 NewSI = Builder.CreateMaskedStore(StoredVal, VecPtr, Alignment, 2532 BlockInMaskParts[Part]); 2533 else 2534 NewSI = Builder.CreateAlignedStore(StoredVal, VecPtr, Alignment); 2535 } 2536 addMetadata(NewSI, SI); 2537 } 2538 return; 2539 } 2540 2541 // Handle loads. 2542 assert(LI && "Must have a load instruction"); 2543 setDebugLocFromInst(Builder, LI); 2544 for (unsigned Part = 0; Part < UF; ++Part) { 2545 Value *NewLI; 2546 if (CreateGatherScatter) { 2547 Value *MaskPart = isMaskRequired ? BlockInMaskParts[Part] : nullptr; 2548 Value *VectorGep = State.get(Addr, Part); 2549 NewLI = Builder.CreateMaskedGather(VectorGep, Alignment, MaskPart, 2550 nullptr, "wide.masked.gather"); 2551 addMetadata(NewLI, LI); 2552 } else { 2553 auto *VecPtr = CreateVecPtr(Part, State.get(Addr, {0, 0})); 2554 if (isMaskRequired) 2555 NewLI = Builder.CreateMaskedLoad( 2556 VecPtr, Alignment, BlockInMaskParts[Part], UndefValue::get(DataTy), 2557 "wide.masked.load"); 2558 else 2559 NewLI = 2560 Builder.CreateAlignedLoad(DataTy, VecPtr, Alignment, "wide.load"); 2561 2562 // Add metadata to the load, but setVectorValue to the reverse shuffle. 2563 addMetadata(NewLI, LI); 2564 if (Reverse) 2565 NewLI = reverseVector(NewLI); 2566 } 2567 VectorLoopValueMap.setVectorValue(Instr, Part, NewLI); 2568 } 2569 } 2570 2571 void InnerLoopVectorizer::scalarizeInstruction(Instruction *Instr, VPUser &User, 2572 const VPIteration &Instance, 2573 bool IfPredicateInstr, 2574 VPTransformState &State) { 2575 assert(!Instr->getType()->isAggregateType() && "Can't handle vectors"); 2576 2577 setDebugLocFromInst(Builder, Instr); 2578 2579 // Does this instruction return a value ? 2580 bool IsVoidRetTy = Instr->getType()->isVoidTy(); 2581 2582 Instruction *Cloned = Instr->clone(); 2583 if (!IsVoidRetTy) 2584 Cloned->setName(Instr->getName() + ".cloned"); 2585 2586 // Replace the operands of the cloned instructions with their scalar 2587 // equivalents in the new loop. 2588 for (unsigned op = 0, e = User.getNumOperands(); op != e; ++op) { 2589 auto *NewOp = State.get(User.getOperand(op), Instance); 2590 Cloned->setOperand(op, NewOp); 2591 } 2592 addNewMetadata(Cloned, Instr); 2593 2594 // Place the cloned scalar in the new loop. 2595 Builder.Insert(Cloned); 2596 2597 // Add the cloned scalar to the scalar map entry. 2598 VectorLoopValueMap.setScalarValue(Instr, Instance, Cloned); 2599 2600 // If we just cloned a new assumption, add it the assumption cache. 2601 if (auto *II = dyn_cast<IntrinsicInst>(Cloned)) 2602 if (II->getIntrinsicID() == Intrinsic::assume) 2603 AC->registerAssumption(II); 2604 2605 // End if-block. 2606 if (IfPredicateInstr) 2607 PredicatedInstructions.push_back(Cloned); 2608 } 2609 2610 PHINode *InnerLoopVectorizer::createInductionVariable(Loop *L, Value *Start, 2611 Value *End, Value *Step, 2612 Instruction *DL) { 2613 BasicBlock *Header = L->getHeader(); 2614 BasicBlock *Latch = L->getLoopLatch(); 2615 // As we're just creating this loop, it's possible no latch exists 2616 // yet. If so, use the header as this will be a single block loop. 2617 if (!Latch) 2618 Latch = Header; 2619 2620 IRBuilder<> Builder(&*Header->getFirstInsertionPt()); 2621 Instruction *OldInst = getDebugLocFromInstOrOperands(OldInduction); 2622 setDebugLocFromInst(Builder, OldInst); 2623 auto *Induction = Builder.CreatePHI(Start->getType(), 2, "index"); 2624 2625 Builder.SetInsertPoint(Latch->getTerminator()); 2626 setDebugLocFromInst(Builder, OldInst); 2627 2628 // Create i+1 and fill the PHINode. 2629 Value *Next = Builder.CreateAdd(Induction, Step, "index.next"); 2630 Induction->addIncoming(Start, L->getLoopPreheader()); 2631 Induction->addIncoming(Next, Latch); 2632 // Create the compare. 2633 Value *ICmp = Builder.CreateICmpEQ(Next, End); 2634 Builder.CreateCondBr(ICmp, L->getExitBlock(), Header); 2635 2636 // Now we have two terminators. Remove the old one from the block. 2637 Latch->getTerminator()->eraseFromParent(); 2638 2639 return Induction; 2640 } 2641 2642 Value *InnerLoopVectorizer::getOrCreateTripCount(Loop *L) { 2643 if (TripCount) 2644 return TripCount; 2645 2646 assert(L && "Create Trip Count for null loop."); 2647 IRBuilder<> Builder(L->getLoopPreheader()->getTerminator()); 2648 // Find the loop boundaries. 2649 ScalarEvolution *SE = PSE.getSE(); 2650 const SCEV *BackedgeTakenCount = PSE.getBackedgeTakenCount(); 2651 assert(BackedgeTakenCount != SE->getCouldNotCompute() && 2652 "Invalid loop count"); 2653 2654 Type *IdxTy = Legal->getWidestInductionType(); 2655 assert(IdxTy && "No type for induction"); 2656 2657 // The exit count might have the type of i64 while the phi is i32. This can 2658 // happen if we have an induction variable that is sign extended before the 2659 // compare. The only way that we get a backedge taken count is that the 2660 // induction variable was signed and as such will not overflow. In such a case 2661 // truncation is legal. 2662 if (SE->getTypeSizeInBits(BackedgeTakenCount->getType()) > 2663 IdxTy->getPrimitiveSizeInBits()) 2664 BackedgeTakenCount = SE->getTruncateOrNoop(BackedgeTakenCount, IdxTy); 2665 BackedgeTakenCount = SE->getNoopOrZeroExtend(BackedgeTakenCount, IdxTy); 2666 2667 // Get the total trip count from the count by adding 1. 2668 const SCEV *ExitCount = SE->getAddExpr( 2669 BackedgeTakenCount, SE->getOne(BackedgeTakenCount->getType())); 2670 2671 const DataLayout &DL = L->getHeader()->getModule()->getDataLayout(); 2672 2673 // Expand the trip count and place the new instructions in the preheader. 2674 // Notice that the pre-header does not change, only the loop body. 2675 SCEVExpander Exp(*SE, DL, "induction"); 2676 2677 // Count holds the overall loop count (N). 2678 TripCount = Exp.expandCodeFor(ExitCount, ExitCount->getType(), 2679 L->getLoopPreheader()->getTerminator()); 2680 2681 if (TripCount->getType()->isPointerTy()) 2682 TripCount = 2683 CastInst::CreatePointerCast(TripCount, IdxTy, "exitcount.ptrcnt.to.int", 2684 L->getLoopPreheader()->getTerminator()); 2685 2686 return TripCount; 2687 } 2688 2689 Value *InnerLoopVectorizer::getOrCreateVectorTripCount(Loop *L) { 2690 if (VectorTripCount) 2691 return VectorTripCount; 2692 2693 Value *TC = getOrCreateTripCount(L); 2694 IRBuilder<> Builder(L->getLoopPreheader()->getTerminator()); 2695 2696 Type *Ty = TC->getType(); 2697 Constant *Step = ConstantInt::get(Ty, VF * UF); 2698 2699 // If the tail is to be folded by masking, round the number of iterations N 2700 // up to a multiple of Step instead of rounding down. This is done by first 2701 // adding Step-1 and then rounding down. Note that it's ok if this addition 2702 // overflows: the vector induction variable will eventually wrap to zero given 2703 // that it starts at zero and its Step is a power of two; the loop will then 2704 // exit, with the last early-exit vector comparison also producing all-true. 2705 if (Cost->foldTailByMasking()) { 2706 assert(isPowerOf2_32(VF * UF) && 2707 "VF*UF must be a power of 2 when folding tail by masking"); 2708 TC = Builder.CreateAdd(TC, ConstantInt::get(Ty, VF * UF - 1), "n.rnd.up"); 2709 } 2710 2711 // Now we need to generate the expression for the part of the loop that the 2712 // vectorized body will execute. This is equal to N - (N % Step) if scalar 2713 // iterations are not required for correctness, or N - Step, otherwise. Step 2714 // is equal to the vectorization factor (number of SIMD elements) times the 2715 // unroll factor (number of SIMD instructions). 2716 Value *R = Builder.CreateURem(TC, Step, "n.mod.vf"); 2717 2718 // If there is a non-reversed interleaved group that may speculatively access 2719 // memory out-of-bounds, we need to ensure that there will be at least one 2720 // iteration of the scalar epilogue loop. Thus, if the step evenly divides 2721 // the trip count, we set the remainder to be equal to the step. If the step 2722 // does not evenly divide the trip count, no adjustment is necessary since 2723 // there will already be scalar iterations. Note that the minimum iterations 2724 // check ensures that N >= Step. 2725 if (VF > 1 && Cost->requiresScalarEpilogue()) { 2726 auto *IsZero = Builder.CreateICmpEQ(R, ConstantInt::get(R->getType(), 0)); 2727 R = Builder.CreateSelect(IsZero, Step, R); 2728 } 2729 2730 VectorTripCount = Builder.CreateSub(TC, R, "n.vec"); 2731 2732 return VectorTripCount; 2733 } 2734 2735 Value *InnerLoopVectorizer::createBitOrPointerCast(Value *V, VectorType *DstVTy, 2736 const DataLayout &DL) { 2737 // Verify that V is a vector type with same number of elements as DstVTy. 2738 unsigned VF = DstVTy->getNumElements(); 2739 VectorType *SrcVecTy = cast<VectorType>(V->getType()); 2740 assert((VF == SrcVecTy->getNumElements()) && "Vector dimensions do not match"); 2741 Type *SrcElemTy = SrcVecTy->getElementType(); 2742 Type *DstElemTy = DstVTy->getElementType(); 2743 assert((DL.getTypeSizeInBits(SrcElemTy) == DL.getTypeSizeInBits(DstElemTy)) && 2744 "Vector elements must have same size"); 2745 2746 // Do a direct cast if element types are castable. 2747 if (CastInst::isBitOrNoopPointerCastable(SrcElemTy, DstElemTy, DL)) { 2748 return Builder.CreateBitOrPointerCast(V, DstVTy); 2749 } 2750 // V cannot be directly casted to desired vector type. 2751 // May happen when V is a floating point vector but DstVTy is a vector of 2752 // pointers or vice-versa. Handle this using a two-step bitcast using an 2753 // intermediate Integer type for the bitcast i.e. Ptr <-> Int <-> Float. 2754 assert((DstElemTy->isPointerTy() != SrcElemTy->isPointerTy()) && 2755 "Only one type should be a pointer type"); 2756 assert((DstElemTy->isFloatingPointTy() != SrcElemTy->isFloatingPointTy()) && 2757 "Only one type should be a floating point type"); 2758 Type *IntTy = 2759 IntegerType::getIntNTy(V->getContext(), DL.getTypeSizeInBits(SrcElemTy)); 2760 auto *VecIntTy = FixedVectorType::get(IntTy, VF); 2761 Value *CastVal = Builder.CreateBitOrPointerCast(V, VecIntTy); 2762 return Builder.CreateBitOrPointerCast(CastVal, DstVTy); 2763 } 2764 2765 void InnerLoopVectorizer::emitMinimumIterationCountCheck(Loop *L, 2766 BasicBlock *Bypass) { 2767 Value *Count = getOrCreateTripCount(L); 2768 // Reuse existing vector loop preheader for TC checks. 2769 // Note that new preheader block is generated for vector loop. 2770 BasicBlock *const TCCheckBlock = LoopVectorPreHeader; 2771 IRBuilder<> Builder(TCCheckBlock->getTerminator()); 2772 2773 // Generate code to check if the loop's trip count is less than VF * UF, or 2774 // equal to it in case a scalar epilogue is required; this implies that the 2775 // vector trip count is zero. This check also covers the case where adding one 2776 // to the backedge-taken count overflowed leading to an incorrect trip count 2777 // of zero. In this case we will also jump to the scalar loop. 2778 auto P = Cost->requiresScalarEpilogue() ? ICmpInst::ICMP_ULE 2779 : ICmpInst::ICMP_ULT; 2780 2781 // If tail is to be folded, vector loop takes care of all iterations. 2782 Value *CheckMinIters = Builder.getFalse(); 2783 if (!Cost->foldTailByMasking()) 2784 CheckMinIters = Builder.CreateICmp( 2785 P, Count, ConstantInt::get(Count->getType(), VF * UF), 2786 "min.iters.check"); 2787 2788 // Create new preheader for vector loop. 2789 LoopVectorPreHeader = 2790 SplitBlock(TCCheckBlock, TCCheckBlock->getTerminator(), DT, LI, nullptr, 2791 "vector.ph"); 2792 2793 assert(DT->properlyDominates(DT->getNode(TCCheckBlock), 2794 DT->getNode(Bypass)->getIDom()) && 2795 "TC check is expected to dominate Bypass"); 2796 2797 // Update dominator for Bypass & LoopExit. 2798 DT->changeImmediateDominator(Bypass, TCCheckBlock); 2799 DT->changeImmediateDominator(LoopExitBlock, TCCheckBlock); 2800 2801 ReplaceInstWithInst( 2802 TCCheckBlock->getTerminator(), 2803 BranchInst::Create(Bypass, LoopVectorPreHeader, CheckMinIters)); 2804 LoopBypassBlocks.push_back(TCCheckBlock); 2805 } 2806 2807 void InnerLoopVectorizer::emitSCEVChecks(Loop *L, BasicBlock *Bypass) { 2808 // Reuse existing vector loop preheader for SCEV checks. 2809 // Note that new preheader block is generated for vector loop. 2810 BasicBlock *const SCEVCheckBlock = LoopVectorPreHeader; 2811 2812 // Generate the code to check that the SCEV assumptions that we made. 2813 // We want the new basic block to start at the first instruction in a 2814 // sequence of instructions that form a check. 2815 SCEVExpander Exp(*PSE.getSE(), Bypass->getModule()->getDataLayout(), 2816 "scev.check"); 2817 Value *SCEVCheck = Exp.expandCodeForPredicate( 2818 &PSE.getUnionPredicate(), SCEVCheckBlock->getTerminator()); 2819 2820 if (auto *C = dyn_cast<ConstantInt>(SCEVCheck)) 2821 if (C->isZero()) 2822 return; 2823 2824 assert(!(SCEVCheckBlock->getParent()->hasOptSize() || 2825 OptForSizeBasedOnProfile) && 2826 "Cannot SCEV check stride or overflow when optimizing for size"); 2827 2828 SCEVCheckBlock->setName("vector.scevcheck"); 2829 // Create new preheader for vector loop. 2830 LoopVectorPreHeader = 2831 SplitBlock(SCEVCheckBlock, SCEVCheckBlock->getTerminator(), DT, LI, 2832 nullptr, "vector.ph"); 2833 2834 // Update dominator only if this is first RT check. 2835 if (LoopBypassBlocks.empty()) { 2836 DT->changeImmediateDominator(Bypass, SCEVCheckBlock); 2837 DT->changeImmediateDominator(LoopExitBlock, SCEVCheckBlock); 2838 } 2839 2840 ReplaceInstWithInst( 2841 SCEVCheckBlock->getTerminator(), 2842 BranchInst::Create(Bypass, LoopVectorPreHeader, SCEVCheck)); 2843 LoopBypassBlocks.push_back(SCEVCheckBlock); 2844 AddedSafetyChecks = true; 2845 } 2846 2847 void InnerLoopVectorizer::emitMemRuntimeChecks(Loop *L, BasicBlock *Bypass) { 2848 // VPlan-native path does not do any analysis for runtime checks currently. 2849 if (EnableVPlanNativePath) 2850 return; 2851 2852 // Reuse existing vector loop preheader for runtime memory checks. 2853 // Note that new preheader block is generated for vector loop. 2854 BasicBlock *const MemCheckBlock = L->getLoopPreheader(); 2855 2856 // Generate the code that checks in runtime if arrays overlap. We put the 2857 // checks into a separate block to make the more common case of few elements 2858 // faster. 2859 auto *LAI = Legal->getLAI(); 2860 const auto &RtPtrChecking = *LAI->getRuntimePointerChecking(); 2861 if (!RtPtrChecking.Need) 2862 return; 2863 Instruction *FirstCheckInst; 2864 Instruction *MemRuntimeCheck; 2865 std::tie(FirstCheckInst, MemRuntimeCheck) = 2866 addRuntimeChecks(MemCheckBlock->getTerminator(), OrigLoop, 2867 RtPtrChecking.getChecks(), RtPtrChecking.getSE()); 2868 assert(MemRuntimeCheck && "no RT checks generated although RtPtrChecking " 2869 "claimed checks are required"); 2870 2871 if (MemCheckBlock->getParent()->hasOptSize() || OptForSizeBasedOnProfile) { 2872 assert(Cost->Hints->getForce() == LoopVectorizeHints::FK_Enabled && 2873 "Cannot emit memory checks when optimizing for size, unless forced " 2874 "to vectorize."); 2875 ORE->emit([&]() { 2876 return OptimizationRemarkAnalysis(DEBUG_TYPE, "VectorizationCodeSize", 2877 L->getStartLoc(), L->getHeader()) 2878 << "Code-size may be reduced by not forcing " 2879 "vectorization, or by source-code modifications " 2880 "eliminating the need for runtime checks " 2881 "(e.g., adding 'restrict')."; 2882 }); 2883 } 2884 2885 MemCheckBlock->setName("vector.memcheck"); 2886 // Create new preheader for vector loop. 2887 LoopVectorPreHeader = 2888 SplitBlock(MemCheckBlock, MemCheckBlock->getTerminator(), DT, LI, nullptr, 2889 "vector.ph"); 2890 2891 // Update dominator only if this is first RT check. 2892 if (LoopBypassBlocks.empty()) { 2893 DT->changeImmediateDominator(Bypass, MemCheckBlock); 2894 DT->changeImmediateDominator(LoopExitBlock, MemCheckBlock); 2895 } 2896 2897 ReplaceInstWithInst( 2898 MemCheckBlock->getTerminator(), 2899 BranchInst::Create(Bypass, LoopVectorPreHeader, MemRuntimeCheck)); 2900 LoopBypassBlocks.push_back(MemCheckBlock); 2901 AddedSafetyChecks = true; 2902 2903 // We currently don't use LoopVersioning for the actual loop cloning but we 2904 // still use it to add the noalias metadata. 2905 LVer = std::make_unique<LoopVersioning>(*Legal->getLAI(), OrigLoop, LI, DT, 2906 PSE.getSE()); 2907 LVer->prepareNoAliasMetadata(); 2908 } 2909 2910 Value *InnerLoopVectorizer::emitTransformedIndex( 2911 IRBuilder<> &B, Value *Index, ScalarEvolution *SE, const DataLayout &DL, 2912 const InductionDescriptor &ID) const { 2913 2914 SCEVExpander Exp(*SE, DL, "induction"); 2915 auto Step = ID.getStep(); 2916 auto StartValue = ID.getStartValue(); 2917 assert(Index->getType() == Step->getType() && 2918 "Index type does not match StepValue type"); 2919 2920 // Note: the IR at this point is broken. We cannot use SE to create any new 2921 // SCEV and then expand it, hoping that SCEV's simplification will give us 2922 // a more optimal code. Unfortunately, attempt of doing so on invalid IR may 2923 // lead to various SCEV crashes. So all we can do is to use builder and rely 2924 // on InstCombine for future simplifications. Here we handle some trivial 2925 // cases only. 2926 auto CreateAdd = [&B](Value *X, Value *Y) { 2927 assert(X->getType() == Y->getType() && "Types don't match!"); 2928 if (auto *CX = dyn_cast<ConstantInt>(X)) 2929 if (CX->isZero()) 2930 return Y; 2931 if (auto *CY = dyn_cast<ConstantInt>(Y)) 2932 if (CY->isZero()) 2933 return X; 2934 return B.CreateAdd(X, Y); 2935 }; 2936 2937 auto CreateMul = [&B](Value *X, Value *Y) { 2938 assert(X->getType() == Y->getType() && "Types don't match!"); 2939 if (auto *CX = dyn_cast<ConstantInt>(X)) 2940 if (CX->isOne()) 2941 return Y; 2942 if (auto *CY = dyn_cast<ConstantInt>(Y)) 2943 if (CY->isOne()) 2944 return X; 2945 return B.CreateMul(X, Y); 2946 }; 2947 2948 // Get a suitable insert point for SCEV expansion. For blocks in the vector 2949 // loop, choose the end of the vector loop header (=LoopVectorBody), because 2950 // the DomTree is not kept up-to-date for additional blocks generated in the 2951 // vector loop. By using the header as insertion point, we guarantee that the 2952 // expanded instructions dominate all their uses. 2953 auto GetInsertPoint = [this, &B]() { 2954 BasicBlock *InsertBB = B.GetInsertPoint()->getParent(); 2955 if (InsertBB != LoopVectorBody && 2956 LI->getLoopFor(LoopVectorBody) == LI->getLoopFor(InsertBB)) 2957 return LoopVectorBody->getTerminator(); 2958 return &*B.GetInsertPoint(); 2959 }; 2960 switch (ID.getKind()) { 2961 case InductionDescriptor::IK_IntInduction: { 2962 assert(Index->getType() == StartValue->getType() && 2963 "Index type does not match StartValue type"); 2964 if (ID.getConstIntStepValue() && ID.getConstIntStepValue()->isMinusOne()) 2965 return B.CreateSub(StartValue, Index); 2966 auto *Offset = CreateMul( 2967 Index, Exp.expandCodeFor(Step, Index->getType(), GetInsertPoint())); 2968 return CreateAdd(StartValue, Offset); 2969 } 2970 case InductionDescriptor::IK_PtrInduction: { 2971 assert(isa<SCEVConstant>(Step) && 2972 "Expected constant step for pointer induction"); 2973 return B.CreateGEP( 2974 StartValue->getType()->getPointerElementType(), StartValue, 2975 CreateMul(Index, 2976 Exp.expandCodeFor(Step, Index->getType(), GetInsertPoint()))); 2977 } 2978 case InductionDescriptor::IK_FpInduction: { 2979 assert(Step->getType()->isFloatingPointTy() && "Expected FP Step value"); 2980 auto InductionBinOp = ID.getInductionBinOp(); 2981 assert(InductionBinOp && 2982 (InductionBinOp->getOpcode() == Instruction::FAdd || 2983 InductionBinOp->getOpcode() == Instruction::FSub) && 2984 "Original bin op should be defined for FP induction"); 2985 2986 Value *StepValue = cast<SCEVUnknown>(Step)->getValue(); 2987 2988 // Floating point operations had to be 'fast' to enable the induction. 2989 FastMathFlags Flags; 2990 Flags.setFast(); 2991 2992 Value *MulExp = B.CreateFMul(StepValue, Index); 2993 if (isa<Instruction>(MulExp)) 2994 // We have to check, the MulExp may be a constant. 2995 cast<Instruction>(MulExp)->setFastMathFlags(Flags); 2996 2997 Value *BOp = B.CreateBinOp(InductionBinOp->getOpcode(), StartValue, MulExp, 2998 "induction"); 2999 if (isa<Instruction>(BOp)) 3000 cast<Instruction>(BOp)->setFastMathFlags(Flags); 3001 3002 return BOp; 3003 } 3004 case InductionDescriptor::IK_NoInduction: 3005 return nullptr; 3006 } 3007 llvm_unreachable("invalid enum"); 3008 } 3009 3010 Loop *InnerLoopVectorizer::createVectorLoopSkeleton(StringRef Prefix) { 3011 LoopScalarBody = OrigLoop->getHeader(); 3012 LoopVectorPreHeader = OrigLoop->getLoopPreheader(); 3013 LoopExitBlock = OrigLoop->getExitBlock(); 3014 assert(LoopExitBlock && "Must have an exit block"); 3015 assert(LoopVectorPreHeader && "Invalid loop structure"); 3016 3017 LoopMiddleBlock = 3018 SplitBlock(LoopVectorPreHeader, LoopVectorPreHeader->getTerminator(), DT, 3019 LI, nullptr, Twine(Prefix) + "middle.block"); 3020 LoopScalarPreHeader = 3021 SplitBlock(LoopMiddleBlock, LoopMiddleBlock->getTerminator(), DT, LI, 3022 nullptr, Twine(Prefix) + "scalar.ph"); 3023 // We intentionally don't let SplitBlock to update LoopInfo since 3024 // LoopVectorBody should belong to another loop than LoopVectorPreHeader. 3025 // LoopVectorBody is explicitly added to the correct place few lines later. 3026 LoopVectorBody = 3027 SplitBlock(LoopVectorPreHeader, LoopVectorPreHeader->getTerminator(), DT, 3028 nullptr, nullptr, Twine(Prefix) + "vector.body"); 3029 3030 // Update dominator for loop exit. 3031 DT->changeImmediateDominator(LoopExitBlock, LoopMiddleBlock); 3032 3033 // Create and register the new vector loop. 3034 Loop *Lp = LI->AllocateLoop(); 3035 Loop *ParentLoop = OrigLoop->getParentLoop(); 3036 3037 // Insert the new loop into the loop nest and register the new basic blocks 3038 // before calling any utilities such as SCEV that require valid LoopInfo. 3039 if (ParentLoop) { 3040 ParentLoop->addChildLoop(Lp); 3041 } else { 3042 LI->addTopLevelLoop(Lp); 3043 } 3044 Lp->addBasicBlockToLoop(LoopVectorBody, *LI); 3045 return Lp; 3046 } 3047 3048 void InnerLoopVectorizer::createInductionResumeValues(Loop *L, 3049 Value *VectorTripCount) { 3050 assert(VectorTripCount && L && "Expected valid arguments"); 3051 // We are going to resume the execution of the scalar loop. 3052 // Go over all of the induction variables that we found and fix the 3053 // PHIs that are left in the scalar version of the loop. 3054 // The starting values of PHI nodes depend on the counter of the last 3055 // iteration in the vectorized loop. 3056 // If we come from a bypass edge then we need to start from the original 3057 // start value. 3058 for (auto &InductionEntry : Legal->getInductionVars()) { 3059 PHINode *OrigPhi = InductionEntry.first; 3060 InductionDescriptor II = InductionEntry.second; 3061 3062 // Create phi nodes to merge from the backedge-taken check block. 3063 PHINode *BCResumeVal = 3064 PHINode::Create(OrigPhi->getType(), 3, "bc.resume.val", 3065 LoopScalarPreHeader->getTerminator()); 3066 // Copy original phi DL over to the new one. 3067 BCResumeVal->setDebugLoc(OrigPhi->getDebugLoc()); 3068 Value *&EndValue = IVEndValues[OrigPhi]; 3069 if (OrigPhi == OldInduction) { 3070 // We know what the end value is. 3071 EndValue = VectorTripCount; 3072 } else { 3073 IRBuilder<> B(L->getLoopPreheader()->getTerminator()); 3074 Type *StepType = II.getStep()->getType(); 3075 Instruction::CastOps CastOp = 3076 CastInst::getCastOpcode(VectorTripCount, true, StepType, true); 3077 Value *CRD = B.CreateCast(CastOp, VectorTripCount, StepType, "cast.crd"); 3078 const DataLayout &DL = LoopScalarBody->getModule()->getDataLayout(); 3079 EndValue = emitTransformedIndex(B, CRD, PSE.getSE(), DL, II); 3080 EndValue->setName("ind.end"); 3081 } 3082 3083 // The new PHI merges the original incoming value, in case of a bypass, 3084 // or the value at the end of the vectorized loop. 3085 BCResumeVal->addIncoming(EndValue, LoopMiddleBlock); 3086 3087 // Fix the scalar body counter (PHI node). 3088 // The old induction's phi node in the scalar body needs the truncated 3089 // value. 3090 for (BasicBlock *BB : LoopBypassBlocks) 3091 BCResumeVal->addIncoming(II.getStartValue(), BB); 3092 OrigPhi->setIncomingValueForBlock(LoopScalarPreHeader, BCResumeVal); 3093 } 3094 } 3095 3096 BasicBlock *InnerLoopVectorizer::completeLoopSkeleton(Loop *L, 3097 MDNode *OrigLoopID) { 3098 assert(L && "Expected valid loop."); 3099 3100 // The trip counts should be cached by now. 3101 Value *Count = getOrCreateTripCount(L); 3102 Value *VectorTripCount = getOrCreateVectorTripCount(L); 3103 3104 // We need the OrigLoop (scalar loop part) latch terminator to help 3105 // produce correct debug info for the middle block BB instructions. 3106 // The legality check stage guarantees that the loop will have a single 3107 // latch. 3108 assert(isa<BranchInst>(OrigLoop->getLoopLatch()->getTerminator()) && 3109 "Scalar loop latch terminator isn't a branch"); 3110 BranchInst *ScalarLatchBr = 3111 cast<BranchInst>(OrigLoop->getLoopLatch()->getTerminator()); 3112 3113 // Add a check in the middle block to see if we have completed 3114 // all of the iterations in the first vector loop. 3115 // If (N - N%VF) == N, then we *don't* need to run the remainder. 3116 // If tail is to be folded, we know we don't need to run the remainder. 3117 Value *CmpN = Builder.getTrue(); 3118 if (!Cost->foldTailByMasking()) { 3119 CmpN = CmpInst::Create(Instruction::ICmp, CmpInst::ICMP_EQ, Count, 3120 VectorTripCount, "cmp.n", 3121 LoopMiddleBlock->getTerminator()); 3122 3123 // Here we use the same DebugLoc as the scalar loop latch branch instead 3124 // of the corresponding compare because they may have ended up with 3125 // different line numbers and we want to avoid awkward line stepping while 3126 // debugging. Eg. if the compare has got a line number inside the loop. 3127 cast<Instruction>(CmpN)->setDebugLoc(ScalarLatchBr->getDebugLoc()); 3128 } 3129 3130 BranchInst *BrInst = 3131 BranchInst::Create(LoopExitBlock, LoopScalarPreHeader, CmpN); 3132 BrInst->setDebugLoc(ScalarLatchBr->getDebugLoc()); 3133 ReplaceInstWithInst(LoopMiddleBlock->getTerminator(), BrInst); 3134 3135 // Get ready to start creating new instructions into the vectorized body. 3136 assert(LoopVectorPreHeader == L->getLoopPreheader() && 3137 "Inconsistent vector loop preheader"); 3138 Builder.SetInsertPoint(&*LoopVectorBody->getFirstInsertionPt()); 3139 3140 Optional<MDNode *> VectorizedLoopID = 3141 makeFollowupLoopID(OrigLoopID, {LLVMLoopVectorizeFollowupAll, 3142 LLVMLoopVectorizeFollowupVectorized}); 3143 if (VectorizedLoopID.hasValue()) { 3144 L->setLoopID(VectorizedLoopID.getValue()); 3145 3146 // Do not setAlreadyVectorized if loop attributes have been defined 3147 // explicitly. 3148 return LoopVectorPreHeader; 3149 } 3150 3151 // Keep all loop hints from the original loop on the vector loop (we'll 3152 // replace the vectorizer-specific hints below). 3153 if (MDNode *LID = OrigLoop->getLoopID()) 3154 L->setLoopID(LID); 3155 3156 LoopVectorizeHints Hints(L, true, *ORE); 3157 Hints.setAlreadyVectorized(); 3158 3159 #ifdef EXPENSIVE_CHECKS 3160 assert(DT->verify(DominatorTree::VerificationLevel::Fast)); 3161 LI->verify(*DT); 3162 #endif 3163 3164 return LoopVectorPreHeader; 3165 } 3166 3167 BasicBlock *InnerLoopVectorizer::createVectorizedLoopSkeleton() { 3168 /* 3169 In this function we generate a new loop. The new loop will contain 3170 the vectorized instructions while the old loop will continue to run the 3171 scalar remainder. 3172 3173 [ ] <-- loop iteration number check. 3174 / | 3175 / v 3176 | [ ] <-- vector loop bypass (may consist of multiple blocks). 3177 | / | 3178 | / v 3179 || [ ] <-- vector pre header. 3180 |/ | 3181 | v 3182 | [ ] \ 3183 | [ ]_| <-- vector loop. 3184 | | 3185 | v 3186 | -[ ] <--- middle-block. 3187 | / | 3188 | / v 3189 -|- >[ ] <--- new preheader. 3190 | | 3191 | v 3192 | [ ] \ 3193 | [ ]_| <-- old scalar loop to handle remainder. 3194 \ | 3195 \ v 3196 >[ ] <-- exit block. 3197 ... 3198 */ 3199 3200 // Get the metadata of the original loop before it gets modified. 3201 MDNode *OrigLoopID = OrigLoop->getLoopID(); 3202 3203 // Create an empty vector loop, and prepare basic blocks for the runtime 3204 // checks. 3205 Loop *Lp = createVectorLoopSkeleton(""); 3206 3207 // Now, compare the new count to zero. If it is zero skip the vector loop and 3208 // jump to the scalar loop. This check also covers the case where the 3209 // backedge-taken count is uint##_max: adding one to it will overflow leading 3210 // to an incorrect trip count of zero. In this (rare) case we will also jump 3211 // to the scalar loop. 3212 emitMinimumIterationCountCheck(Lp, LoopScalarPreHeader); 3213 3214 // Generate the code to check any assumptions that we've made for SCEV 3215 // expressions. 3216 emitSCEVChecks(Lp, LoopScalarPreHeader); 3217 3218 // Generate the code that checks in runtime if arrays overlap. We put the 3219 // checks into a separate block to make the more common case of few elements 3220 // faster. 3221 emitMemRuntimeChecks(Lp, LoopScalarPreHeader); 3222 3223 // Some loops have a single integer induction variable, while other loops 3224 // don't. One example is c++ iterators that often have multiple pointer 3225 // induction variables. In the code below we also support a case where we 3226 // don't have a single induction variable. 3227 // 3228 // We try to obtain an induction variable from the original loop as hard 3229 // as possible. However if we don't find one that: 3230 // - is an integer 3231 // - counts from zero, stepping by one 3232 // - is the size of the widest induction variable type 3233 // then we create a new one. 3234 OldInduction = Legal->getPrimaryInduction(); 3235 Type *IdxTy = Legal->getWidestInductionType(); 3236 Value *StartIdx = ConstantInt::get(IdxTy, 0); 3237 // The loop step is equal to the vectorization factor (num of SIMD elements) 3238 // times the unroll factor (num of SIMD instructions). 3239 Constant *Step = ConstantInt::get(IdxTy, VF * UF); 3240 Value *CountRoundDown = getOrCreateVectorTripCount(Lp); 3241 Induction = 3242 createInductionVariable(Lp, StartIdx, CountRoundDown, Step, 3243 getDebugLocFromInstOrOperands(OldInduction)); 3244 3245 // Emit phis for the new starting index of the scalar loop. 3246 createInductionResumeValues(Lp, CountRoundDown); 3247 3248 return completeLoopSkeleton(Lp, OrigLoopID); 3249 } 3250 3251 // Fix up external users of the induction variable. At this point, we are 3252 // in LCSSA form, with all external PHIs that use the IV having one input value, 3253 // coming from the remainder loop. We need those PHIs to also have a correct 3254 // value for the IV when arriving directly from the middle block. 3255 void InnerLoopVectorizer::fixupIVUsers(PHINode *OrigPhi, 3256 const InductionDescriptor &II, 3257 Value *CountRoundDown, Value *EndValue, 3258 BasicBlock *MiddleBlock) { 3259 // There are two kinds of external IV usages - those that use the value 3260 // computed in the last iteration (the PHI) and those that use the penultimate 3261 // value (the value that feeds into the phi from the loop latch). 3262 // We allow both, but they, obviously, have different values. 3263 3264 assert(OrigLoop->getExitBlock() && "Expected a single exit block"); 3265 3266 DenseMap<Value *, Value *> MissingVals; 3267 3268 // An external user of the last iteration's value should see the value that 3269 // the remainder loop uses to initialize its own IV. 3270 Value *PostInc = OrigPhi->getIncomingValueForBlock(OrigLoop->getLoopLatch()); 3271 for (User *U : PostInc->users()) { 3272 Instruction *UI = cast<Instruction>(U); 3273 if (!OrigLoop->contains(UI)) { 3274 assert(isa<PHINode>(UI) && "Expected LCSSA form"); 3275 MissingVals[UI] = EndValue; 3276 } 3277 } 3278 3279 // An external user of the penultimate value need to see EndValue - Step. 3280 // The simplest way to get this is to recompute it from the constituent SCEVs, 3281 // that is Start + (Step * (CRD - 1)). 3282 for (User *U : OrigPhi->users()) { 3283 auto *UI = cast<Instruction>(U); 3284 if (!OrigLoop->contains(UI)) { 3285 const DataLayout &DL = 3286 OrigLoop->getHeader()->getModule()->getDataLayout(); 3287 assert(isa<PHINode>(UI) && "Expected LCSSA form"); 3288 3289 IRBuilder<> B(MiddleBlock->getTerminator()); 3290 Value *CountMinusOne = B.CreateSub( 3291 CountRoundDown, ConstantInt::get(CountRoundDown->getType(), 1)); 3292 Value *CMO = 3293 !II.getStep()->getType()->isIntegerTy() 3294 ? B.CreateCast(Instruction::SIToFP, CountMinusOne, 3295 II.getStep()->getType()) 3296 : B.CreateSExtOrTrunc(CountMinusOne, II.getStep()->getType()); 3297 CMO->setName("cast.cmo"); 3298 Value *Escape = emitTransformedIndex(B, CMO, PSE.getSE(), DL, II); 3299 Escape->setName("ind.escape"); 3300 MissingVals[UI] = Escape; 3301 } 3302 } 3303 3304 for (auto &I : MissingVals) { 3305 PHINode *PHI = cast<PHINode>(I.first); 3306 // One corner case we have to handle is two IVs "chasing" each-other, 3307 // that is %IV2 = phi [...], [ %IV1, %latch ] 3308 // In this case, if IV1 has an external use, we need to avoid adding both 3309 // "last value of IV1" and "penultimate value of IV2". So, verify that we 3310 // don't already have an incoming value for the middle block. 3311 if (PHI->getBasicBlockIndex(MiddleBlock) == -1) 3312 PHI->addIncoming(I.second, MiddleBlock); 3313 } 3314 } 3315 3316 namespace { 3317 3318 struct CSEDenseMapInfo { 3319 static bool canHandle(const Instruction *I) { 3320 return isa<InsertElementInst>(I) || isa<ExtractElementInst>(I) || 3321 isa<ShuffleVectorInst>(I) || isa<GetElementPtrInst>(I); 3322 } 3323 3324 static inline Instruction *getEmptyKey() { 3325 return DenseMapInfo<Instruction *>::getEmptyKey(); 3326 } 3327 3328 static inline Instruction *getTombstoneKey() { 3329 return DenseMapInfo<Instruction *>::getTombstoneKey(); 3330 } 3331 3332 static unsigned getHashValue(const Instruction *I) { 3333 assert(canHandle(I) && "Unknown instruction!"); 3334 return hash_combine(I->getOpcode(), hash_combine_range(I->value_op_begin(), 3335 I->value_op_end())); 3336 } 3337 3338 static bool isEqual(const Instruction *LHS, const Instruction *RHS) { 3339 if (LHS == getEmptyKey() || RHS == getEmptyKey() || 3340 LHS == getTombstoneKey() || RHS == getTombstoneKey()) 3341 return LHS == RHS; 3342 return LHS->isIdenticalTo(RHS); 3343 } 3344 }; 3345 3346 } // end anonymous namespace 3347 3348 ///Perform cse of induction variable instructions. 3349 static void cse(BasicBlock *BB) { 3350 // Perform simple cse. 3351 SmallDenseMap<Instruction *, Instruction *, 4, CSEDenseMapInfo> CSEMap; 3352 for (BasicBlock::iterator I = BB->begin(), E = BB->end(); I != E;) { 3353 Instruction *In = &*I++; 3354 3355 if (!CSEDenseMapInfo::canHandle(In)) 3356 continue; 3357 3358 // Check if we can replace this instruction with any of the 3359 // visited instructions. 3360 if (Instruction *V = CSEMap.lookup(In)) { 3361 In->replaceAllUsesWith(V); 3362 In->eraseFromParent(); 3363 continue; 3364 } 3365 3366 CSEMap[In] = In; 3367 } 3368 } 3369 3370 unsigned LoopVectorizationCostModel::getVectorCallCost(CallInst *CI, 3371 unsigned VF, 3372 bool &NeedToScalarize) { 3373 Function *F = CI->getCalledFunction(); 3374 Type *ScalarRetTy = CI->getType(); 3375 SmallVector<Type *, 4> Tys, ScalarTys; 3376 for (auto &ArgOp : CI->arg_operands()) 3377 ScalarTys.push_back(ArgOp->getType()); 3378 3379 // Estimate cost of scalarized vector call. The source operands are assumed 3380 // to be vectors, so we need to extract individual elements from there, 3381 // execute VF scalar calls, and then gather the result into the vector return 3382 // value. 3383 unsigned ScalarCallCost = TTI.getCallInstrCost(F, ScalarRetTy, ScalarTys, 3384 TTI::TCK_RecipThroughput); 3385 if (VF == 1) 3386 return ScalarCallCost; 3387 3388 // Compute corresponding vector type for return value and arguments. 3389 Type *RetTy = ToVectorTy(ScalarRetTy, VF); 3390 for (Type *ScalarTy : ScalarTys) 3391 Tys.push_back(ToVectorTy(ScalarTy, VF)); 3392 3393 // Compute costs of unpacking argument values for the scalar calls and 3394 // packing the return values to a vector. 3395 unsigned ScalarizationCost = getScalarizationOverhead(CI, VF); 3396 3397 unsigned Cost = ScalarCallCost * VF + ScalarizationCost; 3398 3399 // If we can't emit a vector call for this function, then the currently found 3400 // cost is the cost we need to return. 3401 NeedToScalarize = true; 3402 VFShape Shape = VFShape::get(*CI, {VF, false}, false /*HasGlobalPred*/); 3403 Function *VecFunc = VFDatabase(*CI).getVectorizedFunction(Shape); 3404 3405 if (!TLI || CI->isNoBuiltin() || !VecFunc) 3406 return Cost; 3407 3408 // If the corresponding vector cost is cheaper, return its cost. 3409 unsigned VectorCallCost = TTI.getCallInstrCost(nullptr, RetTy, Tys, 3410 TTI::TCK_RecipThroughput); 3411 if (VectorCallCost < Cost) { 3412 NeedToScalarize = false; 3413 return VectorCallCost; 3414 } 3415 return Cost; 3416 } 3417 3418 unsigned LoopVectorizationCostModel::getVectorIntrinsicCost(CallInst *CI, 3419 unsigned VF) { 3420 Intrinsic::ID ID = getVectorIntrinsicIDForCall(CI, TLI); 3421 assert(ID && "Expected intrinsic call!"); 3422 3423 IntrinsicCostAttributes CostAttrs(ID, *CI, VF); 3424 return TTI.getIntrinsicInstrCost(CostAttrs, 3425 TargetTransformInfo::TCK_RecipThroughput); 3426 } 3427 3428 static Type *smallestIntegerVectorType(Type *T1, Type *T2) { 3429 auto *I1 = cast<IntegerType>(cast<VectorType>(T1)->getElementType()); 3430 auto *I2 = cast<IntegerType>(cast<VectorType>(T2)->getElementType()); 3431 return I1->getBitWidth() < I2->getBitWidth() ? T1 : T2; 3432 } 3433 3434 static Type *largestIntegerVectorType(Type *T1, Type *T2) { 3435 auto *I1 = cast<IntegerType>(cast<VectorType>(T1)->getElementType()); 3436 auto *I2 = cast<IntegerType>(cast<VectorType>(T2)->getElementType()); 3437 return I1->getBitWidth() > I2->getBitWidth() ? T1 : T2; 3438 } 3439 3440 void InnerLoopVectorizer::truncateToMinimalBitwidths() { 3441 // For every instruction `I` in MinBWs, truncate the operands, create a 3442 // truncated version of `I` and reextend its result. InstCombine runs 3443 // later and will remove any ext/trunc pairs. 3444 SmallPtrSet<Value *, 4> Erased; 3445 for (const auto &KV : Cost->getMinimalBitwidths()) { 3446 // If the value wasn't vectorized, we must maintain the original scalar 3447 // type. The absence of the value from VectorLoopValueMap indicates that it 3448 // wasn't vectorized. 3449 if (!VectorLoopValueMap.hasAnyVectorValue(KV.first)) 3450 continue; 3451 for (unsigned Part = 0; Part < UF; ++Part) { 3452 Value *I = getOrCreateVectorValue(KV.first, Part); 3453 if (Erased.count(I) || I->use_empty() || !isa<Instruction>(I)) 3454 continue; 3455 Type *OriginalTy = I->getType(); 3456 Type *ScalarTruncatedTy = 3457 IntegerType::get(OriginalTy->getContext(), KV.second); 3458 auto *TruncatedTy = FixedVectorType::get( 3459 ScalarTruncatedTy, cast<VectorType>(OriginalTy)->getNumElements()); 3460 if (TruncatedTy == OriginalTy) 3461 continue; 3462 3463 IRBuilder<> B(cast<Instruction>(I)); 3464 auto ShrinkOperand = [&](Value *V) -> Value * { 3465 if (auto *ZI = dyn_cast<ZExtInst>(V)) 3466 if (ZI->getSrcTy() == TruncatedTy) 3467 return ZI->getOperand(0); 3468 return B.CreateZExtOrTrunc(V, TruncatedTy); 3469 }; 3470 3471 // The actual instruction modification depends on the instruction type, 3472 // unfortunately. 3473 Value *NewI = nullptr; 3474 if (auto *BO = dyn_cast<BinaryOperator>(I)) { 3475 NewI = B.CreateBinOp(BO->getOpcode(), ShrinkOperand(BO->getOperand(0)), 3476 ShrinkOperand(BO->getOperand(1))); 3477 3478 // Any wrapping introduced by shrinking this operation shouldn't be 3479 // considered undefined behavior. So, we can't unconditionally copy 3480 // arithmetic wrapping flags to NewI. 3481 cast<BinaryOperator>(NewI)->copyIRFlags(I, /*IncludeWrapFlags=*/false); 3482 } else if (auto *CI = dyn_cast<ICmpInst>(I)) { 3483 NewI = 3484 B.CreateICmp(CI->getPredicate(), ShrinkOperand(CI->getOperand(0)), 3485 ShrinkOperand(CI->getOperand(1))); 3486 } else if (auto *SI = dyn_cast<SelectInst>(I)) { 3487 NewI = B.CreateSelect(SI->getCondition(), 3488 ShrinkOperand(SI->getTrueValue()), 3489 ShrinkOperand(SI->getFalseValue())); 3490 } else if (auto *CI = dyn_cast<CastInst>(I)) { 3491 switch (CI->getOpcode()) { 3492 default: 3493 llvm_unreachable("Unhandled cast!"); 3494 case Instruction::Trunc: 3495 NewI = ShrinkOperand(CI->getOperand(0)); 3496 break; 3497 case Instruction::SExt: 3498 NewI = B.CreateSExtOrTrunc( 3499 CI->getOperand(0), 3500 smallestIntegerVectorType(OriginalTy, TruncatedTy)); 3501 break; 3502 case Instruction::ZExt: 3503 NewI = B.CreateZExtOrTrunc( 3504 CI->getOperand(0), 3505 smallestIntegerVectorType(OriginalTy, TruncatedTy)); 3506 break; 3507 } 3508 } else if (auto *SI = dyn_cast<ShuffleVectorInst>(I)) { 3509 auto Elements0 = 3510 cast<VectorType>(SI->getOperand(0)->getType())->getNumElements(); 3511 auto *O0 = B.CreateZExtOrTrunc( 3512 SI->getOperand(0), 3513 FixedVectorType::get(ScalarTruncatedTy, Elements0)); 3514 auto Elements1 = 3515 cast<VectorType>(SI->getOperand(1)->getType())->getNumElements(); 3516 auto *O1 = B.CreateZExtOrTrunc( 3517 SI->getOperand(1), 3518 FixedVectorType::get(ScalarTruncatedTy, Elements1)); 3519 3520 NewI = B.CreateShuffleVector(O0, O1, SI->getShuffleMask()); 3521 } else if (isa<LoadInst>(I) || isa<PHINode>(I)) { 3522 // Don't do anything with the operands, just extend the result. 3523 continue; 3524 } else if (auto *IE = dyn_cast<InsertElementInst>(I)) { 3525 auto Elements = 3526 cast<VectorType>(IE->getOperand(0)->getType())->getNumElements(); 3527 auto *O0 = B.CreateZExtOrTrunc( 3528 IE->getOperand(0), 3529 FixedVectorType::get(ScalarTruncatedTy, Elements)); 3530 auto *O1 = B.CreateZExtOrTrunc(IE->getOperand(1), ScalarTruncatedTy); 3531 NewI = B.CreateInsertElement(O0, O1, IE->getOperand(2)); 3532 } else if (auto *EE = dyn_cast<ExtractElementInst>(I)) { 3533 auto Elements = 3534 cast<VectorType>(EE->getOperand(0)->getType())->getNumElements(); 3535 auto *O0 = B.CreateZExtOrTrunc( 3536 EE->getOperand(0), 3537 FixedVectorType::get(ScalarTruncatedTy, Elements)); 3538 NewI = B.CreateExtractElement(O0, EE->getOperand(2)); 3539 } else { 3540 // If we don't know what to do, be conservative and don't do anything. 3541 continue; 3542 } 3543 3544 // Lastly, extend the result. 3545 NewI->takeName(cast<Instruction>(I)); 3546 Value *Res = B.CreateZExtOrTrunc(NewI, OriginalTy); 3547 I->replaceAllUsesWith(Res); 3548 cast<Instruction>(I)->eraseFromParent(); 3549 Erased.insert(I); 3550 VectorLoopValueMap.resetVectorValue(KV.first, Part, Res); 3551 } 3552 } 3553 3554 // We'll have created a bunch of ZExts that are now parentless. Clean up. 3555 for (const auto &KV : Cost->getMinimalBitwidths()) { 3556 // If the value wasn't vectorized, we must maintain the original scalar 3557 // type. The absence of the value from VectorLoopValueMap indicates that it 3558 // wasn't vectorized. 3559 if (!VectorLoopValueMap.hasAnyVectorValue(KV.first)) 3560 continue; 3561 for (unsigned Part = 0; Part < UF; ++Part) { 3562 Value *I = getOrCreateVectorValue(KV.first, Part); 3563 ZExtInst *Inst = dyn_cast<ZExtInst>(I); 3564 if (Inst && Inst->use_empty()) { 3565 Value *NewI = Inst->getOperand(0); 3566 Inst->eraseFromParent(); 3567 VectorLoopValueMap.resetVectorValue(KV.first, Part, NewI); 3568 } 3569 } 3570 } 3571 } 3572 3573 void InnerLoopVectorizer::fixVectorizedLoop() { 3574 // Insert truncates and extends for any truncated instructions as hints to 3575 // InstCombine. 3576 if (VF > 1) 3577 truncateToMinimalBitwidths(); 3578 3579 // Fix widened non-induction PHIs by setting up the PHI operands. 3580 if (OrigPHIsToFix.size()) { 3581 assert(EnableVPlanNativePath && 3582 "Unexpected non-induction PHIs for fixup in non VPlan-native path"); 3583 fixNonInductionPHIs(); 3584 } 3585 3586 // At this point every instruction in the original loop is widened to a 3587 // vector form. Now we need to fix the recurrences in the loop. These PHI 3588 // nodes are currently empty because we did not want to introduce cycles. 3589 // This is the second stage of vectorizing recurrences. 3590 fixCrossIterationPHIs(); 3591 3592 // Forget the original basic block. 3593 PSE.getSE()->forgetLoop(OrigLoop); 3594 3595 // Fix-up external users of the induction variables. 3596 for (auto &Entry : Legal->getInductionVars()) 3597 fixupIVUsers(Entry.first, Entry.second, 3598 getOrCreateVectorTripCount(LI->getLoopFor(LoopVectorBody)), 3599 IVEndValues[Entry.first], LoopMiddleBlock); 3600 3601 fixLCSSAPHIs(); 3602 for (Instruction *PI : PredicatedInstructions) 3603 sinkScalarOperands(&*PI); 3604 3605 // Remove redundant induction instructions. 3606 cse(LoopVectorBody); 3607 3608 // Set/update profile weights for the vector and remainder loops as original 3609 // loop iterations are now distributed among them. Note that original loop 3610 // represented by LoopScalarBody becomes remainder loop after vectorization. 3611 // 3612 // For cases like foldTailByMasking() and requiresScalarEpiloque() we may 3613 // end up getting slightly roughened result but that should be OK since 3614 // profile is not inherently precise anyway. Note also possible bypass of 3615 // vector code caused by legality checks is ignored, assigning all the weight 3616 // to the vector loop, optimistically. 3617 setProfileInfoAfterUnrolling(LI->getLoopFor(LoopScalarBody), 3618 LI->getLoopFor(LoopVectorBody), 3619 LI->getLoopFor(LoopScalarBody), VF * UF); 3620 } 3621 3622 void InnerLoopVectorizer::fixCrossIterationPHIs() { 3623 // In order to support recurrences we need to be able to vectorize Phi nodes. 3624 // Phi nodes have cycles, so we need to vectorize them in two stages. This is 3625 // stage #2: We now need to fix the recurrences by adding incoming edges to 3626 // the currently empty PHI nodes. At this point every instruction in the 3627 // original loop is widened to a vector form so we can use them to construct 3628 // the incoming edges. 3629 for (PHINode &Phi : OrigLoop->getHeader()->phis()) { 3630 // Handle first-order recurrences and reductions that need to be fixed. 3631 if (Legal->isFirstOrderRecurrence(&Phi)) 3632 fixFirstOrderRecurrence(&Phi); 3633 else if (Legal->isReductionVariable(&Phi)) 3634 fixReduction(&Phi); 3635 } 3636 } 3637 3638 void InnerLoopVectorizer::fixFirstOrderRecurrence(PHINode *Phi) { 3639 // This is the second phase of vectorizing first-order recurrences. An 3640 // overview of the transformation is described below. Suppose we have the 3641 // following loop. 3642 // 3643 // for (int i = 0; i < n; ++i) 3644 // b[i] = a[i] - a[i - 1]; 3645 // 3646 // There is a first-order recurrence on "a". For this loop, the shorthand 3647 // scalar IR looks like: 3648 // 3649 // scalar.ph: 3650 // s_init = a[-1] 3651 // br scalar.body 3652 // 3653 // scalar.body: 3654 // i = phi [0, scalar.ph], [i+1, scalar.body] 3655 // s1 = phi [s_init, scalar.ph], [s2, scalar.body] 3656 // s2 = a[i] 3657 // b[i] = s2 - s1 3658 // br cond, scalar.body, ... 3659 // 3660 // In this example, s1 is a recurrence because it's value depends on the 3661 // previous iteration. In the first phase of vectorization, we created a 3662 // temporary value for s1. We now complete the vectorization and produce the 3663 // shorthand vector IR shown below (for VF = 4, UF = 1). 3664 // 3665 // vector.ph: 3666 // v_init = vector(..., ..., ..., a[-1]) 3667 // br vector.body 3668 // 3669 // vector.body 3670 // i = phi [0, vector.ph], [i+4, vector.body] 3671 // v1 = phi [v_init, vector.ph], [v2, vector.body] 3672 // v2 = a[i, i+1, i+2, i+3]; 3673 // v3 = vector(v1(3), v2(0, 1, 2)) 3674 // b[i, i+1, i+2, i+3] = v2 - v3 3675 // br cond, vector.body, middle.block 3676 // 3677 // middle.block: 3678 // x = v2(3) 3679 // br scalar.ph 3680 // 3681 // scalar.ph: 3682 // s_init = phi [x, middle.block], [a[-1], otherwise] 3683 // br scalar.body 3684 // 3685 // After execution completes the vector loop, we extract the next value of 3686 // the recurrence (x) to use as the initial value in the scalar loop. 3687 3688 // Get the original loop preheader and single loop latch. 3689 auto *Preheader = OrigLoop->getLoopPreheader(); 3690 auto *Latch = OrigLoop->getLoopLatch(); 3691 3692 // Get the initial and previous values of the scalar recurrence. 3693 auto *ScalarInit = Phi->getIncomingValueForBlock(Preheader); 3694 auto *Previous = Phi->getIncomingValueForBlock(Latch); 3695 3696 // Create a vector from the initial value. 3697 auto *VectorInit = ScalarInit; 3698 if (VF > 1) { 3699 Builder.SetInsertPoint(LoopVectorPreHeader->getTerminator()); 3700 VectorInit = Builder.CreateInsertElement( 3701 UndefValue::get(FixedVectorType::get(VectorInit->getType(), VF)), 3702 VectorInit, Builder.getInt32(VF - 1), "vector.recur.init"); 3703 } 3704 3705 // We constructed a temporary phi node in the first phase of vectorization. 3706 // This phi node will eventually be deleted. 3707 Builder.SetInsertPoint( 3708 cast<Instruction>(VectorLoopValueMap.getVectorValue(Phi, 0))); 3709 3710 // Create a phi node for the new recurrence. The current value will either be 3711 // the initial value inserted into a vector or loop-varying vector value. 3712 auto *VecPhi = Builder.CreatePHI(VectorInit->getType(), 2, "vector.recur"); 3713 VecPhi->addIncoming(VectorInit, LoopVectorPreHeader); 3714 3715 // Get the vectorized previous value of the last part UF - 1. It appears last 3716 // among all unrolled iterations, due to the order of their construction. 3717 Value *PreviousLastPart = getOrCreateVectorValue(Previous, UF - 1); 3718 3719 // Find and set the insertion point after the previous value if it is an 3720 // instruction. 3721 BasicBlock::iterator InsertPt; 3722 // Note that the previous value may have been constant-folded so it is not 3723 // guaranteed to be an instruction in the vector loop. 3724 // FIXME: Loop invariant values do not form recurrences. We should deal with 3725 // them earlier. 3726 if (LI->getLoopFor(LoopVectorBody)->isLoopInvariant(PreviousLastPart)) 3727 InsertPt = LoopVectorBody->getFirstInsertionPt(); 3728 else { 3729 Instruction *PreviousInst = cast<Instruction>(PreviousLastPart); 3730 if (isa<PHINode>(PreviousLastPart)) 3731 // If the previous value is a phi node, we should insert after all the phi 3732 // nodes in the block containing the PHI to avoid breaking basic block 3733 // verification. Note that the basic block may be different to 3734 // LoopVectorBody, in case we predicate the loop. 3735 InsertPt = PreviousInst->getParent()->getFirstInsertionPt(); 3736 else 3737 InsertPt = ++PreviousInst->getIterator(); 3738 } 3739 Builder.SetInsertPoint(&*InsertPt); 3740 3741 // We will construct a vector for the recurrence by combining the values for 3742 // the current and previous iterations. This is the required shuffle mask. 3743 SmallVector<int, 8> ShuffleMask(VF); 3744 ShuffleMask[0] = VF - 1; 3745 for (unsigned I = 1; I < VF; ++I) 3746 ShuffleMask[I] = I + VF - 1; 3747 3748 // The vector from which to take the initial value for the current iteration 3749 // (actual or unrolled). Initially, this is the vector phi node. 3750 Value *Incoming = VecPhi; 3751 3752 // Shuffle the current and previous vector and update the vector parts. 3753 for (unsigned Part = 0; Part < UF; ++Part) { 3754 Value *PreviousPart = getOrCreateVectorValue(Previous, Part); 3755 Value *PhiPart = VectorLoopValueMap.getVectorValue(Phi, Part); 3756 auto *Shuffle = VF > 1 ? Builder.CreateShuffleVector(Incoming, PreviousPart, 3757 ShuffleMask) 3758 : Incoming; 3759 PhiPart->replaceAllUsesWith(Shuffle); 3760 cast<Instruction>(PhiPart)->eraseFromParent(); 3761 VectorLoopValueMap.resetVectorValue(Phi, Part, Shuffle); 3762 Incoming = PreviousPart; 3763 } 3764 3765 // Fix the latch value of the new recurrence in the vector loop. 3766 VecPhi->addIncoming(Incoming, LI->getLoopFor(LoopVectorBody)->getLoopLatch()); 3767 3768 // Extract the last vector element in the middle block. This will be the 3769 // initial value for the recurrence when jumping to the scalar loop. 3770 auto *ExtractForScalar = Incoming; 3771 if (VF > 1) { 3772 Builder.SetInsertPoint(LoopMiddleBlock->getTerminator()); 3773 ExtractForScalar = Builder.CreateExtractElement( 3774 ExtractForScalar, Builder.getInt32(VF - 1), "vector.recur.extract"); 3775 } 3776 // Extract the second last element in the middle block if the 3777 // Phi is used outside the loop. We need to extract the phi itself 3778 // and not the last element (the phi update in the current iteration). This 3779 // will be the value when jumping to the exit block from the LoopMiddleBlock, 3780 // when the scalar loop is not run at all. 3781 Value *ExtractForPhiUsedOutsideLoop = nullptr; 3782 if (VF > 1) 3783 ExtractForPhiUsedOutsideLoop = Builder.CreateExtractElement( 3784 Incoming, Builder.getInt32(VF - 2), "vector.recur.extract.for.phi"); 3785 // When loop is unrolled without vectorizing, initialize 3786 // ExtractForPhiUsedOutsideLoop with the value just prior to unrolled value of 3787 // `Incoming`. This is analogous to the vectorized case above: extracting the 3788 // second last element when VF > 1. 3789 else if (UF > 1) 3790 ExtractForPhiUsedOutsideLoop = getOrCreateVectorValue(Previous, UF - 2); 3791 3792 // Fix the initial value of the original recurrence in the scalar loop. 3793 Builder.SetInsertPoint(&*LoopScalarPreHeader->begin()); 3794 auto *Start = Builder.CreatePHI(Phi->getType(), 2, "scalar.recur.init"); 3795 for (auto *BB : predecessors(LoopScalarPreHeader)) { 3796 auto *Incoming = BB == LoopMiddleBlock ? ExtractForScalar : ScalarInit; 3797 Start->addIncoming(Incoming, BB); 3798 } 3799 3800 Phi->setIncomingValueForBlock(LoopScalarPreHeader, Start); 3801 Phi->setName("scalar.recur"); 3802 3803 // Finally, fix users of the recurrence outside the loop. The users will need 3804 // either the last value of the scalar recurrence or the last value of the 3805 // vector recurrence we extracted in the middle block. Since the loop is in 3806 // LCSSA form, we just need to find all the phi nodes for the original scalar 3807 // recurrence in the exit block, and then add an edge for the middle block. 3808 for (PHINode &LCSSAPhi : LoopExitBlock->phis()) { 3809 if (LCSSAPhi.getIncomingValue(0) == Phi) { 3810 LCSSAPhi.addIncoming(ExtractForPhiUsedOutsideLoop, LoopMiddleBlock); 3811 } 3812 } 3813 } 3814 3815 void InnerLoopVectorizer::fixReduction(PHINode *Phi) { 3816 Constant *Zero = Builder.getInt32(0); 3817 3818 // Get it's reduction variable descriptor. 3819 assert(Legal->isReductionVariable(Phi) && 3820 "Unable to find the reduction variable"); 3821 RecurrenceDescriptor RdxDesc = Legal->getReductionVars()[Phi]; 3822 3823 RecurrenceDescriptor::RecurrenceKind RK = RdxDesc.getRecurrenceKind(); 3824 TrackingVH<Value> ReductionStartValue = RdxDesc.getRecurrenceStartValue(); 3825 Instruction *LoopExitInst = RdxDesc.getLoopExitInstr(); 3826 RecurrenceDescriptor::MinMaxRecurrenceKind MinMaxKind = 3827 RdxDesc.getMinMaxRecurrenceKind(); 3828 setDebugLocFromInst(Builder, ReductionStartValue); 3829 bool IsInLoopReductionPhi = Cost->isInLoopReduction(Phi); 3830 3831 // We need to generate a reduction vector from the incoming scalar. 3832 // To do so, we need to generate the 'identity' vector and override 3833 // one of the elements with the incoming scalar reduction. We need 3834 // to do it in the vector-loop preheader. 3835 Builder.SetInsertPoint(LoopVectorPreHeader->getTerminator()); 3836 3837 // This is the vector-clone of the value that leaves the loop. 3838 Type *VecTy = getOrCreateVectorValue(LoopExitInst, 0)->getType(); 3839 3840 // Find the reduction identity variable. Zero for addition, or, xor, 3841 // one for multiplication, -1 for And. 3842 Value *Identity; 3843 Value *VectorStart; 3844 if (RK == RecurrenceDescriptor::RK_IntegerMinMax || 3845 RK == RecurrenceDescriptor::RK_FloatMinMax) { 3846 // MinMax reduction have the start value as their identify. 3847 if (VF == 1 || IsInLoopReductionPhi) { 3848 VectorStart = Identity = ReductionStartValue; 3849 } else { 3850 VectorStart = Identity = 3851 Builder.CreateVectorSplat(VF, ReductionStartValue, "minmax.ident"); 3852 } 3853 } else { 3854 // Handle other reduction kinds: 3855 Constant *Iden = RecurrenceDescriptor::getRecurrenceIdentity( 3856 RK, VecTy->getScalarType()); 3857 if (VF == 1 || IsInLoopReductionPhi) { 3858 Identity = Iden; 3859 // This vector is the Identity vector where the first element is the 3860 // incoming scalar reduction. 3861 VectorStart = ReductionStartValue; 3862 } else { 3863 Identity = ConstantVector::getSplat({VF, false}, Iden); 3864 3865 // This vector is the Identity vector where the first element is the 3866 // incoming scalar reduction. 3867 VectorStart = 3868 Builder.CreateInsertElement(Identity, ReductionStartValue, Zero); 3869 } 3870 } 3871 3872 // Wrap flags are in general invalid after vectorization, clear them. 3873 clearReductionWrapFlags(RdxDesc); 3874 3875 // Fix the vector-loop phi. 3876 3877 // Reductions do not have to start at zero. They can start with 3878 // any loop invariant values. 3879 BasicBlock *Latch = OrigLoop->getLoopLatch(); 3880 Value *LoopVal = Phi->getIncomingValueForBlock(Latch); 3881 3882 for (unsigned Part = 0; Part < UF; ++Part) { 3883 Value *VecRdxPhi = getOrCreateVectorValue(Phi, Part); 3884 Value *Val = getOrCreateVectorValue(LoopVal, Part); 3885 // Make sure to add the reduction start value only to the 3886 // first unroll part. 3887 Value *StartVal = (Part == 0) ? VectorStart : Identity; 3888 cast<PHINode>(VecRdxPhi)->addIncoming(StartVal, LoopVectorPreHeader); 3889 cast<PHINode>(VecRdxPhi) 3890 ->addIncoming(Val, LI->getLoopFor(LoopVectorBody)->getLoopLatch()); 3891 } 3892 3893 // Before each round, move the insertion point right between 3894 // the PHIs and the values we are going to write. 3895 // This allows us to write both PHINodes and the extractelement 3896 // instructions. 3897 Builder.SetInsertPoint(&*LoopMiddleBlock->getFirstInsertionPt()); 3898 3899 setDebugLocFromInst(Builder, LoopExitInst); 3900 3901 // If tail is folded by masking, the vector value to leave the loop should be 3902 // a Select choosing between the vectorized LoopExitInst and vectorized Phi, 3903 // instead of the former. 3904 if (Cost->foldTailByMasking()) { 3905 for (unsigned Part = 0; Part < UF; ++Part) { 3906 Value *VecLoopExitInst = 3907 VectorLoopValueMap.getVectorValue(LoopExitInst, Part); 3908 Value *Sel = nullptr; 3909 for (User *U : VecLoopExitInst->users()) { 3910 if (isa<SelectInst>(U)) { 3911 assert(!Sel && "Reduction exit feeding two selects"); 3912 Sel = U; 3913 } else 3914 assert(isa<PHINode>(U) && "Reduction exit must feed Phi's or select"); 3915 } 3916 assert(Sel && "Reduction exit feeds no select"); 3917 VectorLoopValueMap.resetVectorValue(LoopExitInst, Part, Sel); 3918 } 3919 } 3920 3921 // If the vector reduction can be performed in a smaller type, we truncate 3922 // then extend the loop exit value to enable InstCombine to evaluate the 3923 // entire expression in the smaller type. 3924 if (VF > 1 && Phi->getType() != RdxDesc.getRecurrenceType()) { 3925 assert(!IsInLoopReductionPhi && "Unexpected truncated inloop reduction!"); 3926 Type *RdxVecTy = FixedVectorType::get(RdxDesc.getRecurrenceType(), VF); 3927 Builder.SetInsertPoint( 3928 LI->getLoopFor(LoopVectorBody)->getLoopLatch()->getTerminator()); 3929 VectorParts RdxParts(UF); 3930 for (unsigned Part = 0; Part < UF; ++Part) { 3931 RdxParts[Part] = VectorLoopValueMap.getVectorValue(LoopExitInst, Part); 3932 Value *Trunc = Builder.CreateTrunc(RdxParts[Part], RdxVecTy); 3933 Value *Extnd = RdxDesc.isSigned() ? Builder.CreateSExt(Trunc, VecTy) 3934 : Builder.CreateZExt(Trunc, VecTy); 3935 for (Value::user_iterator UI = RdxParts[Part]->user_begin(); 3936 UI != RdxParts[Part]->user_end();) 3937 if (*UI != Trunc) { 3938 (*UI++)->replaceUsesOfWith(RdxParts[Part], Extnd); 3939 RdxParts[Part] = Extnd; 3940 } else { 3941 ++UI; 3942 } 3943 } 3944 Builder.SetInsertPoint(&*LoopMiddleBlock->getFirstInsertionPt()); 3945 for (unsigned Part = 0; Part < UF; ++Part) { 3946 RdxParts[Part] = Builder.CreateTrunc(RdxParts[Part], RdxVecTy); 3947 VectorLoopValueMap.resetVectorValue(LoopExitInst, Part, RdxParts[Part]); 3948 } 3949 } 3950 3951 // Reduce all of the unrolled parts into a single vector. 3952 Value *ReducedPartRdx = VectorLoopValueMap.getVectorValue(LoopExitInst, 0); 3953 unsigned Op = RecurrenceDescriptor::getRecurrenceBinOp(RK); 3954 3955 // The middle block terminator has already been assigned a DebugLoc here (the 3956 // OrigLoop's single latch terminator). We want the whole middle block to 3957 // appear to execute on this line because: (a) it is all compiler generated, 3958 // (b) these instructions are always executed after evaluating the latch 3959 // conditional branch, and (c) other passes may add new predecessors which 3960 // terminate on this line. This is the easiest way to ensure we don't 3961 // accidentally cause an extra step back into the loop while debugging. 3962 setDebugLocFromInst(Builder, LoopMiddleBlock->getTerminator()); 3963 for (unsigned Part = 1; Part < UF; ++Part) { 3964 Value *RdxPart = VectorLoopValueMap.getVectorValue(LoopExitInst, Part); 3965 if (Op != Instruction::ICmp && Op != Instruction::FCmp) 3966 // Floating point operations had to be 'fast' to enable the reduction. 3967 ReducedPartRdx = addFastMathFlag( 3968 Builder.CreateBinOp((Instruction::BinaryOps)Op, RdxPart, 3969 ReducedPartRdx, "bin.rdx"), 3970 RdxDesc.getFastMathFlags()); 3971 else 3972 ReducedPartRdx = createMinMaxOp(Builder, MinMaxKind, ReducedPartRdx, 3973 RdxPart); 3974 } 3975 3976 // Create the reduction after the loop. Note that inloop reductions create the 3977 // target reduction in the loop using a Reduction recipe. 3978 if (VF > 1 && !IsInLoopReductionPhi) { 3979 bool NoNaN = Legal->hasFunNoNaNAttr(); 3980 ReducedPartRdx = 3981 createTargetReduction(Builder, TTI, RdxDesc, ReducedPartRdx, NoNaN); 3982 // If the reduction can be performed in a smaller type, we need to extend 3983 // the reduction to the wider type before we branch to the original loop. 3984 if (Phi->getType() != RdxDesc.getRecurrenceType()) 3985 ReducedPartRdx = 3986 RdxDesc.isSigned() 3987 ? Builder.CreateSExt(ReducedPartRdx, Phi->getType()) 3988 : Builder.CreateZExt(ReducedPartRdx, Phi->getType()); 3989 } 3990 3991 // Create a phi node that merges control-flow from the backedge-taken check 3992 // block and the middle block. 3993 PHINode *BCBlockPhi = PHINode::Create(Phi->getType(), 2, "bc.merge.rdx", 3994 LoopScalarPreHeader->getTerminator()); 3995 for (unsigned I = 0, E = LoopBypassBlocks.size(); I != E; ++I) 3996 BCBlockPhi->addIncoming(ReductionStartValue, LoopBypassBlocks[I]); 3997 BCBlockPhi->addIncoming(ReducedPartRdx, LoopMiddleBlock); 3998 3999 // Now, we need to fix the users of the reduction variable 4000 // inside and outside of the scalar remainder loop. 4001 // We know that the loop is in LCSSA form. We need to update the 4002 // PHI nodes in the exit blocks. 4003 for (PHINode &LCSSAPhi : LoopExitBlock->phis()) { 4004 // All PHINodes need to have a single entry edge, or two if 4005 // we already fixed them. 4006 assert(LCSSAPhi.getNumIncomingValues() < 3 && "Invalid LCSSA PHI"); 4007 4008 // We found a reduction value exit-PHI. Update it with the 4009 // incoming bypass edge. 4010 if (LCSSAPhi.getIncomingValue(0) == LoopExitInst) 4011 LCSSAPhi.addIncoming(ReducedPartRdx, LoopMiddleBlock); 4012 } // end of the LCSSA phi scan. 4013 4014 // Fix the scalar loop reduction variable with the incoming reduction sum 4015 // from the vector body and from the backedge value. 4016 int IncomingEdgeBlockIdx = 4017 Phi->getBasicBlockIndex(OrigLoop->getLoopLatch()); 4018 assert(IncomingEdgeBlockIdx >= 0 && "Invalid block index"); 4019 // Pick the other block. 4020 int SelfEdgeBlockIdx = (IncomingEdgeBlockIdx ? 0 : 1); 4021 Phi->setIncomingValue(SelfEdgeBlockIdx, BCBlockPhi); 4022 Phi->setIncomingValue(IncomingEdgeBlockIdx, LoopExitInst); 4023 } 4024 4025 void InnerLoopVectorizer::clearReductionWrapFlags( 4026 RecurrenceDescriptor &RdxDesc) { 4027 RecurrenceDescriptor::RecurrenceKind RK = RdxDesc.getRecurrenceKind(); 4028 if (RK != RecurrenceDescriptor::RK_IntegerAdd && 4029 RK != RecurrenceDescriptor::RK_IntegerMult) 4030 return; 4031 4032 Instruction *LoopExitInstr = RdxDesc.getLoopExitInstr(); 4033 assert(LoopExitInstr && "null loop exit instruction"); 4034 SmallVector<Instruction *, 8> Worklist; 4035 SmallPtrSet<Instruction *, 8> Visited; 4036 Worklist.push_back(LoopExitInstr); 4037 Visited.insert(LoopExitInstr); 4038 4039 while (!Worklist.empty()) { 4040 Instruction *Cur = Worklist.pop_back_val(); 4041 if (isa<OverflowingBinaryOperator>(Cur)) 4042 for (unsigned Part = 0; Part < UF; ++Part) { 4043 Value *V = getOrCreateVectorValue(Cur, Part); 4044 cast<Instruction>(V)->dropPoisonGeneratingFlags(); 4045 } 4046 4047 for (User *U : Cur->users()) { 4048 Instruction *UI = cast<Instruction>(U); 4049 if ((Cur != LoopExitInstr || OrigLoop->contains(UI->getParent())) && 4050 Visited.insert(UI).second) 4051 Worklist.push_back(UI); 4052 } 4053 } 4054 } 4055 4056 void InnerLoopVectorizer::fixLCSSAPHIs() { 4057 for (PHINode &LCSSAPhi : LoopExitBlock->phis()) { 4058 if (LCSSAPhi.getNumIncomingValues() == 1) { 4059 auto *IncomingValue = LCSSAPhi.getIncomingValue(0); 4060 // Non-instruction incoming values will have only one value. 4061 unsigned LastLane = 0; 4062 if (isa<Instruction>(IncomingValue)) 4063 LastLane = Cost->isUniformAfterVectorization( 4064 cast<Instruction>(IncomingValue), VF) 4065 ? 0 4066 : VF - 1; 4067 // Can be a loop invariant incoming value or the last scalar value to be 4068 // extracted from the vectorized loop. 4069 Builder.SetInsertPoint(LoopMiddleBlock->getTerminator()); 4070 Value *lastIncomingValue = 4071 getOrCreateScalarValue(IncomingValue, { UF - 1, LastLane }); 4072 LCSSAPhi.addIncoming(lastIncomingValue, LoopMiddleBlock); 4073 } 4074 } 4075 } 4076 4077 void InnerLoopVectorizer::sinkScalarOperands(Instruction *PredInst) { 4078 // The basic block and loop containing the predicated instruction. 4079 auto *PredBB = PredInst->getParent(); 4080 auto *VectorLoop = LI->getLoopFor(PredBB); 4081 4082 // Initialize a worklist with the operands of the predicated instruction. 4083 SetVector<Value *> Worklist(PredInst->op_begin(), PredInst->op_end()); 4084 4085 // Holds instructions that we need to analyze again. An instruction may be 4086 // reanalyzed if we don't yet know if we can sink it or not. 4087 SmallVector<Instruction *, 8> InstsToReanalyze; 4088 4089 // Returns true if a given use occurs in the predicated block. Phi nodes use 4090 // their operands in their corresponding predecessor blocks. 4091 auto isBlockOfUsePredicated = [&](Use &U) -> bool { 4092 auto *I = cast<Instruction>(U.getUser()); 4093 BasicBlock *BB = I->getParent(); 4094 if (auto *Phi = dyn_cast<PHINode>(I)) 4095 BB = Phi->getIncomingBlock( 4096 PHINode::getIncomingValueNumForOperand(U.getOperandNo())); 4097 return BB == PredBB; 4098 }; 4099 4100 // Iteratively sink the scalarized operands of the predicated instruction 4101 // into the block we created for it. When an instruction is sunk, it's 4102 // operands are then added to the worklist. The algorithm ends after one pass 4103 // through the worklist doesn't sink a single instruction. 4104 bool Changed; 4105 do { 4106 // Add the instructions that need to be reanalyzed to the worklist, and 4107 // reset the changed indicator. 4108 Worklist.insert(InstsToReanalyze.begin(), InstsToReanalyze.end()); 4109 InstsToReanalyze.clear(); 4110 Changed = false; 4111 4112 while (!Worklist.empty()) { 4113 auto *I = dyn_cast<Instruction>(Worklist.pop_back_val()); 4114 4115 // We can't sink an instruction if it is a phi node, is already in the 4116 // predicated block, is not in the loop, or may have side effects. 4117 if (!I || isa<PHINode>(I) || I->getParent() == PredBB || 4118 !VectorLoop->contains(I) || I->mayHaveSideEffects()) 4119 continue; 4120 4121 // It's legal to sink the instruction if all its uses occur in the 4122 // predicated block. Otherwise, there's nothing to do yet, and we may 4123 // need to reanalyze the instruction. 4124 if (!llvm::all_of(I->uses(), isBlockOfUsePredicated)) { 4125 InstsToReanalyze.push_back(I); 4126 continue; 4127 } 4128 4129 // Move the instruction to the beginning of the predicated block, and add 4130 // it's operands to the worklist. 4131 I->moveBefore(&*PredBB->getFirstInsertionPt()); 4132 Worklist.insert(I->op_begin(), I->op_end()); 4133 4134 // The sinking may have enabled other instructions to be sunk, so we will 4135 // need to iterate. 4136 Changed = true; 4137 } 4138 } while (Changed); 4139 } 4140 4141 void InnerLoopVectorizer::fixNonInductionPHIs() { 4142 for (PHINode *OrigPhi : OrigPHIsToFix) { 4143 PHINode *NewPhi = 4144 cast<PHINode>(VectorLoopValueMap.getVectorValue(OrigPhi, 0)); 4145 unsigned NumIncomingValues = OrigPhi->getNumIncomingValues(); 4146 4147 SmallVector<BasicBlock *, 2> ScalarBBPredecessors( 4148 predecessors(OrigPhi->getParent())); 4149 SmallVector<BasicBlock *, 2> VectorBBPredecessors( 4150 predecessors(NewPhi->getParent())); 4151 assert(ScalarBBPredecessors.size() == VectorBBPredecessors.size() && 4152 "Scalar and Vector BB should have the same number of predecessors"); 4153 4154 // The insertion point in Builder may be invalidated by the time we get 4155 // here. Force the Builder insertion point to something valid so that we do 4156 // not run into issues during insertion point restore in 4157 // getOrCreateVectorValue calls below. 4158 Builder.SetInsertPoint(NewPhi); 4159 4160 // The predecessor order is preserved and we can rely on mapping between 4161 // scalar and vector block predecessors. 4162 for (unsigned i = 0; i < NumIncomingValues; ++i) { 4163 BasicBlock *NewPredBB = VectorBBPredecessors[i]; 4164 4165 // When looking up the new scalar/vector values to fix up, use incoming 4166 // values from original phi. 4167 Value *ScIncV = 4168 OrigPhi->getIncomingValueForBlock(ScalarBBPredecessors[i]); 4169 4170 // Scalar incoming value may need a broadcast 4171 Value *NewIncV = getOrCreateVectorValue(ScIncV, 0); 4172 NewPhi->addIncoming(NewIncV, NewPredBB); 4173 } 4174 } 4175 } 4176 4177 void InnerLoopVectorizer::widenGEP(GetElementPtrInst *GEP, VPUser &Operands, 4178 unsigned UF, unsigned VF, 4179 bool IsPtrLoopInvariant, 4180 SmallBitVector &IsIndexLoopInvariant, 4181 VPTransformState &State) { 4182 // Construct a vector GEP by widening the operands of the scalar GEP as 4183 // necessary. We mark the vector GEP 'inbounds' if appropriate. A GEP 4184 // results in a vector of pointers when at least one operand of the GEP 4185 // is vector-typed. Thus, to keep the representation compact, we only use 4186 // vector-typed operands for loop-varying values. 4187 4188 if (VF > 1 && IsPtrLoopInvariant && IsIndexLoopInvariant.all()) { 4189 // If we are vectorizing, but the GEP has only loop-invariant operands, 4190 // the GEP we build (by only using vector-typed operands for 4191 // loop-varying values) would be a scalar pointer. Thus, to ensure we 4192 // produce a vector of pointers, we need to either arbitrarily pick an 4193 // operand to broadcast, or broadcast a clone of the original GEP. 4194 // Here, we broadcast a clone of the original. 4195 // 4196 // TODO: If at some point we decide to scalarize instructions having 4197 // loop-invariant operands, this special case will no longer be 4198 // required. We would add the scalarization decision to 4199 // collectLoopScalars() and teach getVectorValue() to broadcast 4200 // the lane-zero scalar value. 4201 auto *Clone = Builder.Insert(GEP->clone()); 4202 for (unsigned Part = 0; Part < UF; ++Part) { 4203 Value *EntryPart = Builder.CreateVectorSplat(VF, Clone); 4204 VectorLoopValueMap.setVectorValue(GEP, Part, EntryPart); 4205 addMetadata(EntryPart, GEP); 4206 } 4207 } else { 4208 // If the GEP has at least one loop-varying operand, we are sure to 4209 // produce a vector of pointers. But if we are only unrolling, we want 4210 // to produce a scalar GEP for each unroll part. Thus, the GEP we 4211 // produce with the code below will be scalar (if VF == 1) or vector 4212 // (otherwise). Note that for the unroll-only case, we still maintain 4213 // values in the vector mapping with initVector, as we do for other 4214 // instructions. 4215 for (unsigned Part = 0; Part < UF; ++Part) { 4216 // The pointer operand of the new GEP. If it's loop-invariant, we 4217 // won't broadcast it. 4218 auto *Ptr = IsPtrLoopInvariant ? State.get(Operands.getOperand(0), {0, 0}) 4219 : State.get(Operands.getOperand(0), Part); 4220 4221 // Collect all the indices for the new GEP. If any index is 4222 // loop-invariant, we won't broadcast it. 4223 SmallVector<Value *, 4> Indices; 4224 for (unsigned I = 1, E = Operands.getNumOperands(); I < E; I++) { 4225 VPValue *Operand = Operands.getOperand(I); 4226 if (IsIndexLoopInvariant[I - 1]) 4227 Indices.push_back(State.get(Operand, {0, 0})); 4228 else 4229 Indices.push_back(State.get(Operand, Part)); 4230 } 4231 4232 // Create the new GEP. Note that this GEP may be a scalar if VF == 1, 4233 // but it should be a vector, otherwise. 4234 auto *NewGEP = 4235 GEP->isInBounds() 4236 ? Builder.CreateInBoundsGEP(GEP->getSourceElementType(), Ptr, 4237 Indices) 4238 : Builder.CreateGEP(GEP->getSourceElementType(), Ptr, Indices); 4239 assert((VF == 1 || NewGEP->getType()->isVectorTy()) && 4240 "NewGEP is not a pointer vector"); 4241 VectorLoopValueMap.setVectorValue(GEP, Part, NewGEP); 4242 addMetadata(NewGEP, GEP); 4243 } 4244 } 4245 } 4246 4247 void InnerLoopVectorizer::widenPHIInstruction(Instruction *PN, unsigned UF, 4248 unsigned VF) { 4249 PHINode *P = cast<PHINode>(PN); 4250 if (EnableVPlanNativePath) { 4251 // Currently we enter here in the VPlan-native path for non-induction 4252 // PHIs where all control flow is uniform. We simply widen these PHIs. 4253 // Create a vector phi with no operands - the vector phi operands will be 4254 // set at the end of vector code generation. 4255 Type *VecTy = 4256 (VF == 1) ? PN->getType() : FixedVectorType::get(PN->getType(), VF); 4257 Value *VecPhi = Builder.CreatePHI(VecTy, PN->getNumOperands(), "vec.phi"); 4258 VectorLoopValueMap.setVectorValue(P, 0, VecPhi); 4259 OrigPHIsToFix.push_back(P); 4260 4261 return; 4262 } 4263 4264 assert(PN->getParent() == OrigLoop->getHeader() && 4265 "Non-header phis should have been handled elsewhere"); 4266 4267 // In order to support recurrences we need to be able to vectorize Phi nodes. 4268 // Phi nodes have cycles, so we need to vectorize them in two stages. This is 4269 // stage #1: We create a new vector PHI node with no incoming edges. We'll use 4270 // this value when we vectorize all of the instructions that use the PHI. 4271 if (Legal->isReductionVariable(P) || Legal->isFirstOrderRecurrence(P)) { 4272 for (unsigned Part = 0; Part < UF; ++Part) { 4273 // This is phase one of vectorizing PHIs. 4274 bool ScalarPHI = (VF == 1) || Cost->isInLoopReduction(cast<PHINode>(PN)); 4275 Type *VecTy = 4276 ScalarPHI ? PN->getType() : FixedVectorType::get(PN->getType(), VF); 4277 Value *EntryPart = PHINode::Create( 4278 VecTy, 2, "vec.phi", &*LoopVectorBody->getFirstInsertionPt()); 4279 VectorLoopValueMap.setVectorValue(P, Part, EntryPart); 4280 } 4281 return; 4282 } 4283 4284 setDebugLocFromInst(Builder, P); 4285 4286 // This PHINode must be an induction variable. 4287 // Make sure that we know about it. 4288 assert(Legal->getInductionVars().count(P) && "Not an induction variable"); 4289 4290 InductionDescriptor II = Legal->getInductionVars().lookup(P); 4291 const DataLayout &DL = OrigLoop->getHeader()->getModule()->getDataLayout(); 4292 4293 // FIXME: The newly created binary instructions should contain nsw/nuw flags, 4294 // which can be found from the original scalar operations. 4295 switch (II.getKind()) { 4296 case InductionDescriptor::IK_NoInduction: 4297 llvm_unreachable("Unknown induction"); 4298 case InductionDescriptor::IK_IntInduction: 4299 case InductionDescriptor::IK_FpInduction: 4300 llvm_unreachable("Integer/fp induction is handled elsewhere."); 4301 case InductionDescriptor::IK_PtrInduction: { 4302 // Handle the pointer induction variable case. 4303 assert(P->getType()->isPointerTy() && "Unexpected type."); 4304 4305 if (Cost->isScalarAfterVectorization(P, VF)) { 4306 // This is the normalized GEP that starts counting at zero. 4307 Value *PtrInd = 4308 Builder.CreateSExtOrTrunc(Induction, II.getStep()->getType()); 4309 // Determine the number of scalars we need to generate for each unroll 4310 // iteration. If the instruction is uniform, we only need to generate the 4311 // first lane. Otherwise, we generate all VF values. 4312 unsigned Lanes = Cost->isUniformAfterVectorization(P, VF) ? 1 : VF; 4313 for (unsigned Part = 0; Part < UF; ++Part) { 4314 for (unsigned Lane = 0; Lane < Lanes; ++Lane) { 4315 Constant *Idx = ConstantInt::get(PtrInd->getType(), Lane + Part * VF); 4316 Value *GlobalIdx = Builder.CreateAdd(PtrInd, Idx); 4317 Value *SclrGep = 4318 emitTransformedIndex(Builder, GlobalIdx, PSE.getSE(), DL, II); 4319 SclrGep->setName("next.gep"); 4320 VectorLoopValueMap.setScalarValue(P, {Part, Lane}, SclrGep); 4321 } 4322 } 4323 return; 4324 } 4325 assert(isa<SCEVConstant>(II.getStep()) && 4326 "Induction step not a SCEV constant!"); 4327 Type *PhiType = II.getStep()->getType(); 4328 4329 // Build a pointer phi 4330 Value *ScalarStartValue = II.getStartValue(); 4331 Type *ScStValueType = ScalarStartValue->getType(); 4332 PHINode *NewPointerPhi = 4333 PHINode::Create(ScStValueType, 2, "pointer.phi", Induction); 4334 NewPointerPhi->addIncoming(ScalarStartValue, LoopVectorPreHeader); 4335 4336 // A pointer induction, performed by using a gep 4337 BasicBlock *LoopLatch = LI->getLoopFor(LoopVectorBody)->getLoopLatch(); 4338 Instruction *InductionLoc = LoopLatch->getTerminator(); 4339 const SCEV *ScalarStep = II.getStep(); 4340 SCEVExpander Exp(*PSE.getSE(), DL, "induction"); 4341 Value *ScalarStepValue = 4342 Exp.expandCodeFor(ScalarStep, PhiType, InductionLoc); 4343 Value *InductionGEP = GetElementPtrInst::Create( 4344 ScStValueType->getPointerElementType(), NewPointerPhi, 4345 Builder.CreateMul(ScalarStepValue, ConstantInt::get(PhiType, VF * UF)), 4346 "ptr.ind", InductionLoc); 4347 NewPointerPhi->addIncoming(InductionGEP, LoopLatch); 4348 4349 // Create UF many actual address geps that use the pointer 4350 // phi as base and a vectorized version of the step value 4351 // (<step*0, ..., step*N>) as offset. 4352 for (unsigned Part = 0; Part < UF; ++Part) { 4353 SmallVector<Constant *, 8> Indices; 4354 // Create a vector of consecutive numbers from zero to VF. 4355 for (unsigned i = 0; i < VF; ++i) 4356 Indices.push_back(ConstantInt::get(PhiType, i + Part * VF)); 4357 Constant *StartOffset = ConstantVector::get(Indices); 4358 4359 Value *GEP = Builder.CreateGEP( 4360 ScStValueType->getPointerElementType(), NewPointerPhi, 4361 Builder.CreateMul(StartOffset, 4362 Builder.CreateVectorSplat(VF, ScalarStepValue), 4363 "vector.gep")); 4364 VectorLoopValueMap.setVectorValue(P, Part, GEP); 4365 } 4366 } 4367 } 4368 } 4369 4370 /// A helper function for checking whether an integer division-related 4371 /// instruction may divide by zero (in which case it must be predicated if 4372 /// executed conditionally in the scalar code). 4373 /// TODO: It may be worthwhile to generalize and check isKnownNonZero(). 4374 /// Non-zero divisors that are non compile-time constants will not be 4375 /// converted into multiplication, so we will still end up scalarizing 4376 /// the division, but can do so w/o predication. 4377 static bool mayDivideByZero(Instruction &I) { 4378 assert((I.getOpcode() == Instruction::UDiv || 4379 I.getOpcode() == Instruction::SDiv || 4380 I.getOpcode() == Instruction::URem || 4381 I.getOpcode() == Instruction::SRem) && 4382 "Unexpected instruction"); 4383 Value *Divisor = I.getOperand(1); 4384 auto *CInt = dyn_cast<ConstantInt>(Divisor); 4385 return !CInt || CInt->isZero(); 4386 } 4387 4388 void InnerLoopVectorizer::widenInstruction(Instruction &I, VPUser &User, 4389 VPTransformState &State) { 4390 switch (I.getOpcode()) { 4391 case Instruction::Call: 4392 case Instruction::Br: 4393 case Instruction::PHI: 4394 case Instruction::GetElementPtr: 4395 case Instruction::Select: 4396 llvm_unreachable("This instruction is handled by a different recipe."); 4397 case Instruction::UDiv: 4398 case Instruction::SDiv: 4399 case Instruction::SRem: 4400 case Instruction::URem: 4401 case Instruction::Add: 4402 case Instruction::FAdd: 4403 case Instruction::Sub: 4404 case Instruction::FSub: 4405 case Instruction::FNeg: 4406 case Instruction::Mul: 4407 case Instruction::FMul: 4408 case Instruction::FDiv: 4409 case Instruction::FRem: 4410 case Instruction::Shl: 4411 case Instruction::LShr: 4412 case Instruction::AShr: 4413 case Instruction::And: 4414 case Instruction::Or: 4415 case Instruction::Xor: { 4416 // Just widen unops and binops. 4417 setDebugLocFromInst(Builder, &I); 4418 4419 for (unsigned Part = 0; Part < UF; ++Part) { 4420 SmallVector<Value *, 2> Ops; 4421 for (VPValue *VPOp : User.operands()) 4422 Ops.push_back(State.get(VPOp, Part)); 4423 4424 Value *V = Builder.CreateNAryOp(I.getOpcode(), Ops); 4425 4426 if (auto *VecOp = dyn_cast<Instruction>(V)) 4427 VecOp->copyIRFlags(&I); 4428 4429 // Use this vector value for all users of the original instruction. 4430 VectorLoopValueMap.setVectorValue(&I, Part, V); 4431 addMetadata(V, &I); 4432 } 4433 4434 break; 4435 } 4436 case Instruction::ICmp: 4437 case Instruction::FCmp: { 4438 // Widen compares. Generate vector compares. 4439 bool FCmp = (I.getOpcode() == Instruction::FCmp); 4440 auto *Cmp = cast<CmpInst>(&I); 4441 setDebugLocFromInst(Builder, Cmp); 4442 for (unsigned Part = 0; Part < UF; ++Part) { 4443 Value *A = State.get(User.getOperand(0), Part); 4444 Value *B = State.get(User.getOperand(1), Part); 4445 Value *C = nullptr; 4446 if (FCmp) { 4447 // Propagate fast math flags. 4448 IRBuilder<>::FastMathFlagGuard FMFG(Builder); 4449 Builder.setFastMathFlags(Cmp->getFastMathFlags()); 4450 C = Builder.CreateFCmp(Cmp->getPredicate(), A, B); 4451 } else { 4452 C = Builder.CreateICmp(Cmp->getPredicate(), A, B); 4453 } 4454 VectorLoopValueMap.setVectorValue(&I, Part, C); 4455 addMetadata(C, &I); 4456 } 4457 4458 break; 4459 } 4460 4461 case Instruction::ZExt: 4462 case Instruction::SExt: 4463 case Instruction::FPToUI: 4464 case Instruction::FPToSI: 4465 case Instruction::FPExt: 4466 case Instruction::PtrToInt: 4467 case Instruction::IntToPtr: 4468 case Instruction::SIToFP: 4469 case Instruction::UIToFP: 4470 case Instruction::Trunc: 4471 case Instruction::FPTrunc: 4472 case Instruction::BitCast: { 4473 auto *CI = cast<CastInst>(&I); 4474 setDebugLocFromInst(Builder, CI); 4475 4476 /// Vectorize casts. 4477 Type *DestTy = 4478 (VF == 1) ? CI->getType() : FixedVectorType::get(CI->getType(), VF); 4479 4480 for (unsigned Part = 0; Part < UF; ++Part) { 4481 Value *A = State.get(User.getOperand(0), Part); 4482 Value *Cast = Builder.CreateCast(CI->getOpcode(), A, DestTy); 4483 VectorLoopValueMap.setVectorValue(&I, Part, Cast); 4484 addMetadata(Cast, &I); 4485 } 4486 break; 4487 } 4488 default: 4489 // This instruction is not vectorized by simple widening. 4490 LLVM_DEBUG(dbgs() << "LV: Found an unhandled instruction: " << I); 4491 llvm_unreachable("Unhandled instruction!"); 4492 } // end of switch. 4493 } 4494 4495 void InnerLoopVectorizer::widenCallInstruction(CallInst &I, VPUser &ArgOperands, 4496 VPTransformState &State) { 4497 assert(!isa<DbgInfoIntrinsic>(I) && 4498 "DbgInfoIntrinsic should have been dropped during VPlan construction"); 4499 setDebugLocFromInst(Builder, &I); 4500 4501 Module *M = I.getParent()->getParent()->getParent(); 4502 auto *CI = cast<CallInst>(&I); 4503 4504 SmallVector<Type *, 4> Tys; 4505 for (Value *ArgOperand : CI->arg_operands()) 4506 Tys.push_back(ToVectorTy(ArgOperand->getType(), VF)); 4507 4508 Intrinsic::ID ID = getVectorIntrinsicIDForCall(CI, TLI); 4509 4510 // The flag shows whether we use Intrinsic or a usual Call for vectorized 4511 // version of the instruction. 4512 // Is it beneficial to perform intrinsic call compared to lib call? 4513 bool NeedToScalarize = false; 4514 unsigned CallCost = Cost->getVectorCallCost(CI, VF, NeedToScalarize); 4515 bool UseVectorIntrinsic = 4516 ID && Cost->getVectorIntrinsicCost(CI, VF) <= CallCost; 4517 assert((UseVectorIntrinsic || !NeedToScalarize) && 4518 "Instruction should be scalarized elsewhere."); 4519 4520 for (unsigned Part = 0; Part < UF; ++Part) { 4521 SmallVector<Value *, 4> Args; 4522 for (auto &I : enumerate(ArgOperands.operands())) { 4523 // Some intrinsics have a scalar argument - don't replace it with a 4524 // vector. 4525 Value *Arg; 4526 if (!UseVectorIntrinsic || !hasVectorInstrinsicScalarOpd(ID, I.index())) 4527 Arg = State.get(I.value(), Part); 4528 else 4529 Arg = State.get(I.value(), {0, 0}); 4530 Args.push_back(Arg); 4531 } 4532 4533 Function *VectorF; 4534 if (UseVectorIntrinsic) { 4535 // Use vector version of the intrinsic. 4536 Type *TysForDecl[] = {CI->getType()}; 4537 if (VF > 1) 4538 TysForDecl[0] = 4539 FixedVectorType::get(CI->getType()->getScalarType(), VF); 4540 VectorF = Intrinsic::getDeclaration(M, ID, TysForDecl); 4541 assert(VectorF && "Can't retrieve vector intrinsic."); 4542 } else { 4543 // Use vector version of the function call. 4544 const VFShape Shape = 4545 VFShape::get(*CI, {VF, false} /*EC*/, false /*HasGlobalPred*/); 4546 #ifndef NDEBUG 4547 assert(VFDatabase(*CI).getVectorizedFunction(Shape) != nullptr && 4548 "Can't create vector function."); 4549 #endif 4550 VectorF = VFDatabase(*CI).getVectorizedFunction(Shape); 4551 } 4552 SmallVector<OperandBundleDef, 1> OpBundles; 4553 CI->getOperandBundlesAsDefs(OpBundles); 4554 CallInst *V = Builder.CreateCall(VectorF, Args, OpBundles); 4555 4556 if (isa<FPMathOperator>(V)) 4557 V->copyFastMathFlags(CI); 4558 4559 VectorLoopValueMap.setVectorValue(&I, Part, V); 4560 addMetadata(V, &I); 4561 } 4562 } 4563 4564 void InnerLoopVectorizer::widenSelectInstruction(SelectInst &I, 4565 VPUser &Operands, 4566 bool InvariantCond, 4567 VPTransformState &State) { 4568 setDebugLocFromInst(Builder, &I); 4569 4570 // The condition can be loop invariant but still defined inside the 4571 // loop. This means that we can't just use the original 'cond' value. 4572 // We have to take the 'vectorized' value and pick the first lane. 4573 // Instcombine will make this a no-op. 4574 auto *InvarCond = 4575 InvariantCond ? State.get(Operands.getOperand(0), {0, 0}) : nullptr; 4576 4577 for (unsigned Part = 0; Part < UF; ++Part) { 4578 Value *Cond = 4579 InvarCond ? InvarCond : State.get(Operands.getOperand(0), Part); 4580 Value *Op0 = State.get(Operands.getOperand(1), Part); 4581 Value *Op1 = State.get(Operands.getOperand(2), Part); 4582 Value *Sel = Builder.CreateSelect(Cond, Op0, Op1); 4583 VectorLoopValueMap.setVectorValue(&I, Part, Sel); 4584 addMetadata(Sel, &I); 4585 } 4586 } 4587 4588 void LoopVectorizationCostModel::collectLoopScalars(unsigned VF) { 4589 // We should not collect Scalars more than once per VF. Right now, this 4590 // function is called from collectUniformsAndScalars(), which already does 4591 // this check. Collecting Scalars for VF=1 does not make any sense. 4592 assert(VF >= 2 && Scalars.find(VF) == Scalars.end() && 4593 "This function should not be visited twice for the same VF"); 4594 4595 SmallSetVector<Instruction *, 8> Worklist; 4596 4597 // These sets are used to seed the analysis with pointers used by memory 4598 // accesses that will remain scalar. 4599 SmallSetVector<Instruction *, 8> ScalarPtrs; 4600 SmallPtrSet<Instruction *, 8> PossibleNonScalarPtrs; 4601 auto *Latch = TheLoop->getLoopLatch(); 4602 4603 // A helper that returns true if the use of Ptr by MemAccess will be scalar. 4604 // The pointer operands of loads and stores will be scalar as long as the 4605 // memory access is not a gather or scatter operation. The value operand of a 4606 // store will remain scalar if the store is scalarized. 4607 auto isScalarUse = [&](Instruction *MemAccess, Value *Ptr) { 4608 InstWidening WideningDecision = getWideningDecision(MemAccess, VF); 4609 assert(WideningDecision != CM_Unknown && 4610 "Widening decision should be ready at this moment"); 4611 if (auto *Store = dyn_cast<StoreInst>(MemAccess)) 4612 if (Ptr == Store->getValueOperand()) 4613 return WideningDecision == CM_Scalarize; 4614 assert(Ptr == getLoadStorePointerOperand(MemAccess) && 4615 "Ptr is neither a value or pointer operand"); 4616 return WideningDecision != CM_GatherScatter; 4617 }; 4618 4619 // A helper that returns true if the given value is a bitcast or 4620 // getelementptr instruction contained in the loop. 4621 auto isLoopVaryingBitCastOrGEP = [&](Value *V) { 4622 return ((isa<BitCastInst>(V) && V->getType()->isPointerTy()) || 4623 isa<GetElementPtrInst>(V)) && 4624 !TheLoop->isLoopInvariant(V); 4625 }; 4626 4627 auto isScalarPtrInduction = [&](Instruction *MemAccess, Value *Ptr) { 4628 if (!isa<PHINode>(Ptr) || 4629 !Legal->getInductionVars().count(cast<PHINode>(Ptr))) 4630 return false; 4631 auto &Induction = Legal->getInductionVars()[cast<PHINode>(Ptr)]; 4632 if (Induction.getKind() != InductionDescriptor::IK_PtrInduction) 4633 return false; 4634 return isScalarUse(MemAccess, Ptr); 4635 }; 4636 4637 // A helper that evaluates a memory access's use of a pointer. If the 4638 // pointer is actually the pointer induction of a loop, it is being 4639 // inserted into Worklist. If the use will be a scalar use, and the 4640 // pointer is only used by memory accesses, we place the pointer in 4641 // ScalarPtrs. Otherwise, the pointer is placed in PossibleNonScalarPtrs. 4642 auto evaluatePtrUse = [&](Instruction *MemAccess, Value *Ptr) { 4643 if (isScalarPtrInduction(MemAccess, Ptr)) { 4644 Worklist.insert(cast<Instruction>(Ptr)); 4645 Instruction *Update = cast<Instruction>( 4646 cast<PHINode>(Ptr)->getIncomingValueForBlock(Latch)); 4647 Worklist.insert(Update); 4648 LLVM_DEBUG(dbgs() << "LV: Found new scalar instruction: " << *Ptr 4649 << "\n"); 4650 LLVM_DEBUG(dbgs() << "LV: Found new scalar instruction: " << *Update 4651 << "\n"); 4652 return; 4653 } 4654 // We only care about bitcast and getelementptr instructions contained in 4655 // the loop. 4656 if (!isLoopVaryingBitCastOrGEP(Ptr)) 4657 return; 4658 4659 // If the pointer has already been identified as scalar (e.g., if it was 4660 // also identified as uniform), there's nothing to do. 4661 auto *I = cast<Instruction>(Ptr); 4662 if (Worklist.count(I)) 4663 return; 4664 4665 // If the use of the pointer will be a scalar use, and all users of the 4666 // pointer are memory accesses, place the pointer in ScalarPtrs. Otherwise, 4667 // place the pointer in PossibleNonScalarPtrs. 4668 if (isScalarUse(MemAccess, Ptr) && llvm::all_of(I->users(), [&](User *U) { 4669 return isa<LoadInst>(U) || isa<StoreInst>(U); 4670 })) 4671 ScalarPtrs.insert(I); 4672 else 4673 PossibleNonScalarPtrs.insert(I); 4674 }; 4675 4676 // We seed the scalars analysis with three classes of instructions: (1) 4677 // instructions marked uniform-after-vectorization and (2) bitcast, 4678 // getelementptr and (pointer) phi instructions used by memory accesses 4679 // requiring a scalar use. 4680 // 4681 // (1) Add to the worklist all instructions that have been identified as 4682 // uniform-after-vectorization. 4683 Worklist.insert(Uniforms[VF].begin(), Uniforms[VF].end()); 4684 4685 // (2) Add to the worklist all bitcast and getelementptr instructions used by 4686 // memory accesses requiring a scalar use. The pointer operands of loads and 4687 // stores will be scalar as long as the memory accesses is not a gather or 4688 // scatter operation. The value operand of a store will remain scalar if the 4689 // store is scalarized. 4690 for (auto *BB : TheLoop->blocks()) 4691 for (auto &I : *BB) { 4692 if (auto *Load = dyn_cast<LoadInst>(&I)) { 4693 evaluatePtrUse(Load, Load->getPointerOperand()); 4694 } else if (auto *Store = dyn_cast<StoreInst>(&I)) { 4695 evaluatePtrUse(Store, Store->getPointerOperand()); 4696 evaluatePtrUse(Store, Store->getValueOperand()); 4697 } 4698 } 4699 for (auto *I : ScalarPtrs) 4700 if (!PossibleNonScalarPtrs.count(I)) { 4701 LLVM_DEBUG(dbgs() << "LV: Found scalar instruction: " << *I << "\n"); 4702 Worklist.insert(I); 4703 } 4704 4705 // Insert the forced scalars. 4706 // FIXME: Currently widenPHIInstruction() often creates a dead vector 4707 // induction variable when the PHI user is scalarized. 4708 auto ForcedScalar = ForcedScalars.find(VF); 4709 if (ForcedScalar != ForcedScalars.end()) 4710 for (auto *I : ForcedScalar->second) 4711 Worklist.insert(I); 4712 4713 // Expand the worklist by looking through any bitcasts and getelementptr 4714 // instructions we've already identified as scalar. This is similar to the 4715 // expansion step in collectLoopUniforms(); however, here we're only 4716 // expanding to include additional bitcasts and getelementptr instructions. 4717 unsigned Idx = 0; 4718 while (Idx != Worklist.size()) { 4719 Instruction *Dst = Worklist[Idx++]; 4720 if (!isLoopVaryingBitCastOrGEP(Dst->getOperand(0))) 4721 continue; 4722 auto *Src = cast<Instruction>(Dst->getOperand(0)); 4723 if (llvm::all_of(Src->users(), [&](User *U) -> bool { 4724 auto *J = cast<Instruction>(U); 4725 return !TheLoop->contains(J) || Worklist.count(J) || 4726 ((isa<LoadInst>(J) || isa<StoreInst>(J)) && 4727 isScalarUse(J, Src)); 4728 })) { 4729 Worklist.insert(Src); 4730 LLVM_DEBUG(dbgs() << "LV: Found scalar instruction: " << *Src << "\n"); 4731 } 4732 } 4733 4734 // An induction variable will remain scalar if all users of the induction 4735 // variable and induction variable update remain scalar. 4736 for (auto &Induction : Legal->getInductionVars()) { 4737 auto *Ind = Induction.first; 4738 auto *IndUpdate = cast<Instruction>(Ind->getIncomingValueForBlock(Latch)); 4739 4740 // If tail-folding is applied, the primary induction variable will be used 4741 // to feed a vector compare. 4742 if (Ind == Legal->getPrimaryInduction() && foldTailByMasking()) 4743 continue; 4744 4745 // Determine if all users of the induction variable are scalar after 4746 // vectorization. 4747 auto ScalarInd = llvm::all_of(Ind->users(), [&](User *U) -> bool { 4748 auto *I = cast<Instruction>(U); 4749 return I == IndUpdate || !TheLoop->contains(I) || Worklist.count(I); 4750 }); 4751 if (!ScalarInd) 4752 continue; 4753 4754 // Determine if all users of the induction variable update instruction are 4755 // scalar after vectorization. 4756 auto ScalarIndUpdate = 4757 llvm::all_of(IndUpdate->users(), [&](User *U) -> bool { 4758 auto *I = cast<Instruction>(U); 4759 return I == Ind || !TheLoop->contains(I) || Worklist.count(I); 4760 }); 4761 if (!ScalarIndUpdate) 4762 continue; 4763 4764 // The induction variable and its update instruction will remain scalar. 4765 Worklist.insert(Ind); 4766 Worklist.insert(IndUpdate); 4767 LLVM_DEBUG(dbgs() << "LV: Found scalar instruction: " << *Ind << "\n"); 4768 LLVM_DEBUG(dbgs() << "LV: Found scalar instruction: " << *IndUpdate 4769 << "\n"); 4770 } 4771 4772 Scalars[VF].insert(Worklist.begin(), Worklist.end()); 4773 } 4774 4775 bool LoopVectorizationCostModel::isScalarWithPredication(Instruction *I, unsigned VF) { 4776 if (!blockNeedsPredication(I->getParent())) 4777 return false; 4778 switch(I->getOpcode()) { 4779 default: 4780 break; 4781 case Instruction::Load: 4782 case Instruction::Store: { 4783 if (!Legal->isMaskRequired(I)) 4784 return false; 4785 auto *Ptr = getLoadStorePointerOperand(I); 4786 auto *Ty = getMemInstValueType(I); 4787 // We have already decided how to vectorize this instruction, get that 4788 // result. 4789 if (VF > 1) { 4790 InstWidening WideningDecision = getWideningDecision(I, VF); 4791 assert(WideningDecision != CM_Unknown && 4792 "Widening decision should be ready at this moment"); 4793 return WideningDecision == CM_Scalarize; 4794 } 4795 const Align Alignment = getLoadStoreAlignment(I); 4796 return isa<LoadInst>(I) ? !(isLegalMaskedLoad(Ty, Ptr, Alignment) || 4797 isLegalMaskedGather(Ty, Alignment)) 4798 : !(isLegalMaskedStore(Ty, Ptr, Alignment) || 4799 isLegalMaskedScatter(Ty, Alignment)); 4800 } 4801 case Instruction::UDiv: 4802 case Instruction::SDiv: 4803 case Instruction::SRem: 4804 case Instruction::URem: 4805 return mayDivideByZero(*I); 4806 } 4807 return false; 4808 } 4809 4810 bool LoopVectorizationCostModel::interleavedAccessCanBeWidened(Instruction *I, 4811 unsigned VF) { 4812 assert(isAccessInterleaved(I) && "Expecting interleaved access."); 4813 assert(getWideningDecision(I, VF) == CM_Unknown && 4814 "Decision should not be set yet."); 4815 auto *Group = getInterleavedAccessGroup(I); 4816 assert(Group && "Must have a group."); 4817 4818 // If the instruction's allocated size doesn't equal it's type size, it 4819 // requires padding and will be scalarized. 4820 auto &DL = I->getModule()->getDataLayout(); 4821 auto *ScalarTy = getMemInstValueType(I); 4822 if (hasIrregularType(ScalarTy, DL, VF)) 4823 return false; 4824 4825 // Check if masking is required. 4826 // A Group may need masking for one of two reasons: it resides in a block that 4827 // needs predication, or it was decided to use masking to deal with gaps. 4828 bool PredicatedAccessRequiresMasking = 4829 Legal->blockNeedsPredication(I->getParent()) && Legal->isMaskRequired(I); 4830 bool AccessWithGapsRequiresMasking = 4831 Group->requiresScalarEpilogue() && !isScalarEpilogueAllowed(); 4832 if (!PredicatedAccessRequiresMasking && !AccessWithGapsRequiresMasking) 4833 return true; 4834 4835 // If masked interleaving is required, we expect that the user/target had 4836 // enabled it, because otherwise it either wouldn't have been created or 4837 // it should have been invalidated by the CostModel. 4838 assert(useMaskedInterleavedAccesses(TTI) && 4839 "Masked interleave-groups for predicated accesses are not enabled."); 4840 4841 auto *Ty = getMemInstValueType(I); 4842 const Align Alignment = getLoadStoreAlignment(I); 4843 return isa<LoadInst>(I) ? TTI.isLegalMaskedLoad(Ty, Alignment) 4844 : TTI.isLegalMaskedStore(Ty, Alignment); 4845 } 4846 4847 bool LoopVectorizationCostModel::memoryInstructionCanBeWidened(Instruction *I, 4848 unsigned VF) { 4849 // Get and ensure we have a valid memory instruction. 4850 LoadInst *LI = dyn_cast<LoadInst>(I); 4851 StoreInst *SI = dyn_cast<StoreInst>(I); 4852 assert((LI || SI) && "Invalid memory instruction"); 4853 4854 auto *Ptr = getLoadStorePointerOperand(I); 4855 4856 // In order to be widened, the pointer should be consecutive, first of all. 4857 if (!Legal->isConsecutivePtr(Ptr)) 4858 return false; 4859 4860 // If the instruction is a store located in a predicated block, it will be 4861 // scalarized. 4862 if (isScalarWithPredication(I)) 4863 return false; 4864 4865 // If the instruction's allocated size doesn't equal it's type size, it 4866 // requires padding and will be scalarized. 4867 auto &DL = I->getModule()->getDataLayout(); 4868 auto *ScalarTy = LI ? LI->getType() : SI->getValueOperand()->getType(); 4869 if (hasIrregularType(ScalarTy, DL, VF)) 4870 return false; 4871 4872 return true; 4873 } 4874 4875 void LoopVectorizationCostModel::collectLoopUniforms(unsigned VF) { 4876 // We should not collect Uniforms more than once per VF. Right now, 4877 // this function is called from collectUniformsAndScalars(), which 4878 // already does this check. Collecting Uniforms for VF=1 does not make any 4879 // sense. 4880 4881 assert(VF >= 2 && Uniforms.find(VF) == Uniforms.end() && 4882 "This function should not be visited twice for the same VF"); 4883 4884 // Visit the list of Uniforms. If we'll not find any uniform value, we'll 4885 // not analyze again. Uniforms.count(VF) will return 1. 4886 Uniforms[VF].clear(); 4887 4888 // We now know that the loop is vectorizable! 4889 // Collect instructions inside the loop that will remain uniform after 4890 // vectorization. 4891 4892 // Global values, params and instructions outside of current loop are out of 4893 // scope. 4894 auto isOutOfScope = [&](Value *V) -> bool { 4895 Instruction *I = dyn_cast<Instruction>(V); 4896 return (!I || !TheLoop->contains(I)); 4897 }; 4898 4899 SetVector<Instruction *> Worklist; 4900 BasicBlock *Latch = TheLoop->getLoopLatch(); 4901 4902 // Instructions that are scalar with predication must not be considered 4903 // uniform after vectorization, because that would create an erroneous 4904 // replicating region where only a single instance out of VF should be formed. 4905 // TODO: optimize such seldom cases if found important, see PR40816. 4906 auto addToWorklistIfAllowed = [&](Instruction *I) -> void { 4907 if (isScalarWithPredication(I, VF)) { 4908 LLVM_DEBUG(dbgs() << "LV: Found not uniform being ScalarWithPredication: " 4909 << *I << "\n"); 4910 return; 4911 } 4912 LLVM_DEBUG(dbgs() << "LV: Found uniform instruction: " << *I << "\n"); 4913 Worklist.insert(I); 4914 }; 4915 4916 // Start with the conditional branch. If the branch condition is an 4917 // instruction contained in the loop that is only used by the branch, it is 4918 // uniform. 4919 auto *Cmp = dyn_cast<Instruction>(Latch->getTerminator()->getOperand(0)); 4920 if (Cmp && TheLoop->contains(Cmp) && Cmp->hasOneUse()) 4921 addToWorklistIfAllowed(Cmp); 4922 4923 // Holds consecutive and consecutive-like pointers. Consecutive-like pointers 4924 // are pointers that are treated like consecutive pointers during 4925 // vectorization. The pointer operands of interleaved accesses are an 4926 // example. 4927 SmallSetVector<Instruction *, 8> ConsecutiveLikePtrs; 4928 4929 // Holds pointer operands of instructions that are possibly non-uniform. 4930 SmallPtrSet<Instruction *, 8> PossibleNonUniformPtrs; 4931 4932 auto isUniformDecision = [&](Instruction *I, unsigned VF) { 4933 InstWidening WideningDecision = getWideningDecision(I, VF); 4934 assert(WideningDecision != CM_Unknown && 4935 "Widening decision should be ready at this moment"); 4936 4937 return (WideningDecision == CM_Widen || 4938 WideningDecision == CM_Widen_Reverse || 4939 WideningDecision == CM_Interleave); 4940 }; 4941 // Iterate over the instructions in the loop, and collect all 4942 // consecutive-like pointer operands in ConsecutiveLikePtrs. If it's possible 4943 // that a consecutive-like pointer operand will be scalarized, we collect it 4944 // in PossibleNonUniformPtrs instead. We use two sets here because a single 4945 // getelementptr instruction can be used by both vectorized and scalarized 4946 // memory instructions. For example, if a loop loads and stores from the same 4947 // location, but the store is conditional, the store will be scalarized, and 4948 // the getelementptr won't remain uniform. 4949 for (auto *BB : TheLoop->blocks()) 4950 for (auto &I : *BB) { 4951 // If there's no pointer operand, there's nothing to do. 4952 auto *Ptr = dyn_cast_or_null<Instruction>(getLoadStorePointerOperand(&I)); 4953 if (!Ptr) 4954 continue; 4955 4956 // True if all users of Ptr are memory accesses that have Ptr as their 4957 // pointer operand. 4958 auto UsersAreMemAccesses = 4959 llvm::all_of(Ptr->users(), [&](User *U) -> bool { 4960 return getLoadStorePointerOperand(U) == Ptr; 4961 }); 4962 4963 // Ensure the memory instruction will not be scalarized or used by 4964 // gather/scatter, making its pointer operand non-uniform. If the pointer 4965 // operand is used by any instruction other than a memory access, we 4966 // conservatively assume the pointer operand may be non-uniform. 4967 if (!UsersAreMemAccesses || !isUniformDecision(&I, VF)) 4968 PossibleNonUniformPtrs.insert(Ptr); 4969 4970 // If the memory instruction will be vectorized and its pointer operand 4971 // is consecutive-like, or interleaving - the pointer operand should 4972 // remain uniform. 4973 else 4974 ConsecutiveLikePtrs.insert(Ptr); 4975 } 4976 4977 // Add to the Worklist all consecutive and consecutive-like pointers that 4978 // aren't also identified as possibly non-uniform. 4979 for (auto *V : ConsecutiveLikePtrs) 4980 if (!PossibleNonUniformPtrs.count(V)) 4981 addToWorklistIfAllowed(V); 4982 4983 // Expand Worklist in topological order: whenever a new instruction 4984 // is added , its users should be already inside Worklist. It ensures 4985 // a uniform instruction will only be used by uniform instructions. 4986 unsigned idx = 0; 4987 while (idx != Worklist.size()) { 4988 Instruction *I = Worklist[idx++]; 4989 4990 for (auto OV : I->operand_values()) { 4991 // isOutOfScope operands cannot be uniform instructions. 4992 if (isOutOfScope(OV)) 4993 continue; 4994 // First order recurrence Phi's should typically be considered 4995 // non-uniform. 4996 auto *OP = dyn_cast<PHINode>(OV); 4997 if (OP && Legal->isFirstOrderRecurrence(OP)) 4998 continue; 4999 // If all the users of the operand are uniform, then add the 5000 // operand into the uniform worklist. 5001 auto *OI = cast<Instruction>(OV); 5002 if (llvm::all_of(OI->users(), [&](User *U) -> bool { 5003 auto *J = cast<Instruction>(U); 5004 return Worklist.count(J) || 5005 (OI == getLoadStorePointerOperand(J) && 5006 isUniformDecision(J, VF)); 5007 })) 5008 addToWorklistIfAllowed(OI); 5009 } 5010 } 5011 5012 // Returns true if Ptr is the pointer operand of a memory access instruction 5013 // I, and I is known to not require scalarization. 5014 auto isVectorizedMemAccessUse = [&](Instruction *I, Value *Ptr) -> bool { 5015 return getLoadStorePointerOperand(I) == Ptr && isUniformDecision(I, VF); 5016 }; 5017 5018 // For an instruction to be added into Worklist above, all its users inside 5019 // the loop should also be in Worklist. However, this condition cannot be 5020 // true for phi nodes that form a cyclic dependence. We must process phi 5021 // nodes separately. An induction variable will remain uniform if all users 5022 // of the induction variable and induction variable update remain uniform. 5023 // The code below handles both pointer and non-pointer induction variables. 5024 for (auto &Induction : Legal->getInductionVars()) { 5025 auto *Ind = Induction.first; 5026 auto *IndUpdate = cast<Instruction>(Ind->getIncomingValueForBlock(Latch)); 5027 5028 // Determine if all users of the induction variable are uniform after 5029 // vectorization. 5030 auto UniformInd = llvm::all_of(Ind->users(), [&](User *U) -> bool { 5031 auto *I = cast<Instruction>(U); 5032 return I == IndUpdate || !TheLoop->contains(I) || Worklist.count(I) || 5033 isVectorizedMemAccessUse(I, Ind); 5034 }); 5035 if (!UniformInd) 5036 continue; 5037 5038 // Determine if all users of the induction variable update instruction are 5039 // uniform after vectorization. 5040 auto UniformIndUpdate = 5041 llvm::all_of(IndUpdate->users(), [&](User *U) -> bool { 5042 auto *I = cast<Instruction>(U); 5043 return I == Ind || !TheLoop->contains(I) || Worklist.count(I) || 5044 isVectorizedMemAccessUse(I, IndUpdate); 5045 }); 5046 if (!UniformIndUpdate) 5047 continue; 5048 5049 // The induction variable and its update instruction will remain uniform. 5050 addToWorklistIfAllowed(Ind); 5051 addToWorklistIfAllowed(IndUpdate); 5052 } 5053 5054 Uniforms[VF].insert(Worklist.begin(), Worklist.end()); 5055 } 5056 5057 bool LoopVectorizationCostModel::runtimeChecksRequired() { 5058 LLVM_DEBUG(dbgs() << "LV: Performing code size checks.\n"); 5059 5060 if (Legal->getRuntimePointerChecking()->Need) { 5061 reportVectorizationFailure("Runtime ptr check is required with -Os/-Oz", 5062 "runtime pointer checks needed. Enable vectorization of this " 5063 "loop with '#pragma clang loop vectorize(enable)' when " 5064 "compiling with -Os/-Oz", 5065 "CantVersionLoopWithOptForSize", ORE, TheLoop); 5066 return true; 5067 } 5068 5069 if (!PSE.getUnionPredicate().getPredicates().empty()) { 5070 reportVectorizationFailure("Runtime SCEV check is required with -Os/-Oz", 5071 "runtime SCEV checks needed. Enable vectorization of this " 5072 "loop with '#pragma clang loop vectorize(enable)' when " 5073 "compiling with -Os/-Oz", 5074 "CantVersionLoopWithOptForSize", ORE, TheLoop); 5075 return true; 5076 } 5077 5078 // FIXME: Avoid specializing for stride==1 instead of bailing out. 5079 if (!Legal->getLAI()->getSymbolicStrides().empty()) { 5080 reportVectorizationFailure("Runtime stride check for small trip count", 5081 "runtime stride == 1 checks needed. Enable vectorization of " 5082 "this loop without such check by compiling with -Os/-Oz", 5083 "CantVersionLoopWithOptForSize", ORE, TheLoop); 5084 return true; 5085 } 5086 5087 return false; 5088 } 5089 5090 Optional<unsigned> LoopVectorizationCostModel::computeMaxVF(unsigned UserVF, 5091 unsigned UserIC) { 5092 if (Legal->getRuntimePointerChecking()->Need && TTI.hasBranchDivergence()) { 5093 // TODO: It may by useful to do since it's still likely to be dynamically 5094 // uniform if the target can skip. 5095 reportVectorizationFailure( 5096 "Not inserting runtime ptr check for divergent target", 5097 "runtime pointer checks needed. Not enabled for divergent target", 5098 "CantVersionLoopWithDivergentTarget", ORE, TheLoop); 5099 return None; 5100 } 5101 5102 unsigned TC = PSE.getSE()->getSmallConstantTripCount(TheLoop); 5103 LLVM_DEBUG(dbgs() << "LV: Found trip count: " << TC << '\n'); 5104 if (TC == 1) { 5105 reportVectorizationFailure("Single iteration (non) loop", 5106 "loop trip count is one, irrelevant for vectorization", 5107 "SingleIterationLoop", ORE, TheLoop); 5108 return None; 5109 } 5110 5111 switch (ScalarEpilogueStatus) { 5112 case CM_ScalarEpilogueAllowed: 5113 return UserVF ? UserVF : computeFeasibleMaxVF(TC); 5114 case CM_ScalarEpilogueNotNeededUsePredicate: 5115 LLVM_DEBUG( 5116 dbgs() << "LV: vector predicate hint/switch found.\n" 5117 << "LV: Not allowing scalar epilogue, creating predicated " 5118 << "vector loop.\n"); 5119 break; 5120 case CM_ScalarEpilogueNotAllowedLowTripLoop: 5121 // fallthrough as a special case of OptForSize 5122 case CM_ScalarEpilogueNotAllowedOptSize: 5123 if (ScalarEpilogueStatus == CM_ScalarEpilogueNotAllowedOptSize) 5124 LLVM_DEBUG( 5125 dbgs() << "LV: Not allowing scalar epilogue due to -Os/-Oz.\n"); 5126 else 5127 LLVM_DEBUG(dbgs() << "LV: Not allowing scalar epilogue due to low trip " 5128 << "count.\n"); 5129 5130 // Bail if runtime checks are required, which are not good when optimising 5131 // for size. 5132 if (runtimeChecksRequired()) 5133 return None; 5134 break; 5135 } 5136 5137 // Now try the tail folding 5138 5139 // Invalidate interleave groups that require an epilogue if we can't mask 5140 // the interleave-group. 5141 if (!useMaskedInterleavedAccesses(TTI)) { 5142 assert(WideningDecisions.empty() && Uniforms.empty() && Scalars.empty() && 5143 "No decisions should have been taken at this point"); 5144 // Note: There is no need to invalidate any cost modeling decisions here, as 5145 // non where taken so far. 5146 InterleaveInfo.invalidateGroupsRequiringScalarEpilogue(); 5147 } 5148 5149 unsigned MaxVF = UserVF ? UserVF : computeFeasibleMaxVF(TC); 5150 assert((UserVF || isPowerOf2_32(MaxVF)) && "MaxVF must be a power of 2"); 5151 unsigned MaxVFtimesIC = UserIC ? MaxVF * UserIC : MaxVF; 5152 if (TC > 0 && TC % MaxVFtimesIC == 0) { 5153 // Accept MaxVF if we do not have a tail. 5154 LLVM_DEBUG(dbgs() << "LV: No tail will remain for any chosen VF.\n"); 5155 return MaxVF; 5156 } 5157 5158 // If we don't know the precise trip count, or if the trip count that we 5159 // found modulo the vectorization factor is not zero, try to fold the tail 5160 // by masking. 5161 // FIXME: look for a smaller MaxVF that does divide TC rather than masking. 5162 if (Legal->prepareToFoldTailByMasking()) { 5163 FoldTailByMasking = true; 5164 return MaxVF; 5165 } 5166 5167 if (TC == 0) { 5168 reportVectorizationFailure( 5169 "Unable to calculate the loop count due to complex control flow", 5170 "unable to calculate the loop count due to complex control flow", 5171 "UnknownLoopCountComplexCFG", ORE, TheLoop); 5172 return None; 5173 } 5174 5175 reportVectorizationFailure( 5176 "Cannot optimize for size and vectorize at the same time.", 5177 "cannot optimize for size and vectorize at the same time. " 5178 "Enable vectorization of this loop with '#pragma clang loop " 5179 "vectorize(enable)' when compiling with -Os/-Oz", 5180 "NoTailLoopWithOptForSize", ORE, TheLoop); 5181 return None; 5182 } 5183 5184 unsigned 5185 LoopVectorizationCostModel::computeFeasibleMaxVF(unsigned ConstTripCount) { 5186 MinBWs = computeMinimumValueSizes(TheLoop->getBlocks(), *DB, &TTI); 5187 unsigned SmallestType, WidestType; 5188 std::tie(SmallestType, WidestType) = getSmallestAndWidestTypes(); 5189 unsigned WidestRegister = TTI.getRegisterBitWidth(true); 5190 5191 // Get the maximum safe dependence distance in bits computed by LAA. 5192 // It is computed by MaxVF * sizeOf(type) * 8, where type is taken from 5193 // the memory accesses that is most restrictive (involved in the smallest 5194 // dependence distance). 5195 unsigned MaxSafeRegisterWidth = Legal->getMaxSafeRegisterWidth(); 5196 5197 WidestRegister = std::min(WidestRegister, MaxSafeRegisterWidth); 5198 5199 // Ensure MaxVF is a power of 2; the dependence distance bound may not be. 5200 // Note that both WidestRegister and WidestType may not be a powers of 2. 5201 unsigned MaxVectorSize = PowerOf2Floor(WidestRegister / WidestType); 5202 5203 LLVM_DEBUG(dbgs() << "LV: The Smallest and Widest types: " << SmallestType 5204 << " / " << WidestType << " bits.\n"); 5205 LLVM_DEBUG(dbgs() << "LV: The Widest register safe to use is: " 5206 << WidestRegister << " bits.\n"); 5207 5208 assert(MaxVectorSize <= 256 && "Did not expect to pack so many elements" 5209 " into one vector!"); 5210 if (MaxVectorSize == 0) { 5211 LLVM_DEBUG(dbgs() << "LV: The target has no vector registers.\n"); 5212 MaxVectorSize = 1; 5213 return MaxVectorSize; 5214 } else if (ConstTripCount && ConstTripCount < MaxVectorSize && 5215 isPowerOf2_32(ConstTripCount)) { 5216 // We need to clamp the VF to be the ConstTripCount. There is no point in 5217 // choosing a higher viable VF as done in the loop below. 5218 LLVM_DEBUG(dbgs() << "LV: Clamping the MaxVF to the constant trip count: " 5219 << ConstTripCount << "\n"); 5220 MaxVectorSize = ConstTripCount; 5221 return MaxVectorSize; 5222 } 5223 5224 unsigned MaxVF = MaxVectorSize; 5225 if (TTI.shouldMaximizeVectorBandwidth(!isScalarEpilogueAllowed()) || 5226 (MaximizeBandwidth && isScalarEpilogueAllowed())) { 5227 // Collect all viable vectorization factors larger than the default MaxVF 5228 // (i.e. MaxVectorSize). 5229 SmallVector<unsigned, 8> VFs; 5230 unsigned NewMaxVectorSize = WidestRegister / SmallestType; 5231 for (unsigned VS = MaxVectorSize * 2; VS <= NewMaxVectorSize; VS *= 2) 5232 VFs.push_back(VS); 5233 5234 // For each VF calculate its register usage. 5235 auto RUs = calculateRegisterUsage(VFs); 5236 5237 // Select the largest VF which doesn't require more registers than existing 5238 // ones. 5239 for (int i = RUs.size() - 1; i >= 0; --i) { 5240 bool Selected = true; 5241 for (auto& pair : RUs[i].MaxLocalUsers) { 5242 unsigned TargetNumRegisters = TTI.getNumberOfRegisters(pair.first); 5243 if (pair.second > TargetNumRegisters) 5244 Selected = false; 5245 } 5246 if (Selected) { 5247 MaxVF = VFs[i]; 5248 break; 5249 } 5250 } 5251 if (unsigned MinVF = TTI.getMinimumVF(SmallestType)) { 5252 if (MaxVF < MinVF) { 5253 LLVM_DEBUG(dbgs() << "LV: Overriding calculated MaxVF(" << MaxVF 5254 << ") with target's minimum: " << MinVF << '\n'); 5255 MaxVF = MinVF; 5256 } 5257 } 5258 } 5259 return MaxVF; 5260 } 5261 5262 VectorizationFactor 5263 LoopVectorizationCostModel::selectVectorizationFactor(unsigned MaxVF) { 5264 float Cost = expectedCost(1).first; 5265 const float ScalarCost = Cost; 5266 unsigned Width = 1; 5267 LLVM_DEBUG(dbgs() << "LV: Scalar loop costs: " << (int)ScalarCost << ".\n"); 5268 5269 bool ForceVectorization = Hints->getForce() == LoopVectorizeHints::FK_Enabled; 5270 if (ForceVectorization && MaxVF > 1) { 5271 // Ignore scalar width, because the user explicitly wants vectorization. 5272 // Initialize cost to max so that VF = 2 is, at least, chosen during cost 5273 // evaluation. 5274 Cost = std::numeric_limits<float>::max(); 5275 } 5276 5277 for (unsigned i = 2; i <= MaxVF; i *= 2) { 5278 // Notice that the vector loop needs to be executed less times, so 5279 // we need to divide the cost of the vector loops by the width of 5280 // the vector elements. 5281 VectorizationCostTy C = expectedCost(i); 5282 float VectorCost = C.first / (float)i; 5283 LLVM_DEBUG(dbgs() << "LV: Vector loop of width " << i 5284 << " costs: " << (int)VectorCost << ".\n"); 5285 if (!C.second && !ForceVectorization) { 5286 LLVM_DEBUG( 5287 dbgs() << "LV: Not considering vector loop of width " << i 5288 << " because it will not generate any vector instructions.\n"); 5289 continue; 5290 } 5291 if (VectorCost < Cost) { 5292 Cost = VectorCost; 5293 Width = i; 5294 } 5295 } 5296 5297 if (!EnableCondStoresVectorization && NumPredStores) { 5298 reportVectorizationFailure("There are conditional stores.", 5299 "store that is conditionally executed prevents vectorization", 5300 "ConditionalStore", ORE, TheLoop); 5301 Width = 1; 5302 Cost = ScalarCost; 5303 } 5304 5305 LLVM_DEBUG(if (ForceVectorization && Width > 1 && Cost >= ScalarCost) dbgs() 5306 << "LV: Vectorization seems to be not beneficial, " 5307 << "but was forced by a user.\n"); 5308 LLVM_DEBUG(dbgs() << "LV: Selecting VF: " << Width << ".\n"); 5309 VectorizationFactor Factor = {Width, (unsigned)(Width * Cost)}; 5310 return Factor; 5311 } 5312 5313 std::pair<unsigned, unsigned> 5314 LoopVectorizationCostModel::getSmallestAndWidestTypes() { 5315 unsigned MinWidth = -1U; 5316 unsigned MaxWidth = 8; 5317 const DataLayout &DL = TheFunction->getParent()->getDataLayout(); 5318 5319 // For each block. 5320 for (BasicBlock *BB : TheLoop->blocks()) { 5321 // For each instruction in the loop. 5322 for (Instruction &I : BB->instructionsWithoutDebug()) { 5323 Type *T = I.getType(); 5324 5325 // Skip ignored values. 5326 if (ValuesToIgnore.count(&I)) 5327 continue; 5328 5329 // Only examine Loads, Stores and PHINodes. 5330 if (!isa<LoadInst>(I) && !isa<StoreInst>(I) && !isa<PHINode>(I)) 5331 continue; 5332 5333 // Examine PHI nodes that are reduction variables. Update the type to 5334 // account for the recurrence type. 5335 if (auto *PN = dyn_cast<PHINode>(&I)) { 5336 if (!Legal->isReductionVariable(PN)) 5337 continue; 5338 RecurrenceDescriptor RdxDesc = Legal->getReductionVars()[PN]; 5339 T = RdxDesc.getRecurrenceType(); 5340 } 5341 5342 // Examine the stored values. 5343 if (auto *ST = dyn_cast<StoreInst>(&I)) 5344 T = ST->getValueOperand()->getType(); 5345 5346 // Ignore loaded pointer types and stored pointer types that are not 5347 // vectorizable. 5348 // 5349 // FIXME: The check here attempts to predict whether a load or store will 5350 // be vectorized. We only know this for certain after a VF has 5351 // been selected. Here, we assume that if an access can be 5352 // vectorized, it will be. We should also look at extending this 5353 // optimization to non-pointer types. 5354 // 5355 if (T->isPointerTy() && !isConsecutiveLoadOrStore(&I) && 5356 !isAccessInterleaved(&I) && !isLegalGatherOrScatter(&I)) 5357 continue; 5358 5359 MinWidth = std::min(MinWidth, 5360 (unsigned)DL.getTypeSizeInBits(T->getScalarType())); 5361 MaxWidth = std::max(MaxWidth, 5362 (unsigned)DL.getTypeSizeInBits(T->getScalarType())); 5363 } 5364 } 5365 5366 return {MinWidth, MaxWidth}; 5367 } 5368 5369 unsigned LoopVectorizationCostModel::selectInterleaveCount(unsigned VF, 5370 unsigned LoopCost) { 5371 // -- The interleave heuristics -- 5372 // We interleave the loop in order to expose ILP and reduce the loop overhead. 5373 // There are many micro-architectural considerations that we can't predict 5374 // at this level. For example, frontend pressure (on decode or fetch) due to 5375 // code size, or the number and capabilities of the execution ports. 5376 // 5377 // We use the following heuristics to select the interleave count: 5378 // 1. If the code has reductions, then we interleave to break the cross 5379 // iteration dependency. 5380 // 2. If the loop is really small, then we interleave to reduce the loop 5381 // overhead. 5382 // 3. We don't interleave if we think that we will spill registers to memory 5383 // due to the increased register pressure. 5384 5385 if (!isScalarEpilogueAllowed()) 5386 return 1; 5387 5388 // We used the distance for the interleave count. 5389 if (Legal->getMaxSafeDepDistBytes() != -1U) 5390 return 1; 5391 5392 // Do not interleave loops with a relatively small known or estimated trip 5393 // count. 5394 auto BestKnownTC = getSmallBestKnownTC(*PSE.getSE(), TheLoop); 5395 if (BestKnownTC && *BestKnownTC < TinyTripCountInterleaveThreshold) 5396 return 1; 5397 5398 RegisterUsage R = calculateRegisterUsage({VF})[0]; 5399 // We divide by these constants so assume that we have at least one 5400 // instruction that uses at least one register. 5401 for (auto& pair : R.MaxLocalUsers) { 5402 pair.second = std::max(pair.second, 1U); 5403 } 5404 5405 // We calculate the interleave count using the following formula. 5406 // Subtract the number of loop invariants from the number of available 5407 // registers. These registers are used by all of the interleaved instances. 5408 // Next, divide the remaining registers by the number of registers that is 5409 // required by the loop, in order to estimate how many parallel instances 5410 // fit without causing spills. All of this is rounded down if necessary to be 5411 // a power of two. We want power of two interleave count to simplify any 5412 // addressing operations or alignment considerations. 5413 // We also want power of two interleave counts to ensure that the induction 5414 // variable of the vector loop wraps to zero, when tail is folded by masking; 5415 // this currently happens when OptForSize, in which case IC is set to 1 above. 5416 unsigned IC = UINT_MAX; 5417 5418 for (auto& pair : R.MaxLocalUsers) { 5419 unsigned TargetNumRegisters = TTI.getNumberOfRegisters(pair.first); 5420 LLVM_DEBUG(dbgs() << "LV: The target has " << TargetNumRegisters 5421 << " registers of " 5422 << TTI.getRegisterClassName(pair.first) << " register class\n"); 5423 if (VF == 1) { 5424 if (ForceTargetNumScalarRegs.getNumOccurrences() > 0) 5425 TargetNumRegisters = ForceTargetNumScalarRegs; 5426 } else { 5427 if (ForceTargetNumVectorRegs.getNumOccurrences() > 0) 5428 TargetNumRegisters = ForceTargetNumVectorRegs; 5429 } 5430 unsigned MaxLocalUsers = pair.second; 5431 unsigned LoopInvariantRegs = 0; 5432 if (R.LoopInvariantRegs.find(pair.first) != R.LoopInvariantRegs.end()) 5433 LoopInvariantRegs = R.LoopInvariantRegs[pair.first]; 5434 5435 unsigned TmpIC = PowerOf2Floor((TargetNumRegisters - LoopInvariantRegs) / MaxLocalUsers); 5436 // Don't count the induction variable as interleaved. 5437 if (EnableIndVarRegisterHeur) { 5438 TmpIC = 5439 PowerOf2Floor((TargetNumRegisters - LoopInvariantRegs - 1) / 5440 std::max(1U, (MaxLocalUsers - 1))); 5441 } 5442 5443 IC = std::min(IC, TmpIC); 5444 } 5445 5446 // Clamp the interleave ranges to reasonable counts. 5447 unsigned MaxInterleaveCount = TTI.getMaxInterleaveFactor(VF); 5448 5449 // Check if the user has overridden the max. 5450 if (VF == 1) { 5451 if (ForceTargetMaxScalarInterleaveFactor.getNumOccurrences() > 0) 5452 MaxInterleaveCount = ForceTargetMaxScalarInterleaveFactor; 5453 } else { 5454 if (ForceTargetMaxVectorInterleaveFactor.getNumOccurrences() > 0) 5455 MaxInterleaveCount = ForceTargetMaxVectorInterleaveFactor; 5456 } 5457 5458 // If trip count is known or estimated compile time constant, limit the 5459 // interleave count to be less than the trip count divided by VF. 5460 if (BestKnownTC) { 5461 MaxInterleaveCount = std::min(*BestKnownTC / VF, MaxInterleaveCount); 5462 } 5463 5464 // If we did not calculate the cost for VF (because the user selected the VF) 5465 // then we calculate the cost of VF here. 5466 if (LoopCost == 0) 5467 LoopCost = expectedCost(VF).first; 5468 5469 assert(LoopCost && "Non-zero loop cost expected"); 5470 5471 // Clamp the calculated IC to be between the 1 and the max interleave count 5472 // that the target and trip count allows. 5473 if (IC > MaxInterleaveCount) 5474 IC = MaxInterleaveCount; 5475 else if (IC < 1) 5476 IC = 1; 5477 5478 // Interleave if we vectorized this loop and there is a reduction that could 5479 // benefit from interleaving. 5480 if (VF > 1 && !Legal->getReductionVars().empty()) { 5481 LLVM_DEBUG(dbgs() << "LV: Interleaving because of reductions.\n"); 5482 return IC; 5483 } 5484 5485 // Note that if we've already vectorized the loop we will have done the 5486 // runtime check and so interleaving won't require further checks. 5487 bool InterleavingRequiresRuntimePointerCheck = 5488 (VF == 1 && Legal->getRuntimePointerChecking()->Need); 5489 5490 // We want to interleave small loops in order to reduce the loop overhead and 5491 // potentially expose ILP opportunities. 5492 LLVM_DEBUG(dbgs() << "LV: Loop cost is " << LoopCost << '\n'); 5493 if (!InterleavingRequiresRuntimePointerCheck && LoopCost < SmallLoopCost) { 5494 // We assume that the cost overhead is 1 and we use the cost model 5495 // to estimate the cost of the loop and interleave until the cost of the 5496 // loop overhead is about 5% of the cost of the loop. 5497 unsigned SmallIC = 5498 std::min(IC, (unsigned)PowerOf2Floor(SmallLoopCost / LoopCost)); 5499 5500 // Interleave until store/load ports (estimated by max interleave count) are 5501 // saturated. 5502 unsigned NumStores = Legal->getNumStores(); 5503 unsigned NumLoads = Legal->getNumLoads(); 5504 unsigned StoresIC = IC / (NumStores ? NumStores : 1); 5505 unsigned LoadsIC = IC / (NumLoads ? NumLoads : 1); 5506 5507 // If we have a scalar reduction (vector reductions are already dealt with 5508 // by this point), we can increase the critical path length if the loop 5509 // we're interleaving is inside another loop. Limit, by default to 2, so the 5510 // critical path only gets increased by one reduction operation. 5511 if (!Legal->getReductionVars().empty() && TheLoop->getLoopDepth() > 1) { 5512 unsigned F = static_cast<unsigned>(MaxNestedScalarReductionIC); 5513 SmallIC = std::min(SmallIC, F); 5514 StoresIC = std::min(StoresIC, F); 5515 LoadsIC = std::min(LoadsIC, F); 5516 } 5517 5518 if (EnableLoadStoreRuntimeInterleave && 5519 std::max(StoresIC, LoadsIC) > SmallIC) { 5520 LLVM_DEBUG( 5521 dbgs() << "LV: Interleaving to saturate store or load ports.\n"); 5522 return std::max(StoresIC, LoadsIC); 5523 } 5524 5525 LLVM_DEBUG(dbgs() << "LV: Interleaving to reduce branch cost.\n"); 5526 return SmallIC; 5527 } 5528 5529 // Interleave if this is a large loop (small loops are already dealt with by 5530 // this point) that could benefit from interleaving. 5531 bool HasReductions = !Legal->getReductionVars().empty(); 5532 if (TTI.enableAggressiveInterleaving(HasReductions)) { 5533 LLVM_DEBUG(dbgs() << "LV: Interleaving to expose ILP.\n"); 5534 return IC; 5535 } 5536 5537 LLVM_DEBUG(dbgs() << "LV: Not Interleaving.\n"); 5538 return 1; 5539 } 5540 5541 SmallVector<LoopVectorizationCostModel::RegisterUsage, 8> 5542 LoopVectorizationCostModel::calculateRegisterUsage(ArrayRef<unsigned> VFs) { 5543 // This function calculates the register usage by measuring the highest number 5544 // of values that are alive at a single location. Obviously, this is a very 5545 // rough estimation. We scan the loop in a topological order in order and 5546 // assign a number to each instruction. We use RPO to ensure that defs are 5547 // met before their users. We assume that each instruction that has in-loop 5548 // users starts an interval. We record every time that an in-loop value is 5549 // used, so we have a list of the first and last occurrences of each 5550 // instruction. Next, we transpose this data structure into a multi map that 5551 // holds the list of intervals that *end* at a specific location. This multi 5552 // map allows us to perform a linear search. We scan the instructions linearly 5553 // and record each time that a new interval starts, by placing it in a set. 5554 // If we find this value in the multi-map then we remove it from the set. 5555 // The max register usage is the maximum size of the set. 5556 // We also search for instructions that are defined outside the loop, but are 5557 // used inside the loop. We need this number separately from the max-interval 5558 // usage number because when we unroll, loop-invariant values do not take 5559 // more register. 5560 LoopBlocksDFS DFS(TheLoop); 5561 DFS.perform(LI); 5562 5563 RegisterUsage RU; 5564 5565 // Each 'key' in the map opens a new interval. The values 5566 // of the map are the index of the 'last seen' usage of the 5567 // instruction that is the key. 5568 using IntervalMap = DenseMap<Instruction *, unsigned>; 5569 5570 // Maps instruction to its index. 5571 SmallVector<Instruction *, 64> IdxToInstr; 5572 // Marks the end of each interval. 5573 IntervalMap EndPoint; 5574 // Saves the list of instruction indices that are used in the loop. 5575 SmallPtrSet<Instruction *, 8> Ends; 5576 // Saves the list of values that are used in the loop but are 5577 // defined outside the loop, such as arguments and constants. 5578 SmallPtrSet<Value *, 8> LoopInvariants; 5579 5580 for (BasicBlock *BB : make_range(DFS.beginRPO(), DFS.endRPO())) { 5581 for (Instruction &I : BB->instructionsWithoutDebug()) { 5582 IdxToInstr.push_back(&I); 5583 5584 // Save the end location of each USE. 5585 for (Value *U : I.operands()) { 5586 auto *Instr = dyn_cast<Instruction>(U); 5587 5588 // Ignore non-instruction values such as arguments, constants, etc. 5589 if (!Instr) 5590 continue; 5591 5592 // If this instruction is outside the loop then record it and continue. 5593 if (!TheLoop->contains(Instr)) { 5594 LoopInvariants.insert(Instr); 5595 continue; 5596 } 5597 5598 // Overwrite previous end points. 5599 EndPoint[Instr] = IdxToInstr.size(); 5600 Ends.insert(Instr); 5601 } 5602 } 5603 } 5604 5605 // Saves the list of intervals that end with the index in 'key'. 5606 using InstrList = SmallVector<Instruction *, 2>; 5607 DenseMap<unsigned, InstrList> TransposeEnds; 5608 5609 // Transpose the EndPoints to a list of values that end at each index. 5610 for (auto &Interval : EndPoint) 5611 TransposeEnds[Interval.second].push_back(Interval.first); 5612 5613 SmallPtrSet<Instruction *, 8> OpenIntervals; 5614 5615 // Get the size of the widest register. 5616 unsigned MaxSafeDepDist = -1U; 5617 if (Legal->getMaxSafeDepDistBytes() != -1U) 5618 MaxSafeDepDist = Legal->getMaxSafeDepDistBytes() * 8; 5619 unsigned WidestRegister = 5620 std::min(TTI.getRegisterBitWidth(true), MaxSafeDepDist); 5621 const DataLayout &DL = TheFunction->getParent()->getDataLayout(); 5622 5623 SmallVector<RegisterUsage, 8> RUs(VFs.size()); 5624 SmallVector<SmallMapVector<unsigned, unsigned, 4>, 8> MaxUsages(VFs.size()); 5625 5626 LLVM_DEBUG(dbgs() << "LV(REG): Calculating max register usage:\n"); 5627 5628 // A lambda that gets the register usage for the given type and VF. 5629 auto GetRegUsage = [&DL, WidestRegister](Type *Ty, unsigned VF) { 5630 if (Ty->isTokenTy()) 5631 return 0U; 5632 unsigned TypeSize = DL.getTypeSizeInBits(Ty->getScalarType()); 5633 return std::max<unsigned>(1, VF * TypeSize / WidestRegister); 5634 }; 5635 5636 for (unsigned int i = 0, s = IdxToInstr.size(); i < s; ++i) { 5637 Instruction *I = IdxToInstr[i]; 5638 5639 // Remove all of the instructions that end at this location. 5640 InstrList &List = TransposeEnds[i]; 5641 for (Instruction *ToRemove : List) 5642 OpenIntervals.erase(ToRemove); 5643 5644 // Ignore instructions that are never used within the loop. 5645 if (!Ends.count(I)) 5646 continue; 5647 5648 // Skip ignored values. 5649 if (ValuesToIgnore.count(I)) 5650 continue; 5651 5652 // For each VF find the maximum usage of registers. 5653 for (unsigned j = 0, e = VFs.size(); j < e; ++j) { 5654 // Count the number of live intervals. 5655 SmallMapVector<unsigned, unsigned, 4> RegUsage; 5656 5657 if (VFs[j] == 1) { 5658 for (auto Inst : OpenIntervals) { 5659 unsigned ClassID = TTI.getRegisterClassForType(false, Inst->getType()); 5660 if (RegUsage.find(ClassID) == RegUsage.end()) 5661 RegUsage[ClassID] = 1; 5662 else 5663 RegUsage[ClassID] += 1; 5664 } 5665 } else { 5666 collectUniformsAndScalars(VFs[j]); 5667 for (auto Inst : OpenIntervals) { 5668 // Skip ignored values for VF > 1. 5669 if (VecValuesToIgnore.count(Inst)) 5670 continue; 5671 if (isScalarAfterVectorization(Inst, VFs[j])) { 5672 unsigned ClassID = TTI.getRegisterClassForType(false, Inst->getType()); 5673 if (RegUsage.find(ClassID) == RegUsage.end()) 5674 RegUsage[ClassID] = 1; 5675 else 5676 RegUsage[ClassID] += 1; 5677 } else { 5678 unsigned ClassID = TTI.getRegisterClassForType(true, Inst->getType()); 5679 if (RegUsage.find(ClassID) == RegUsage.end()) 5680 RegUsage[ClassID] = GetRegUsage(Inst->getType(), VFs[j]); 5681 else 5682 RegUsage[ClassID] += GetRegUsage(Inst->getType(), VFs[j]); 5683 } 5684 } 5685 } 5686 5687 for (auto& pair : RegUsage) { 5688 if (MaxUsages[j].find(pair.first) != MaxUsages[j].end()) 5689 MaxUsages[j][pair.first] = std::max(MaxUsages[j][pair.first], pair.second); 5690 else 5691 MaxUsages[j][pair.first] = pair.second; 5692 } 5693 } 5694 5695 LLVM_DEBUG(dbgs() << "LV(REG): At #" << i << " Interval # " 5696 << OpenIntervals.size() << '\n'); 5697 5698 // Add the current instruction to the list of open intervals. 5699 OpenIntervals.insert(I); 5700 } 5701 5702 for (unsigned i = 0, e = VFs.size(); i < e; ++i) { 5703 SmallMapVector<unsigned, unsigned, 4> Invariant; 5704 5705 for (auto Inst : LoopInvariants) { 5706 unsigned Usage = VFs[i] == 1 ? 1 : GetRegUsage(Inst->getType(), VFs[i]); 5707 unsigned ClassID = TTI.getRegisterClassForType(VFs[i] > 1, Inst->getType()); 5708 if (Invariant.find(ClassID) == Invariant.end()) 5709 Invariant[ClassID] = Usage; 5710 else 5711 Invariant[ClassID] += Usage; 5712 } 5713 5714 LLVM_DEBUG({ 5715 dbgs() << "LV(REG): VF = " << VFs[i] << '\n'; 5716 dbgs() << "LV(REG): Found max usage: " << MaxUsages[i].size() 5717 << " item\n"; 5718 for (const auto &pair : MaxUsages[i]) { 5719 dbgs() << "LV(REG): RegisterClass: " 5720 << TTI.getRegisterClassName(pair.first) << ", " << pair.second 5721 << " registers\n"; 5722 } 5723 dbgs() << "LV(REG): Found invariant usage: " << Invariant.size() 5724 << " item\n"; 5725 for (const auto &pair : Invariant) { 5726 dbgs() << "LV(REG): RegisterClass: " 5727 << TTI.getRegisterClassName(pair.first) << ", " << pair.second 5728 << " registers\n"; 5729 } 5730 }); 5731 5732 RU.LoopInvariantRegs = Invariant; 5733 RU.MaxLocalUsers = MaxUsages[i]; 5734 RUs[i] = RU; 5735 } 5736 5737 return RUs; 5738 } 5739 5740 bool LoopVectorizationCostModel::useEmulatedMaskMemRefHack(Instruction *I){ 5741 // TODO: Cost model for emulated masked load/store is completely 5742 // broken. This hack guides the cost model to use an artificially 5743 // high enough value to practically disable vectorization with such 5744 // operations, except where previously deployed legality hack allowed 5745 // using very low cost values. This is to avoid regressions coming simply 5746 // from moving "masked load/store" check from legality to cost model. 5747 // Masked Load/Gather emulation was previously never allowed. 5748 // Limited number of Masked Store/Scatter emulation was allowed. 5749 assert(isPredicatedInst(I) && "Expecting a scalar emulated instruction"); 5750 return isa<LoadInst>(I) || 5751 (isa<StoreInst>(I) && 5752 NumPredStores > NumberOfStoresToPredicate); 5753 } 5754 5755 void LoopVectorizationCostModel::collectInstsToScalarize(unsigned VF) { 5756 // If we aren't vectorizing the loop, or if we've already collected the 5757 // instructions to scalarize, there's nothing to do. Collection may already 5758 // have occurred if we have a user-selected VF and are now computing the 5759 // expected cost for interleaving. 5760 if (VF < 2 || InstsToScalarize.find(VF) != InstsToScalarize.end()) 5761 return; 5762 5763 // Initialize a mapping for VF in InstsToScalalarize. If we find that it's 5764 // not profitable to scalarize any instructions, the presence of VF in the 5765 // map will indicate that we've analyzed it already. 5766 ScalarCostsTy &ScalarCostsVF = InstsToScalarize[VF]; 5767 5768 // Find all the instructions that are scalar with predication in the loop and 5769 // determine if it would be better to not if-convert the blocks they are in. 5770 // If so, we also record the instructions to scalarize. 5771 for (BasicBlock *BB : TheLoop->blocks()) { 5772 if (!blockNeedsPredication(BB)) 5773 continue; 5774 for (Instruction &I : *BB) 5775 if (isScalarWithPredication(&I)) { 5776 ScalarCostsTy ScalarCosts; 5777 // Do not apply discount logic if hacked cost is needed 5778 // for emulated masked memrefs. 5779 if (!useEmulatedMaskMemRefHack(&I) && 5780 computePredInstDiscount(&I, ScalarCosts, VF) >= 0) 5781 ScalarCostsVF.insert(ScalarCosts.begin(), ScalarCosts.end()); 5782 // Remember that BB will remain after vectorization. 5783 PredicatedBBsAfterVectorization.insert(BB); 5784 } 5785 } 5786 } 5787 5788 int LoopVectorizationCostModel::computePredInstDiscount( 5789 Instruction *PredInst, DenseMap<Instruction *, unsigned> &ScalarCosts, 5790 unsigned VF) { 5791 assert(!isUniformAfterVectorization(PredInst, VF) && 5792 "Instruction marked uniform-after-vectorization will be predicated"); 5793 5794 // Initialize the discount to zero, meaning that the scalar version and the 5795 // vector version cost the same. 5796 int Discount = 0; 5797 5798 // Holds instructions to analyze. The instructions we visit are mapped in 5799 // ScalarCosts. Those instructions are the ones that would be scalarized if 5800 // we find that the scalar version costs less. 5801 SmallVector<Instruction *, 8> Worklist; 5802 5803 // Returns true if the given instruction can be scalarized. 5804 auto canBeScalarized = [&](Instruction *I) -> bool { 5805 // We only attempt to scalarize instructions forming a single-use chain 5806 // from the original predicated block that would otherwise be vectorized. 5807 // Although not strictly necessary, we give up on instructions we know will 5808 // already be scalar to avoid traversing chains that are unlikely to be 5809 // beneficial. 5810 if (!I->hasOneUse() || PredInst->getParent() != I->getParent() || 5811 isScalarAfterVectorization(I, VF)) 5812 return false; 5813 5814 // If the instruction is scalar with predication, it will be analyzed 5815 // separately. We ignore it within the context of PredInst. 5816 if (isScalarWithPredication(I)) 5817 return false; 5818 5819 // If any of the instruction's operands are uniform after vectorization, 5820 // the instruction cannot be scalarized. This prevents, for example, a 5821 // masked load from being scalarized. 5822 // 5823 // We assume we will only emit a value for lane zero of an instruction 5824 // marked uniform after vectorization, rather than VF identical values. 5825 // Thus, if we scalarize an instruction that uses a uniform, we would 5826 // create uses of values corresponding to the lanes we aren't emitting code 5827 // for. This behavior can be changed by allowing getScalarValue to clone 5828 // the lane zero values for uniforms rather than asserting. 5829 for (Use &U : I->operands()) 5830 if (auto *J = dyn_cast<Instruction>(U.get())) 5831 if (isUniformAfterVectorization(J, VF)) 5832 return false; 5833 5834 // Otherwise, we can scalarize the instruction. 5835 return true; 5836 }; 5837 5838 // Compute the expected cost discount from scalarizing the entire expression 5839 // feeding the predicated instruction. We currently only consider expressions 5840 // that are single-use instruction chains. 5841 Worklist.push_back(PredInst); 5842 while (!Worklist.empty()) { 5843 Instruction *I = Worklist.pop_back_val(); 5844 5845 // If we've already analyzed the instruction, there's nothing to do. 5846 if (ScalarCosts.find(I) != ScalarCosts.end()) 5847 continue; 5848 5849 // Compute the cost of the vector instruction. Note that this cost already 5850 // includes the scalarization overhead of the predicated instruction. 5851 unsigned VectorCost = getInstructionCost(I, VF).first; 5852 5853 // Compute the cost of the scalarized instruction. This cost is the cost of 5854 // the instruction as if it wasn't if-converted and instead remained in the 5855 // predicated block. We will scale this cost by block probability after 5856 // computing the scalarization overhead. 5857 unsigned ScalarCost = VF * getInstructionCost(I, 1).first; 5858 5859 // Compute the scalarization overhead of needed insertelement instructions 5860 // and phi nodes. 5861 if (isScalarWithPredication(I) && !I->getType()->isVoidTy()) { 5862 ScalarCost += TTI.getScalarizationOverhead( 5863 cast<VectorType>(ToVectorTy(I->getType(), VF)), 5864 APInt::getAllOnesValue(VF), true, false); 5865 ScalarCost += VF * TTI.getCFInstrCost(Instruction::PHI, 5866 TTI::TCK_RecipThroughput); 5867 } 5868 5869 // Compute the scalarization overhead of needed extractelement 5870 // instructions. For each of the instruction's operands, if the operand can 5871 // be scalarized, add it to the worklist; otherwise, account for the 5872 // overhead. 5873 for (Use &U : I->operands()) 5874 if (auto *J = dyn_cast<Instruction>(U.get())) { 5875 assert(VectorType::isValidElementType(J->getType()) && 5876 "Instruction has non-scalar type"); 5877 if (canBeScalarized(J)) 5878 Worklist.push_back(J); 5879 else if (needsExtract(J, VF)) 5880 ScalarCost += TTI.getScalarizationOverhead( 5881 cast<VectorType>(ToVectorTy(J->getType(), VF)), 5882 APInt::getAllOnesValue(VF), false, true); 5883 } 5884 5885 // Scale the total scalar cost by block probability. 5886 ScalarCost /= getReciprocalPredBlockProb(); 5887 5888 // Compute the discount. A non-negative discount means the vector version 5889 // of the instruction costs more, and scalarizing would be beneficial. 5890 Discount += VectorCost - ScalarCost; 5891 ScalarCosts[I] = ScalarCost; 5892 } 5893 5894 return Discount; 5895 } 5896 5897 LoopVectorizationCostModel::VectorizationCostTy 5898 LoopVectorizationCostModel::expectedCost(unsigned VF) { 5899 VectorizationCostTy Cost; 5900 5901 // For each block. 5902 for (BasicBlock *BB : TheLoop->blocks()) { 5903 VectorizationCostTy BlockCost; 5904 5905 // For each instruction in the old loop. 5906 for (Instruction &I : BB->instructionsWithoutDebug()) { 5907 // Skip ignored values. 5908 if (ValuesToIgnore.count(&I) || (VF > 1 && VecValuesToIgnore.count(&I))) 5909 continue; 5910 5911 VectorizationCostTy C = getInstructionCost(&I, VF); 5912 5913 // Check if we should override the cost. 5914 if (ForceTargetInstructionCost.getNumOccurrences() > 0) 5915 C.first = ForceTargetInstructionCost; 5916 5917 BlockCost.first += C.first; 5918 BlockCost.second |= C.second; 5919 LLVM_DEBUG(dbgs() << "LV: Found an estimated cost of " << C.first 5920 << " for VF " << VF << " For instruction: " << I 5921 << '\n'); 5922 } 5923 5924 // If we are vectorizing a predicated block, it will have been 5925 // if-converted. This means that the block's instructions (aside from 5926 // stores and instructions that may divide by zero) will now be 5927 // unconditionally executed. For the scalar case, we may not always execute 5928 // the predicated block. Thus, scale the block's cost by the probability of 5929 // executing it. 5930 if (VF == 1 && blockNeedsPredication(BB)) 5931 BlockCost.first /= getReciprocalPredBlockProb(); 5932 5933 Cost.first += BlockCost.first; 5934 Cost.second |= BlockCost.second; 5935 } 5936 5937 return Cost; 5938 } 5939 5940 /// Gets Address Access SCEV after verifying that the access pattern 5941 /// is loop invariant except the induction variable dependence. 5942 /// 5943 /// This SCEV can be sent to the Target in order to estimate the address 5944 /// calculation cost. 5945 static const SCEV *getAddressAccessSCEV( 5946 Value *Ptr, 5947 LoopVectorizationLegality *Legal, 5948 PredicatedScalarEvolution &PSE, 5949 const Loop *TheLoop) { 5950 5951 auto *Gep = dyn_cast<GetElementPtrInst>(Ptr); 5952 if (!Gep) 5953 return nullptr; 5954 5955 // We are looking for a gep with all loop invariant indices except for one 5956 // which should be an induction variable. 5957 auto SE = PSE.getSE(); 5958 unsigned NumOperands = Gep->getNumOperands(); 5959 for (unsigned i = 1; i < NumOperands; ++i) { 5960 Value *Opd = Gep->getOperand(i); 5961 if (!SE->isLoopInvariant(SE->getSCEV(Opd), TheLoop) && 5962 !Legal->isInductionVariable(Opd)) 5963 return nullptr; 5964 } 5965 5966 // Now we know we have a GEP ptr, %inv, %ind, %inv. return the Ptr SCEV. 5967 return PSE.getSCEV(Ptr); 5968 } 5969 5970 static bool isStrideMul(Instruction *I, LoopVectorizationLegality *Legal) { 5971 return Legal->hasStride(I->getOperand(0)) || 5972 Legal->hasStride(I->getOperand(1)); 5973 } 5974 5975 unsigned LoopVectorizationCostModel::getMemInstScalarizationCost(Instruction *I, 5976 unsigned VF) { 5977 assert(VF > 1 && "Scalarization cost of instruction implies vectorization."); 5978 Type *ValTy = getMemInstValueType(I); 5979 auto SE = PSE.getSE(); 5980 5981 unsigned AS = getLoadStoreAddressSpace(I); 5982 Value *Ptr = getLoadStorePointerOperand(I); 5983 Type *PtrTy = ToVectorTy(Ptr->getType(), VF); 5984 5985 // Figure out whether the access is strided and get the stride value 5986 // if it's known in compile time 5987 const SCEV *PtrSCEV = getAddressAccessSCEV(Ptr, Legal, PSE, TheLoop); 5988 5989 // Get the cost of the scalar memory instruction and address computation. 5990 unsigned Cost = VF * TTI.getAddressComputationCost(PtrTy, SE, PtrSCEV); 5991 5992 // Don't pass *I here, since it is scalar but will actually be part of a 5993 // vectorized loop where the user of it is a vectorized instruction. 5994 const Align Alignment = getLoadStoreAlignment(I); 5995 Cost += VF * TTI.getMemoryOpCost(I->getOpcode(), ValTy->getScalarType(), 5996 Alignment, AS, 5997 TTI::TCK_RecipThroughput); 5998 5999 // Get the overhead of the extractelement and insertelement instructions 6000 // we might create due to scalarization. 6001 Cost += getScalarizationOverhead(I, VF); 6002 6003 // If we have a predicated store, it may not be executed for each vector 6004 // lane. Scale the cost by the probability of executing the predicated 6005 // block. 6006 if (isPredicatedInst(I)) { 6007 Cost /= getReciprocalPredBlockProb(); 6008 6009 if (useEmulatedMaskMemRefHack(I)) 6010 // Artificially setting to a high enough value to practically disable 6011 // vectorization with such operations. 6012 Cost = 3000000; 6013 } 6014 6015 return Cost; 6016 } 6017 6018 unsigned LoopVectorizationCostModel::getConsecutiveMemOpCost(Instruction *I, 6019 unsigned VF) { 6020 Type *ValTy = getMemInstValueType(I); 6021 auto *VectorTy = cast<VectorType>(ToVectorTy(ValTy, VF)); 6022 Value *Ptr = getLoadStorePointerOperand(I); 6023 unsigned AS = getLoadStoreAddressSpace(I); 6024 int ConsecutiveStride = Legal->isConsecutivePtr(Ptr); 6025 enum TTI::TargetCostKind CostKind = TTI::TCK_RecipThroughput; 6026 6027 assert((ConsecutiveStride == 1 || ConsecutiveStride == -1) && 6028 "Stride should be 1 or -1 for consecutive memory access"); 6029 const Align Alignment = getLoadStoreAlignment(I); 6030 unsigned Cost = 0; 6031 if (Legal->isMaskRequired(I)) 6032 Cost += TTI.getMaskedMemoryOpCost(I->getOpcode(), VectorTy, Alignment, AS, 6033 CostKind); 6034 else 6035 Cost += TTI.getMemoryOpCost(I->getOpcode(), VectorTy, Alignment, AS, 6036 CostKind, I); 6037 6038 bool Reverse = ConsecutiveStride < 0; 6039 if (Reverse) 6040 Cost += TTI.getShuffleCost(TargetTransformInfo::SK_Reverse, VectorTy, 0); 6041 return Cost; 6042 } 6043 6044 unsigned LoopVectorizationCostModel::getUniformMemOpCost(Instruction *I, 6045 unsigned VF) { 6046 Type *ValTy = getMemInstValueType(I); 6047 auto *VectorTy = cast<VectorType>(ToVectorTy(ValTy, VF)); 6048 const Align Alignment = getLoadStoreAlignment(I); 6049 unsigned AS = getLoadStoreAddressSpace(I); 6050 enum TTI::TargetCostKind CostKind = TTI::TCK_RecipThroughput; 6051 if (isa<LoadInst>(I)) { 6052 return TTI.getAddressComputationCost(ValTy) + 6053 TTI.getMemoryOpCost(Instruction::Load, ValTy, Alignment, AS, 6054 CostKind) + 6055 TTI.getShuffleCost(TargetTransformInfo::SK_Broadcast, VectorTy); 6056 } 6057 StoreInst *SI = cast<StoreInst>(I); 6058 6059 bool isLoopInvariantStoreValue = Legal->isUniform(SI->getValueOperand()); 6060 return TTI.getAddressComputationCost(ValTy) + 6061 TTI.getMemoryOpCost(Instruction::Store, ValTy, Alignment, AS, 6062 CostKind) + 6063 (isLoopInvariantStoreValue 6064 ? 0 6065 : TTI.getVectorInstrCost(Instruction::ExtractElement, VectorTy, 6066 VF - 1)); 6067 } 6068 6069 unsigned LoopVectorizationCostModel::getGatherScatterCost(Instruction *I, 6070 unsigned VF) { 6071 Type *ValTy = getMemInstValueType(I); 6072 auto *VectorTy = cast<VectorType>(ToVectorTy(ValTy, VF)); 6073 const Align Alignment = getLoadStoreAlignment(I); 6074 const Value *Ptr = getLoadStorePointerOperand(I); 6075 6076 return TTI.getAddressComputationCost(VectorTy) + 6077 TTI.getGatherScatterOpCost( 6078 I->getOpcode(), VectorTy, Ptr, Legal->isMaskRequired(I), Alignment, 6079 TargetTransformInfo::TCK_RecipThroughput, I); 6080 } 6081 6082 unsigned LoopVectorizationCostModel::getInterleaveGroupCost(Instruction *I, 6083 unsigned VF) { 6084 Type *ValTy = getMemInstValueType(I); 6085 auto *VectorTy = cast<VectorType>(ToVectorTy(ValTy, VF)); 6086 unsigned AS = getLoadStoreAddressSpace(I); 6087 6088 auto Group = getInterleavedAccessGroup(I); 6089 assert(Group && "Fail to get an interleaved access group."); 6090 6091 unsigned InterleaveFactor = Group->getFactor(); 6092 auto *WideVecTy = FixedVectorType::get(ValTy, VF * InterleaveFactor); 6093 6094 // Holds the indices of existing members in an interleaved load group. 6095 // An interleaved store group doesn't need this as it doesn't allow gaps. 6096 SmallVector<unsigned, 4> Indices; 6097 if (isa<LoadInst>(I)) { 6098 for (unsigned i = 0; i < InterleaveFactor; i++) 6099 if (Group->getMember(i)) 6100 Indices.push_back(i); 6101 } 6102 6103 // Calculate the cost of the whole interleaved group. 6104 bool UseMaskForGaps = 6105 Group->requiresScalarEpilogue() && !isScalarEpilogueAllowed(); 6106 unsigned Cost = TTI.getInterleavedMemoryOpCost( 6107 I->getOpcode(), WideVecTy, Group->getFactor(), Indices, Group->getAlign(), 6108 AS, TTI::TCK_RecipThroughput, Legal->isMaskRequired(I), UseMaskForGaps); 6109 6110 if (Group->isReverse()) { 6111 // TODO: Add support for reversed masked interleaved access. 6112 assert(!Legal->isMaskRequired(I) && 6113 "Reverse masked interleaved access not supported."); 6114 Cost += Group->getNumMembers() * 6115 TTI.getShuffleCost(TargetTransformInfo::SK_Reverse, VectorTy, 0); 6116 } 6117 return Cost; 6118 } 6119 6120 unsigned LoopVectorizationCostModel::getMemoryInstructionCost(Instruction *I, 6121 unsigned VF) { 6122 // Calculate scalar cost only. Vectorization cost should be ready at this 6123 // moment. 6124 if (VF == 1) { 6125 Type *ValTy = getMemInstValueType(I); 6126 const Align Alignment = getLoadStoreAlignment(I); 6127 unsigned AS = getLoadStoreAddressSpace(I); 6128 6129 return TTI.getAddressComputationCost(ValTy) + 6130 TTI.getMemoryOpCost(I->getOpcode(), ValTy, Alignment, AS, 6131 TTI::TCK_RecipThroughput, I); 6132 } 6133 return getWideningCost(I, VF); 6134 } 6135 6136 LoopVectorizationCostModel::VectorizationCostTy 6137 LoopVectorizationCostModel::getInstructionCost(Instruction *I, unsigned VF) { 6138 // If we know that this instruction will remain uniform, check the cost of 6139 // the scalar version. 6140 if (isUniformAfterVectorization(I, VF)) 6141 VF = 1; 6142 6143 if (VF > 1 && isProfitableToScalarize(I, VF)) 6144 return VectorizationCostTy(InstsToScalarize[VF][I], false); 6145 6146 // Forced scalars do not have any scalarization overhead. 6147 auto ForcedScalar = ForcedScalars.find(VF); 6148 if (VF > 1 && ForcedScalar != ForcedScalars.end()) { 6149 auto InstSet = ForcedScalar->second; 6150 if (InstSet.count(I)) 6151 return VectorizationCostTy((getInstructionCost(I, 1).first * VF), false); 6152 } 6153 6154 Type *VectorTy; 6155 unsigned C = getInstructionCost(I, VF, VectorTy); 6156 6157 bool TypeNotScalarized = 6158 VF > 1 && VectorTy->isVectorTy() && TTI.getNumberOfParts(VectorTy) < VF; 6159 return VectorizationCostTy(C, TypeNotScalarized); 6160 } 6161 6162 unsigned LoopVectorizationCostModel::getScalarizationOverhead(Instruction *I, 6163 unsigned VF) { 6164 6165 if (VF == 1) 6166 return 0; 6167 6168 unsigned Cost = 0; 6169 Type *RetTy = ToVectorTy(I->getType(), VF); 6170 if (!RetTy->isVoidTy() && 6171 (!isa<LoadInst>(I) || !TTI.supportsEfficientVectorElementLoadStore())) 6172 Cost += TTI.getScalarizationOverhead( 6173 cast<VectorType>(RetTy), APInt::getAllOnesValue(VF), true, false); 6174 6175 // Some targets keep addresses scalar. 6176 if (isa<LoadInst>(I) && !TTI.prefersVectorizedAddressing()) 6177 return Cost; 6178 6179 // Some targets support efficient element stores. 6180 if (isa<StoreInst>(I) && TTI.supportsEfficientVectorElementLoadStore()) 6181 return Cost; 6182 6183 // Collect operands to consider. 6184 CallInst *CI = dyn_cast<CallInst>(I); 6185 Instruction::op_range Ops = CI ? CI->arg_operands() : I->operands(); 6186 6187 // Skip operands that do not require extraction/scalarization and do not incur 6188 // any overhead. 6189 return Cost + TTI.getOperandsScalarizationOverhead( 6190 filterExtractingOperands(Ops, VF), VF); 6191 } 6192 6193 void LoopVectorizationCostModel::setCostBasedWideningDecision(unsigned VF) { 6194 if (VF == 1) 6195 return; 6196 NumPredStores = 0; 6197 for (BasicBlock *BB : TheLoop->blocks()) { 6198 // For each instruction in the old loop. 6199 for (Instruction &I : *BB) { 6200 Value *Ptr = getLoadStorePointerOperand(&I); 6201 if (!Ptr) 6202 continue; 6203 6204 // TODO: We should generate better code and update the cost model for 6205 // predicated uniform stores. Today they are treated as any other 6206 // predicated store (see added test cases in 6207 // invariant-store-vectorization.ll). 6208 if (isa<StoreInst>(&I) && isScalarWithPredication(&I)) 6209 NumPredStores++; 6210 6211 if (Legal->isUniform(Ptr) && 6212 // Conditional loads and stores should be scalarized and predicated. 6213 // isScalarWithPredication cannot be used here since masked 6214 // gather/scatters are not considered scalar with predication. 6215 !Legal->blockNeedsPredication(I.getParent())) { 6216 // TODO: Avoid replicating loads and stores instead of 6217 // relying on instcombine to remove them. 6218 // Load: Scalar load + broadcast 6219 // Store: Scalar store + isLoopInvariantStoreValue ? 0 : extract 6220 unsigned Cost = getUniformMemOpCost(&I, VF); 6221 setWideningDecision(&I, VF, CM_Scalarize, Cost); 6222 continue; 6223 } 6224 6225 // We assume that widening is the best solution when possible. 6226 if (memoryInstructionCanBeWidened(&I, VF)) { 6227 unsigned Cost = getConsecutiveMemOpCost(&I, VF); 6228 int ConsecutiveStride = 6229 Legal->isConsecutivePtr(getLoadStorePointerOperand(&I)); 6230 assert((ConsecutiveStride == 1 || ConsecutiveStride == -1) && 6231 "Expected consecutive stride."); 6232 InstWidening Decision = 6233 ConsecutiveStride == 1 ? CM_Widen : CM_Widen_Reverse; 6234 setWideningDecision(&I, VF, Decision, Cost); 6235 continue; 6236 } 6237 6238 // Choose between Interleaving, Gather/Scatter or Scalarization. 6239 unsigned InterleaveCost = std::numeric_limits<unsigned>::max(); 6240 unsigned NumAccesses = 1; 6241 if (isAccessInterleaved(&I)) { 6242 auto Group = getInterleavedAccessGroup(&I); 6243 assert(Group && "Fail to get an interleaved access group."); 6244 6245 // Make one decision for the whole group. 6246 if (getWideningDecision(&I, VF) != CM_Unknown) 6247 continue; 6248 6249 NumAccesses = Group->getNumMembers(); 6250 if (interleavedAccessCanBeWidened(&I, VF)) 6251 InterleaveCost = getInterleaveGroupCost(&I, VF); 6252 } 6253 6254 unsigned GatherScatterCost = 6255 isLegalGatherOrScatter(&I) 6256 ? getGatherScatterCost(&I, VF) * NumAccesses 6257 : std::numeric_limits<unsigned>::max(); 6258 6259 unsigned ScalarizationCost = 6260 getMemInstScalarizationCost(&I, VF) * NumAccesses; 6261 6262 // Choose better solution for the current VF, 6263 // write down this decision and use it during vectorization. 6264 unsigned Cost; 6265 InstWidening Decision; 6266 if (InterleaveCost <= GatherScatterCost && 6267 InterleaveCost < ScalarizationCost) { 6268 Decision = CM_Interleave; 6269 Cost = InterleaveCost; 6270 } else if (GatherScatterCost < ScalarizationCost) { 6271 Decision = CM_GatherScatter; 6272 Cost = GatherScatterCost; 6273 } else { 6274 Decision = CM_Scalarize; 6275 Cost = ScalarizationCost; 6276 } 6277 // If the instructions belongs to an interleave group, the whole group 6278 // receives the same decision. The whole group receives the cost, but 6279 // the cost will actually be assigned to one instruction. 6280 if (auto Group = getInterleavedAccessGroup(&I)) 6281 setWideningDecision(Group, VF, Decision, Cost); 6282 else 6283 setWideningDecision(&I, VF, Decision, Cost); 6284 } 6285 } 6286 6287 // Make sure that any load of address and any other address computation 6288 // remains scalar unless there is gather/scatter support. This avoids 6289 // inevitable extracts into address registers, and also has the benefit of 6290 // activating LSR more, since that pass can't optimize vectorized 6291 // addresses. 6292 if (TTI.prefersVectorizedAddressing()) 6293 return; 6294 6295 // Start with all scalar pointer uses. 6296 SmallPtrSet<Instruction *, 8> AddrDefs; 6297 for (BasicBlock *BB : TheLoop->blocks()) 6298 for (Instruction &I : *BB) { 6299 Instruction *PtrDef = 6300 dyn_cast_or_null<Instruction>(getLoadStorePointerOperand(&I)); 6301 if (PtrDef && TheLoop->contains(PtrDef) && 6302 getWideningDecision(&I, VF) != CM_GatherScatter) 6303 AddrDefs.insert(PtrDef); 6304 } 6305 6306 // Add all instructions used to generate the addresses. 6307 SmallVector<Instruction *, 4> Worklist; 6308 for (auto *I : AddrDefs) 6309 Worklist.push_back(I); 6310 while (!Worklist.empty()) { 6311 Instruction *I = Worklist.pop_back_val(); 6312 for (auto &Op : I->operands()) 6313 if (auto *InstOp = dyn_cast<Instruction>(Op)) 6314 if ((InstOp->getParent() == I->getParent()) && !isa<PHINode>(InstOp) && 6315 AddrDefs.insert(InstOp).second) 6316 Worklist.push_back(InstOp); 6317 } 6318 6319 for (auto *I : AddrDefs) { 6320 if (isa<LoadInst>(I)) { 6321 // Setting the desired widening decision should ideally be handled in 6322 // by cost functions, but since this involves the task of finding out 6323 // if the loaded register is involved in an address computation, it is 6324 // instead changed here when we know this is the case. 6325 InstWidening Decision = getWideningDecision(I, VF); 6326 if (Decision == CM_Widen || Decision == CM_Widen_Reverse) 6327 // Scalarize a widened load of address. 6328 setWideningDecision(I, VF, CM_Scalarize, 6329 (VF * getMemoryInstructionCost(I, 1))); 6330 else if (auto Group = getInterleavedAccessGroup(I)) { 6331 // Scalarize an interleave group of address loads. 6332 for (unsigned I = 0; I < Group->getFactor(); ++I) { 6333 if (Instruction *Member = Group->getMember(I)) 6334 setWideningDecision(Member, VF, CM_Scalarize, 6335 (VF * getMemoryInstructionCost(Member, 1))); 6336 } 6337 } 6338 } else 6339 // Make sure I gets scalarized and a cost estimate without 6340 // scalarization overhead. 6341 ForcedScalars[VF].insert(I); 6342 } 6343 } 6344 6345 unsigned LoopVectorizationCostModel::getInstructionCost(Instruction *I, 6346 unsigned VF, 6347 Type *&VectorTy) { 6348 Type *RetTy = I->getType(); 6349 if (canTruncateToMinimalBitwidth(I, VF)) 6350 RetTy = IntegerType::get(RetTy->getContext(), MinBWs[I]); 6351 VectorTy = isScalarAfterVectorization(I, VF) ? RetTy : ToVectorTy(RetTy, VF); 6352 auto SE = PSE.getSE(); 6353 TTI::TargetCostKind CostKind = TTI::TCK_RecipThroughput; 6354 6355 // TODO: We need to estimate the cost of intrinsic calls. 6356 switch (I->getOpcode()) { 6357 case Instruction::GetElementPtr: 6358 // We mark this instruction as zero-cost because the cost of GEPs in 6359 // vectorized code depends on whether the corresponding memory instruction 6360 // is scalarized or not. Therefore, we handle GEPs with the memory 6361 // instruction cost. 6362 return 0; 6363 case Instruction::Br: { 6364 // In cases of scalarized and predicated instructions, there will be VF 6365 // predicated blocks in the vectorized loop. Each branch around these 6366 // blocks requires also an extract of its vector compare i1 element. 6367 bool ScalarPredicatedBB = false; 6368 BranchInst *BI = cast<BranchInst>(I); 6369 if (VF > 1 && BI->isConditional() && 6370 (PredicatedBBsAfterVectorization.count(BI->getSuccessor(0)) || 6371 PredicatedBBsAfterVectorization.count(BI->getSuccessor(1)))) 6372 ScalarPredicatedBB = true; 6373 6374 if (ScalarPredicatedBB) { 6375 // Return cost for branches around scalarized and predicated blocks. 6376 auto *Vec_i1Ty = 6377 FixedVectorType::get(IntegerType::getInt1Ty(RetTy->getContext()), VF); 6378 return (TTI.getScalarizationOverhead(Vec_i1Ty, APInt::getAllOnesValue(VF), 6379 false, true) + 6380 (TTI.getCFInstrCost(Instruction::Br, CostKind) * VF)); 6381 } else if (I->getParent() == TheLoop->getLoopLatch() || VF == 1) 6382 // The back-edge branch will remain, as will all scalar branches. 6383 return TTI.getCFInstrCost(Instruction::Br, CostKind); 6384 else 6385 // This branch will be eliminated by if-conversion. 6386 return 0; 6387 // Note: We currently assume zero cost for an unconditional branch inside 6388 // a predicated block since it will become a fall-through, although we 6389 // may decide in the future to call TTI for all branches. 6390 } 6391 case Instruction::PHI: { 6392 auto *Phi = cast<PHINode>(I); 6393 6394 // First-order recurrences are replaced by vector shuffles inside the loop. 6395 // NOTE: Don't use ToVectorTy as SK_ExtractSubvector expects a vector type. 6396 if (VF > 1 && Legal->isFirstOrderRecurrence(Phi)) 6397 return TTI.getShuffleCost(TargetTransformInfo::SK_ExtractSubvector, 6398 cast<VectorType>(VectorTy), VF - 1, 6399 FixedVectorType::get(RetTy, 1)); 6400 6401 // Phi nodes in non-header blocks (not inductions, reductions, etc.) are 6402 // converted into select instructions. We require N - 1 selects per phi 6403 // node, where N is the number of incoming values. 6404 if (VF > 1 && Phi->getParent() != TheLoop->getHeader()) 6405 return (Phi->getNumIncomingValues() - 1) * 6406 TTI.getCmpSelInstrCost( 6407 Instruction::Select, ToVectorTy(Phi->getType(), VF), 6408 ToVectorTy(Type::getInt1Ty(Phi->getContext()), VF), 6409 CostKind); 6410 6411 return TTI.getCFInstrCost(Instruction::PHI, CostKind); 6412 } 6413 case Instruction::UDiv: 6414 case Instruction::SDiv: 6415 case Instruction::URem: 6416 case Instruction::SRem: 6417 // If we have a predicated instruction, it may not be executed for each 6418 // vector lane. Get the scalarization cost and scale this amount by the 6419 // probability of executing the predicated block. If the instruction is not 6420 // predicated, we fall through to the next case. 6421 if (VF > 1 && isScalarWithPredication(I)) { 6422 unsigned Cost = 0; 6423 6424 // These instructions have a non-void type, so account for the phi nodes 6425 // that we will create. This cost is likely to be zero. The phi node 6426 // cost, if any, should be scaled by the block probability because it 6427 // models a copy at the end of each predicated block. 6428 Cost += VF * TTI.getCFInstrCost(Instruction::PHI, CostKind); 6429 6430 // The cost of the non-predicated instruction. 6431 Cost += VF * TTI.getArithmeticInstrCost(I->getOpcode(), RetTy, CostKind); 6432 6433 // The cost of insertelement and extractelement instructions needed for 6434 // scalarization. 6435 Cost += getScalarizationOverhead(I, VF); 6436 6437 // Scale the cost by the probability of executing the predicated blocks. 6438 // This assumes the predicated block for each vector lane is equally 6439 // likely. 6440 return Cost / getReciprocalPredBlockProb(); 6441 } 6442 LLVM_FALLTHROUGH; 6443 case Instruction::Add: 6444 case Instruction::FAdd: 6445 case Instruction::Sub: 6446 case Instruction::FSub: 6447 case Instruction::Mul: 6448 case Instruction::FMul: 6449 case Instruction::FDiv: 6450 case Instruction::FRem: 6451 case Instruction::Shl: 6452 case Instruction::LShr: 6453 case Instruction::AShr: 6454 case Instruction::And: 6455 case Instruction::Or: 6456 case Instruction::Xor: { 6457 // Since we will replace the stride by 1 the multiplication should go away. 6458 if (I->getOpcode() == Instruction::Mul && isStrideMul(I, Legal)) 6459 return 0; 6460 // Certain instructions can be cheaper to vectorize if they have a constant 6461 // second vector operand. One example of this are shifts on x86. 6462 Value *Op2 = I->getOperand(1); 6463 TargetTransformInfo::OperandValueProperties Op2VP; 6464 TargetTransformInfo::OperandValueKind Op2VK = 6465 TTI.getOperandInfo(Op2, Op2VP); 6466 if (Op2VK == TargetTransformInfo::OK_AnyValue && Legal->isUniform(Op2)) 6467 Op2VK = TargetTransformInfo::OK_UniformValue; 6468 6469 SmallVector<const Value *, 4> Operands(I->operand_values()); 6470 unsigned N = isScalarAfterVectorization(I, VF) ? VF : 1; 6471 return N * TTI.getArithmeticInstrCost( 6472 I->getOpcode(), VectorTy, CostKind, 6473 TargetTransformInfo::OK_AnyValue, 6474 Op2VK, TargetTransformInfo::OP_None, Op2VP, Operands, I); 6475 } 6476 case Instruction::FNeg: { 6477 unsigned N = isScalarAfterVectorization(I, VF) ? VF : 1; 6478 return N * TTI.getArithmeticInstrCost( 6479 I->getOpcode(), VectorTy, CostKind, 6480 TargetTransformInfo::OK_AnyValue, 6481 TargetTransformInfo::OK_AnyValue, 6482 TargetTransformInfo::OP_None, TargetTransformInfo::OP_None, 6483 I->getOperand(0), I); 6484 } 6485 case Instruction::Select: { 6486 SelectInst *SI = cast<SelectInst>(I); 6487 const SCEV *CondSCEV = SE->getSCEV(SI->getCondition()); 6488 bool ScalarCond = (SE->isLoopInvariant(CondSCEV, TheLoop)); 6489 Type *CondTy = SI->getCondition()->getType(); 6490 if (!ScalarCond) 6491 CondTy = FixedVectorType::get(CondTy, VF); 6492 6493 return TTI.getCmpSelInstrCost(I->getOpcode(), VectorTy, CondTy, 6494 CostKind, I); 6495 } 6496 case Instruction::ICmp: 6497 case Instruction::FCmp: { 6498 Type *ValTy = I->getOperand(0)->getType(); 6499 Instruction *Op0AsInstruction = dyn_cast<Instruction>(I->getOperand(0)); 6500 if (canTruncateToMinimalBitwidth(Op0AsInstruction, VF)) 6501 ValTy = IntegerType::get(ValTy->getContext(), MinBWs[Op0AsInstruction]); 6502 VectorTy = ToVectorTy(ValTy, VF); 6503 return TTI.getCmpSelInstrCost(I->getOpcode(), VectorTy, nullptr, CostKind, 6504 I); 6505 } 6506 case Instruction::Store: 6507 case Instruction::Load: { 6508 unsigned Width = VF; 6509 if (Width > 1) { 6510 InstWidening Decision = getWideningDecision(I, Width); 6511 assert(Decision != CM_Unknown && 6512 "CM decision should be taken at this point"); 6513 if (Decision == CM_Scalarize) 6514 Width = 1; 6515 } 6516 VectorTy = ToVectorTy(getMemInstValueType(I), Width); 6517 return getMemoryInstructionCost(I, VF); 6518 } 6519 case Instruction::ZExt: 6520 case Instruction::SExt: 6521 case Instruction::FPToUI: 6522 case Instruction::FPToSI: 6523 case Instruction::FPExt: 6524 case Instruction::PtrToInt: 6525 case Instruction::IntToPtr: 6526 case Instruction::SIToFP: 6527 case Instruction::UIToFP: 6528 case Instruction::Trunc: 6529 case Instruction::FPTrunc: 6530 case Instruction::BitCast: { 6531 // Computes the CastContextHint from a Load/Store instruction. 6532 auto ComputeCCH = [&](Instruction *I) -> TTI::CastContextHint { 6533 assert((isa<LoadInst>(I) || isa<StoreInst>(I)) && 6534 "Expected a load or a store!"); 6535 6536 if (VF == 1 || !TheLoop->contains(I)) 6537 return TTI::CastContextHint::Normal; 6538 6539 switch (getWideningDecision(I, VF)) { 6540 case LoopVectorizationCostModel::CM_GatherScatter: 6541 return TTI::CastContextHint::GatherScatter; 6542 case LoopVectorizationCostModel::CM_Interleave: 6543 return TTI::CastContextHint::Interleave; 6544 case LoopVectorizationCostModel::CM_Scalarize: 6545 case LoopVectorizationCostModel::CM_Widen: 6546 return Legal->isMaskRequired(I) ? TTI::CastContextHint::Masked 6547 : TTI::CastContextHint::Normal; 6548 case LoopVectorizationCostModel::CM_Widen_Reverse: 6549 return TTI::CastContextHint::Reversed; 6550 case LoopVectorizationCostModel::CM_Unknown: 6551 llvm_unreachable("Instr did not go through cost modelling?"); 6552 } 6553 6554 llvm_unreachable("Unhandled case!"); 6555 }; 6556 6557 unsigned Opcode = I->getOpcode(); 6558 TTI::CastContextHint CCH = TTI::CastContextHint::None; 6559 // For Trunc, the context is the only user, which must be a StoreInst. 6560 if (Opcode == Instruction::Trunc || Opcode == Instruction::FPTrunc) { 6561 if (I->hasOneUse()) 6562 if (StoreInst *Store = dyn_cast<StoreInst>(*I->user_begin())) 6563 CCH = ComputeCCH(Store); 6564 } 6565 // For Z/Sext, the context is the operand, which must be a LoadInst. 6566 else if (Opcode == Instruction::ZExt || Opcode == Instruction::SExt || 6567 Opcode == Instruction::FPExt) { 6568 if (LoadInst *Load = dyn_cast<LoadInst>(I->getOperand(0))) 6569 CCH = ComputeCCH(Load); 6570 } 6571 6572 // We optimize the truncation of induction variables having constant 6573 // integer steps. The cost of these truncations is the same as the scalar 6574 // operation. 6575 if (isOptimizableIVTruncate(I, VF)) { 6576 auto *Trunc = cast<TruncInst>(I); 6577 return TTI.getCastInstrCost(Instruction::Trunc, Trunc->getDestTy(), 6578 Trunc->getSrcTy(), CCH, CostKind, Trunc); 6579 } 6580 6581 Type *SrcScalarTy = I->getOperand(0)->getType(); 6582 Type *SrcVecTy = 6583 VectorTy->isVectorTy() ? ToVectorTy(SrcScalarTy, VF) : SrcScalarTy; 6584 if (canTruncateToMinimalBitwidth(I, VF)) { 6585 // This cast is going to be shrunk. This may remove the cast or it might 6586 // turn it into slightly different cast. For example, if MinBW == 16, 6587 // "zext i8 %1 to i32" becomes "zext i8 %1 to i16". 6588 // 6589 // Calculate the modified src and dest types. 6590 Type *MinVecTy = VectorTy; 6591 if (Opcode == Instruction::Trunc) { 6592 SrcVecTy = smallestIntegerVectorType(SrcVecTy, MinVecTy); 6593 VectorTy = 6594 largestIntegerVectorType(ToVectorTy(I->getType(), VF), MinVecTy); 6595 } else if (Opcode == Instruction::ZExt || Opcode == Instruction::SExt) { 6596 SrcVecTy = largestIntegerVectorType(SrcVecTy, MinVecTy); 6597 VectorTy = 6598 smallestIntegerVectorType(ToVectorTy(I->getType(), VF), MinVecTy); 6599 } 6600 } 6601 6602 unsigned N = isScalarAfterVectorization(I, VF) ? VF : 1; 6603 return N * 6604 TTI.getCastInstrCost(Opcode, VectorTy, SrcVecTy, CCH, CostKind, I); 6605 } 6606 case Instruction::Call: { 6607 bool NeedToScalarize; 6608 CallInst *CI = cast<CallInst>(I); 6609 unsigned CallCost = getVectorCallCost(CI, VF, NeedToScalarize); 6610 if (getVectorIntrinsicIDForCall(CI, TLI)) 6611 return std::min(CallCost, getVectorIntrinsicCost(CI, VF)); 6612 return CallCost; 6613 } 6614 default: 6615 // The cost of executing VF copies of the scalar instruction. This opcode 6616 // is unknown. Assume that it is the same as 'mul'. 6617 return VF * TTI.getArithmeticInstrCost(Instruction::Mul, VectorTy, 6618 CostKind) + 6619 getScalarizationOverhead(I, VF); 6620 } // end of switch. 6621 } 6622 6623 char LoopVectorize::ID = 0; 6624 6625 static const char lv_name[] = "Loop Vectorization"; 6626 6627 INITIALIZE_PASS_BEGIN(LoopVectorize, LV_NAME, lv_name, false, false) 6628 INITIALIZE_PASS_DEPENDENCY(TargetTransformInfoWrapperPass) 6629 INITIALIZE_PASS_DEPENDENCY(BasicAAWrapperPass) 6630 INITIALIZE_PASS_DEPENDENCY(AAResultsWrapperPass) 6631 INITIALIZE_PASS_DEPENDENCY(GlobalsAAWrapperPass) 6632 INITIALIZE_PASS_DEPENDENCY(AssumptionCacheTracker) 6633 INITIALIZE_PASS_DEPENDENCY(BlockFrequencyInfoWrapperPass) 6634 INITIALIZE_PASS_DEPENDENCY(DominatorTreeWrapperPass) 6635 INITIALIZE_PASS_DEPENDENCY(ScalarEvolutionWrapperPass) 6636 INITIALIZE_PASS_DEPENDENCY(LoopInfoWrapperPass) 6637 INITIALIZE_PASS_DEPENDENCY(LoopAccessLegacyAnalysis) 6638 INITIALIZE_PASS_DEPENDENCY(DemandedBitsWrapperPass) 6639 INITIALIZE_PASS_DEPENDENCY(OptimizationRemarkEmitterWrapperPass) 6640 INITIALIZE_PASS_DEPENDENCY(ProfileSummaryInfoWrapperPass) 6641 INITIALIZE_PASS_DEPENDENCY(InjectTLIMappingsLegacy) 6642 INITIALIZE_PASS_END(LoopVectorize, LV_NAME, lv_name, false, false) 6643 6644 namespace llvm { 6645 6646 Pass *createLoopVectorizePass() { return new LoopVectorize(); } 6647 6648 Pass *createLoopVectorizePass(bool InterleaveOnlyWhenForced, 6649 bool VectorizeOnlyWhenForced) { 6650 return new LoopVectorize(InterleaveOnlyWhenForced, VectorizeOnlyWhenForced); 6651 } 6652 6653 } // end namespace llvm 6654 6655 bool LoopVectorizationCostModel::isConsecutiveLoadOrStore(Instruction *Inst) { 6656 // Check if the pointer operand of a load or store instruction is 6657 // consecutive. 6658 if (auto *Ptr = getLoadStorePointerOperand(Inst)) 6659 return Legal->isConsecutivePtr(Ptr); 6660 return false; 6661 } 6662 6663 void LoopVectorizationCostModel::collectValuesToIgnore() { 6664 // Ignore ephemeral values. 6665 CodeMetrics::collectEphemeralValues(TheLoop, AC, ValuesToIgnore); 6666 6667 // Ignore type-promoting instructions we identified during reduction 6668 // detection. 6669 for (auto &Reduction : Legal->getReductionVars()) { 6670 RecurrenceDescriptor &RedDes = Reduction.second; 6671 SmallPtrSetImpl<Instruction *> &Casts = RedDes.getCastInsts(); 6672 VecValuesToIgnore.insert(Casts.begin(), Casts.end()); 6673 } 6674 // Ignore type-casting instructions we identified during induction 6675 // detection. 6676 for (auto &Induction : Legal->getInductionVars()) { 6677 InductionDescriptor &IndDes = Induction.second; 6678 const SmallVectorImpl<Instruction *> &Casts = IndDes.getCastInsts(); 6679 VecValuesToIgnore.insert(Casts.begin(), Casts.end()); 6680 } 6681 } 6682 6683 void LoopVectorizationCostModel::collectInLoopReductions() { 6684 // For the moment, without predicated reduction instructions, we do not 6685 // support inloop reductions whilst folding the tail, and hence in those cases 6686 // all reductions are currently out of the loop. 6687 if (!PreferInLoopReductions || foldTailByMasking()) 6688 return; 6689 6690 for (auto &Reduction : Legal->getReductionVars()) { 6691 PHINode *Phi = Reduction.first; 6692 RecurrenceDescriptor &RdxDesc = Reduction.second; 6693 6694 // We don't collect reductions that are type promoted (yet). 6695 if (RdxDesc.getRecurrenceType() != Phi->getType()) 6696 continue; 6697 6698 // Check that we can correctly put the reductions into the loop, by 6699 // finding the chain of operations that leads from the phi to the loop 6700 // exit value. 6701 SmallVector<Instruction *, 4> ReductionOperations = 6702 RdxDesc.getReductionOpChain(Phi, TheLoop); 6703 bool InLoop = !ReductionOperations.empty(); 6704 if (InLoop) 6705 InLoopReductionChains[Phi] = ReductionOperations; 6706 LLVM_DEBUG(dbgs() << "LV: Using " << (InLoop ? "inloop" : "out of loop") 6707 << " reduction for phi: " << *Phi << "\n"); 6708 } 6709 } 6710 6711 // TODO: we could return a pair of values that specify the max VF and 6712 // min VF, to be used in `buildVPlans(MinVF, MaxVF)` instead of 6713 // `buildVPlans(VF, VF)`. We cannot do it because VPLAN at the moment 6714 // doesn't have a cost model that can choose which plan to execute if 6715 // more than one is generated. 6716 static unsigned determineVPlanVF(const unsigned WidestVectorRegBits, 6717 LoopVectorizationCostModel &CM) { 6718 unsigned WidestType; 6719 std::tie(std::ignore, WidestType) = CM.getSmallestAndWidestTypes(); 6720 return WidestVectorRegBits / WidestType; 6721 } 6722 6723 VectorizationFactor 6724 LoopVectorizationPlanner::planInVPlanNativePath(unsigned UserVF) { 6725 unsigned VF = UserVF; 6726 // Outer loop handling: They may require CFG and instruction level 6727 // transformations before even evaluating whether vectorization is profitable. 6728 // Since we cannot modify the incoming IR, we need to build VPlan upfront in 6729 // the vectorization pipeline. 6730 if (!OrigLoop->empty()) { 6731 // If the user doesn't provide a vectorization factor, determine a 6732 // reasonable one. 6733 if (!UserVF) { 6734 VF = determineVPlanVF(TTI->getRegisterBitWidth(true /* Vector*/), CM); 6735 LLVM_DEBUG(dbgs() << "LV: VPlan computed VF " << VF << ".\n"); 6736 6737 // Make sure we have a VF > 1 for stress testing. 6738 if (VPlanBuildStressTest && VF < 2) { 6739 LLVM_DEBUG(dbgs() << "LV: VPlan stress testing: " 6740 << "overriding computed VF.\n"); 6741 VF = 4; 6742 } 6743 } 6744 assert(EnableVPlanNativePath && "VPlan-native path is not enabled."); 6745 assert(isPowerOf2_32(VF) && "VF needs to be a power of two"); 6746 LLVM_DEBUG(dbgs() << "LV: Using " << (UserVF ? "user " : "") << "VF " << VF 6747 << " to build VPlans.\n"); 6748 buildVPlans(VF, VF); 6749 6750 // For VPlan build stress testing, we bail out after VPlan construction. 6751 if (VPlanBuildStressTest) 6752 return VectorizationFactor::Disabled(); 6753 6754 return {VF, 0}; 6755 } 6756 6757 LLVM_DEBUG( 6758 dbgs() << "LV: Not vectorizing. Inner loops aren't supported in the " 6759 "VPlan-native path.\n"); 6760 return VectorizationFactor::Disabled(); 6761 } 6762 6763 Optional<VectorizationFactor> LoopVectorizationPlanner::plan(unsigned UserVF, 6764 unsigned UserIC) { 6765 assert(OrigLoop->empty() && "Inner loop expected."); 6766 Optional<unsigned> MaybeMaxVF = CM.computeMaxVF(UserVF, UserIC); 6767 if (!MaybeMaxVF) // Cases that should not to be vectorized nor interleaved. 6768 return None; 6769 6770 // Invalidate interleave groups if all blocks of loop will be predicated. 6771 if (CM.blockNeedsPredication(OrigLoop->getHeader()) && 6772 !useMaskedInterleavedAccesses(*TTI)) { 6773 LLVM_DEBUG( 6774 dbgs() 6775 << "LV: Invalidate all interleaved groups due to fold-tail by masking " 6776 "which requires masked-interleaved support.\n"); 6777 if (CM.InterleaveInfo.invalidateGroups()) 6778 // Invalidating interleave groups also requires invalidating all decisions 6779 // based on them, which includes widening decisions and uniform and scalar 6780 // values. 6781 CM.invalidateCostModelingDecisions(); 6782 } 6783 6784 if (UserVF) { 6785 LLVM_DEBUG(dbgs() << "LV: Using user VF " << UserVF << ".\n"); 6786 assert(isPowerOf2_32(UserVF) && "VF needs to be a power of two"); 6787 // Collect the instructions (and their associated costs) that will be more 6788 // profitable to scalarize. 6789 CM.selectUserVectorizationFactor(UserVF); 6790 CM.collectInLoopReductions(); 6791 buildVPlansWithVPRecipes(UserVF, UserVF); 6792 LLVM_DEBUG(printPlans(dbgs())); 6793 return {{UserVF, 0}}; 6794 } 6795 6796 unsigned MaxVF = MaybeMaxVF.getValue(); 6797 assert(MaxVF != 0 && "MaxVF is zero."); 6798 6799 for (unsigned VF = 1; VF <= MaxVF; VF *= 2) { 6800 // Collect Uniform and Scalar instructions after vectorization with VF. 6801 CM.collectUniformsAndScalars(VF); 6802 6803 // Collect the instructions (and their associated costs) that will be more 6804 // profitable to scalarize. 6805 if (VF > 1) 6806 CM.collectInstsToScalarize(VF); 6807 } 6808 6809 CM.collectInLoopReductions(); 6810 6811 buildVPlansWithVPRecipes(1, MaxVF); 6812 LLVM_DEBUG(printPlans(dbgs())); 6813 if (MaxVF == 1) 6814 return VectorizationFactor::Disabled(); 6815 6816 // Select the optimal vectorization factor. 6817 return CM.selectVectorizationFactor(MaxVF); 6818 } 6819 6820 void LoopVectorizationPlanner::setBestPlan(unsigned VF, unsigned UF) { 6821 LLVM_DEBUG(dbgs() << "Setting best plan to VF=" << VF << ", UF=" << UF 6822 << '\n'); 6823 BestVF = VF; 6824 BestUF = UF; 6825 6826 erase_if(VPlans, [VF](const VPlanPtr &Plan) { 6827 return !Plan->hasVF(VF); 6828 }); 6829 assert(VPlans.size() == 1 && "Best VF has not a single VPlan."); 6830 } 6831 6832 void LoopVectorizationPlanner::executePlan(InnerLoopVectorizer &ILV, 6833 DominatorTree *DT) { 6834 // Perform the actual loop transformation. 6835 6836 // 1. Create a new empty loop. Unlink the old loop and connect the new one. 6837 VPCallbackILV CallbackILV(ILV); 6838 6839 VPTransformState State{BestVF, BestUF, LI, 6840 DT, ILV.Builder, ILV.VectorLoopValueMap, 6841 &ILV, CallbackILV}; 6842 State.CFG.PrevBB = ILV.createVectorizedLoopSkeleton(); 6843 State.TripCount = ILV.getOrCreateTripCount(nullptr); 6844 State.CanonicalIV = ILV.Induction; 6845 6846 //===------------------------------------------------===// 6847 // 6848 // Notice: any optimization or new instruction that go 6849 // into the code below should also be implemented in 6850 // the cost-model. 6851 // 6852 //===------------------------------------------------===// 6853 6854 // 2. Copy and widen instructions from the old loop into the new loop. 6855 assert(VPlans.size() == 1 && "Not a single VPlan to execute."); 6856 VPlans.front()->execute(&State); 6857 6858 // 3. Fix the vectorized code: take care of header phi's, live-outs, 6859 // predication, updating analyses. 6860 ILV.fixVectorizedLoop(); 6861 } 6862 6863 void LoopVectorizationPlanner::collectTriviallyDeadInstructions( 6864 SmallPtrSetImpl<Instruction *> &DeadInstructions) { 6865 BasicBlock *Latch = OrigLoop->getLoopLatch(); 6866 6867 // We create new control-flow for the vectorized loop, so the original 6868 // condition will be dead after vectorization if it's only used by the 6869 // branch. 6870 auto *Cmp = dyn_cast<Instruction>(Latch->getTerminator()->getOperand(0)); 6871 if (Cmp && Cmp->hasOneUse()) 6872 DeadInstructions.insert(Cmp); 6873 6874 // We create new "steps" for induction variable updates to which the original 6875 // induction variables map. An original update instruction will be dead if 6876 // all its users except the induction variable are dead. 6877 for (auto &Induction : Legal->getInductionVars()) { 6878 PHINode *Ind = Induction.first; 6879 auto *IndUpdate = cast<Instruction>(Ind->getIncomingValueForBlock(Latch)); 6880 if (llvm::all_of(IndUpdate->users(), [&](User *U) -> bool { 6881 return U == Ind || DeadInstructions.count(cast<Instruction>(U)); 6882 })) 6883 DeadInstructions.insert(IndUpdate); 6884 6885 // We record as "Dead" also the type-casting instructions we had identified 6886 // during induction analysis. We don't need any handling for them in the 6887 // vectorized loop because we have proven that, under a proper runtime 6888 // test guarding the vectorized loop, the value of the phi, and the casted 6889 // value of the phi, are the same. The last instruction in this casting chain 6890 // will get its scalar/vector/widened def from the scalar/vector/widened def 6891 // of the respective phi node. Any other casts in the induction def-use chain 6892 // have no other uses outside the phi update chain, and will be ignored. 6893 InductionDescriptor &IndDes = Induction.second; 6894 const SmallVectorImpl<Instruction *> &Casts = IndDes.getCastInsts(); 6895 DeadInstructions.insert(Casts.begin(), Casts.end()); 6896 } 6897 } 6898 6899 Value *InnerLoopUnroller::reverseVector(Value *Vec) { return Vec; } 6900 6901 Value *InnerLoopUnroller::getBroadcastInstrs(Value *V) { return V; } 6902 6903 Value *InnerLoopUnroller::getStepVector(Value *Val, int StartIdx, Value *Step, 6904 Instruction::BinaryOps BinOp) { 6905 // When unrolling and the VF is 1, we only need to add a simple scalar. 6906 Type *Ty = Val->getType(); 6907 assert(!Ty->isVectorTy() && "Val must be a scalar"); 6908 6909 if (Ty->isFloatingPointTy()) { 6910 Constant *C = ConstantFP::get(Ty, (double)StartIdx); 6911 6912 // Floating point operations had to be 'fast' to enable the unrolling. 6913 Value *MulOp = addFastMathFlag(Builder.CreateFMul(C, Step)); 6914 return addFastMathFlag(Builder.CreateBinOp(BinOp, Val, MulOp)); 6915 } 6916 Constant *C = ConstantInt::get(Ty, StartIdx); 6917 return Builder.CreateAdd(Val, Builder.CreateMul(C, Step), "induction"); 6918 } 6919 6920 static void AddRuntimeUnrollDisableMetaData(Loop *L) { 6921 SmallVector<Metadata *, 4> MDs; 6922 // Reserve first location for self reference to the LoopID metadata node. 6923 MDs.push_back(nullptr); 6924 bool IsUnrollMetadata = false; 6925 MDNode *LoopID = L->getLoopID(); 6926 if (LoopID) { 6927 // First find existing loop unrolling disable metadata. 6928 for (unsigned i = 1, ie = LoopID->getNumOperands(); i < ie; ++i) { 6929 auto *MD = dyn_cast<MDNode>(LoopID->getOperand(i)); 6930 if (MD) { 6931 const auto *S = dyn_cast<MDString>(MD->getOperand(0)); 6932 IsUnrollMetadata = 6933 S && S->getString().startswith("llvm.loop.unroll.disable"); 6934 } 6935 MDs.push_back(LoopID->getOperand(i)); 6936 } 6937 } 6938 6939 if (!IsUnrollMetadata) { 6940 // Add runtime unroll disable metadata. 6941 LLVMContext &Context = L->getHeader()->getContext(); 6942 SmallVector<Metadata *, 1> DisableOperands; 6943 DisableOperands.push_back( 6944 MDString::get(Context, "llvm.loop.unroll.runtime.disable")); 6945 MDNode *DisableNode = MDNode::get(Context, DisableOperands); 6946 MDs.push_back(DisableNode); 6947 MDNode *NewLoopID = MDNode::get(Context, MDs); 6948 // Set operand 0 to refer to the loop id itself. 6949 NewLoopID->replaceOperandWith(0, NewLoopID); 6950 L->setLoopID(NewLoopID); 6951 } 6952 } 6953 6954 bool LoopVectorizationPlanner::getDecisionAndClampRange( 6955 const std::function<bool(unsigned)> &Predicate, VFRange &Range) { 6956 assert(Range.End > Range.Start && "Trying to test an empty VF range."); 6957 bool PredicateAtRangeStart = Predicate(Range.Start); 6958 6959 for (unsigned TmpVF = Range.Start * 2; TmpVF < Range.End; TmpVF *= 2) 6960 if (Predicate(TmpVF) != PredicateAtRangeStart) { 6961 Range.End = TmpVF; 6962 break; 6963 } 6964 6965 return PredicateAtRangeStart; 6966 } 6967 6968 /// Build VPlans for the full range of feasible VF's = {\p MinVF, 2 * \p MinVF, 6969 /// 4 * \p MinVF, ..., \p MaxVF} by repeatedly building a VPlan for a sub-range 6970 /// of VF's starting at a given VF and extending it as much as possible. Each 6971 /// vectorization decision can potentially shorten this sub-range during 6972 /// buildVPlan(). 6973 void LoopVectorizationPlanner::buildVPlans(unsigned MinVF, unsigned MaxVF) { 6974 for (unsigned VF = MinVF; VF < MaxVF + 1;) { 6975 VFRange SubRange = {VF, MaxVF + 1}; 6976 VPlans.push_back(buildVPlan(SubRange)); 6977 VF = SubRange.End; 6978 } 6979 } 6980 6981 VPValue *VPRecipeBuilder::createEdgeMask(BasicBlock *Src, BasicBlock *Dst, 6982 VPlanPtr &Plan) { 6983 assert(is_contained(predecessors(Dst), Src) && "Invalid edge"); 6984 6985 // Look for cached value. 6986 std::pair<BasicBlock *, BasicBlock *> Edge(Src, Dst); 6987 EdgeMaskCacheTy::iterator ECEntryIt = EdgeMaskCache.find(Edge); 6988 if (ECEntryIt != EdgeMaskCache.end()) 6989 return ECEntryIt->second; 6990 6991 VPValue *SrcMask = createBlockInMask(Src, Plan); 6992 6993 // The terminator has to be a branch inst! 6994 BranchInst *BI = dyn_cast<BranchInst>(Src->getTerminator()); 6995 assert(BI && "Unexpected terminator found"); 6996 6997 if (!BI->isConditional() || BI->getSuccessor(0) == BI->getSuccessor(1)) 6998 return EdgeMaskCache[Edge] = SrcMask; 6999 7000 VPValue *EdgeMask = Plan->getVPValue(BI->getCondition()); 7001 assert(EdgeMask && "No Edge Mask found for condition"); 7002 7003 if (BI->getSuccessor(0) != Dst) 7004 EdgeMask = Builder.createNot(EdgeMask); 7005 7006 if (SrcMask) // Otherwise block in-mask is all-one, no need to AND. 7007 EdgeMask = Builder.createAnd(EdgeMask, SrcMask); 7008 7009 return EdgeMaskCache[Edge] = EdgeMask; 7010 } 7011 7012 VPValue *VPRecipeBuilder::createBlockInMask(BasicBlock *BB, VPlanPtr &Plan) { 7013 assert(OrigLoop->contains(BB) && "Block is not a part of a loop"); 7014 7015 // Look for cached value. 7016 BlockMaskCacheTy::iterator BCEntryIt = BlockMaskCache.find(BB); 7017 if (BCEntryIt != BlockMaskCache.end()) 7018 return BCEntryIt->second; 7019 7020 // All-one mask is modelled as no-mask following the convention for masked 7021 // load/store/gather/scatter. Initialize BlockMask to no-mask. 7022 VPValue *BlockMask = nullptr; 7023 7024 if (OrigLoop->getHeader() == BB) { 7025 if (!CM.blockNeedsPredication(BB)) 7026 return BlockMaskCache[BB] = BlockMask; // Loop incoming mask is all-one. 7027 7028 // Introduce the early-exit compare IV <= BTC to form header block mask. 7029 // This is used instead of IV < TC because TC may wrap, unlike BTC. 7030 // Start by constructing the desired canonical IV. 7031 VPValue *IV = nullptr; 7032 if (Legal->getPrimaryInduction()) 7033 IV = Plan->getVPValue(Legal->getPrimaryInduction()); 7034 else { 7035 auto IVRecipe = new VPWidenCanonicalIVRecipe(); 7036 Builder.getInsertBlock()->appendRecipe(IVRecipe); 7037 IV = IVRecipe->getVPValue(); 7038 } 7039 VPValue *BTC = Plan->getOrCreateBackedgeTakenCount(); 7040 bool TailFolded = !CM.isScalarEpilogueAllowed(); 7041 if (TailFolded && CM.TTI.emitGetActiveLaneMask()) 7042 BlockMask = Builder.createNaryOp(VPInstruction::ActiveLaneMask, {IV, BTC}); 7043 else 7044 BlockMask = Builder.createNaryOp(VPInstruction::ICmpULE, {IV, BTC}); 7045 return BlockMaskCache[BB] = BlockMask; 7046 } 7047 7048 // This is the block mask. We OR all incoming edges. 7049 for (auto *Predecessor : predecessors(BB)) { 7050 VPValue *EdgeMask = createEdgeMask(Predecessor, BB, Plan); 7051 if (!EdgeMask) // Mask of predecessor is all-one so mask of block is too. 7052 return BlockMaskCache[BB] = EdgeMask; 7053 7054 if (!BlockMask) { // BlockMask has its initialized nullptr value. 7055 BlockMask = EdgeMask; 7056 continue; 7057 } 7058 7059 BlockMask = Builder.createOr(BlockMask, EdgeMask); 7060 } 7061 7062 return BlockMaskCache[BB] = BlockMask; 7063 } 7064 7065 VPWidenMemoryInstructionRecipe * 7066 VPRecipeBuilder::tryToWidenMemory(Instruction *I, VFRange &Range, 7067 VPlanPtr &Plan) { 7068 assert((isa<LoadInst>(I) || isa<StoreInst>(I)) && 7069 "Must be called with either a load or store"); 7070 7071 auto willWiden = [&](unsigned VF) -> bool { 7072 if (VF == 1) 7073 return false; 7074 LoopVectorizationCostModel::InstWidening Decision = 7075 CM.getWideningDecision(I, VF); 7076 assert(Decision != LoopVectorizationCostModel::CM_Unknown && 7077 "CM decision should be taken at this point."); 7078 if (Decision == LoopVectorizationCostModel::CM_Interleave) 7079 return true; 7080 if (CM.isScalarAfterVectorization(I, VF) || 7081 CM.isProfitableToScalarize(I, VF)) 7082 return false; 7083 return Decision != LoopVectorizationCostModel::CM_Scalarize; 7084 }; 7085 7086 if (!LoopVectorizationPlanner::getDecisionAndClampRange(willWiden, Range)) 7087 return nullptr; 7088 7089 VPValue *Mask = nullptr; 7090 if (Legal->isMaskRequired(I)) 7091 Mask = createBlockInMask(I->getParent(), Plan); 7092 7093 VPValue *Addr = Plan->getOrAddVPValue(getLoadStorePointerOperand(I)); 7094 if (LoadInst *Load = dyn_cast<LoadInst>(I)) 7095 return new VPWidenMemoryInstructionRecipe(*Load, Addr, Mask); 7096 7097 StoreInst *Store = cast<StoreInst>(I); 7098 VPValue *StoredValue = Plan->getOrAddVPValue(Store->getValueOperand()); 7099 return new VPWidenMemoryInstructionRecipe(*Store, Addr, StoredValue, Mask); 7100 } 7101 7102 VPWidenIntOrFpInductionRecipe * 7103 VPRecipeBuilder::tryToOptimizeInductionPHI(PHINode *Phi) const { 7104 // Check if this is an integer or fp induction. If so, build the recipe that 7105 // produces its scalar and vector values. 7106 InductionDescriptor II = Legal->getInductionVars().lookup(Phi); 7107 if (II.getKind() == InductionDescriptor::IK_IntInduction || 7108 II.getKind() == InductionDescriptor::IK_FpInduction) 7109 return new VPWidenIntOrFpInductionRecipe(Phi); 7110 7111 return nullptr; 7112 } 7113 7114 VPWidenIntOrFpInductionRecipe * 7115 VPRecipeBuilder::tryToOptimizeInductionTruncate(TruncInst *I, 7116 VFRange &Range) const { 7117 // Optimize the special case where the source is a constant integer 7118 // induction variable. Notice that we can only optimize the 'trunc' case 7119 // because (a) FP conversions lose precision, (b) sext/zext may wrap, and 7120 // (c) other casts depend on pointer size. 7121 7122 // Determine whether \p K is a truncation based on an induction variable that 7123 // can be optimized. 7124 auto isOptimizableIVTruncate = 7125 [&](Instruction *K) -> std::function<bool(unsigned)> { 7126 return 7127 [=](unsigned VF) -> bool { return CM.isOptimizableIVTruncate(K, VF); }; 7128 }; 7129 7130 if (LoopVectorizationPlanner::getDecisionAndClampRange( 7131 isOptimizableIVTruncate(I), Range)) 7132 return new VPWidenIntOrFpInductionRecipe(cast<PHINode>(I->getOperand(0)), 7133 I); 7134 return nullptr; 7135 } 7136 7137 VPBlendRecipe *VPRecipeBuilder::tryToBlend(PHINode *Phi, VPlanPtr &Plan) { 7138 // We know that all PHIs in non-header blocks are converted into selects, so 7139 // we don't have to worry about the insertion order and we can just use the 7140 // builder. At this point we generate the predication tree. There may be 7141 // duplications since this is a simple recursive scan, but future 7142 // optimizations will clean it up. 7143 7144 SmallVector<VPValue *, 2> Operands; 7145 unsigned NumIncoming = Phi->getNumIncomingValues(); 7146 for (unsigned In = 0; In < NumIncoming; In++) { 7147 VPValue *EdgeMask = 7148 createEdgeMask(Phi->getIncomingBlock(In), Phi->getParent(), Plan); 7149 assert((EdgeMask || NumIncoming == 1) && 7150 "Multiple predecessors with one having a full mask"); 7151 Operands.push_back(Plan->getOrAddVPValue(Phi->getIncomingValue(In))); 7152 if (EdgeMask) 7153 Operands.push_back(EdgeMask); 7154 } 7155 return new VPBlendRecipe(Phi, Operands); 7156 } 7157 7158 VPWidenCallRecipe *VPRecipeBuilder::tryToWidenCall(CallInst *CI, VFRange &Range, 7159 VPlan &Plan) const { 7160 7161 bool IsPredicated = LoopVectorizationPlanner::getDecisionAndClampRange( 7162 [this, CI](unsigned VF) { return CM.isScalarWithPredication(CI, VF); }, 7163 Range); 7164 7165 if (IsPredicated) 7166 return nullptr; 7167 7168 Intrinsic::ID ID = getVectorIntrinsicIDForCall(CI, TLI); 7169 if (ID && (ID == Intrinsic::assume || ID == Intrinsic::lifetime_end || 7170 ID == Intrinsic::lifetime_start || ID == Intrinsic::sideeffect)) 7171 return nullptr; 7172 7173 auto willWiden = [&](unsigned VF) -> bool { 7174 Intrinsic::ID ID = getVectorIntrinsicIDForCall(CI, TLI); 7175 // The following case may be scalarized depending on the VF. 7176 // The flag shows whether we use Intrinsic or a usual Call for vectorized 7177 // version of the instruction. 7178 // Is it beneficial to perform intrinsic call compared to lib call? 7179 bool NeedToScalarize = false; 7180 unsigned CallCost = CM.getVectorCallCost(CI, VF, NeedToScalarize); 7181 bool UseVectorIntrinsic = 7182 ID && CM.getVectorIntrinsicCost(CI, VF) <= CallCost; 7183 return UseVectorIntrinsic || !NeedToScalarize; 7184 }; 7185 7186 if (!LoopVectorizationPlanner::getDecisionAndClampRange(willWiden, Range)) 7187 return nullptr; 7188 7189 return new VPWidenCallRecipe(*CI, Plan.mapToVPValues(CI->arg_operands())); 7190 } 7191 7192 bool VPRecipeBuilder::shouldWiden(Instruction *I, VFRange &Range) const { 7193 assert(!isa<BranchInst>(I) && !isa<PHINode>(I) && !isa<LoadInst>(I) && 7194 !isa<StoreInst>(I) && "Instruction should have been handled earlier"); 7195 // Instruction should be widened, unless it is scalar after vectorization, 7196 // scalarization is profitable or it is predicated. 7197 auto WillScalarize = [this, I](unsigned VF) -> bool { 7198 return CM.isScalarAfterVectorization(I, VF) || 7199 CM.isProfitableToScalarize(I, VF) || 7200 CM.isScalarWithPredication(I, VF); 7201 }; 7202 return !LoopVectorizationPlanner::getDecisionAndClampRange(WillScalarize, 7203 Range); 7204 } 7205 7206 VPWidenRecipe *VPRecipeBuilder::tryToWiden(Instruction *I, VPlan &Plan) const { 7207 auto IsVectorizableOpcode = [](unsigned Opcode) { 7208 switch (Opcode) { 7209 case Instruction::Add: 7210 case Instruction::And: 7211 case Instruction::AShr: 7212 case Instruction::BitCast: 7213 case Instruction::FAdd: 7214 case Instruction::FCmp: 7215 case Instruction::FDiv: 7216 case Instruction::FMul: 7217 case Instruction::FNeg: 7218 case Instruction::FPExt: 7219 case Instruction::FPToSI: 7220 case Instruction::FPToUI: 7221 case Instruction::FPTrunc: 7222 case Instruction::FRem: 7223 case Instruction::FSub: 7224 case Instruction::ICmp: 7225 case Instruction::IntToPtr: 7226 case Instruction::LShr: 7227 case Instruction::Mul: 7228 case Instruction::Or: 7229 case Instruction::PtrToInt: 7230 case Instruction::SDiv: 7231 case Instruction::Select: 7232 case Instruction::SExt: 7233 case Instruction::Shl: 7234 case Instruction::SIToFP: 7235 case Instruction::SRem: 7236 case Instruction::Sub: 7237 case Instruction::Trunc: 7238 case Instruction::UDiv: 7239 case Instruction::UIToFP: 7240 case Instruction::URem: 7241 case Instruction::Xor: 7242 case Instruction::ZExt: 7243 return true; 7244 } 7245 return false; 7246 }; 7247 7248 if (!IsVectorizableOpcode(I->getOpcode())) 7249 return nullptr; 7250 7251 // Success: widen this instruction. 7252 return new VPWidenRecipe(*I, Plan.mapToVPValues(I->operands())); 7253 } 7254 7255 VPBasicBlock *VPRecipeBuilder::handleReplication( 7256 Instruction *I, VFRange &Range, VPBasicBlock *VPBB, 7257 DenseMap<Instruction *, VPReplicateRecipe *> &PredInst2Recipe, 7258 VPlanPtr &Plan) { 7259 bool IsUniform = LoopVectorizationPlanner::getDecisionAndClampRange( 7260 [&](unsigned VF) { return CM.isUniformAfterVectorization(I, VF); }, 7261 Range); 7262 7263 bool IsPredicated = LoopVectorizationPlanner::getDecisionAndClampRange( 7264 [&](unsigned VF) { return CM.isScalarWithPredication(I, VF); }, Range); 7265 7266 auto *Recipe = new VPReplicateRecipe(I, Plan->mapToVPValues(I->operands()), 7267 IsUniform, IsPredicated); 7268 setRecipe(I, Recipe); 7269 7270 // Find if I uses a predicated instruction. If so, it will use its scalar 7271 // value. Avoid hoisting the insert-element which packs the scalar value into 7272 // a vector value, as that happens iff all users use the vector value. 7273 for (auto &Op : I->operands()) 7274 if (auto *PredInst = dyn_cast<Instruction>(Op)) 7275 if (PredInst2Recipe.find(PredInst) != PredInst2Recipe.end()) 7276 PredInst2Recipe[PredInst]->setAlsoPack(false); 7277 7278 // Finalize the recipe for Instr, first if it is not predicated. 7279 if (!IsPredicated) { 7280 LLVM_DEBUG(dbgs() << "LV: Scalarizing:" << *I << "\n"); 7281 VPBB->appendRecipe(Recipe); 7282 return VPBB; 7283 } 7284 LLVM_DEBUG(dbgs() << "LV: Scalarizing and predicating:" << *I << "\n"); 7285 assert(VPBB->getSuccessors().empty() && 7286 "VPBB has successors when handling predicated replication."); 7287 // Record predicated instructions for above packing optimizations. 7288 PredInst2Recipe[I] = Recipe; 7289 VPBlockBase *Region = createReplicateRegion(I, Recipe, Plan); 7290 VPBlockUtils::insertBlockAfter(Region, VPBB); 7291 auto *RegSucc = new VPBasicBlock(); 7292 VPBlockUtils::insertBlockAfter(RegSucc, Region); 7293 return RegSucc; 7294 } 7295 7296 VPRegionBlock *VPRecipeBuilder::createReplicateRegion(Instruction *Instr, 7297 VPRecipeBase *PredRecipe, 7298 VPlanPtr &Plan) { 7299 // Instructions marked for predication are replicated and placed under an 7300 // if-then construct to prevent side-effects. 7301 7302 // Generate recipes to compute the block mask for this region. 7303 VPValue *BlockInMask = createBlockInMask(Instr->getParent(), Plan); 7304 7305 // Build the triangular if-then region. 7306 std::string RegionName = (Twine("pred.") + Instr->getOpcodeName()).str(); 7307 assert(Instr->getParent() && "Predicated instruction not in any basic block"); 7308 auto *BOMRecipe = new VPBranchOnMaskRecipe(BlockInMask); 7309 auto *Entry = new VPBasicBlock(Twine(RegionName) + ".entry", BOMRecipe); 7310 auto *PHIRecipe = 7311 Instr->getType()->isVoidTy() ? nullptr : new VPPredInstPHIRecipe(Instr); 7312 auto *Exit = new VPBasicBlock(Twine(RegionName) + ".continue", PHIRecipe); 7313 auto *Pred = new VPBasicBlock(Twine(RegionName) + ".if", PredRecipe); 7314 VPRegionBlock *Region = new VPRegionBlock(Entry, Exit, RegionName, true); 7315 7316 // Note: first set Entry as region entry and then connect successors starting 7317 // from it in order, to propagate the "parent" of each VPBasicBlock. 7318 VPBlockUtils::insertTwoBlocksAfter(Pred, Exit, BlockInMask, Entry); 7319 VPBlockUtils::connectBlocks(Pred, Exit); 7320 7321 return Region; 7322 } 7323 7324 VPRecipeBase *VPRecipeBuilder::tryToCreateWidenRecipe(Instruction *Instr, 7325 VFRange &Range, 7326 VPlanPtr &Plan) { 7327 // First, check for specific widening recipes that deal with calls, memory 7328 // operations, inductions and Phi nodes. 7329 if (auto *CI = dyn_cast<CallInst>(Instr)) 7330 return tryToWidenCall(CI, Range, *Plan); 7331 7332 if (isa<LoadInst>(Instr) || isa<StoreInst>(Instr)) 7333 return tryToWidenMemory(Instr, Range, Plan); 7334 7335 VPRecipeBase *Recipe; 7336 if (auto Phi = dyn_cast<PHINode>(Instr)) { 7337 if (Phi->getParent() != OrigLoop->getHeader()) 7338 return tryToBlend(Phi, Plan); 7339 if ((Recipe = tryToOptimizeInductionPHI(Phi))) 7340 return Recipe; 7341 return new VPWidenPHIRecipe(Phi); 7342 } 7343 7344 if (isa<TruncInst>(Instr) && 7345 (Recipe = tryToOptimizeInductionTruncate(cast<TruncInst>(Instr), Range))) 7346 return Recipe; 7347 7348 if (!shouldWiden(Instr, Range)) 7349 return nullptr; 7350 7351 if (auto GEP = dyn_cast<GetElementPtrInst>(Instr)) 7352 return new VPWidenGEPRecipe(GEP, Plan->mapToVPValues(GEP->operands()), 7353 OrigLoop); 7354 7355 if (auto *SI = dyn_cast<SelectInst>(Instr)) { 7356 bool InvariantCond = 7357 PSE.getSE()->isLoopInvariant(PSE.getSCEV(SI->getOperand(0)), OrigLoop); 7358 return new VPWidenSelectRecipe(*SI, Plan->mapToVPValues(SI->operands()), 7359 InvariantCond); 7360 } 7361 7362 return tryToWiden(Instr, *Plan); 7363 } 7364 7365 void LoopVectorizationPlanner::buildVPlansWithVPRecipes(unsigned MinVF, 7366 unsigned MaxVF) { 7367 assert(OrigLoop->empty() && "Inner loop expected."); 7368 7369 // Collect conditions feeding internal conditional branches; they need to be 7370 // represented in VPlan for it to model masking. 7371 SmallPtrSet<Value *, 1> NeedDef; 7372 7373 auto *Latch = OrigLoop->getLoopLatch(); 7374 for (BasicBlock *BB : OrigLoop->blocks()) { 7375 if (BB == Latch) 7376 continue; 7377 BranchInst *Branch = dyn_cast<BranchInst>(BB->getTerminator()); 7378 if (Branch && Branch->isConditional()) 7379 NeedDef.insert(Branch->getCondition()); 7380 } 7381 7382 // If the tail is to be folded by masking, the primary induction variable, if 7383 // exists needs to be represented in VPlan for it to model early-exit masking. 7384 // Also, both the Phi and the live-out instruction of each reduction are 7385 // required in order to introduce a select between them in VPlan. 7386 if (CM.foldTailByMasking()) { 7387 if (Legal->getPrimaryInduction()) 7388 NeedDef.insert(Legal->getPrimaryInduction()); 7389 for (auto &Reduction : Legal->getReductionVars()) { 7390 NeedDef.insert(Reduction.first); 7391 NeedDef.insert(Reduction.second.getLoopExitInstr()); 7392 } 7393 } 7394 7395 // Collect instructions from the original loop that will become trivially dead 7396 // in the vectorized loop. We don't need to vectorize these instructions. For 7397 // example, original induction update instructions can become dead because we 7398 // separately emit induction "steps" when generating code for the new loop. 7399 // Similarly, we create a new latch condition when setting up the structure 7400 // of the new loop, so the old one can become dead. 7401 SmallPtrSet<Instruction *, 4> DeadInstructions; 7402 collectTriviallyDeadInstructions(DeadInstructions); 7403 7404 // Add assume instructions we need to drop to DeadInstructions, to prevent 7405 // them from being added to the VPlan. 7406 // TODO: We only need to drop assumes in blocks that get flattend. If the 7407 // control flow is preserved, we should keep them. 7408 auto &ConditionalAssumes = Legal->getConditionalAssumes(); 7409 DeadInstructions.insert(ConditionalAssumes.begin(), ConditionalAssumes.end()); 7410 7411 DenseMap<Instruction *, Instruction *> &SinkAfter = Legal->getSinkAfter(); 7412 // Dead instructions do not need sinking. Remove them from SinkAfter. 7413 for (Instruction *I : DeadInstructions) 7414 SinkAfter.erase(I); 7415 7416 for (unsigned VF = MinVF; VF < MaxVF + 1;) { 7417 VFRange SubRange = {VF, MaxVF + 1}; 7418 VPlans.push_back(buildVPlanWithVPRecipes(SubRange, NeedDef, 7419 DeadInstructions, SinkAfter)); 7420 VF = SubRange.End; 7421 } 7422 } 7423 7424 VPlanPtr LoopVectorizationPlanner::buildVPlanWithVPRecipes( 7425 VFRange &Range, SmallPtrSetImpl<Value *> &NeedDef, 7426 SmallPtrSetImpl<Instruction *> &DeadInstructions, 7427 const DenseMap<Instruction *, Instruction *> &SinkAfter) { 7428 7429 // Hold a mapping from predicated instructions to their recipes, in order to 7430 // fix their AlsoPack behavior if a user is determined to replicate and use a 7431 // scalar instead of vector value. 7432 DenseMap<Instruction *, VPReplicateRecipe *> PredInst2Recipe; 7433 7434 SmallPtrSet<const InterleaveGroup<Instruction> *, 1> InterleaveGroups; 7435 7436 VPRecipeBuilder RecipeBuilder(OrigLoop, TLI, Legal, CM, PSE, Builder); 7437 7438 // --------------------------------------------------------------------------- 7439 // Pre-construction: record ingredients whose recipes we'll need to further 7440 // process after constructing the initial VPlan. 7441 // --------------------------------------------------------------------------- 7442 7443 // Mark instructions we'll need to sink later and their targets as 7444 // ingredients whose recipe we'll need to record. 7445 for (auto &Entry : SinkAfter) { 7446 RecipeBuilder.recordRecipeOf(Entry.first); 7447 RecipeBuilder.recordRecipeOf(Entry.second); 7448 } 7449 for (auto &Reduction : CM.getInLoopReductionChains()) { 7450 PHINode *Phi = Reduction.first; 7451 RecurrenceDescriptor::RecurrenceKind Kind = 7452 Legal->getReductionVars()[Phi].getRecurrenceKind(); 7453 const SmallVector<Instruction *, 4> &ReductionOperations = Reduction.second; 7454 7455 RecipeBuilder.recordRecipeOf(Phi); 7456 for (auto &R : ReductionOperations) { 7457 RecipeBuilder.recordRecipeOf(R); 7458 // For min/max reducitons, where we have a pair of icmp/select, we also 7459 // need to record the ICmp recipe, so it can be removed later. 7460 if (Kind == RecurrenceDescriptor::RK_IntegerMinMax || 7461 Kind == RecurrenceDescriptor::RK_FloatMinMax) { 7462 RecipeBuilder.recordRecipeOf(cast<Instruction>(R->getOperand(0))); 7463 } 7464 } 7465 } 7466 7467 // For each interleave group which is relevant for this (possibly trimmed) 7468 // Range, add it to the set of groups to be later applied to the VPlan and add 7469 // placeholders for its members' Recipes which we'll be replacing with a 7470 // single VPInterleaveRecipe. 7471 for (InterleaveGroup<Instruction> *IG : IAI.getInterleaveGroups()) { 7472 auto applyIG = [IG, this](unsigned VF) -> bool { 7473 return (VF >= 2 && // Query is illegal for VF == 1 7474 CM.getWideningDecision(IG->getInsertPos(), VF) == 7475 LoopVectorizationCostModel::CM_Interleave); 7476 }; 7477 if (!getDecisionAndClampRange(applyIG, Range)) 7478 continue; 7479 InterleaveGroups.insert(IG); 7480 for (unsigned i = 0; i < IG->getFactor(); i++) 7481 if (Instruction *Member = IG->getMember(i)) 7482 RecipeBuilder.recordRecipeOf(Member); 7483 }; 7484 7485 // --------------------------------------------------------------------------- 7486 // Build initial VPlan: Scan the body of the loop in a topological order to 7487 // visit each basic block after having visited its predecessor basic blocks. 7488 // --------------------------------------------------------------------------- 7489 7490 // Create a dummy pre-entry VPBasicBlock to start building the VPlan. 7491 auto Plan = std::make_unique<VPlan>(); 7492 VPBasicBlock *VPBB = new VPBasicBlock("Pre-Entry"); 7493 Plan->setEntry(VPBB); 7494 7495 // Represent values that will have defs inside VPlan. 7496 for (Value *V : NeedDef) 7497 Plan->addVPValue(V); 7498 7499 // Scan the body of the loop in a topological order to visit each basic block 7500 // after having visited its predecessor basic blocks. 7501 LoopBlocksDFS DFS(OrigLoop); 7502 DFS.perform(LI); 7503 7504 for (BasicBlock *BB : make_range(DFS.beginRPO(), DFS.endRPO())) { 7505 // Relevant instructions from basic block BB will be grouped into VPRecipe 7506 // ingredients and fill a new VPBasicBlock. 7507 unsigned VPBBsForBB = 0; 7508 auto *FirstVPBBForBB = new VPBasicBlock(BB->getName()); 7509 VPBlockUtils::insertBlockAfter(FirstVPBBForBB, VPBB); 7510 VPBB = FirstVPBBForBB; 7511 Builder.setInsertPoint(VPBB); 7512 7513 // Introduce each ingredient into VPlan. 7514 // TODO: Model and preserve debug instrinsics in VPlan. 7515 for (Instruction &I : BB->instructionsWithoutDebug()) { 7516 Instruction *Instr = &I; 7517 7518 // First filter out irrelevant instructions, to ensure no recipes are 7519 // built for them. 7520 if (isa<BranchInst>(Instr) || DeadInstructions.count(Instr)) 7521 continue; 7522 7523 if (auto Recipe = 7524 RecipeBuilder.tryToCreateWidenRecipe(Instr, Range, Plan)) { 7525 RecipeBuilder.setRecipe(Instr, Recipe); 7526 VPBB->appendRecipe(Recipe); 7527 continue; 7528 } 7529 7530 // Otherwise, if all widening options failed, Instruction is to be 7531 // replicated. This may create a successor for VPBB. 7532 VPBasicBlock *NextVPBB = RecipeBuilder.handleReplication( 7533 Instr, Range, VPBB, PredInst2Recipe, Plan); 7534 if (NextVPBB != VPBB) { 7535 VPBB = NextVPBB; 7536 VPBB->setName(BB->hasName() ? BB->getName() + "." + Twine(VPBBsForBB++) 7537 : ""); 7538 } 7539 } 7540 } 7541 7542 // Discard empty dummy pre-entry VPBasicBlock. Note that other VPBasicBlocks 7543 // may also be empty, such as the last one VPBB, reflecting original 7544 // basic-blocks with no recipes. 7545 VPBasicBlock *PreEntry = cast<VPBasicBlock>(Plan->getEntry()); 7546 assert(PreEntry->empty() && "Expecting empty pre-entry block."); 7547 VPBlockBase *Entry = Plan->setEntry(PreEntry->getSingleSuccessor()); 7548 VPBlockUtils::disconnectBlocks(PreEntry, Entry); 7549 delete PreEntry; 7550 7551 // --------------------------------------------------------------------------- 7552 // Transform initial VPlan: Apply previously taken decisions, in order, to 7553 // bring the VPlan to its final state. 7554 // --------------------------------------------------------------------------- 7555 7556 // Apply Sink-After legal constraints. 7557 for (auto &Entry : SinkAfter) { 7558 VPRecipeBase *Sink = RecipeBuilder.getRecipe(Entry.first); 7559 VPRecipeBase *Target = RecipeBuilder.getRecipe(Entry.second); 7560 Sink->moveAfter(Target); 7561 } 7562 7563 // Interleave memory: for each Interleave Group we marked earlier as relevant 7564 // for this VPlan, replace the Recipes widening its memory instructions with a 7565 // single VPInterleaveRecipe at its insertion point. 7566 for (auto IG : InterleaveGroups) { 7567 auto *Recipe = cast<VPWidenMemoryInstructionRecipe>( 7568 RecipeBuilder.getRecipe(IG->getInsertPos())); 7569 (new VPInterleaveRecipe(IG, Recipe->getAddr(), Recipe->getMask())) 7570 ->insertBefore(Recipe); 7571 7572 for (unsigned i = 0; i < IG->getFactor(); ++i) 7573 if (Instruction *Member = IG->getMember(i)) { 7574 RecipeBuilder.getRecipe(Member)->eraseFromParent(); 7575 } 7576 } 7577 7578 // Adjust the recipes for any inloop reductions. 7579 if (Range.Start > 1) 7580 adjustRecipesForInLoopReductions(Plan, RecipeBuilder); 7581 7582 // Finally, if tail is folded by masking, introduce selects between the phi 7583 // and the live-out instruction of each reduction, at the end of the latch. 7584 if (CM.foldTailByMasking() && !Legal->getReductionVars().empty()) { 7585 Builder.setInsertPoint(VPBB); 7586 auto *Cond = RecipeBuilder.createBlockInMask(OrigLoop->getHeader(), Plan); 7587 for (auto &Reduction : Legal->getReductionVars()) { 7588 assert(!CM.isInLoopReduction(Reduction.first) && 7589 "Didn't expect inloop tail folded reduction yet!"); 7590 VPValue *Phi = Plan->getVPValue(Reduction.first); 7591 VPValue *Red = Plan->getVPValue(Reduction.second.getLoopExitInstr()); 7592 Builder.createNaryOp(Instruction::Select, {Cond, Red, Phi}); 7593 } 7594 } 7595 7596 std::string PlanName; 7597 raw_string_ostream RSO(PlanName); 7598 unsigned VF = Range.Start; 7599 Plan->addVF(VF); 7600 RSO << "Initial VPlan for VF={" << VF; 7601 for (VF *= 2; VF < Range.End; VF *= 2) { 7602 Plan->addVF(VF); 7603 RSO << "," << VF; 7604 } 7605 RSO << "},UF>=1"; 7606 RSO.flush(); 7607 Plan->setName(PlanName); 7608 7609 return Plan; 7610 } 7611 7612 VPlanPtr LoopVectorizationPlanner::buildVPlan(VFRange &Range) { 7613 // Outer loop handling: They may require CFG and instruction level 7614 // transformations before even evaluating whether vectorization is profitable. 7615 // Since we cannot modify the incoming IR, we need to build VPlan upfront in 7616 // the vectorization pipeline. 7617 assert(!OrigLoop->empty()); 7618 assert(EnableVPlanNativePath && "VPlan-native path is not enabled."); 7619 7620 // Create new empty VPlan 7621 auto Plan = std::make_unique<VPlan>(); 7622 7623 // Build hierarchical CFG 7624 VPlanHCFGBuilder HCFGBuilder(OrigLoop, LI, *Plan); 7625 HCFGBuilder.buildHierarchicalCFG(); 7626 7627 for (unsigned VF = Range.Start; VF < Range.End; VF *= 2) 7628 Plan->addVF(VF); 7629 7630 if (EnableVPlanPredication) { 7631 VPlanPredicator VPP(*Plan); 7632 VPP.predicate(); 7633 7634 // Avoid running transformation to recipes until masked code generation in 7635 // VPlan-native path is in place. 7636 return Plan; 7637 } 7638 7639 SmallPtrSet<Instruction *, 1> DeadInstructions; 7640 VPlanTransforms::VPInstructionsToVPRecipes( 7641 OrigLoop, Plan, Legal->getInductionVars(), DeadInstructions); 7642 return Plan; 7643 } 7644 7645 // Adjust the recipes for any inloop reductions. The chain of instructions 7646 // leading from the loop exit instr to the phi need to be converted to 7647 // reductions, with one operand being vector and the other being the scalar 7648 // reduction chain. 7649 void LoopVectorizationPlanner::adjustRecipesForInLoopReductions( 7650 VPlanPtr &Plan, VPRecipeBuilder &RecipeBuilder) { 7651 for (auto &Reduction : CM.getInLoopReductionChains()) { 7652 PHINode *Phi = Reduction.first; 7653 RecurrenceDescriptor &RdxDesc = Legal->getReductionVars()[Phi]; 7654 const SmallVector<Instruction *, 4> &ReductionOperations = Reduction.second; 7655 7656 // ReductionOperations are orders top-down from the phi's use to the 7657 // LoopExitValue. We keep a track of the previous item (the Chain) to tell 7658 // which of the two operands will remain scalar and which will be reduced. 7659 // For minmax the chain will be the select instructions. 7660 Instruction *Chain = Phi; 7661 for (Instruction *R : ReductionOperations) { 7662 VPRecipeBase *WidenRecipe = RecipeBuilder.getRecipe(R); 7663 RecurrenceDescriptor::RecurrenceKind Kind = RdxDesc.getRecurrenceKind(); 7664 7665 VPValue *ChainOp = Plan->getVPValue(Chain); 7666 unsigned FirstOpId; 7667 if (Kind == RecurrenceDescriptor::RK_IntegerMinMax || 7668 Kind == RecurrenceDescriptor::RK_FloatMinMax) { 7669 assert(WidenRecipe->getVPRecipeID() == VPRecipeBase::VPWidenSelectSC && 7670 "Expected to replace a VPWidenSelectSC"); 7671 FirstOpId = 1; 7672 } else { 7673 assert(WidenRecipe->getVPRecipeID() == VPRecipeBase::VPWidenSC && 7674 "Expected to replace a VPWidenSC"); 7675 FirstOpId = 0; 7676 } 7677 unsigned VecOpId = 7678 R->getOperand(FirstOpId) == Chain ? FirstOpId + 1 : FirstOpId; 7679 VPValue *VecOp = Plan->getVPValue(R->getOperand(VecOpId)); 7680 7681 VPReductionRecipe *RedRecipe = new VPReductionRecipe( 7682 &RdxDesc, R, ChainOp, VecOp, Legal->hasFunNoNaNAttr(), TTI); 7683 WidenRecipe->getParent()->insert(RedRecipe, WidenRecipe->getIterator()); 7684 WidenRecipe->eraseFromParent(); 7685 7686 if (Kind == RecurrenceDescriptor::RK_IntegerMinMax || 7687 Kind == RecurrenceDescriptor::RK_FloatMinMax) { 7688 VPRecipeBase *CompareRecipe = 7689 RecipeBuilder.getRecipe(cast<Instruction>(R->getOperand(0))); 7690 assert(CompareRecipe->getVPRecipeID() == VPRecipeBase::VPWidenSC && 7691 "Expected to replace a VPWidenSC"); 7692 CompareRecipe->eraseFromParent(); 7693 } 7694 Chain = R; 7695 } 7696 } 7697 } 7698 7699 Value* LoopVectorizationPlanner::VPCallbackILV:: 7700 getOrCreateVectorValues(Value *V, unsigned Part) { 7701 return ILV.getOrCreateVectorValue(V, Part); 7702 } 7703 7704 Value *LoopVectorizationPlanner::VPCallbackILV::getOrCreateScalarValue( 7705 Value *V, const VPIteration &Instance) { 7706 return ILV.getOrCreateScalarValue(V, Instance); 7707 } 7708 7709 void VPInterleaveRecipe::print(raw_ostream &O, const Twine &Indent, 7710 VPSlotTracker &SlotTracker) const { 7711 O << "\"INTERLEAVE-GROUP with factor " << IG->getFactor() << " at "; 7712 IG->getInsertPos()->printAsOperand(O, false); 7713 O << ", "; 7714 getAddr()->printAsOperand(O, SlotTracker); 7715 VPValue *Mask = getMask(); 7716 if (Mask) { 7717 O << ", "; 7718 Mask->printAsOperand(O, SlotTracker); 7719 } 7720 for (unsigned i = 0; i < IG->getFactor(); ++i) 7721 if (Instruction *I = IG->getMember(i)) 7722 O << "\\l\" +\n" << Indent << "\" " << VPlanIngredient(I) << " " << i; 7723 } 7724 7725 void VPWidenCallRecipe::execute(VPTransformState &State) { 7726 State.ILV->widenCallInstruction(Ingredient, User, State); 7727 } 7728 7729 void VPWidenSelectRecipe::execute(VPTransformState &State) { 7730 State.ILV->widenSelectInstruction(Ingredient, User, InvariantCond, State); 7731 } 7732 7733 void VPWidenRecipe::execute(VPTransformState &State) { 7734 State.ILV->widenInstruction(Ingredient, User, State); 7735 } 7736 7737 void VPWidenGEPRecipe::execute(VPTransformState &State) { 7738 State.ILV->widenGEP(GEP, User, State.UF, State.VF, IsPtrLoopInvariant, 7739 IsIndexLoopInvariant, State); 7740 } 7741 7742 void VPWidenIntOrFpInductionRecipe::execute(VPTransformState &State) { 7743 assert(!State.Instance && "Int or FP induction being replicated."); 7744 State.ILV->widenIntOrFpInduction(IV, Trunc); 7745 } 7746 7747 void VPWidenPHIRecipe::execute(VPTransformState &State) { 7748 State.ILV->widenPHIInstruction(Phi, State.UF, State.VF); 7749 } 7750 7751 void VPBlendRecipe::execute(VPTransformState &State) { 7752 State.ILV->setDebugLocFromInst(State.Builder, Phi); 7753 // We know that all PHIs in non-header blocks are converted into 7754 // selects, so we don't have to worry about the insertion order and we 7755 // can just use the builder. 7756 // At this point we generate the predication tree. There may be 7757 // duplications since this is a simple recursive scan, but future 7758 // optimizations will clean it up. 7759 7760 unsigned NumIncoming = getNumIncomingValues(); 7761 7762 // Generate a sequence of selects of the form: 7763 // SELECT(Mask3, In3, 7764 // SELECT(Mask2, In2, 7765 // SELECT(Mask1, In1, 7766 // In0))) 7767 // Note that Mask0 is never used: lanes for which no path reaches this phi and 7768 // are essentially undef are taken from In0. 7769 InnerLoopVectorizer::VectorParts Entry(State.UF); 7770 for (unsigned In = 0; In < NumIncoming; ++In) { 7771 for (unsigned Part = 0; Part < State.UF; ++Part) { 7772 // We might have single edge PHIs (blocks) - use an identity 7773 // 'select' for the first PHI operand. 7774 Value *In0 = State.get(getIncomingValue(In), Part); 7775 if (In == 0) 7776 Entry[Part] = In0; // Initialize with the first incoming value. 7777 else { 7778 // Select between the current value and the previous incoming edge 7779 // based on the incoming mask. 7780 Value *Cond = State.get(getMask(In), Part); 7781 Entry[Part] = 7782 State.Builder.CreateSelect(Cond, In0, Entry[Part], "predphi"); 7783 } 7784 } 7785 } 7786 for (unsigned Part = 0; Part < State.UF; ++Part) 7787 State.ValueMap.setVectorValue(Phi, Part, Entry[Part]); 7788 } 7789 7790 void VPInterleaveRecipe::execute(VPTransformState &State) { 7791 assert(!State.Instance && "Interleave group being replicated."); 7792 State.ILV->vectorizeInterleaveGroup(IG, State, getAddr(), getMask()); 7793 } 7794 7795 void VPReductionRecipe::execute(VPTransformState &State) { 7796 assert(!State.Instance && "Reduction being replicated."); 7797 for (unsigned Part = 0; Part < State.UF; ++Part) { 7798 unsigned Kind = RdxDesc->getRecurrenceKind(); 7799 Value *NewVecOp = State.get(VecOp, Part); 7800 Value *NewRed = 7801 createTargetReduction(State.Builder, TTI, *RdxDesc, NewVecOp, NoNaN); 7802 Value *PrevInChain = State.get(ChainOp, Part); 7803 Value *NextInChain; 7804 if (Kind == RecurrenceDescriptor::RK_IntegerMinMax || 7805 Kind == RecurrenceDescriptor::RK_FloatMinMax) { 7806 NextInChain = 7807 createMinMaxOp(State.Builder, RdxDesc->getMinMaxRecurrenceKind(), 7808 NewRed, PrevInChain); 7809 } else { 7810 NextInChain = State.Builder.CreateBinOp( 7811 (Instruction::BinaryOps)I->getOpcode(), NewRed, PrevInChain); 7812 } 7813 State.ValueMap.setVectorValue(I, Part, NextInChain); 7814 } 7815 } 7816 7817 void VPReplicateRecipe::execute(VPTransformState &State) { 7818 if (State.Instance) { // Generate a single instance. 7819 State.ILV->scalarizeInstruction(Ingredient, User, *State.Instance, 7820 IsPredicated, State); 7821 // Insert scalar instance packing it into a vector. 7822 if (AlsoPack && State.VF > 1) { 7823 // If we're constructing lane 0, initialize to start from undef. 7824 if (State.Instance->Lane == 0) { 7825 Value *Undef = UndefValue::get( 7826 FixedVectorType::get(Ingredient->getType(), State.VF)); 7827 State.ValueMap.setVectorValue(Ingredient, State.Instance->Part, Undef); 7828 } 7829 State.ILV->packScalarIntoVectorValue(Ingredient, *State.Instance); 7830 } 7831 return; 7832 } 7833 7834 // Generate scalar instances for all VF lanes of all UF parts, unless the 7835 // instruction is uniform inwhich case generate only the first lane for each 7836 // of the UF parts. 7837 unsigned EndLane = IsUniform ? 1 : State.VF; 7838 for (unsigned Part = 0; Part < State.UF; ++Part) 7839 for (unsigned Lane = 0; Lane < EndLane; ++Lane) 7840 State.ILV->scalarizeInstruction(Ingredient, User, {Part, Lane}, 7841 IsPredicated, State); 7842 } 7843 7844 void VPBranchOnMaskRecipe::execute(VPTransformState &State) { 7845 assert(State.Instance && "Branch on Mask works only on single instance."); 7846 7847 unsigned Part = State.Instance->Part; 7848 unsigned Lane = State.Instance->Lane; 7849 7850 Value *ConditionBit = nullptr; 7851 VPValue *BlockInMask = getMask(); 7852 if (BlockInMask) { 7853 ConditionBit = State.get(BlockInMask, Part); 7854 if (ConditionBit->getType()->isVectorTy()) 7855 ConditionBit = State.Builder.CreateExtractElement( 7856 ConditionBit, State.Builder.getInt32(Lane)); 7857 } else // Block in mask is all-one. 7858 ConditionBit = State.Builder.getTrue(); 7859 7860 // Replace the temporary unreachable terminator with a new conditional branch, 7861 // whose two destinations will be set later when they are created. 7862 auto *CurrentTerminator = State.CFG.PrevBB->getTerminator(); 7863 assert(isa<UnreachableInst>(CurrentTerminator) && 7864 "Expected to replace unreachable terminator with conditional branch."); 7865 auto *CondBr = BranchInst::Create(State.CFG.PrevBB, nullptr, ConditionBit); 7866 CondBr->setSuccessor(0, nullptr); 7867 ReplaceInstWithInst(CurrentTerminator, CondBr); 7868 } 7869 7870 void VPPredInstPHIRecipe::execute(VPTransformState &State) { 7871 assert(State.Instance && "Predicated instruction PHI works per instance."); 7872 Instruction *ScalarPredInst = cast<Instruction>( 7873 State.ValueMap.getScalarValue(PredInst, *State.Instance)); 7874 BasicBlock *PredicatedBB = ScalarPredInst->getParent(); 7875 BasicBlock *PredicatingBB = PredicatedBB->getSinglePredecessor(); 7876 assert(PredicatingBB && "Predicated block has no single predecessor."); 7877 7878 // By current pack/unpack logic we need to generate only a single phi node: if 7879 // a vector value for the predicated instruction exists at this point it means 7880 // the instruction has vector users only, and a phi for the vector value is 7881 // needed. In this case the recipe of the predicated instruction is marked to 7882 // also do that packing, thereby "hoisting" the insert-element sequence. 7883 // Otherwise, a phi node for the scalar value is needed. 7884 unsigned Part = State.Instance->Part; 7885 if (State.ValueMap.hasVectorValue(PredInst, Part)) { 7886 Value *VectorValue = State.ValueMap.getVectorValue(PredInst, Part); 7887 InsertElementInst *IEI = cast<InsertElementInst>(VectorValue); 7888 PHINode *VPhi = State.Builder.CreatePHI(IEI->getType(), 2); 7889 VPhi->addIncoming(IEI->getOperand(0), PredicatingBB); // Unmodified vector. 7890 VPhi->addIncoming(IEI, PredicatedBB); // New vector with inserted element. 7891 State.ValueMap.resetVectorValue(PredInst, Part, VPhi); // Update cache. 7892 } else { 7893 Type *PredInstType = PredInst->getType(); 7894 PHINode *Phi = State.Builder.CreatePHI(PredInstType, 2); 7895 Phi->addIncoming(UndefValue::get(ScalarPredInst->getType()), PredicatingBB); 7896 Phi->addIncoming(ScalarPredInst, PredicatedBB); 7897 State.ValueMap.resetScalarValue(PredInst, *State.Instance, Phi); 7898 } 7899 } 7900 7901 void VPWidenMemoryInstructionRecipe::execute(VPTransformState &State) { 7902 VPValue *StoredValue = isa<StoreInst>(Instr) ? getStoredValue() : nullptr; 7903 State.ILV->vectorizeMemoryInstruction(&Instr, State, getAddr(), StoredValue, 7904 getMask()); 7905 } 7906 7907 // Determine how to lower the scalar epilogue, which depends on 1) optimising 7908 // for minimum code-size, 2) predicate compiler options, 3) loop hints forcing 7909 // predication, and 4) a TTI hook that analyses whether the loop is suitable 7910 // for predication. 7911 static ScalarEpilogueLowering getScalarEpilogueLowering( 7912 Function *F, Loop *L, LoopVectorizeHints &Hints, ProfileSummaryInfo *PSI, 7913 BlockFrequencyInfo *BFI, TargetTransformInfo *TTI, TargetLibraryInfo *TLI, 7914 AssumptionCache *AC, LoopInfo *LI, ScalarEvolution *SE, DominatorTree *DT, 7915 LoopVectorizationLegality &LVL) { 7916 bool OptSize = 7917 F->hasOptSize() || llvm::shouldOptimizeForSize(L->getHeader(), PSI, BFI, 7918 PGSOQueryType::IRPass); 7919 // 1) OptSize takes precedence over all other options, i.e. if this is set, 7920 // don't look at hints or options, and don't request a scalar epilogue. 7921 if (OptSize) 7922 return CM_ScalarEpilogueNotAllowedOptSize; 7923 7924 bool PredicateOptDisabled = PreferPredicateOverEpilog.getNumOccurrences() && 7925 !PreferPredicateOverEpilog; 7926 7927 // 2) Next, if disabling predication is requested on the command line, honour 7928 // this and request a scalar epilogue. 7929 if (PredicateOptDisabled) 7930 return CM_ScalarEpilogueAllowed; 7931 7932 // 3) and 4) look if enabling predication is requested on the command line, 7933 // with a loop hint, or if the TTI hook indicates this is profitable, request 7934 // predication . 7935 if (PreferPredicateOverEpilog || 7936 Hints.getPredicate() == LoopVectorizeHints::FK_Enabled || 7937 (TTI->preferPredicateOverEpilogue(L, LI, *SE, *AC, TLI, DT, 7938 LVL.getLAI()) && 7939 Hints.getPredicate() != LoopVectorizeHints::FK_Disabled)) 7940 return CM_ScalarEpilogueNotNeededUsePredicate; 7941 7942 return CM_ScalarEpilogueAllowed; 7943 } 7944 7945 // Process the loop in the VPlan-native vectorization path. This path builds 7946 // VPlan upfront in the vectorization pipeline, which allows to apply 7947 // VPlan-to-VPlan transformations from the very beginning without modifying the 7948 // input LLVM IR. 7949 static bool processLoopInVPlanNativePath( 7950 Loop *L, PredicatedScalarEvolution &PSE, LoopInfo *LI, DominatorTree *DT, 7951 LoopVectorizationLegality *LVL, TargetTransformInfo *TTI, 7952 TargetLibraryInfo *TLI, DemandedBits *DB, AssumptionCache *AC, 7953 OptimizationRemarkEmitter *ORE, BlockFrequencyInfo *BFI, 7954 ProfileSummaryInfo *PSI, LoopVectorizeHints &Hints) { 7955 7956 if (PSE.getBackedgeTakenCount() == PSE.getSE()->getCouldNotCompute()) { 7957 LLVM_DEBUG(dbgs() << "LV: cannot compute the outer-loop trip count\n"); 7958 return false; 7959 } 7960 assert(EnableVPlanNativePath && "VPlan-native path is disabled."); 7961 Function *F = L->getHeader()->getParent(); 7962 InterleavedAccessInfo IAI(PSE, L, DT, LI, LVL->getLAI()); 7963 7964 ScalarEpilogueLowering SEL = getScalarEpilogueLowering( 7965 F, L, Hints, PSI, BFI, TTI, TLI, AC, LI, PSE.getSE(), DT, *LVL); 7966 7967 LoopVectorizationCostModel CM(SEL, L, PSE, LI, LVL, *TTI, TLI, DB, AC, ORE, F, 7968 &Hints, IAI); 7969 // Use the planner for outer loop vectorization. 7970 // TODO: CM is not used at this point inside the planner. Turn CM into an 7971 // optional argument if we don't need it in the future. 7972 LoopVectorizationPlanner LVP(L, LI, TLI, TTI, LVL, CM, IAI, PSE); 7973 7974 // Get user vectorization factor. 7975 const unsigned UserVF = Hints.getWidth(); 7976 7977 // Plan how to best vectorize, return the best VF and its cost. 7978 const VectorizationFactor VF = LVP.planInVPlanNativePath(UserVF); 7979 7980 // If we are stress testing VPlan builds, do not attempt to generate vector 7981 // code. Masked vector code generation support will follow soon. 7982 // Also, do not attempt to vectorize if no vector code will be produced. 7983 if (VPlanBuildStressTest || EnableVPlanPredication || 7984 VectorizationFactor::Disabled() == VF) 7985 return false; 7986 7987 LVP.setBestPlan(VF.Width, 1); 7988 7989 InnerLoopVectorizer LB(L, PSE, LI, DT, TLI, TTI, AC, ORE, VF.Width, 1, LVL, 7990 &CM, BFI, PSI); 7991 LLVM_DEBUG(dbgs() << "Vectorizing outer loop in \"" 7992 << L->getHeader()->getParent()->getName() << "\"\n"); 7993 LVP.executePlan(LB, DT); 7994 7995 // Mark the loop as already vectorized to avoid vectorizing again. 7996 Hints.setAlreadyVectorized(); 7997 7998 assert(!verifyFunction(*L->getHeader()->getParent(), &dbgs())); 7999 return true; 8000 } 8001 8002 LoopVectorizePass::LoopVectorizePass(LoopVectorizeOptions Opts) 8003 : InterleaveOnlyWhenForced(Opts.InterleaveOnlyWhenForced || 8004 !EnableLoopInterleaving), 8005 VectorizeOnlyWhenForced(Opts.VectorizeOnlyWhenForced || 8006 !EnableLoopVectorization) {} 8007 8008 bool LoopVectorizePass::processLoop(Loop *L) { 8009 assert((EnableVPlanNativePath || L->empty()) && 8010 "VPlan-native path is not enabled. Only process inner loops."); 8011 8012 #ifndef NDEBUG 8013 const std::string DebugLocStr = getDebugLocString(L); 8014 #endif /* NDEBUG */ 8015 8016 LLVM_DEBUG(dbgs() << "\nLV: Checking a loop in \"" 8017 << L->getHeader()->getParent()->getName() << "\" from " 8018 << DebugLocStr << "\n"); 8019 8020 LoopVectorizeHints Hints(L, InterleaveOnlyWhenForced, *ORE); 8021 8022 LLVM_DEBUG( 8023 dbgs() << "LV: Loop hints:" 8024 << " force=" 8025 << (Hints.getForce() == LoopVectorizeHints::FK_Disabled 8026 ? "disabled" 8027 : (Hints.getForce() == LoopVectorizeHints::FK_Enabled 8028 ? "enabled" 8029 : "?")) 8030 << " width=" << Hints.getWidth() 8031 << " unroll=" << Hints.getInterleave() << "\n"); 8032 8033 // Function containing loop 8034 Function *F = L->getHeader()->getParent(); 8035 8036 // Looking at the diagnostic output is the only way to determine if a loop 8037 // was vectorized (other than looking at the IR or machine code), so it 8038 // is important to generate an optimization remark for each loop. Most of 8039 // these messages are generated as OptimizationRemarkAnalysis. Remarks 8040 // generated as OptimizationRemark and OptimizationRemarkMissed are 8041 // less verbose reporting vectorized loops and unvectorized loops that may 8042 // benefit from vectorization, respectively. 8043 8044 if (!Hints.allowVectorization(F, L, VectorizeOnlyWhenForced)) { 8045 LLVM_DEBUG(dbgs() << "LV: Loop hints prevent vectorization.\n"); 8046 return false; 8047 } 8048 8049 PredicatedScalarEvolution PSE(*SE, *L); 8050 8051 // Check if it is legal to vectorize the loop. 8052 LoopVectorizationRequirements Requirements(*ORE); 8053 LoopVectorizationLegality LVL(L, PSE, DT, TTI, TLI, AA, F, GetLAA, LI, ORE, 8054 &Requirements, &Hints, DB, AC, BFI, PSI); 8055 if (!LVL.canVectorize(EnableVPlanNativePath)) { 8056 LLVM_DEBUG(dbgs() << "LV: Not vectorizing: Cannot prove legality.\n"); 8057 Hints.emitRemarkWithHints(); 8058 return false; 8059 } 8060 8061 // Check the function attributes and profiles to find out if this function 8062 // should be optimized for size. 8063 ScalarEpilogueLowering SEL = getScalarEpilogueLowering( 8064 F, L, Hints, PSI, BFI, TTI, TLI, AC, LI, PSE.getSE(), DT, LVL); 8065 8066 // Entrance to the VPlan-native vectorization path. Outer loops are processed 8067 // here. They may require CFG and instruction level transformations before 8068 // even evaluating whether vectorization is profitable. Since we cannot modify 8069 // the incoming IR, we need to build VPlan upfront in the vectorization 8070 // pipeline. 8071 if (!L->empty()) 8072 return processLoopInVPlanNativePath(L, PSE, LI, DT, &LVL, TTI, TLI, DB, AC, 8073 ORE, BFI, PSI, Hints); 8074 8075 assert(L->empty() && "Inner loop expected."); 8076 8077 // Check the loop for a trip count threshold: vectorize loops with a tiny trip 8078 // count by optimizing for size, to minimize overheads. 8079 auto ExpectedTC = getSmallBestKnownTC(*SE, L); 8080 if (ExpectedTC && *ExpectedTC < TinyTripCountVectorThreshold) { 8081 LLVM_DEBUG(dbgs() << "LV: Found a loop with a very small trip count. " 8082 << "This loop is worth vectorizing only if no scalar " 8083 << "iteration overheads are incurred."); 8084 if (Hints.getForce() == LoopVectorizeHints::FK_Enabled) 8085 LLVM_DEBUG(dbgs() << " But vectorizing was explicitly forced.\n"); 8086 else { 8087 LLVM_DEBUG(dbgs() << "\n"); 8088 SEL = CM_ScalarEpilogueNotAllowedLowTripLoop; 8089 } 8090 } 8091 8092 // Check the function attributes to see if implicit floats are allowed. 8093 // FIXME: This check doesn't seem possibly correct -- what if the loop is 8094 // an integer loop and the vector instructions selected are purely integer 8095 // vector instructions? 8096 if (F->hasFnAttribute(Attribute::NoImplicitFloat)) { 8097 reportVectorizationFailure( 8098 "Can't vectorize when the NoImplicitFloat attribute is used", 8099 "loop not vectorized due to NoImplicitFloat attribute", 8100 "NoImplicitFloat", ORE, L); 8101 Hints.emitRemarkWithHints(); 8102 return false; 8103 } 8104 8105 // Check if the target supports potentially unsafe FP vectorization. 8106 // FIXME: Add a check for the type of safety issue (denormal, signaling) 8107 // for the target we're vectorizing for, to make sure none of the 8108 // additional fp-math flags can help. 8109 if (Hints.isPotentiallyUnsafe() && 8110 TTI->isFPVectorizationPotentiallyUnsafe()) { 8111 reportVectorizationFailure( 8112 "Potentially unsafe FP op prevents vectorization", 8113 "loop not vectorized due to unsafe FP support.", 8114 "UnsafeFP", ORE, L); 8115 Hints.emitRemarkWithHints(); 8116 return false; 8117 } 8118 8119 bool UseInterleaved = TTI->enableInterleavedAccessVectorization(); 8120 InterleavedAccessInfo IAI(PSE, L, DT, LI, LVL.getLAI()); 8121 8122 // If an override option has been passed in for interleaved accesses, use it. 8123 if (EnableInterleavedMemAccesses.getNumOccurrences() > 0) 8124 UseInterleaved = EnableInterleavedMemAccesses; 8125 8126 // Analyze interleaved memory accesses. 8127 if (UseInterleaved) { 8128 IAI.analyzeInterleaving(useMaskedInterleavedAccesses(*TTI)); 8129 } 8130 8131 // Use the cost model. 8132 LoopVectorizationCostModel CM(SEL, L, PSE, LI, &LVL, *TTI, TLI, DB, AC, ORE, 8133 F, &Hints, IAI); 8134 CM.collectValuesToIgnore(); 8135 8136 // Use the planner for vectorization. 8137 LoopVectorizationPlanner LVP(L, LI, TLI, TTI, &LVL, CM, IAI, PSE); 8138 8139 // Get user vectorization factor and interleave count. 8140 unsigned UserVF = Hints.getWidth(); 8141 unsigned UserIC = Hints.getInterleave(); 8142 8143 // Plan how to best vectorize, return the best VF and its cost. 8144 Optional<VectorizationFactor> MaybeVF = LVP.plan(UserVF, UserIC); 8145 8146 VectorizationFactor VF = VectorizationFactor::Disabled(); 8147 unsigned IC = 1; 8148 8149 if (MaybeVF) { 8150 VF = *MaybeVF; 8151 // Select the interleave count. 8152 IC = CM.selectInterleaveCount(VF.Width, VF.Cost); 8153 } 8154 8155 // Identify the diagnostic messages that should be produced. 8156 std::pair<StringRef, std::string> VecDiagMsg, IntDiagMsg; 8157 bool VectorizeLoop = true, InterleaveLoop = true; 8158 if (Requirements.doesNotMeet(F, L, Hints)) { 8159 LLVM_DEBUG(dbgs() << "LV: Not vectorizing: loop did not meet vectorization " 8160 "requirements.\n"); 8161 Hints.emitRemarkWithHints(); 8162 return false; 8163 } 8164 8165 if (VF.Width == 1) { 8166 LLVM_DEBUG(dbgs() << "LV: Vectorization is possible but not beneficial.\n"); 8167 VecDiagMsg = std::make_pair( 8168 "VectorizationNotBeneficial", 8169 "the cost-model indicates that vectorization is not beneficial"); 8170 VectorizeLoop = false; 8171 } 8172 8173 if (!MaybeVF && UserIC > 1) { 8174 // Tell the user interleaving was avoided up-front, despite being explicitly 8175 // requested. 8176 LLVM_DEBUG(dbgs() << "LV: Ignoring UserIC, because vectorization and " 8177 "interleaving should be avoided up front\n"); 8178 IntDiagMsg = std::make_pair( 8179 "InterleavingAvoided", 8180 "Ignoring UserIC, because interleaving was avoided up front"); 8181 InterleaveLoop = false; 8182 } else if (IC == 1 && UserIC <= 1) { 8183 // Tell the user interleaving is not beneficial. 8184 LLVM_DEBUG(dbgs() << "LV: Interleaving is not beneficial.\n"); 8185 IntDiagMsg = std::make_pair( 8186 "InterleavingNotBeneficial", 8187 "the cost-model indicates that interleaving is not beneficial"); 8188 InterleaveLoop = false; 8189 if (UserIC == 1) { 8190 IntDiagMsg.first = "InterleavingNotBeneficialAndDisabled"; 8191 IntDiagMsg.second += 8192 " and is explicitly disabled or interleave count is set to 1"; 8193 } 8194 } else if (IC > 1 && UserIC == 1) { 8195 // Tell the user interleaving is beneficial, but it explicitly disabled. 8196 LLVM_DEBUG( 8197 dbgs() << "LV: Interleaving is beneficial but is explicitly disabled."); 8198 IntDiagMsg = std::make_pair( 8199 "InterleavingBeneficialButDisabled", 8200 "the cost-model indicates that interleaving is beneficial " 8201 "but is explicitly disabled or interleave count is set to 1"); 8202 InterleaveLoop = false; 8203 } 8204 8205 // Override IC if user provided an interleave count. 8206 IC = UserIC > 0 ? UserIC : IC; 8207 8208 // Emit diagnostic messages, if any. 8209 const char *VAPassName = Hints.vectorizeAnalysisPassName(); 8210 if (!VectorizeLoop && !InterleaveLoop) { 8211 // Do not vectorize or interleaving the loop. 8212 ORE->emit([&]() { 8213 return OptimizationRemarkMissed(VAPassName, VecDiagMsg.first, 8214 L->getStartLoc(), L->getHeader()) 8215 << VecDiagMsg.second; 8216 }); 8217 ORE->emit([&]() { 8218 return OptimizationRemarkMissed(LV_NAME, IntDiagMsg.first, 8219 L->getStartLoc(), L->getHeader()) 8220 << IntDiagMsg.second; 8221 }); 8222 return false; 8223 } else if (!VectorizeLoop && InterleaveLoop) { 8224 LLVM_DEBUG(dbgs() << "LV: Interleave Count is " << IC << '\n'); 8225 ORE->emit([&]() { 8226 return OptimizationRemarkAnalysis(VAPassName, VecDiagMsg.first, 8227 L->getStartLoc(), L->getHeader()) 8228 << VecDiagMsg.second; 8229 }); 8230 } else if (VectorizeLoop && !InterleaveLoop) { 8231 LLVM_DEBUG(dbgs() << "LV: Found a vectorizable loop (" << VF.Width 8232 << ") in " << DebugLocStr << '\n'); 8233 ORE->emit([&]() { 8234 return OptimizationRemarkAnalysis(LV_NAME, IntDiagMsg.first, 8235 L->getStartLoc(), L->getHeader()) 8236 << IntDiagMsg.second; 8237 }); 8238 } else if (VectorizeLoop && InterleaveLoop) { 8239 LLVM_DEBUG(dbgs() << "LV: Found a vectorizable loop (" << VF.Width 8240 << ") in " << DebugLocStr << '\n'); 8241 LLVM_DEBUG(dbgs() << "LV: Interleave Count is " << IC << '\n'); 8242 } 8243 8244 LVP.setBestPlan(VF.Width, IC); 8245 8246 using namespace ore; 8247 bool DisableRuntimeUnroll = false; 8248 MDNode *OrigLoopID = L->getLoopID(); 8249 8250 if (!VectorizeLoop) { 8251 assert(IC > 1 && "interleave count should not be 1 or 0"); 8252 // If we decided that it is not legal to vectorize the loop, then 8253 // interleave it. 8254 InnerLoopUnroller Unroller(L, PSE, LI, DT, TLI, TTI, AC, ORE, IC, &LVL, &CM, 8255 BFI, PSI); 8256 LVP.executePlan(Unroller, DT); 8257 8258 ORE->emit([&]() { 8259 return OptimizationRemark(LV_NAME, "Interleaved", L->getStartLoc(), 8260 L->getHeader()) 8261 << "interleaved loop (interleaved count: " 8262 << NV("InterleaveCount", IC) << ")"; 8263 }); 8264 } else { 8265 // If we decided that it is *legal* to vectorize the loop, then do it. 8266 InnerLoopVectorizer LB(L, PSE, LI, DT, TLI, TTI, AC, ORE, VF.Width, IC, 8267 &LVL, &CM, BFI, PSI); 8268 LVP.executePlan(LB, DT); 8269 ++LoopsVectorized; 8270 8271 // Add metadata to disable runtime unrolling a scalar loop when there are 8272 // no runtime checks about strides and memory. A scalar loop that is 8273 // rarely used is not worth unrolling. 8274 if (!LB.areSafetyChecksAdded()) 8275 DisableRuntimeUnroll = true; 8276 8277 // Report the vectorization decision. 8278 ORE->emit([&]() { 8279 return OptimizationRemark(LV_NAME, "Vectorized", L->getStartLoc(), 8280 L->getHeader()) 8281 << "vectorized loop (vectorization width: " 8282 << NV("VectorizationFactor", VF.Width) 8283 << ", interleaved count: " << NV("InterleaveCount", IC) << ")"; 8284 }); 8285 } 8286 8287 Optional<MDNode *> RemainderLoopID = 8288 makeFollowupLoopID(OrigLoopID, {LLVMLoopVectorizeFollowupAll, 8289 LLVMLoopVectorizeFollowupEpilogue}); 8290 if (RemainderLoopID.hasValue()) { 8291 L->setLoopID(RemainderLoopID.getValue()); 8292 } else { 8293 if (DisableRuntimeUnroll) 8294 AddRuntimeUnrollDisableMetaData(L); 8295 8296 // Mark the loop as already vectorized to avoid vectorizing again. 8297 Hints.setAlreadyVectorized(); 8298 } 8299 8300 assert(!verifyFunction(*L->getHeader()->getParent(), &dbgs())); 8301 return true; 8302 } 8303 8304 LoopVectorizeResult LoopVectorizePass::runImpl( 8305 Function &F, ScalarEvolution &SE_, LoopInfo &LI_, TargetTransformInfo &TTI_, 8306 DominatorTree &DT_, BlockFrequencyInfo &BFI_, TargetLibraryInfo *TLI_, 8307 DemandedBits &DB_, AAResults &AA_, AssumptionCache &AC_, 8308 std::function<const LoopAccessInfo &(Loop &)> &GetLAA_, 8309 OptimizationRemarkEmitter &ORE_, ProfileSummaryInfo *PSI_) { 8310 SE = &SE_; 8311 LI = &LI_; 8312 TTI = &TTI_; 8313 DT = &DT_; 8314 BFI = &BFI_; 8315 TLI = TLI_; 8316 AA = &AA_; 8317 AC = &AC_; 8318 GetLAA = &GetLAA_; 8319 DB = &DB_; 8320 ORE = &ORE_; 8321 PSI = PSI_; 8322 8323 // Don't attempt if 8324 // 1. the target claims to have no vector registers, and 8325 // 2. interleaving won't help ILP. 8326 // 8327 // The second condition is necessary because, even if the target has no 8328 // vector registers, loop vectorization may still enable scalar 8329 // interleaving. 8330 if (!TTI->getNumberOfRegisters(TTI->getRegisterClassForType(true)) && 8331 TTI->getMaxInterleaveFactor(1) < 2) 8332 return LoopVectorizeResult(false, false); 8333 8334 bool Changed = false, CFGChanged = false; 8335 8336 // The vectorizer requires loops to be in simplified form. 8337 // Since simplification may add new inner loops, it has to run before the 8338 // legality and profitability checks. This means running the loop vectorizer 8339 // will simplify all loops, regardless of whether anything end up being 8340 // vectorized. 8341 for (auto &L : *LI) 8342 Changed |= CFGChanged |= 8343 simplifyLoop(L, DT, LI, SE, AC, nullptr, false /* PreserveLCSSA */); 8344 8345 // Build up a worklist of inner-loops to vectorize. This is necessary as 8346 // the act of vectorizing or partially unrolling a loop creates new loops 8347 // and can invalidate iterators across the loops. 8348 SmallVector<Loop *, 8> Worklist; 8349 8350 for (Loop *L : *LI) 8351 collectSupportedLoops(*L, LI, ORE, Worklist); 8352 8353 LoopsAnalyzed += Worklist.size(); 8354 8355 // Now walk the identified inner loops. 8356 while (!Worklist.empty()) { 8357 Loop *L = Worklist.pop_back_val(); 8358 8359 // For the inner loops we actually process, form LCSSA to simplify the 8360 // transform. 8361 Changed |= formLCSSARecursively(*L, *DT, LI, SE); 8362 8363 Changed |= CFGChanged |= processLoop(L); 8364 } 8365 8366 // Process each loop nest in the function. 8367 return LoopVectorizeResult(Changed, CFGChanged); 8368 } 8369 8370 PreservedAnalyses LoopVectorizePass::run(Function &F, 8371 FunctionAnalysisManager &AM) { 8372 auto &SE = AM.getResult<ScalarEvolutionAnalysis>(F); 8373 auto &LI = AM.getResult<LoopAnalysis>(F); 8374 auto &TTI = AM.getResult<TargetIRAnalysis>(F); 8375 auto &DT = AM.getResult<DominatorTreeAnalysis>(F); 8376 auto &BFI = AM.getResult<BlockFrequencyAnalysis>(F); 8377 auto &TLI = AM.getResult<TargetLibraryAnalysis>(F); 8378 auto &AA = AM.getResult<AAManager>(F); 8379 auto &AC = AM.getResult<AssumptionAnalysis>(F); 8380 auto &DB = AM.getResult<DemandedBitsAnalysis>(F); 8381 auto &ORE = AM.getResult<OptimizationRemarkEmitterAnalysis>(F); 8382 MemorySSA *MSSA = EnableMSSALoopDependency 8383 ? &AM.getResult<MemorySSAAnalysis>(F).getMSSA() 8384 : nullptr; 8385 8386 auto &LAM = AM.getResult<LoopAnalysisManagerFunctionProxy>(F).getManager(); 8387 std::function<const LoopAccessInfo &(Loop &)> GetLAA = 8388 [&](Loop &L) -> const LoopAccessInfo & { 8389 LoopStandardAnalysisResults AR = {AA, AC, DT, LI, SE, TLI, TTI, MSSA}; 8390 return LAM.getResult<LoopAccessAnalysis>(L, AR); 8391 }; 8392 auto &MAMProxy = AM.getResult<ModuleAnalysisManagerFunctionProxy>(F); 8393 ProfileSummaryInfo *PSI = 8394 MAMProxy.getCachedResult<ProfileSummaryAnalysis>(*F.getParent()); 8395 LoopVectorizeResult Result = 8396 runImpl(F, SE, LI, TTI, DT, BFI, &TLI, DB, AA, AC, GetLAA, ORE, PSI); 8397 if (!Result.MadeAnyChange) 8398 return PreservedAnalyses::all(); 8399 PreservedAnalyses PA; 8400 8401 // We currently do not preserve loopinfo/dominator analyses with outer loop 8402 // vectorization. Until this is addressed, mark these analyses as preserved 8403 // only for non-VPlan-native path. 8404 // TODO: Preserve Loop and Dominator analyses for VPlan-native path. 8405 if (!EnableVPlanNativePath) { 8406 PA.preserve<LoopAnalysis>(); 8407 PA.preserve<DominatorTreeAnalysis>(); 8408 } 8409 PA.preserve<BasicAA>(); 8410 PA.preserve<GlobalsAA>(); 8411 if (!Result.MadeCFGChange) 8412 PA.preserveSet<CFGAnalyses>(); 8413 return PA; 8414 } 8415