1 //===- LoopVectorize.cpp - A Loop Vectorizer ------------------------------===// 2 // 3 // Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. 4 // See https://llvm.org/LICENSE.txt for license information. 5 // SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception 6 // 7 //===----------------------------------------------------------------------===// 8 // 9 // This is the LLVM loop vectorizer. This pass modifies 'vectorizable' loops 10 // and generates target-independent LLVM-IR. 11 // The vectorizer uses the TargetTransformInfo analysis to estimate the costs 12 // of instructions in order to estimate the profitability of vectorization. 13 // 14 // The loop vectorizer combines consecutive loop iterations into a single 15 // 'wide' iteration. After this transformation the index is incremented 16 // by the SIMD vector width, and not by one. 17 // 18 // This pass has three parts: 19 // 1. The main loop pass that drives the different parts. 20 // 2. LoopVectorizationLegality - A unit that checks for the legality 21 // of the vectorization. 22 // 3. InnerLoopVectorizer - A unit that performs the actual 23 // widening of instructions. 24 // 4. LoopVectorizationCostModel - A unit that checks for the profitability 25 // of vectorization. It decides on the optimal vector width, which 26 // can be one, if vectorization is not profitable. 27 // 28 // There is a development effort going on to migrate loop vectorizer to the 29 // VPlan infrastructure and to introduce outer loop vectorization support (see 30 // docs/Proposal/VectorizationPlan.rst and 31 // http://lists.llvm.org/pipermail/llvm-dev/2017-December/119523.html). For this 32 // purpose, we temporarily introduced the VPlan-native vectorization path: an 33 // alternative vectorization path that is natively implemented on top of the 34 // VPlan infrastructure. See EnableVPlanNativePath for enabling. 35 // 36 //===----------------------------------------------------------------------===// 37 // 38 // The reduction-variable vectorization is based on the paper: 39 // D. Nuzman and R. Henderson. Multi-platform Auto-vectorization. 40 // 41 // Variable uniformity checks are inspired by: 42 // Karrenberg, R. and Hack, S. Whole Function Vectorization. 43 // 44 // The interleaved access vectorization is based on the paper: 45 // Dorit Nuzman, Ira Rosen and Ayal Zaks. Auto-Vectorization of Interleaved 46 // Data for SIMD 47 // 48 // Other ideas/concepts are from: 49 // A. Zaks and D. Nuzman. Autovectorization in GCC-two years later. 50 // 51 // S. Maleki, Y. Gao, M. Garzaran, T. Wong and D. Padua. An Evaluation of 52 // Vectorizing Compilers. 53 // 54 //===----------------------------------------------------------------------===// 55 56 #include "llvm/Transforms/Vectorize/LoopVectorize.h" 57 #include "LoopVectorizationPlanner.h" 58 #include "VPRecipeBuilder.h" 59 #include "VPlan.h" 60 #include "VPlanHCFGBuilder.h" 61 #include "VPlanPredicator.h" 62 #include "VPlanTransforms.h" 63 #include "llvm/ADT/APInt.h" 64 #include "llvm/ADT/ArrayRef.h" 65 #include "llvm/ADT/DenseMap.h" 66 #include "llvm/ADT/DenseMapInfo.h" 67 #include "llvm/ADT/Hashing.h" 68 #include "llvm/ADT/MapVector.h" 69 #include "llvm/ADT/None.h" 70 #include "llvm/ADT/Optional.h" 71 #include "llvm/ADT/STLExtras.h" 72 #include "llvm/ADT/SetVector.h" 73 #include "llvm/ADT/SmallPtrSet.h" 74 #include "llvm/ADT/SmallVector.h" 75 #include "llvm/ADT/Statistic.h" 76 #include "llvm/ADT/StringRef.h" 77 #include "llvm/ADT/Twine.h" 78 #include "llvm/ADT/iterator_range.h" 79 #include "llvm/Analysis/AssumptionCache.h" 80 #include "llvm/Analysis/BasicAliasAnalysis.h" 81 #include "llvm/Analysis/BlockFrequencyInfo.h" 82 #include "llvm/Analysis/CFG.h" 83 #include "llvm/Analysis/CodeMetrics.h" 84 #include "llvm/Analysis/DemandedBits.h" 85 #include "llvm/Analysis/GlobalsModRef.h" 86 #include "llvm/Analysis/LoopAccessAnalysis.h" 87 #include "llvm/Analysis/LoopAnalysisManager.h" 88 #include "llvm/Analysis/LoopInfo.h" 89 #include "llvm/Analysis/LoopIterator.h" 90 #include "llvm/Analysis/MemorySSA.h" 91 #include "llvm/Analysis/OptimizationRemarkEmitter.h" 92 #include "llvm/Analysis/ProfileSummaryInfo.h" 93 #include "llvm/Analysis/ScalarEvolution.h" 94 #include "llvm/Analysis/ScalarEvolutionExpander.h" 95 #include "llvm/Analysis/ScalarEvolutionExpressions.h" 96 #include "llvm/Analysis/TargetLibraryInfo.h" 97 #include "llvm/Analysis/TargetTransformInfo.h" 98 #include "llvm/Analysis/VectorUtils.h" 99 #include "llvm/IR/Attributes.h" 100 #include "llvm/IR/BasicBlock.h" 101 #include "llvm/IR/CFG.h" 102 #include "llvm/IR/Constant.h" 103 #include "llvm/IR/Constants.h" 104 #include "llvm/IR/DataLayout.h" 105 #include "llvm/IR/DebugInfoMetadata.h" 106 #include "llvm/IR/DebugLoc.h" 107 #include "llvm/IR/DerivedTypes.h" 108 #include "llvm/IR/DiagnosticInfo.h" 109 #include "llvm/IR/Dominators.h" 110 #include "llvm/IR/Function.h" 111 #include "llvm/IR/IRBuilder.h" 112 #include "llvm/IR/InstrTypes.h" 113 #include "llvm/IR/Instruction.h" 114 #include "llvm/IR/Instructions.h" 115 #include "llvm/IR/IntrinsicInst.h" 116 #include "llvm/IR/Intrinsics.h" 117 #include "llvm/IR/LLVMContext.h" 118 #include "llvm/IR/Metadata.h" 119 #include "llvm/IR/Module.h" 120 #include "llvm/IR/Operator.h" 121 #include "llvm/IR/Type.h" 122 #include "llvm/IR/Use.h" 123 #include "llvm/IR/User.h" 124 #include "llvm/IR/Value.h" 125 #include "llvm/IR/ValueHandle.h" 126 #include "llvm/IR/Verifier.h" 127 #include "llvm/InitializePasses.h" 128 #include "llvm/Pass.h" 129 #include "llvm/Support/Casting.h" 130 #include "llvm/Support/CommandLine.h" 131 #include "llvm/Support/Compiler.h" 132 #include "llvm/Support/Debug.h" 133 #include "llvm/Support/ErrorHandling.h" 134 #include "llvm/Support/MathExtras.h" 135 #include "llvm/Support/raw_ostream.h" 136 #include "llvm/Transforms/Utils/BasicBlockUtils.h" 137 #include "llvm/Transforms/Utils/InjectTLIMappings.h" 138 #include "llvm/Transforms/Utils/LoopSimplify.h" 139 #include "llvm/Transforms/Utils/LoopUtils.h" 140 #include "llvm/Transforms/Utils/LoopVersioning.h" 141 #include "llvm/Transforms/Utils/SizeOpts.h" 142 #include "llvm/Transforms/Vectorize/LoopVectorizationLegality.h" 143 #include <algorithm> 144 #include <cassert> 145 #include <cstdint> 146 #include <cstdlib> 147 #include <functional> 148 #include <iterator> 149 #include <limits> 150 #include <memory> 151 #include <string> 152 #include <tuple> 153 #include <utility> 154 155 using namespace llvm; 156 157 #define LV_NAME "loop-vectorize" 158 #define DEBUG_TYPE LV_NAME 159 160 /// @{ 161 /// Metadata attribute names 162 static const char *const LLVMLoopVectorizeFollowupAll = 163 "llvm.loop.vectorize.followup_all"; 164 static const char *const LLVMLoopVectorizeFollowupVectorized = 165 "llvm.loop.vectorize.followup_vectorized"; 166 static const char *const LLVMLoopVectorizeFollowupEpilogue = 167 "llvm.loop.vectorize.followup_epilogue"; 168 /// @} 169 170 STATISTIC(LoopsVectorized, "Number of loops vectorized"); 171 STATISTIC(LoopsAnalyzed, "Number of loops analyzed for vectorization"); 172 173 /// Loops with a known constant trip count below this number are vectorized only 174 /// if no scalar iteration overheads are incurred. 175 static cl::opt<unsigned> TinyTripCountVectorThreshold( 176 "vectorizer-min-trip-count", cl::init(16), cl::Hidden, 177 cl::desc("Loops with a constant trip count that is smaller than this " 178 "value are vectorized only if no scalar iteration overheads " 179 "are incurred.")); 180 181 // Indicates that an epilogue is undesired, predication is preferred. 182 // This means that the vectorizer will try to fold the loop-tail (epilogue) 183 // into the loop and predicate the loop body accordingly. 184 static cl::opt<bool> PreferPredicateOverEpilog( 185 "prefer-predicate-over-epilog", cl::init(false), cl::Hidden, 186 cl::desc("Indicate that an epilogue is undesired, predication should be " 187 "used instead.")); 188 189 static cl::opt<bool> MaximizeBandwidth( 190 "vectorizer-maximize-bandwidth", cl::init(false), cl::Hidden, 191 cl::desc("Maximize bandwidth when selecting vectorization factor which " 192 "will be determined by the smallest type in loop.")); 193 194 static cl::opt<bool> EnableInterleavedMemAccesses( 195 "enable-interleaved-mem-accesses", cl::init(false), cl::Hidden, 196 cl::desc("Enable vectorization on interleaved memory accesses in a loop")); 197 198 /// An interleave-group may need masking if it resides in a block that needs 199 /// predication, or in order to mask away gaps. 200 static cl::opt<bool> EnableMaskedInterleavedMemAccesses( 201 "enable-masked-interleaved-mem-accesses", cl::init(false), cl::Hidden, 202 cl::desc("Enable vectorization on masked interleaved memory accesses in a loop")); 203 204 static cl::opt<unsigned> TinyTripCountInterleaveThreshold( 205 "tiny-trip-count-interleave-threshold", cl::init(128), cl::Hidden, 206 cl::desc("We don't interleave loops with a estimated constant trip count " 207 "below this number")); 208 209 static cl::opt<unsigned> ForceTargetNumScalarRegs( 210 "force-target-num-scalar-regs", cl::init(0), cl::Hidden, 211 cl::desc("A flag that overrides the target's number of scalar registers.")); 212 213 static cl::opt<unsigned> ForceTargetNumVectorRegs( 214 "force-target-num-vector-regs", cl::init(0), cl::Hidden, 215 cl::desc("A flag that overrides the target's number of vector registers.")); 216 217 static cl::opt<unsigned> ForceTargetMaxScalarInterleaveFactor( 218 "force-target-max-scalar-interleave", cl::init(0), cl::Hidden, 219 cl::desc("A flag that overrides the target's max interleave factor for " 220 "scalar loops.")); 221 222 static cl::opt<unsigned> ForceTargetMaxVectorInterleaveFactor( 223 "force-target-max-vector-interleave", cl::init(0), cl::Hidden, 224 cl::desc("A flag that overrides the target's max interleave factor for " 225 "vectorized loops.")); 226 227 static cl::opt<unsigned> ForceTargetInstructionCost( 228 "force-target-instruction-cost", cl::init(0), cl::Hidden, 229 cl::desc("A flag that overrides the target's expected cost for " 230 "an instruction to a single constant value. Mostly " 231 "useful for getting consistent testing.")); 232 233 static cl::opt<unsigned> SmallLoopCost( 234 "small-loop-cost", cl::init(20), cl::Hidden, 235 cl::desc( 236 "The cost of a loop that is considered 'small' by the interleaver.")); 237 238 static cl::opt<bool> LoopVectorizeWithBlockFrequency( 239 "loop-vectorize-with-block-frequency", cl::init(true), cl::Hidden, 240 cl::desc("Enable the use of the block frequency analysis to access PGO " 241 "heuristics minimizing code growth in cold regions and being more " 242 "aggressive in hot regions.")); 243 244 // Runtime interleave loops for load/store throughput. 245 static cl::opt<bool> EnableLoadStoreRuntimeInterleave( 246 "enable-loadstore-runtime-interleave", cl::init(true), cl::Hidden, 247 cl::desc( 248 "Enable runtime interleaving until load/store ports are saturated")); 249 250 /// The number of stores in a loop that are allowed to need predication. 251 static cl::opt<unsigned> NumberOfStoresToPredicate( 252 "vectorize-num-stores-pred", cl::init(1), cl::Hidden, 253 cl::desc("Max number of stores to be predicated behind an if.")); 254 255 static cl::opt<bool> EnableIndVarRegisterHeur( 256 "enable-ind-var-reg-heur", cl::init(true), cl::Hidden, 257 cl::desc("Count the induction variable only once when interleaving")); 258 259 static cl::opt<bool> EnableCondStoresVectorization( 260 "enable-cond-stores-vec", cl::init(true), cl::Hidden, 261 cl::desc("Enable if predication of stores during vectorization.")); 262 263 static cl::opt<unsigned> MaxNestedScalarReductionIC( 264 "max-nested-scalar-reduction-interleave", cl::init(2), cl::Hidden, 265 cl::desc("The maximum interleave count to use when interleaving a scalar " 266 "reduction in a nested loop.")); 267 268 cl::opt<bool> EnableVPlanNativePath( 269 "enable-vplan-native-path", cl::init(false), cl::Hidden, 270 cl::desc("Enable VPlan-native vectorization path with " 271 "support for outer loop vectorization.")); 272 273 // FIXME: Remove this switch once we have divergence analysis. Currently we 274 // assume divergent non-backedge branches when this switch is true. 275 cl::opt<bool> EnableVPlanPredication( 276 "enable-vplan-predication", cl::init(false), cl::Hidden, 277 cl::desc("Enable VPlan-native vectorization path predicator with " 278 "support for outer loop vectorization.")); 279 280 // This flag enables the stress testing of the VPlan H-CFG construction in the 281 // VPlan-native vectorization path. It must be used in conjuction with 282 // -enable-vplan-native-path. -vplan-verify-hcfg can also be used to enable the 283 // verification of the H-CFGs built. 284 static cl::opt<bool> VPlanBuildStressTest( 285 "vplan-build-stress-test", cl::init(false), cl::Hidden, 286 cl::desc( 287 "Build VPlan for every supported loop nest in the function and bail " 288 "out right after the build (stress test the VPlan H-CFG construction " 289 "in the VPlan-native vectorization path).")); 290 291 cl::opt<bool> llvm::EnableLoopInterleaving( 292 "interleave-loops", cl::init(true), cl::Hidden, 293 cl::desc("Enable loop interleaving in Loop vectorization passes")); 294 cl::opt<bool> llvm::EnableLoopVectorization( 295 "vectorize-loops", cl::init(true), cl::Hidden, 296 cl::desc("Run the Loop vectorization passes")); 297 298 /// A helper function for converting Scalar types to vector types. 299 /// If the incoming type is void, we return void. If the VF is 1, we return 300 /// the scalar type. 301 static Type *ToVectorTy(Type *Scalar, unsigned VF) { 302 if (Scalar->isVoidTy() || VF == 1) 303 return Scalar; 304 return VectorType::get(Scalar, VF); 305 } 306 307 /// A helper function that returns the type of loaded or stored value. 308 static Type *getMemInstValueType(Value *I) { 309 assert((isa<LoadInst>(I) || isa<StoreInst>(I)) && 310 "Expected Load or Store instruction"); 311 if (auto *LI = dyn_cast<LoadInst>(I)) 312 return LI->getType(); 313 return cast<StoreInst>(I)->getValueOperand()->getType(); 314 } 315 316 /// A helper function that returns true if the given type is irregular. The 317 /// type is irregular if its allocated size doesn't equal the store size of an 318 /// element of the corresponding vector type at the given vectorization factor. 319 static bool hasIrregularType(Type *Ty, const DataLayout &DL, unsigned VF) { 320 // Determine if an array of VF elements of type Ty is "bitcast compatible" 321 // with a <VF x Ty> vector. 322 if (VF > 1) { 323 auto *VectorTy = VectorType::get(Ty, VF); 324 return VF * DL.getTypeAllocSize(Ty) != DL.getTypeStoreSize(VectorTy); 325 } 326 327 // If the vectorization factor is one, we just check if an array of type Ty 328 // requires padding between elements. 329 return DL.getTypeAllocSizeInBits(Ty) != DL.getTypeSizeInBits(Ty); 330 } 331 332 /// A helper function that returns the reciprocal of the block probability of 333 /// predicated blocks. If we return X, we are assuming the predicated block 334 /// will execute once for every X iterations of the loop header. 335 /// 336 /// TODO: We should use actual block probability here, if available. Currently, 337 /// we always assume predicated blocks have a 50% chance of executing. 338 static unsigned getReciprocalPredBlockProb() { return 2; } 339 340 /// A helper function that adds a 'fast' flag to floating-point operations. 341 static Value *addFastMathFlag(Value *V) { 342 if (isa<FPMathOperator>(V)) 343 cast<Instruction>(V)->setFastMathFlags(FastMathFlags::getFast()); 344 return V; 345 } 346 347 static Value *addFastMathFlag(Value *V, FastMathFlags FMF) { 348 if (isa<FPMathOperator>(V)) 349 cast<Instruction>(V)->setFastMathFlags(FMF); 350 return V; 351 } 352 353 /// A helper function that returns an integer or floating-point constant with 354 /// value C. 355 static Constant *getSignedIntOrFpConstant(Type *Ty, int64_t C) { 356 return Ty->isIntegerTy() ? ConstantInt::getSigned(Ty, C) 357 : ConstantFP::get(Ty, C); 358 } 359 360 /// Returns "best known" trip count for the specified loop \p L as defined by 361 /// the following procedure: 362 /// 1) Returns exact trip count if it is known. 363 /// 2) Returns expected trip count according to profile data if any. 364 /// 3) Returns upper bound estimate if it is known. 365 /// 4) Returns None if all of the above failed. 366 static Optional<unsigned> getSmallBestKnownTC(ScalarEvolution &SE, Loop *L) { 367 // Check if exact trip count is known. 368 if (unsigned ExpectedTC = SE.getSmallConstantTripCount(L)) 369 return ExpectedTC; 370 371 // Check if there is an expected trip count available from profile data. 372 if (LoopVectorizeWithBlockFrequency) 373 if (auto EstimatedTC = getLoopEstimatedTripCount(L)) 374 return EstimatedTC; 375 376 // Check if upper bound estimate is known. 377 if (unsigned ExpectedTC = SE.getSmallConstantMaxTripCount(L)) 378 return ExpectedTC; 379 380 return None; 381 } 382 383 namespace llvm { 384 385 /// InnerLoopVectorizer vectorizes loops which contain only one basic 386 /// block to a specified vectorization factor (VF). 387 /// This class performs the widening of scalars into vectors, or multiple 388 /// scalars. This class also implements the following features: 389 /// * It inserts an epilogue loop for handling loops that don't have iteration 390 /// counts that are known to be a multiple of the vectorization factor. 391 /// * It handles the code generation for reduction variables. 392 /// * Scalarization (implementation using scalars) of un-vectorizable 393 /// instructions. 394 /// InnerLoopVectorizer does not perform any vectorization-legality 395 /// checks, and relies on the caller to check for the different legality 396 /// aspects. The InnerLoopVectorizer relies on the 397 /// LoopVectorizationLegality class to provide information about the induction 398 /// and reduction variables that were found to a given vectorization factor. 399 class InnerLoopVectorizer { 400 public: 401 InnerLoopVectorizer(Loop *OrigLoop, PredicatedScalarEvolution &PSE, 402 LoopInfo *LI, DominatorTree *DT, 403 const TargetLibraryInfo *TLI, 404 const TargetTransformInfo *TTI, AssumptionCache *AC, 405 OptimizationRemarkEmitter *ORE, unsigned VecWidth, 406 unsigned UnrollFactor, LoopVectorizationLegality *LVL, 407 LoopVectorizationCostModel *CM) 408 : OrigLoop(OrigLoop), PSE(PSE), LI(LI), DT(DT), TLI(TLI), TTI(TTI), 409 AC(AC), ORE(ORE), VF(VecWidth), UF(UnrollFactor), 410 Builder(PSE.getSE()->getContext()), 411 VectorLoopValueMap(UnrollFactor, VecWidth), Legal(LVL), Cost(CM) {} 412 virtual ~InnerLoopVectorizer() = default; 413 414 /// Create a new empty loop. Unlink the old loop and connect the new one. 415 /// Return the pre-header block of the new loop. 416 BasicBlock *createVectorizedLoopSkeleton(); 417 418 /// Widen a single instruction within the innermost loop. 419 void widenInstruction(Instruction &I); 420 421 /// Fix the vectorized code, taking care of header phi's, live-outs, and more. 422 void fixVectorizedLoop(); 423 424 // Return true if any runtime check is added. 425 bool areSafetyChecksAdded() { return AddedSafetyChecks; } 426 427 /// A type for vectorized values in the new loop. Each value from the 428 /// original loop, when vectorized, is represented by UF vector values in the 429 /// new unrolled loop, where UF is the unroll factor. 430 using VectorParts = SmallVector<Value *, 2>; 431 432 /// Vectorize a single GetElementPtrInst based on information gathered and 433 /// decisions taken during planning. 434 void widenGEP(GetElementPtrInst *GEP, unsigned UF, unsigned VF, 435 bool IsPtrLoopInvariant, SmallBitVector &IsIndexLoopInvariant); 436 437 /// Vectorize a single PHINode in a block. This method handles the induction 438 /// variable canonicalization. It supports both VF = 1 for unrolled loops and 439 /// arbitrary length vectors. 440 void widenPHIInstruction(Instruction *PN, unsigned UF, unsigned VF); 441 442 /// A helper function to scalarize a single Instruction in the innermost loop. 443 /// Generates a sequence of scalar instances for each lane between \p MinLane 444 /// and \p MaxLane, times each part between \p MinPart and \p MaxPart, 445 /// inclusive.. 446 void scalarizeInstruction(Instruction *Instr, const VPIteration &Instance, 447 bool IfPredicateInstr); 448 449 /// Widen an integer or floating-point induction variable \p IV. If \p Trunc 450 /// is provided, the integer induction variable will first be truncated to 451 /// the corresponding type. 452 void widenIntOrFpInduction(PHINode *IV, TruncInst *Trunc = nullptr); 453 454 /// getOrCreateVectorValue and getOrCreateScalarValue coordinate to generate a 455 /// vector or scalar value on-demand if one is not yet available. When 456 /// vectorizing a loop, we visit the definition of an instruction before its 457 /// uses. When visiting the definition, we either vectorize or scalarize the 458 /// instruction, creating an entry for it in the corresponding map. (In some 459 /// cases, such as induction variables, we will create both vector and scalar 460 /// entries.) Then, as we encounter uses of the definition, we derive values 461 /// for each scalar or vector use unless such a value is already available. 462 /// For example, if we scalarize a definition and one of its uses is vector, 463 /// we build the required vector on-demand with an insertelement sequence 464 /// when visiting the use. Otherwise, if the use is scalar, we can use the 465 /// existing scalar definition. 466 /// 467 /// Return a value in the new loop corresponding to \p V from the original 468 /// loop at unroll index \p Part. If the value has already been vectorized, 469 /// the corresponding vector entry in VectorLoopValueMap is returned. If, 470 /// however, the value has a scalar entry in VectorLoopValueMap, we construct 471 /// a new vector value on-demand by inserting the scalar values into a vector 472 /// with an insertelement sequence. If the value has been neither vectorized 473 /// nor scalarized, it must be loop invariant, so we simply broadcast the 474 /// value into a vector. 475 Value *getOrCreateVectorValue(Value *V, unsigned Part); 476 477 /// Return a value in the new loop corresponding to \p V from the original 478 /// loop at unroll and vector indices \p Instance. If the value has been 479 /// vectorized but not scalarized, the necessary extractelement instruction 480 /// will be generated. 481 Value *getOrCreateScalarValue(Value *V, const VPIteration &Instance); 482 483 /// Construct the vector value of a scalarized value \p V one lane at a time. 484 void packScalarIntoVectorValue(Value *V, const VPIteration &Instance); 485 486 /// Try to vectorize the interleaved access group that \p Instr belongs to 487 /// with the base address given in \p Addr, optionally masking the vector 488 /// operations if \p BlockInMask is non-null. Use \p State to translate given 489 /// VPValues to IR values in the vectorized loop. 490 void vectorizeInterleaveGroup(Instruction *Instr, VPTransformState &State, 491 VPValue *Addr, VPValue *BlockInMask = nullptr); 492 493 /// Vectorize Load and Store instructions with the base address given in \p 494 /// Addr, optionally masking the vector operations if \p BlockInMask is 495 /// non-null. Use \p State to translate given VPValues to IR values in the 496 /// vectorized loop. 497 void vectorizeMemoryInstruction(Instruction *Instr, VPTransformState &State, 498 VPValue *Addr, 499 VPValue *BlockInMask = nullptr); 500 501 /// Set the debug location in the builder using the debug location in 502 /// the instruction. 503 void setDebugLocFromInst(IRBuilder<> &B, const Value *Ptr); 504 505 /// Fix the non-induction PHIs in the OrigPHIsToFix vector. 506 void fixNonInductionPHIs(void); 507 508 protected: 509 friend class LoopVectorizationPlanner; 510 511 /// A small list of PHINodes. 512 using PhiVector = SmallVector<PHINode *, 4>; 513 514 /// A type for scalarized values in the new loop. Each value from the 515 /// original loop, when scalarized, is represented by UF x VF scalar values 516 /// in the new unrolled loop, where UF is the unroll factor and VF is the 517 /// vectorization factor. 518 using ScalarParts = SmallVector<SmallVector<Value *, 4>, 2>; 519 520 /// Set up the values of the IVs correctly when exiting the vector loop. 521 void fixupIVUsers(PHINode *OrigPhi, const InductionDescriptor &II, 522 Value *CountRoundDown, Value *EndValue, 523 BasicBlock *MiddleBlock); 524 525 /// Create a new induction variable inside L. 526 PHINode *createInductionVariable(Loop *L, Value *Start, Value *End, 527 Value *Step, Instruction *DL); 528 529 /// Handle all cross-iteration phis in the header. 530 void fixCrossIterationPHIs(); 531 532 /// Fix a first-order recurrence. This is the second phase of vectorizing 533 /// this phi node. 534 void fixFirstOrderRecurrence(PHINode *Phi); 535 536 /// Fix a reduction cross-iteration phi. This is the second phase of 537 /// vectorizing this phi node. 538 void fixReduction(PHINode *Phi); 539 540 /// Clear NSW/NUW flags from reduction instructions if necessary. 541 void clearReductionWrapFlags(RecurrenceDescriptor &RdxDesc); 542 543 /// The Loop exit block may have single value PHI nodes with some 544 /// incoming value. While vectorizing we only handled real values 545 /// that were defined inside the loop and we should have one value for 546 /// each predecessor of its parent basic block. See PR14725. 547 void fixLCSSAPHIs(); 548 549 /// Iteratively sink the scalarized operands of a predicated instruction into 550 /// the block that was created for it. 551 void sinkScalarOperands(Instruction *PredInst); 552 553 /// Shrinks vector element sizes to the smallest bitwidth they can be legally 554 /// represented as. 555 void truncateToMinimalBitwidths(); 556 557 /// Create a broadcast instruction. This method generates a broadcast 558 /// instruction (shuffle) for loop invariant values and for the induction 559 /// value. If this is the induction variable then we extend it to N, N+1, ... 560 /// this is needed because each iteration in the loop corresponds to a SIMD 561 /// element. 562 virtual Value *getBroadcastInstrs(Value *V); 563 564 /// This function adds (StartIdx, StartIdx + Step, StartIdx + 2*Step, ...) 565 /// to each vector element of Val. The sequence starts at StartIndex. 566 /// \p Opcode is relevant for FP induction variable. 567 virtual Value *getStepVector(Value *Val, int StartIdx, Value *Step, 568 Instruction::BinaryOps Opcode = 569 Instruction::BinaryOpsEnd); 570 571 /// Compute scalar induction steps. \p ScalarIV is the scalar induction 572 /// variable on which to base the steps, \p Step is the size of the step, and 573 /// \p EntryVal is the value from the original loop that maps to the steps. 574 /// Note that \p EntryVal doesn't have to be an induction variable - it 575 /// can also be a truncate instruction. 576 void buildScalarSteps(Value *ScalarIV, Value *Step, Instruction *EntryVal, 577 const InductionDescriptor &ID); 578 579 /// Create a vector induction phi node based on an existing scalar one. \p 580 /// EntryVal is the value from the original loop that maps to the vector phi 581 /// node, and \p Step is the loop-invariant step. If \p EntryVal is a 582 /// truncate instruction, instead of widening the original IV, we widen a 583 /// version of the IV truncated to \p EntryVal's type. 584 void createVectorIntOrFpInductionPHI(const InductionDescriptor &II, 585 Value *Step, Instruction *EntryVal); 586 587 /// Returns true if an instruction \p I should be scalarized instead of 588 /// vectorized for the chosen vectorization factor. 589 bool shouldScalarizeInstruction(Instruction *I) const; 590 591 /// Returns true if we should generate a scalar version of \p IV. 592 bool needsScalarInduction(Instruction *IV) const; 593 594 /// If there is a cast involved in the induction variable \p ID, which should 595 /// be ignored in the vectorized loop body, this function records the 596 /// VectorLoopValue of the respective Phi also as the VectorLoopValue of the 597 /// cast. We had already proved that the casted Phi is equal to the uncasted 598 /// Phi in the vectorized loop (under a runtime guard), and therefore 599 /// there is no need to vectorize the cast - the same value can be used in the 600 /// vector loop for both the Phi and the cast. 601 /// If \p VectorLoopValue is a scalarized value, \p Lane is also specified, 602 /// Otherwise, \p VectorLoopValue is a widened/vectorized value. 603 /// 604 /// \p EntryVal is the value from the original loop that maps to the vector 605 /// phi node and is used to distinguish what is the IV currently being 606 /// processed - original one (if \p EntryVal is a phi corresponding to the 607 /// original IV) or the "newly-created" one based on the proof mentioned above 608 /// (see also buildScalarSteps() and createVectorIntOrFPInductionPHI()). In the 609 /// latter case \p EntryVal is a TruncInst and we must not record anything for 610 /// that IV, but it's error-prone to expect callers of this routine to care 611 /// about that, hence this explicit parameter. 612 void recordVectorLoopValueForInductionCast(const InductionDescriptor &ID, 613 const Instruction *EntryVal, 614 Value *VectorLoopValue, 615 unsigned Part, 616 unsigned Lane = UINT_MAX); 617 618 /// Generate a shuffle sequence that will reverse the vector Vec. 619 virtual Value *reverseVector(Value *Vec); 620 621 /// Returns (and creates if needed) the original loop trip count. 622 Value *getOrCreateTripCount(Loop *NewLoop); 623 624 /// Returns (and creates if needed) the trip count of the widened loop. 625 Value *getOrCreateVectorTripCount(Loop *NewLoop); 626 627 /// Returns a bitcasted value to the requested vector type. 628 /// Also handles bitcasts of vector<float> <-> vector<pointer> types. 629 Value *createBitOrPointerCast(Value *V, VectorType *DstVTy, 630 const DataLayout &DL); 631 632 /// Emit a bypass check to see if the vector trip count is zero, including if 633 /// it overflows. 634 void emitMinimumIterationCountCheck(Loop *L, BasicBlock *Bypass); 635 636 /// Emit a bypass check to see if all of the SCEV assumptions we've 637 /// had to make are correct. 638 void emitSCEVChecks(Loop *L, BasicBlock *Bypass); 639 640 /// Emit bypass checks to check any memory assumptions we may have made. 641 void emitMemRuntimeChecks(Loop *L, BasicBlock *Bypass); 642 643 /// Compute the transformed value of Index at offset StartValue using step 644 /// StepValue. 645 /// For integer induction, returns StartValue + Index * StepValue. 646 /// For pointer induction, returns StartValue[Index * StepValue]. 647 /// FIXME: The newly created binary instructions should contain nsw/nuw 648 /// flags, which can be found from the original scalar operations. 649 Value *emitTransformedIndex(IRBuilder<> &B, Value *Index, ScalarEvolution *SE, 650 const DataLayout &DL, 651 const InductionDescriptor &ID) const; 652 653 /// Add additional metadata to \p To that was not present on \p Orig. 654 /// 655 /// Currently this is used to add the noalias annotations based on the 656 /// inserted memchecks. Use this for instructions that are *cloned* into the 657 /// vector loop. 658 void addNewMetadata(Instruction *To, const Instruction *Orig); 659 660 /// Add metadata from one instruction to another. 661 /// 662 /// This includes both the original MDs from \p From and additional ones (\see 663 /// addNewMetadata). Use this for *newly created* instructions in the vector 664 /// loop. 665 void addMetadata(Instruction *To, Instruction *From); 666 667 /// Similar to the previous function but it adds the metadata to a 668 /// vector of instructions. 669 void addMetadata(ArrayRef<Value *> To, Instruction *From); 670 671 /// The original loop. 672 Loop *OrigLoop; 673 674 /// A wrapper around ScalarEvolution used to add runtime SCEV checks. Applies 675 /// dynamic knowledge to simplify SCEV expressions and converts them to a 676 /// more usable form. 677 PredicatedScalarEvolution &PSE; 678 679 /// Loop Info. 680 LoopInfo *LI; 681 682 /// Dominator Tree. 683 DominatorTree *DT; 684 685 /// Alias Analysis. 686 AliasAnalysis *AA; 687 688 /// Target Library Info. 689 const TargetLibraryInfo *TLI; 690 691 /// Target Transform Info. 692 const TargetTransformInfo *TTI; 693 694 /// Assumption Cache. 695 AssumptionCache *AC; 696 697 /// Interface to emit optimization remarks. 698 OptimizationRemarkEmitter *ORE; 699 700 /// LoopVersioning. It's only set up (non-null) if memchecks were 701 /// used. 702 /// 703 /// This is currently only used to add no-alias metadata based on the 704 /// memchecks. The actually versioning is performed manually. 705 std::unique_ptr<LoopVersioning> LVer; 706 707 /// The vectorization SIMD factor to use. Each vector will have this many 708 /// vector elements. 709 unsigned VF; 710 711 /// The vectorization unroll factor to use. Each scalar is vectorized to this 712 /// many different vector instructions. 713 unsigned UF; 714 715 /// The builder that we use 716 IRBuilder<> Builder; 717 718 // --- Vectorization state --- 719 720 /// The vector-loop preheader. 721 BasicBlock *LoopVectorPreHeader; 722 723 /// The scalar-loop preheader. 724 BasicBlock *LoopScalarPreHeader; 725 726 /// Middle Block between the vector and the scalar. 727 BasicBlock *LoopMiddleBlock; 728 729 /// The ExitBlock of the scalar loop. 730 BasicBlock *LoopExitBlock; 731 732 /// The vector loop body. 733 BasicBlock *LoopVectorBody; 734 735 /// The scalar loop body. 736 BasicBlock *LoopScalarBody; 737 738 /// A list of all bypass blocks. The first block is the entry of the loop. 739 SmallVector<BasicBlock *, 4> LoopBypassBlocks; 740 741 /// The new Induction variable which was added to the new block. 742 PHINode *Induction = nullptr; 743 744 /// The induction variable of the old basic block. 745 PHINode *OldInduction = nullptr; 746 747 /// Maps values from the original loop to their corresponding values in the 748 /// vectorized loop. A key value can map to either vector values, scalar 749 /// values or both kinds of values, depending on whether the key was 750 /// vectorized and scalarized. 751 VectorizerValueMap VectorLoopValueMap; 752 753 /// Store instructions that were predicated. 754 SmallVector<Instruction *, 4> PredicatedInstructions; 755 756 /// Trip count of the original loop. 757 Value *TripCount = nullptr; 758 759 /// Trip count of the widened loop (TripCount - TripCount % (VF*UF)) 760 Value *VectorTripCount = nullptr; 761 762 /// The legality analysis. 763 LoopVectorizationLegality *Legal; 764 765 /// The profitablity analysis. 766 LoopVectorizationCostModel *Cost; 767 768 // Record whether runtime checks are added. 769 bool AddedSafetyChecks = false; 770 771 // Holds the end values for each induction variable. We save the end values 772 // so we can later fix-up the external users of the induction variables. 773 DenseMap<PHINode *, Value *> IVEndValues; 774 775 // Vector of original scalar PHIs whose corresponding widened PHIs need to be 776 // fixed up at the end of vector code generation. 777 SmallVector<PHINode *, 8> OrigPHIsToFix; 778 }; 779 780 class InnerLoopUnroller : public InnerLoopVectorizer { 781 public: 782 InnerLoopUnroller(Loop *OrigLoop, PredicatedScalarEvolution &PSE, 783 LoopInfo *LI, DominatorTree *DT, 784 const TargetLibraryInfo *TLI, 785 const TargetTransformInfo *TTI, AssumptionCache *AC, 786 OptimizationRemarkEmitter *ORE, unsigned UnrollFactor, 787 LoopVectorizationLegality *LVL, 788 LoopVectorizationCostModel *CM) 789 : InnerLoopVectorizer(OrigLoop, PSE, LI, DT, TLI, TTI, AC, ORE, 1, 790 UnrollFactor, LVL, CM) {} 791 792 private: 793 Value *getBroadcastInstrs(Value *V) override; 794 Value *getStepVector(Value *Val, int StartIdx, Value *Step, 795 Instruction::BinaryOps Opcode = 796 Instruction::BinaryOpsEnd) override; 797 Value *reverseVector(Value *Vec) override; 798 }; 799 800 } // end namespace llvm 801 802 /// Look for a meaningful debug location on the instruction or it's 803 /// operands. 804 static Instruction *getDebugLocFromInstOrOperands(Instruction *I) { 805 if (!I) 806 return I; 807 808 DebugLoc Empty; 809 if (I->getDebugLoc() != Empty) 810 return I; 811 812 for (User::op_iterator OI = I->op_begin(), OE = I->op_end(); OI != OE; ++OI) { 813 if (Instruction *OpInst = dyn_cast<Instruction>(*OI)) 814 if (OpInst->getDebugLoc() != Empty) 815 return OpInst; 816 } 817 818 return I; 819 } 820 821 void InnerLoopVectorizer::setDebugLocFromInst(IRBuilder<> &B, const Value *Ptr) { 822 if (const Instruction *Inst = dyn_cast_or_null<Instruction>(Ptr)) { 823 const DILocation *DIL = Inst->getDebugLoc(); 824 if (DIL && Inst->getFunction()->isDebugInfoForProfiling() && 825 !isa<DbgInfoIntrinsic>(Inst)) { 826 auto NewDIL = DIL->cloneByMultiplyingDuplicationFactor(UF * VF); 827 if (NewDIL) 828 B.SetCurrentDebugLocation(NewDIL.getValue()); 829 else 830 LLVM_DEBUG(dbgs() 831 << "Failed to create new discriminator: " 832 << DIL->getFilename() << " Line: " << DIL->getLine()); 833 } 834 else 835 B.SetCurrentDebugLocation(DIL); 836 } else 837 B.SetCurrentDebugLocation(DebugLoc()); 838 } 839 840 /// Write a record \p DebugMsg about vectorization failure to the debug 841 /// output stream. If \p I is passed, it is an instruction that prevents 842 /// vectorization. 843 #ifndef NDEBUG 844 static void debugVectorizationFailure(const StringRef DebugMsg, 845 Instruction *I) { 846 dbgs() << "LV: Not vectorizing: " << DebugMsg; 847 if (I != nullptr) 848 dbgs() << " " << *I; 849 else 850 dbgs() << '.'; 851 dbgs() << '\n'; 852 } 853 #endif 854 855 /// Create an analysis remark that explains why vectorization failed 856 /// 857 /// \p PassName is the name of the pass (e.g. can be AlwaysPrint). \p 858 /// RemarkName is the identifier for the remark. If \p I is passed it is an 859 /// instruction that prevents vectorization. Otherwise \p TheLoop is used for 860 /// the location of the remark. \return the remark object that can be 861 /// streamed to. 862 static OptimizationRemarkAnalysis createLVAnalysis(const char *PassName, 863 StringRef RemarkName, Loop *TheLoop, Instruction *I) { 864 Value *CodeRegion = TheLoop->getHeader(); 865 DebugLoc DL = TheLoop->getStartLoc(); 866 867 if (I) { 868 CodeRegion = I->getParent(); 869 // If there is no debug location attached to the instruction, revert back to 870 // using the loop's. 871 if (I->getDebugLoc()) 872 DL = I->getDebugLoc(); 873 } 874 875 OptimizationRemarkAnalysis R(PassName, RemarkName, DL, CodeRegion); 876 R << "loop not vectorized: "; 877 return R; 878 } 879 880 namespace llvm { 881 882 void reportVectorizationFailure(const StringRef DebugMsg, 883 const StringRef OREMsg, const StringRef ORETag, 884 OptimizationRemarkEmitter *ORE, Loop *TheLoop, Instruction *I) { 885 LLVM_DEBUG(debugVectorizationFailure(DebugMsg, I)); 886 LoopVectorizeHints Hints(TheLoop, true /* doesn't matter */, *ORE); 887 ORE->emit(createLVAnalysis(Hints.vectorizeAnalysisPassName(), 888 ORETag, TheLoop, I) << OREMsg); 889 } 890 891 } // end namespace llvm 892 893 #ifndef NDEBUG 894 /// \return string containing a file name and a line # for the given loop. 895 static std::string getDebugLocString(const Loop *L) { 896 std::string Result; 897 if (L) { 898 raw_string_ostream OS(Result); 899 if (const DebugLoc LoopDbgLoc = L->getStartLoc()) 900 LoopDbgLoc.print(OS); 901 else 902 // Just print the module name. 903 OS << L->getHeader()->getParent()->getParent()->getModuleIdentifier(); 904 OS.flush(); 905 } 906 return Result; 907 } 908 #endif 909 910 void InnerLoopVectorizer::addNewMetadata(Instruction *To, 911 const Instruction *Orig) { 912 // If the loop was versioned with memchecks, add the corresponding no-alias 913 // metadata. 914 if (LVer && (isa<LoadInst>(Orig) || isa<StoreInst>(Orig))) 915 LVer->annotateInstWithNoAlias(To, Orig); 916 } 917 918 void InnerLoopVectorizer::addMetadata(Instruction *To, 919 Instruction *From) { 920 propagateMetadata(To, From); 921 addNewMetadata(To, From); 922 } 923 924 void InnerLoopVectorizer::addMetadata(ArrayRef<Value *> To, 925 Instruction *From) { 926 for (Value *V : To) { 927 if (Instruction *I = dyn_cast<Instruction>(V)) 928 addMetadata(I, From); 929 } 930 } 931 932 namespace llvm { 933 934 // Loop vectorization cost-model hints how the scalar epilogue loop should be 935 // lowered. 936 enum ScalarEpilogueLowering { 937 938 // The default: allowing scalar epilogues. 939 CM_ScalarEpilogueAllowed, 940 941 // Vectorization with OptForSize: don't allow epilogues. 942 CM_ScalarEpilogueNotAllowedOptSize, 943 944 // A special case of vectorisation with OptForSize: loops with a very small 945 // trip count are considered for vectorization under OptForSize, thereby 946 // making sure the cost of their loop body is dominant, free of runtime 947 // guards and scalar iteration overheads. 948 CM_ScalarEpilogueNotAllowedLowTripLoop, 949 950 // Loop hint predicate indicating an epilogue is undesired. 951 CM_ScalarEpilogueNotNeededUsePredicate 952 }; 953 954 /// LoopVectorizationCostModel - estimates the expected speedups due to 955 /// vectorization. 956 /// In many cases vectorization is not profitable. This can happen because of 957 /// a number of reasons. In this class we mainly attempt to predict the 958 /// expected speedup/slowdowns due to the supported instruction set. We use the 959 /// TargetTransformInfo to query the different backends for the cost of 960 /// different operations. 961 class LoopVectorizationCostModel { 962 public: 963 LoopVectorizationCostModel(ScalarEpilogueLowering SEL, Loop *L, 964 PredicatedScalarEvolution &PSE, LoopInfo *LI, 965 LoopVectorizationLegality *Legal, 966 const TargetTransformInfo &TTI, 967 const TargetLibraryInfo *TLI, DemandedBits *DB, 968 AssumptionCache *AC, 969 OptimizationRemarkEmitter *ORE, const Function *F, 970 const LoopVectorizeHints *Hints, 971 InterleavedAccessInfo &IAI) 972 : ScalarEpilogueStatus(SEL), TheLoop(L), PSE(PSE), LI(LI), Legal(Legal), 973 TTI(TTI), TLI(TLI), DB(DB), AC(AC), ORE(ORE), TheFunction(F), 974 Hints(Hints), InterleaveInfo(IAI) {} 975 976 /// \return An upper bound for the vectorization factor, or None if 977 /// vectorization and interleaving should be avoided up front. 978 Optional<unsigned> computeMaxVF(); 979 980 /// \return True if runtime checks are required for vectorization, and false 981 /// otherwise. 982 bool runtimeChecksRequired(); 983 984 /// \return The most profitable vectorization factor and the cost of that VF. 985 /// This method checks every power of two up to MaxVF. If UserVF is not ZERO 986 /// then this vectorization factor will be selected if vectorization is 987 /// possible. 988 VectorizationFactor selectVectorizationFactor(unsigned MaxVF); 989 990 /// Setup cost-based decisions for user vectorization factor. 991 void selectUserVectorizationFactor(unsigned UserVF) { 992 collectUniformsAndScalars(UserVF); 993 collectInstsToScalarize(UserVF); 994 } 995 996 /// \return The size (in bits) of the smallest and widest types in the code 997 /// that needs to be vectorized. We ignore values that remain scalar such as 998 /// 64 bit loop indices. 999 std::pair<unsigned, unsigned> getSmallestAndWidestTypes(); 1000 1001 /// \return The desired interleave count. 1002 /// If interleave count has been specified by metadata it will be returned. 1003 /// Otherwise, the interleave count is computed and returned. VF and LoopCost 1004 /// are the selected vectorization factor and the cost of the selected VF. 1005 unsigned selectInterleaveCount(unsigned VF, unsigned LoopCost); 1006 1007 /// Memory access instruction may be vectorized in more than one way. 1008 /// Form of instruction after vectorization depends on cost. 1009 /// This function takes cost-based decisions for Load/Store instructions 1010 /// and collects them in a map. This decisions map is used for building 1011 /// the lists of loop-uniform and loop-scalar instructions. 1012 /// The calculated cost is saved with widening decision in order to 1013 /// avoid redundant calculations. 1014 void setCostBasedWideningDecision(unsigned VF); 1015 1016 /// A struct that represents some properties of the register usage 1017 /// of a loop. 1018 struct RegisterUsage { 1019 /// Holds the number of loop invariant values that are used in the loop. 1020 /// The key is ClassID of target-provided register class. 1021 SmallMapVector<unsigned, unsigned, 4> LoopInvariantRegs; 1022 /// Holds the maximum number of concurrent live intervals in the loop. 1023 /// The key is ClassID of target-provided register class. 1024 SmallMapVector<unsigned, unsigned, 4> MaxLocalUsers; 1025 }; 1026 1027 /// \return Returns information about the register usages of the loop for the 1028 /// given vectorization factors. 1029 SmallVector<RegisterUsage, 8> calculateRegisterUsage(ArrayRef<unsigned> VFs); 1030 1031 /// Collect values we want to ignore in the cost model. 1032 void collectValuesToIgnore(); 1033 1034 /// \returns The smallest bitwidth each instruction can be represented with. 1035 /// The vector equivalents of these instructions should be truncated to this 1036 /// type. 1037 const MapVector<Instruction *, uint64_t> &getMinimalBitwidths() const { 1038 return MinBWs; 1039 } 1040 1041 /// \returns True if it is more profitable to scalarize instruction \p I for 1042 /// vectorization factor \p VF. 1043 bool isProfitableToScalarize(Instruction *I, unsigned VF) const { 1044 assert(VF > 1 && "Profitable to scalarize relevant only for VF > 1."); 1045 1046 // Cost model is not run in the VPlan-native path - return conservative 1047 // result until this changes. 1048 if (EnableVPlanNativePath) 1049 return false; 1050 1051 auto Scalars = InstsToScalarize.find(VF); 1052 assert(Scalars != InstsToScalarize.end() && 1053 "VF not yet analyzed for scalarization profitability"); 1054 return Scalars->second.find(I) != Scalars->second.end(); 1055 } 1056 1057 /// Returns true if \p I is known to be uniform after vectorization. 1058 bool isUniformAfterVectorization(Instruction *I, unsigned VF) const { 1059 if (VF == 1) 1060 return true; 1061 1062 // Cost model is not run in the VPlan-native path - return conservative 1063 // result until this changes. 1064 if (EnableVPlanNativePath) 1065 return false; 1066 1067 auto UniformsPerVF = Uniforms.find(VF); 1068 assert(UniformsPerVF != Uniforms.end() && 1069 "VF not yet analyzed for uniformity"); 1070 return UniformsPerVF->second.find(I) != UniformsPerVF->second.end(); 1071 } 1072 1073 /// Returns true if \p I is known to be scalar after vectorization. 1074 bool isScalarAfterVectorization(Instruction *I, unsigned VF) const { 1075 if (VF == 1) 1076 return true; 1077 1078 // Cost model is not run in the VPlan-native path - return conservative 1079 // result until this changes. 1080 if (EnableVPlanNativePath) 1081 return false; 1082 1083 auto ScalarsPerVF = Scalars.find(VF); 1084 assert(ScalarsPerVF != Scalars.end() && 1085 "Scalar values are not calculated for VF"); 1086 return ScalarsPerVF->second.find(I) != ScalarsPerVF->second.end(); 1087 } 1088 1089 /// \returns True if instruction \p I can be truncated to a smaller bitwidth 1090 /// for vectorization factor \p VF. 1091 bool canTruncateToMinimalBitwidth(Instruction *I, unsigned VF) const { 1092 return VF > 1 && MinBWs.find(I) != MinBWs.end() && 1093 !isProfitableToScalarize(I, VF) && 1094 !isScalarAfterVectorization(I, VF); 1095 } 1096 1097 /// Decision that was taken during cost calculation for memory instruction. 1098 enum InstWidening { 1099 CM_Unknown, 1100 CM_Widen, // For consecutive accesses with stride +1. 1101 CM_Widen_Reverse, // For consecutive accesses with stride -1. 1102 CM_Interleave, 1103 CM_GatherScatter, 1104 CM_Scalarize 1105 }; 1106 1107 /// Save vectorization decision \p W and \p Cost taken by the cost model for 1108 /// instruction \p I and vector width \p VF. 1109 void setWideningDecision(Instruction *I, unsigned VF, InstWidening W, 1110 unsigned Cost) { 1111 assert(VF >= 2 && "Expected VF >=2"); 1112 WideningDecisions[std::make_pair(I, VF)] = std::make_pair(W, Cost); 1113 } 1114 1115 /// Save vectorization decision \p W and \p Cost taken by the cost model for 1116 /// interleaving group \p Grp and vector width \p VF. 1117 void setWideningDecision(const InterleaveGroup<Instruction> *Grp, unsigned VF, 1118 InstWidening W, unsigned Cost) { 1119 assert(VF >= 2 && "Expected VF >=2"); 1120 /// Broadcast this decicion to all instructions inside the group. 1121 /// But the cost will be assigned to one instruction only. 1122 for (unsigned i = 0; i < Grp->getFactor(); ++i) { 1123 if (auto *I = Grp->getMember(i)) { 1124 if (Grp->getInsertPos() == I) 1125 WideningDecisions[std::make_pair(I, VF)] = std::make_pair(W, Cost); 1126 else 1127 WideningDecisions[std::make_pair(I, VF)] = std::make_pair(W, 0); 1128 } 1129 } 1130 } 1131 1132 /// Return the cost model decision for the given instruction \p I and vector 1133 /// width \p VF. Return CM_Unknown if this instruction did not pass 1134 /// through the cost modeling. 1135 InstWidening getWideningDecision(Instruction *I, unsigned VF) { 1136 assert(VF >= 2 && "Expected VF >=2"); 1137 1138 // Cost model is not run in the VPlan-native path - return conservative 1139 // result until this changes. 1140 if (EnableVPlanNativePath) 1141 return CM_GatherScatter; 1142 1143 std::pair<Instruction *, unsigned> InstOnVF = std::make_pair(I, VF); 1144 auto Itr = WideningDecisions.find(InstOnVF); 1145 if (Itr == WideningDecisions.end()) 1146 return CM_Unknown; 1147 return Itr->second.first; 1148 } 1149 1150 /// Return the vectorization cost for the given instruction \p I and vector 1151 /// width \p VF. 1152 unsigned getWideningCost(Instruction *I, unsigned VF) { 1153 assert(VF >= 2 && "Expected VF >=2"); 1154 std::pair<Instruction *, unsigned> InstOnVF = std::make_pair(I, VF); 1155 assert(WideningDecisions.find(InstOnVF) != WideningDecisions.end() && 1156 "The cost is not calculated"); 1157 return WideningDecisions[InstOnVF].second; 1158 } 1159 1160 /// Return True if instruction \p I is an optimizable truncate whose operand 1161 /// is an induction variable. Such a truncate will be removed by adding a new 1162 /// induction variable with the destination type. 1163 bool isOptimizableIVTruncate(Instruction *I, unsigned VF) { 1164 // If the instruction is not a truncate, return false. 1165 auto *Trunc = dyn_cast<TruncInst>(I); 1166 if (!Trunc) 1167 return false; 1168 1169 // Get the source and destination types of the truncate. 1170 Type *SrcTy = ToVectorTy(cast<CastInst>(I)->getSrcTy(), VF); 1171 Type *DestTy = ToVectorTy(cast<CastInst>(I)->getDestTy(), VF); 1172 1173 // If the truncate is free for the given types, return false. Replacing a 1174 // free truncate with an induction variable would add an induction variable 1175 // update instruction to each iteration of the loop. We exclude from this 1176 // check the primary induction variable since it will need an update 1177 // instruction regardless. 1178 Value *Op = Trunc->getOperand(0); 1179 if (Op != Legal->getPrimaryInduction() && TTI.isTruncateFree(SrcTy, DestTy)) 1180 return false; 1181 1182 // If the truncated value is not an induction variable, return false. 1183 return Legal->isInductionPhi(Op); 1184 } 1185 1186 /// Collects the instructions to scalarize for each predicated instruction in 1187 /// the loop. 1188 void collectInstsToScalarize(unsigned VF); 1189 1190 /// Collect Uniform and Scalar values for the given \p VF. 1191 /// The sets depend on CM decision for Load/Store instructions 1192 /// that may be vectorized as interleave, gather-scatter or scalarized. 1193 void collectUniformsAndScalars(unsigned VF) { 1194 // Do the analysis once. 1195 if (VF == 1 || Uniforms.find(VF) != Uniforms.end()) 1196 return; 1197 setCostBasedWideningDecision(VF); 1198 collectLoopUniforms(VF); 1199 collectLoopScalars(VF); 1200 } 1201 1202 /// Returns true if the target machine supports masked store operation 1203 /// for the given \p DataType and kind of access to \p Ptr. 1204 bool isLegalMaskedStore(Type *DataType, Value *Ptr, MaybeAlign Alignment) { 1205 return Legal->isConsecutivePtr(Ptr) && 1206 TTI.isLegalMaskedStore(DataType, Alignment); 1207 } 1208 1209 /// Returns true if the target machine supports masked load operation 1210 /// for the given \p DataType and kind of access to \p Ptr. 1211 bool isLegalMaskedLoad(Type *DataType, Value *Ptr, MaybeAlign Alignment) { 1212 return Legal->isConsecutivePtr(Ptr) && 1213 TTI.isLegalMaskedLoad(DataType, Alignment); 1214 } 1215 1216 /// Returns true if the target machine supports masked scatter operation 1217 /// for the given \p DataType. 1218 bool isLegalMaskedScatter(Type *DataType, MaybeAlign Alignment) { 1219 return TTI.isLegalMaskedScatter(DataType, Alignment); 1220 } 1221 1222 /// Returns true if the target machine supports masked gather operation 1223 /// for the given \p DataType. 1224 bool isLegalMaskedGather(Type *DataType, MaybeAlign Alignment) { 1225 return TTI.isLegalMaskedGather(DataType, Alignment); 1226 } 1227 1228 /// Returns true if the target machine can represent \p V as a masked gather 1229 /// or scatter operation. 1230 bool isLegalGatherOrScatter(Value *V) { 1231 bool LI = isa<LoadInst>(V); 1232 bool SI = isa<StoreInst>(V); 1233 if (!LI && !SI) 1234 return false; 1235 auto *Ty = getMemInstValueType(V); 1236 MaybeAlign Align = getLoadStoreAlignment(V); 1237 return (LI && isLegalMaskedGather(Ty, Align)) || 1238 (SI && isLegalMaskedScatter(Ty, Align)); 1239 } 1240 1241 /// Returns true if \p I is an instruction that will be scalarized with 1242 /// predication. Such instructions include conditional stores and 1243 /// instructions that may divide by zero. 1244 /// If a non-zero VF has been calculated, we check if I will be scalarized 1245 /// predication for that VF. 1246 bool isScalarWithPredication(Instruction *I, unsigned VF = 1); 1247 1248 // Returns true if \p I is an instruction that will be predicated either 1249 // through scalar predication or masked load/store or masked gather/scatter. 1250 // Superset of instructions that return true for isScalarWithPredication. 1251 bool isPredicatedInst(Instruction *I) { 1252 if (!blockNeedsPredication(I->getParent())) 1253 return false; 1254 // Loads and stores that need some form of masked operation are predicated 1255 // instructions. 1256 if (isa<LoadInst>(I) || isa<StoreInst>(I)) 1257 return Legal->isMaskRequired(I); 1258 return isScalarWithPredication(I); 1259 } 1260 1261 /// Returns true if \p I is a memory instruction with consecutive memory 1262 /// access that can be widened. 1263 bool memoryInstructionCanBeWidened(Instruction *I, unsigned VF = 1); 1264 1265 /// Returns true if \p I is a memory instruction in an interleaved-group 1266 /// of memory accesses that can be vectorized with wide vector loads/stores 1267 /// and shuffles. 1268 bool interleavedAccessCanBeWidened(Instruction *I, unsigned VF = 1); 1269 1270 /// Check if \p Instr belongs to any interleaved access group. 1271 bool isAccessInterleaved(Instruction *Instr) { 1272 return InterleaveInfo.isInterleaved(Instr); 1273 } 1274 1275 /// Get the interleaved access group that \p Instr belongs to. 1276 const InterleaveGroup<Instruction> * 1277 getInterleavedAccessGroup(Instruction *Instr) { 1278 return InterleaveInfo.getInterleaveGroup(Instr); 1279 } 1280 1281 /// Returns true if an interleaved group requires a scalar iteration 1282 /// to handle accesses with gaps, and there is nothing preventing us from 1283 /// creating a scalar epilogue. 1284 bool requiresScalarEpilogue() const { 1285 return isScalarEpilogueAllowed() && InterleaveInfo.requiresScalarEpilogue(); 1286 } 1287 1288 /// Returns true if a scalar epilogue is not allowed due to optsize or a 1289 /// loop hint annotation. 1290 bool isScalarEpilogueAllowed() const { 1291 return ScalarEpilogueStatus == CM_ScalarEpilogueAllowed; 1292 } 1293 1294 /// Returns true if all loop blocks should be masked to fold tail loop. 1295 bool foldTailByMasking() const { return FoldTailByMasking; } 1296 1297 bool blockNeedsPredication(BasicBlock *BB) { 1298 return foldTailByMasking() || Legal->blockNeedsPredication(BB); 1299 } 1300 1301 /// Estimate cost of an intrinsic call instruction CI if it were vectorized 1302 /// with factor VF. Return the cost of the instruction, including 1303 /// scalarization overhead if it's needed. 1304 unsigned getVectorIntrinsicCost(CallInst *CI, unsigned VF); 1305 1306 /// Estimate cost of a call instruction CI if it were vectorized with factor 1307 /// VF. Return the cost of the instruction, including scalarization overhead 1308 /// if it's needed. The flag NeedToScalarize shows if the call needs to be 1309 /// scalarized - 1310 /// i.e. either vector version isn't available, or is too expensive. 1311 unsigned getVectorCallCost(CallInst *CI, unsigned VF, bool &NeedToScalarize); 1312 1313 private: 1314 unsigned NumPredStores = 0; 1315 1316 /// \return An upper bound for the vectorization factor, larger than zero. 1317 /// One is returned if vectorization should best be avoided due to cost. 1318 unsigned computeFeasibleMaxVF(unsigned ConstTripCount); 1319 1320 /// The vectorization cost is a combination of the cost itself and a boolean 1321 /// indicating whether any of the contributing operations will actually 1322 /// operate on 1323 /// vector values after type legalization in the backend. If this latter value 1324 /// is 1325 /// false, then all operations will be scalarized (i.e. no vectorization has 1326 /// actually taken place). 1327 using VectorizationCostTy = std::pair<unsigned, bool>; 1328 1329 /// Returns the expected execution cost. The unit of the cost does 1330 /// not matter because we use the 'cost' units to compare different 1331 /// vector widths. The cost that is returned is *not* normalized by 1332 /// the factor width. 1333 VectorizationCostTy expectedCost(unsigned VF); 1334 1335 /// Returns the execution time cost of an instruction for a given vector 1336 /// width. Vector width of one means scalar. 1337 VectorizationCostTy getInstructionCost(Instruction *I, unsigned VF); 1338 1339 /// The cost-computation logic from getInstructionCost which provides 1340 /// the vector type as an output parameter. 1341 unsigned getInstructionCost(Instruction *I, unsigned VF, Type *&VectorTy); 1342 1343 /// Calculate vectorization cost of memory instruction \p I. 1344 unsigned getMemoryInstructionCost(Instruction *I, unsigned VF); 1345 1346 /// The cost computation for scalarized memory instruction. 1347 unsigned getMemInstScalarizationCost(Instruction *I, unsigned VF); 1348 1349 /// The cost computation for interleaving group of memory instructions. 1350 unsigned getInterleaveGroupCost(Instruction *I, unsigned VF); 1351 1352 /// The cost computation for Gather/Scatter instruction. 1353 unsigned getGatherScatterCost(Instruction *I, unsigned VF); 1354 1355 /// The cost computation for widening instruction \p I with consecutive 1356 /// memory access. 1357 unsigned getConsecutiveMemOpCost(Instruction *I, unsigned VF); 1358 1359 /// The cost calculation for Load/Store instruction \p I with uniform pointer - 1360 /// Load: scalar load + broadcast. 1361 /// Store: scalar store + (loop invariant value stored? 0 : extract of last 1362 /// element) 1363 unsigned getUniformMemOpCost(Instruction *I, unsigned VF); 1364 1365 /// Estimate the overhead of scalarizing an instruction. This is a 1366 /// convenience wrapper for the type-based getScalarizationOverhead API. 1367 unsigned getScalarizationOverhead(Instruction *I, unsigned VF); 1368 1369 /// Returns whether the instruction is a load or store and will be a emitted 1370 /// as a vector operation. 1371 bool isConsecutiveLoadOrStore(Instruction *I); 1372 1373 /// Returns true if an artificially high cost for emulated masked memrefs 1374 /// should be used. 1375 bool useEmulatedMaskMemRefHack(Instruction *I); 1376 1377 /// Map of scalar integer values to the smallest bitwidth they can be legally 1378 /// represented as. The vector equivalents of these values should be truncated 1379 /// to this type. 1380 MapVector<Instruction *, uint64_t> MinBWs; 1381 1382 /// A type representing the costs for instructions if they were to be 1383 /// scalarized rather than vectorized. The entries are Instruction-Cost 1384 /// pairs. 1385 using ScalarCostsTy = DenseMap<Instruction *, unsigned>; 1386 1387 /// A set containing all BasicBlocks that are known to present after 1388 /// vectorization as a predicated block. 1389 SmallPtrSet<BasicBlock *, 4> PredicatedBBsAfterVectorization; 1390 1391 /// Records whether it is allowed to have the original scalar loop execute at 1392 /// least once. This may be needed as a fallback loop in case runtime 1393 /// aliasing/dependence checks fail, or to handle the tail/remainder 1394 /// iterations when the trip count is unknown or doesn't divide by the VF, 1395 /// or as a peel-loop to handle gaps in interleave-groups. 1396 /// Under optsize and when the trip count is very small we don't allow any 1397 /// iterations to execute in the scalar loop. 1398 ScalarEpilogueLowering ScalarEpilogueStatus = CM_ScalarEpilogueAllowed; 1399 1400 /// All blocks of loop are to be masked to fold tail of scalar iterations. 1401 bool FoldTailByMasking = false; 1402 1403 /// A map holding scalar costs for different vectorization factors. The 1404 /// presence of a cost for an instruction in the mapping indicates that the 1405 /// instruction will be scalarized when vectorizing with the associated 1406 /// vectorization factor. The entries are VF-ScalarCostTy pairs. 1407 DenseMap<unsigned, ScalarCostsTy> InstsToScalarize; 1408 1409 /// Holds the instructions known to be uniform after vectorization. 1410 /// The data is collected per VF. 1411 DenseMap<unsigned, SmallPtrSet<Instruction *, 4>> Uniforms; 1412 1413 /// Holds the instructions known to be scalar after vectorization. 1414 /// The data is collected per VF. 1415 DenseMap<unsigned, SmallPtrSet<Instruction *, 4>> Scalars; 1416 1417 /// Holds the instructions (address computations) that are forced to be 1418 /// scalarized. 1419 DenseMap<unsigned, SmallPtrSet<Instruction *, 4>> ForcedScalars; 1420 1421 /// Returns the expected difference in cost from scalarizing the expression 1422 /// feeding a predicated instruction \p PredInst. The instructions to 1423 /// scalarize and their scalar costs are collected in \p ScalarCosts. A 1424 /// non-negative return value implies the expression will be scalarized. 1425 /// Currently, only single-use chains are considered for scalarization. 1426 int computePredInstDiscount(Instruction *PredInst, ScalarCostsTy &ScalarCosts, 1427 unsigned VF); 1428 1429 /// Collect the instructions that are uniform after vectorization. An 1430 /// instruction is uniform if we represent it with a single scalar value in 1431 /// the vectorized loop corresponding to each vector iteration. Examples of 1432 /// uniform instructions include pointer operands of consecutive or 1433 /// interleaved memory accesses. Note that although uniformity implies an 1434 /// instruction will be scalar, the reverse is not true. In general, a 1435 /// scalarized instruction will be represented by VF scalar values in the 1436 /// vectorized loop, each corresponding to an iteration of the original 1437 /// scalar loop. 1438 void collectLoopUniforms(unsigned VF); 1439 1440 /// Collect the instructions that are scalar after vectorization. An 1441 /// instruction is scalar if it is known to be uniform or will be scalarized 1442 /// during vectorization. Non-uniform scalarized instructions will be 1443 /// represented by VF values in the vectorized loop, each corresponding to an 1444 /// iteration of the original scalar loop. 1445 void collectLoopScalars(unsigned VF); 1446 1447 /// Keeps cost model vectorization decision and cost for instructions. 1448 /// Right now it is used for memory instructions only. 1449 using DecisionList = DenseMap<std::pair<Instruction *, unsigned>, 1450 std::pair<InstWidening, unsigned>>; 1451 1452 DecisionList WideningDecisions; 1453 1454 /// Returns true if \p V is expected to be vectorized and it needs to be 1455 /// extracted. 1456 bool needsExtract(Value *V, unsigned VF) const { 1457 Instruction *I = dyn_cast<Instruction>(V); 1458 if (VF == 1 || !I || !TheLoop->contains(I) || TheLoop->isLoopInvariant(I)) 1459 return false; 1460 1461 // Assume we can vectorize V (and hence we need extraction) if the 1462 // scalars are not computed yet. This can happen, because it is called 1463 // via getScalarizationOverhead from setCostBasedWideningDecision, before 1464 // the scalars are collected. That should be a safe assumption in most 1465 // cases, because we check if the operands have vectorizable types 1466 // beforehand in LoopVectorizationLegality. 1467 return Scalars.find(VF) == Scalars.end() || 1468 !isScalarAfterVectorization(I, VF); 1469 }; 1470 1471 /// Returns a range containing only operands needing to be extracted. 1472 SmallVector<Value *, 4> filterExtractingOperands(Instruction::op_range Ops, 1473 unsigned VF) { 1474 return SmallVector<Value *, 4>(make_filter_range( 1475 Ops, [this, VF](Value *V) { return this->needsExtract(V, VF); })); 1476 } 1477 1478 public: 1479 /// The loop that we evaluate. 1480 Loop *TheLoop; 1481 1482 /// Predicated scalar evolution analysis. 1483 PredicatedScalarEvolution &PSE; 1484 1485 /// Loop Info analysis. 1486 LoopInfo *LI; 1487 1488 /// Vectorization legality. 1489 LoopVectorizationLegality *Legal; 1490 1491 /// Vector target information. 1492 const TargetTransformInfo &TTI; 1493 1494 /// Target Library Info. 1495 const TargetLibraryInfo *TLI; 1496 1497 /// Demanded bits analysis. 1498 DemandedBits *DB; 1499 1500 /// Assumption cache. 1501 AssumptionCache *AC; 1502 1503 /// Interface to emit optimization remarks. 1504 OptimizationRemarkEmitter *ORE; 1505 1506 const Function *TheFunction; 1507 1508 /// Loop Vectorize Hint. 1509 const LoopVectorizeHints *Hints; 1510 1511 /// The interleave access information contains groups of interleaved accesses 1512 /// with the same stride and close to each other. 1513 InterleavedAccessInfo &InterleaveInfo; 1514 1515 /// Values to ignore in the cost model. 1516 SmallPtrSet<const Value *, 16> ValuesToIgnore; 1517 1518 /// Values to ignore in the cost model when VF > 1. 1519 SmallPtrSet<const Value *, 16> VecValuesToIgnore; 1520 }; 1521 1522 } // end namespace llvm 1523 1524 // Return true if \p OuterLp is an outer loop annotated with hints for explicit 1525 // vectorization. The loop needs to be annotated with #pragma omp simd 1526 // simdlen(#) or #pragma clang vectorize(enable) vectorize_width(#). If the 1527 // vector length information is not provided, vectorization is not considered 1528 // explicit. Interleave hints are not allowed either. These limitations will be 1529 // relaxed in the future. 1530 // Please, note that we are currently forced to abuse the pragma 'clang 1531 // vectorize' semantics. This pragma provides *auto-vectorization hints* 1532 // (i.e., LV must check that vectorization is legal) whereas pragma 'omp simd' 1533 // provides *explicit vectorization hints* (LV can bypass legal checks and 1534 // assume that vectorization is legal). However, both hints are implemented 1535 // using the same metadata (llvm.loop.vectorize, processed by 1536 // LoopVectorizeHints). This will be fixed in the future when the native IR 1537 // representation for pragma 'omp simd' is introduced. 1538 static bool isExplicitVecOuterLoop(Loop *OuterLp, 1539 OptimizationRemarkEmitter *ORE) { 1540 assert(!OuterLp->empty() && "This is not an outer loop"); 1541 LoopVectorizeHints Hints(OuterLp, true /*DisableInterleaving*/, *ORE); 1542 1543 // Only outer loops with an explicit vectorization hint are supported. 1544 // Unannotated outer loops are ignored. 1545 if (Hints.getForce() == LoopVectorizeHints::FK_Undefined) 1546 return false; 1547 1548 Function *Fn = OuterLp->getHeader()->getParent(); 1549 if (!Hints.allowVectorization(Fn, OuterLp, 1550 true /*VectorizeOnlyWhenForced*/)) { 1551 LLVM_DEBUG(dbgs() << "LV: Loop hints prevent outer loop vectorization.\n"); 1552 return false; 1553 } 1554 1555 if (Hints.getInterleave() > 1) { 1556 // TODO: Interleave support is future work. 1557 LLVM_DEBUG(dbgs() << "LV: Not vectorizing: Interleave is not supported for " 1558 "outer loops.\n"); 1559 Hints.emitRemarkWithHints(); 1560 return false; 1561 } 1562 1563 return true; 1564 } 1565 1566 static void collectSupportedLoops(Loop &L, LoopInfo *LI, 1567 OptimizationRemarkEmitter *ORE, 1568 SmallVectorImpl<Loop *> &V) { 1569 // Collect inner loops and outer loops without irreducible control flow. For 1570 // now, only collect outer loops that have explicit vectorization hints. If we 1571 // are stress testing the VPlan H-CFG construction, we collect the outermost 1572 // loop of every loop nest. 1573 if (L.empty() || VPlanBuildStressTest || 1574 (EnableVPlanNativePath && isExplicitVecOuterLoop(&L, ORE))) { 1575 LoopBlocksRPO RPOT(&L); 1576 RPOT.perform(LI); 1577 if (!containsIrreducibleCFG<const BasicBlock *>(RPOT, *LI)) { 1578 V.push_back(&L); 1579 // TODO: Collect inner loops inside marked outer loops in case 1580 // vectorization fails for the outer loop. Do not invoke 1581 // 'containsIrreducibleCFG' again for inner loops when the outer loop is 1582 // already known to be reducible. We can use an inherited attribute for 1583 // that. 1584 return; 1585 } 1586 } 1587 for (Loop *InnerL : L) 1588 collectSupportedLoops(*InnerL, LI, ORE, V); 1589 } 1590 1591 namespace { 1592 1593 /// The LoopVectorize Pass. 1594 struct LoopVectorize : public FunctionPass { 1595 /// Pass identification, replacement for typeid 1596 static char ID; 1597 1598 LoopVectorizePass Impl; 1599 1600 explicit LoopVectorize(bool InterleaveOnlyWhenForced = false, 1601 bool VectorizeOnlyWhenForced = false) 1602 : FunctionPass(ID) { 1603 Impl.InterleaveOnlyWhenForced = InterleaveOnlyWhenForced; 1604 Impl.VectorizeOnlyWhenForced = VectorizeOnlyWhenForced; 1605 initializeLoopVectorizePass(*PassRegistry::getPassRegistry()); 1606 } 1607 1608 bool runOnFunction(Function &F) override { 1609 if (skipFunction(F)) 1610 return false; 1611 1612 auto *SE = &getAnalysis<ScalarEvolutionWrapperPass>().getSE(); 1613 auto *LI = &getAnalysis<LoopInfoWrapperPass>().getLoopInfo(); 1614 auto *TTI = &getAnalysis<TargetTransformInfoWrapperPass>().getTTI(F); 1615 auto *DT = &getAnalysis<DominatorTreeWrapperPass>().getDomTree(); 1616 auto *BFI = &getAnalysis<BlockFrequencyInfoWrapperPass>().getBFI(); 1617 auto *TLIP = getAnalysisIfAvailable<TargetLibraryInfoWrapperPass>(); 1618 auto *TLI = TLIP ? &TLIP->getTLI(F) : nullptr; 1619 auto *AA = &getAnalysis<AAResultsWrapperPass>().getAAResults(); 1620 auto *AC = &getAnalysis<AssumptionCacheTracker>().getAssumptionCache(F); 1621 auto *LAA = &getAnalysis<LoopAccessLegacyAnalysis>(); 1622 auto *DB = &getAnalysis<DemandedBitsWrapperPass>().getDemandedBits(); 1623 auto *ORE = &getAnalysis<OptimizationRemarkEmitterWrapperPass>().getORE(); 1624 auto *PSI = &getAnalysis<ProfileSummaryInfoWrapperPass>().getPSI(); 1625 1626 std::function<const LoopAccessInfo &(Loop &)> GetLAA = 1627 [&](Loop &L) -> const LoopAccessInfo & { return LAA->getInfo(&L); }; 1628 1629 return Impl.runImpl(F, *SE, *LI, *TTI, *DT, *BFI, TLI, *DB, *AA, *AC, 1630 GetLAA, *ORE, PSI); 1631 } 1632 1633 void getAnalysisUsage(AnalysisUsage &AU) const override { 1634 AU.addRequired<AssumptionCacheTracker>(); 1635 AU.addRequired<BlockFrequencyInfoWrapperPass>(); 1636 AU.addRequired<DominatorTreeWrapperPass>(); 1637 AU.addRequired<LoopInfoWrapperPass>(); 1638 AU.addRequired<ScalarEvolutionWrapperPass>(); 1639 AU.addRequired<TargetTransformInfoWrapperPass>(); 1640 AU.addRequired<AAResultsWrapperPass>(); 1641 AU.addRequired<LoopAccessLegacyAnalysis>(); 1642 AU.addRequired<DemandedBitsWrapperPass>(); 1643 AU.addRequired<OptimizationRemarkEmitterWrapperPass>(); 1644 AU.addRequired<InjectTLIMappingsLegacy>(); 1645 1646 // We currently do not preserve loopinfo/dominator analyses with outer loop 1647 // vectorization. Until this is addressed, mark these analyses as preserved 1648 // only for non-VPlan-native path. 1649 // TODO: Preserve Loop and Dominator analyses for VPlan-native path. 1650 if (!EnableVPlanNativePath) { 1651 AU.addPreserved<LoopInfoWrapperPass>(); 1652 AU.addPreserved<DominatorTreeWrapperPass>(); 1653 } 1654 1655 AU.addPreserved<BasicAAWrapperPass>(); 1656 AU.addPreserved<GlobalsAAWrapperPass>(); 1657 AU.addRequired<ProfileSummaryInfoWrapperPass>(); 1658 } 1659 }; 1660 1661 } // end anonymous namespace 1662 1663 //===----------------------------------------------------------------------===// 1664 // Implementation of LoopVectorizationLegality, InnerLoopVectorizer and 1665 // LoopVectorizationCostModel and LoopVectorizationPlanner. 1666 //===----------------------------------------------------------------------===// 1667 1668 Value *InnerLoopVectorizer::getBroadcastInstrs(Value *V) { 1669 // We need to place the broadcast of invariant variables outside the loop, 1670 // but only if it's proven safe to do so. Else, broadcast will be inside 1671 // vector loop body. 1672 Instruction *Instr = dyn_cast<Instruction>(V); 1673 bool SafeToHoist = OrigLoop->isLoopInvariant(V) && 1674 (!Instr || 1675 DT->dominates(Instr->getParent(), LoopVectorPreHeader)); 1676 // Place the code for broadcasting invariant variables in the new preheader. 1677 IRBuilder<>::InsertPointGuard Guard(Builder); 1678 if (SafeToHoist) 1679 Builder.SetInsertPoint(LoopVectorPreHeader->getTerminator()); 1680 1681 // Broadcast the scalar into all locations in the vector. 1682 Value *Shuf = Builder.CreateVectorSplat(VF, V, "broadcast"); 1683 1684 return Shuf; 1685 } 1686 1687 void InnerLoopVectorizer::createVectorIntOrFpInductionPHI( 1688 const InductionDescriptor &II, Value *Step, Instruction *EntryVal) { 1689 assert((isa<PHINode>(EntryVal) || isa<TruncInst>(EntryVal)) && 1690 "Expected either an induction phi-node or a truncate of it!"); 1691 Value *Start = II.getStartValue(); 1692 1693 // Construct the initial value of the vector IV in the vector loop preheader 1694 auto CurrIP = Builder.saveIP(); 1695 Builder.SetInsertPoint(LoopVectorPreHeader->getTerminator()); 1696 if (isa<TruncInst>(EntryVal)) { 1697 assert(Start->getType()->isIntegerTy() && 1698 "Truncation requires an integer type"); 1699 auto *TruncType = cast<IntegerType>(EntryVal->getType()); 1700 Step = Builder.CreateTrunc(Step, TruncType); 1701 Start = Builder.CreateCast(Instruction::Trunc, Start, TruncType); 1702 } 1703 Value *SplatStart = Builder.CreateVectorSplat(VF, Start); 1704 Value *SteppedStart = 1705 getStepVector(SplatStart, 0, Step, II.getInductionOpcode()); 1706 1707 // We create vector phi nodes for both integer and floating-point induction 1708 // variables. Here, we determine the kind of arithmetic we will perform. 1709 Instruction::BinaryOps AddOp; 1710 Instruction::BinaryOps MulOp; 1711 if (Step->getType()->isIntegerTy()) { 1712 AddOp = Instruction::Add; 1713 MulOp = Instruction::Mul; 1714 } else { 1715 AddOp = II.getInductionOpcode(); 1716 MulOp = Instruction::FMul; 1717 } 1718 1719 // Multiply the vectorization factor by the step using integer or 1720 // floating-point arithmetic as appropriate. 1721 Value *ConstVF = getSignedIntOrFpConstant(Step->getType(), VF); 1722 Value *Mul = addFastMathFlag(Builder.CreateBinOp(MulOp, Step, ConstVF)); 1723 1724 // Create a vector splat to use in the induction update. 1725 // 1726 // FIXME: If the step is non-constant, we create the vector splat with 1727 // IRBuilder. IRBuilder can constant-fold the multiply, but it doesn't 1728 // handle a constant vector splat. 1729 Value *SplatVF = isa<Constant>(Mul) 1730 ? ConstantVector::getSplat(VF, cast<Constant>(Mul)) 1731 : Builder.CreateVectorSplat(VF, Mul); 1732 Builder.restoreIP(CurrIP); 1733 1734 // We may need to add the step a number of times, depending on the unroll 1735 // factor. The last of those goes into the PHI. 1736 PHINode *VecInd = PHINode::Create(SteppedStart->getType(), 2, "vec.ind", 1737 &*LoopVectorBody->getFirstInsertionPt()); 1738 VecInd->setDebugLoc(EntryVal->getDebugLoc()); 1739 Instruction *LastInduction = VecInd; 1740 for (unsigned Part = 0; Part < UF; ++Part) { 1741 VectorLoopValueMap.setVectorValue(EntryVal, Part, LastInduction); 1742 1743 if (isa<TruncInst>(EntryVal)) 1744 addMetadata(LastInduction, EntryVal); 1745 recordVectorLoopValueForInductionCast(II, EntryVal, LastInduction, Part); 1746 1747 LastInduction = cast<Instruction>(addFastMathFlag( 1748 Builder.CreateBinOp(AddOp, LastInduction, SplatVF, "step.add"))); 1749 LastInduction->setDebugLoc(EntryVal->getDebugLoc()); 1750 } 1751 1752 // Move the last step to the end of the latch block. This ensures consistent 1753 // placement of all induction updates. 1754 auto *LoopVectorLatch = LI->getLoopFor(LoopVectorBody)->getLoopLatch(); 1755 auto *Br = cast<BranchInst>(LoopVectorLatch->getTerminator()); 1756 auto *ICmp = cast<Instruction>(Br->getCondition()); 1757 LastInduction->moveBefore(ICmp); 1758 LastInduction->setName("vec.ind.next"); 1759 1760 VecInd->addIncoming(SteppedStart, LoopVectorPreHeader); 1761 VecInd->addIncoming(LastInduction, LoopVectorLatch); 1762 } 1763 1764 bool InnerLoopVectorizer::shouldScalarizeInstruction(Instruction *I) const { 1765 return Cost->isScalarAfterVectorization(I, VF) || 1766 Cost->isProfitableToScalarize(I, VF); 1767 } 1768 1769 bool InnerLoopVectorizer::needsScalarInduction(Instruction *IV) const { 1770 if (shouldScalarizeInstruction(IV)) 1771 return true; 1772 auto isScalarInst = [&](User *U) -> bool { 1773 auto *I = cast<Instruction>(U); 1774 return (OrigLoop->contains(I) && shouldScalarizeInstruction(I)); 1775 }; 1776 return llvm::any_of(IV->users(), isScalarInst); 1777 } 1778 1779 void InnerLoopVectorizer::recordVectorLoopValueForInductionCast( 1780 const InductionDescriptor &ID, const Instruction *EntryVal, 1781 Value *VectorLoopVal, unsigned Part, unsigned Lane) { 1782 assert((isa<PHINode>(EntryVal) || isa<TruncInst>(EntryVal)) && 1783 "Expected either an induction phi-node or a truncate of it!"); 1784 1785 // This induction variable is not the phi from the original loop but the 1786 // newly-created IV based on the proof that casted Phi is equal to the 1787 // uncasted Phi in the vectorized loop (under a runtime guard possibly). It 1788 // re-uses the same InductionDescriptor that original IV uses but we don't 1789 // have to do any recording in this case - that is done when original IV is 1790 // processed. 1791 if (isa<TruncInst>(EntryVal)) 1792 return; 1793 1794 const SmallVectorImpl<Instruction *> &Casts = ID.getCastInsts(); 1795 if (Casts.empty()) 1796 return; 1797 // Only the first Cast instruction in the Casts vector is of interest. 1798 // The rest of the Casts (if exist) have no uses outside the 1799 // induction update chain itself. 1800 Instruction *CastInst = *Casts.begin(); 1801 if (Lane < UINT_MAX) 1802 VectorLoopValueMap.setScalarValue(CastInst, {Part, Lane}, VectorLoopVal); 1803 else 1804 VectorLoopValueMap.setVectorValue(CastInst, Part, VectorLoopVal); 1805 } 1806 1807 void InnerLoopVectorizer::widenIntOrFpInduction(PHINode *IV, TruncInst *Trunc) { 1808 assert((IV->getType()->isIntegerTy() || IV != OldInduction) && 1809 "Primary induction variable must have an integer type"); 1810 1811 auto II = Legal->getInductionVars()->find(IV); 1812 assert(II != Legal->getInductionVars()->end() && "IV is not an induction"); 1813 1814 auto ID = II->second; 1815 assert(IV->getType() == ID.getStartValue()->getType() && "Types must match"); 1816 1817 // The scalar value to broadcast. This will be derived from the canonical 1818 // induction variable. 1819 Value *ScalarIV = nullptr; 1820 1821 // The value from the original loop to which we are mapping the new induction 1822 // variable. 1823 Instruction *EntryVal = Trunc ? cast<Instruction>(Trunc) : IV; 1824 1825 // True if we have vectorized the induction variable. 1826 auto VectorizedIV = false; 1827 1828 // Determine if we want a scalar version of the induction variable. This is 1829 // true if the induction variable itself is not widened, or if it has at 1830 // least one user in the loop that is not widened. 1831 auto NeedsScalarIV = VF > 1 && needsScalarInduction(EntryVal); 1832 1833 // Generate code for the induction step. Note that induction steps are 1834 // required to be loop-invariant 1835 assert(PSE.getSE()->isLoopInvariant(ID.getStep(), OrigLoop) && 1836 "Induction step should be loop invariant"); 1837 auto &DL = OrigLoop->getHeader()->getModule()->getDataLayout(); 1838 Value *Step = nullptr; 1839 if (PSE.getSE()->isSCEVable(IV->getType())) { 1840 SCEVExpander Exp(*PSE.getSE(), DL, "induction"); 1841 Step = Exp.expandCodeFor(ID.getStep(), ID.getStep()->getType(), 1842 LoopVectorPreHeader->getTerminator()); 1843 } else { 1844 Step = cast<SCEVUnknown>(ID.getStep())->getValue(); 1845 } 1846 1847 // Try to create a new independent vector induction variable. If we can't 1848 // create the phi node, we will splat the scalar induction variable in each 1849 // loop iteration. 1850 if (VF > 1 && !shouldScalarizeInstruction(EntryVal)) { 1851 createVectorIntOrFpInductionPHI(ID, Step, EntryVal); 1852 VectorizedIV = true; 1853 } 1854 1855 // If we haven't yet vectorized the induction variable, or if we will create 1856 // a scalar one, we need to define the scalar induction variable and step 1857 // values. If we were given a truncation type, truncate the canonical 1858 // induction variable and step. Otherwise, derive these values from the 1859 // induction descriptor. 1860 if (!VectorizedIV || NeedsScalarIV) { 1861 ScalarIV = Induction; 1862 if (IV != OldInduction) { 1863 ScalarIV = IV->getType()->isIntegerTy() 1864 ? Builder.CreateSExtOrTrunc(Induction, IV->getType()) 1865 : Builder.CreateCast(Instruction::SIToFP, Induction, 1866 IV->getType()); 1867 ScalarIV = emitTransformedIndex(Builder, ScalarIV, PSE.getSE(), DL, ID); 1868 ScalarIV->setName("offset.idx"); 1869 } 1870 if (Trunc) { 1871 auto *TruncType = cast<IntegerType>(Trunc->getType()); 1872 assert(Step->getType()->isIntegerTy() && 1873 "Truncation requires an integer step"); 1874 ScalarIV = Builder.CreateTrunc(ScalarIV, TruncType); 1875 Step = Builder.CreateTrunc(Step, TruncType); 1876 } 1877 } 1878 1879 // If we haven't yet vectorized the induction variable, splat the scalar 1880 // induction variable, and build the necessary step vectors. 1881 // TODO: Don't do it unless the vectorized IV is really required. 1882 if (!VectorizedIV) { 1883 Value *Broadcasted = getBroadcastInstrs(ScalarIV); 1884 for (unsigned Part = 0; Part < UF; ++Part) { 1885 Value *EntryPart = 1886 getStepVector(Broadcasted, VF * Part, Step, ID.getInductionOpcode()); 1887 VectorLoopValueMap.setVectorValue(EntryVal, Part, EntryPart); 1888 if (Trunc) 1889 addMetadata(EntryPart, Trunc); 1890 recordVectorLoopValueForInductionCast(ID, EntryVal, EntryPart, Part); 1891 } 1892 } 1893 1894 // If an induction variable is only used for counting loop iterations or 1895 // calculating addresses, it doesn't need to be widened. Create scalar steps 1896 // that can be used by instructions we will later scalarize. Note that the 1897 // addition of the scalar steps will not increase the number of instructions 1898 // in the loop in the common case prior to InstCombine. We will be trading 1899 // one vector extract for each scalar step. 1900 if (NeedsScalarIV) 1901 buildScalarSteps(ScalarIV, Step, EntryVal, ID); 1902 } 1903 1904 Value *InnerLoopVectorizer::getStepVector(Value *Val, int StartIdx, Value *Step, 1905 Instruction::BinaryOps BinOp) { 1906 // Create and check the types. 1907 assert(Val->getType()->isVectorTy() && "Must be a vector"); 1908 int VLen = Val->getType()->getVectorNumElements(); 1909 1910 Type *STy = Val->getType()->getScalarType(); 1911 assert((STy->isIntegerTy() || STy->isFloatingPointTy()) && 1912 "Induction Step must be an integer or FP"); 1913 assert(Step->getType() == STy && "Step has wrong type"); 1914 1915 SmallVector<Constant *, 8> Indices; 1916 1917 if (STy->isIntegerTy()) { 1918 // Create a vector of consecutive numbers from zero to VF. 1919 for (int i = 0; i < VLen; ++i) 1920 Indices.push_back(ConstantInt::get(STy, StartIdx + i)); 1921 1922 // Add the consecutive indices to the vector value. 1923 Constant *Cv = ConstantVector::get(Indices); 1924 assert(Cv->getType() == Val->getType() && "Invalid consecutive vec"); 1925 Step = Builder.CreateVectorSplat(VLen, Step); 1926 assert(Step->getType() == Val->getType() && "Invalid step vec"); 1927 // FIXME: The newly created binary instructions should contain nsw/nuw flags, 1928 // which can be found from the original scalar operations. 1929 Step = Builder.CreateMul(Cv, Step); 1930 return Builder.CreateAdd(Val, Step, "induction"); 1931 } 1932 1933 // Floating point induction. 1934 assert((BinOp == Instruction::FAdd || BinOp == Instruction::FSub) && 1935 "Binary Opcode should be specified for FP induction"); 1936 // Create a vector of consecutive numbers from zero to VF. 1937 for (int i = 0; i < VLen; ++i) 1938 Indices.push_back(ConstantFP::get(STy, (double)(StartIdx + i))); 1939 1940 // Add the consecutive indices to the vector value. 1941 Constant *Cv = ConstantVector::get(Indices); 1942 1943 Step = Builder.CreateVectorSplat(VLen, Step); 1944 1945 // Floating point operations had to be 'fast' to enable the induction. 1946 FastMathFlags Flags; 1947 Flags.setFast(); 1948 1949 Value *MulOp = Builder.CreateFMul(Cv, Step); 1950 if (isa<Instruction>(MulOp)) 1951 // Have to check, MulOp may be a constant 1952 cast<Instruction>(MulOp)->setFastMathFlags(Flags); 1953 1954 Value *BOp = Builder.CreateBinOp(BinOp, Val, MulOp, "induction"); 1955 if (isa<Instruction>(BOp)) 1956 cast<Instruction>(BOp)->setFastMathFlags(Flags); 1957 return BOp; 1958 } 1959 1960 void InnerLoopVectorizer::buildScalarSteps(Value *ScalarIV, Value *Step, 1961 Instruction *EntryVal, 1962 const InductionDescriptor &ID) { 1963 // We shouldn't have to build scalar steps if we aren't vectorizing. 1964 assert(VF > 1 && "VF should be greater than one"); 1965 1966 // Get the value type and ensure it and the step have the same integer type. 1967 Type *ScalarIVTy = ScalarIV->getType()->getScalarType(); 1968 assert(ScalarIVTy == Step->getType() && 1969 "Val and Step should have the same type"); 1970 1971 // We build scalar steps for both integer and floating-point induction 1972 // variables. Here, we determine the kind of arithmetic we will perform. 1973 Instruction::BinaryOps AddOp; 1974 Instruction::BinaryOps MulOp; 1975 if (ScalarIVTy->isIntegerTy()) { 1976 AddOp = Instruction::Add; 1977 MulOp = Instruction::Mul; 1978 } else { 1979 AddOp = ID.getInductionOpcode(); 1980 MulOp = Instruction::FMul; 1981 } 1982 1983 // Determine the number of scalars we need to generate for each unroll 1984 // iteration. If EntryVal is uniform, we only need to generate the first 1985 // lane. Otherwise, we generate all VF values. 1986 unsigned Lanes = 1987 Cost->isUniformAfterVectorization(cast<Instruction>(EntryVal), VF) ? 1 1988 : VF; 1989 // Compute the scalar steps and save the results in VectorLoopValueMap. 1990 for (unsigned Part = 0; Part < UF; ++Part) { 1991 for (unsigned Lane = 0; Lane < Lanes; ++Lane) { 1992 auto *StartIdx = getSignedIntOrFpConstant(ScalarIVTy, VF * Part + Lane); 1993 auto *Mul = addFastMathFlag(Builder.CreateBinOp(MulOp, StartIdx, Step)); 1994 auto *Add = addFastMathFlag(Builder.CreateBinOp(AddOp, ScalarIV, Mul)); 1995 VectorLoopValueMap.setScalarValue(EntryVal, {Part, Lane}, Add); 1996 recordVectorLoopValueForInductionCast(ID, EntryVal, Add, Part, Lane); 1997 } 1998 } 1999 } 2000 2001 Value *InnerLoopVectorizer::getOrCreateVectorValue(Value *V, unsigned Part) { 2002 assert(V != Induction && "The new induction variable should not be used."); 2003 assert(!V->getType()->isVectorTy() && "Can't widen a vector"); 2004 assert(!V->getType()->isVoidTy() && "Type does not produce a value"); 2005 2006 // If we have a stride that is replaced by one, do it here. Defer this for 2007 // the VPlan-native path until we start running Legal checks in that path. 2008 if (!EnableVPlanNativePath && Legal->hasStride(V)) 2009 V = ConstantInt::get(V->getType(), 1); 2010 2011 // If we have a vector mapped to this value, return it. 2012 if (VectorLoopValueMap.hasVectorValue(V, Part)) 2013 return VectorLoopValueMap.getVectorValue(V, Part); 2014 2015 // If the value has not been vectorized, check if it has been scalarized 2016 // instead. If it has been scalarized, and we actually need the value in 2017 // vector form, we will construct the vector values on demand. 2018 if (VectorLoopValueMap.hasAnyScalarValue(V)) { 2019 Value *ScalarValue = VectorLoopValueMap.getScalarValue(V, {Part, 0}); 2020 2021 // If we've scalarized a value, that value should be an instruction. 2022 auto *I = cast<Instruction>(V); 2023 2024 // If we aren't vectorizing, we can just copy the scalar map values over to 2025 // the vector map. 2026 if (VF == 1) { 2027 VectorLoopValueMap.setVectorValue(V, Part, ScalarValue); 2028 return ScalarValue; 2029 } 2030 2031 // Get the last scalar instruction we generated for V and Part. If the value 2032 // is known to be uniform after vectorization, this corresponds to lane zero 2033 // of the Part unroll iteration. Otherwise, the last instruction is the one 2034 // we created for the last vector lane of the Part unroll iteration. 2035 unsigned LastLane = Cost->isUniformAfterVectorization(I, VF) ? 0 : VF - 1; 2036 auto *LastInst = cast<Instruction>( 2037 VectorLoopValueMap.getScalarValue(V, {Part, LastLane})); 2038 2039 // Set the insert point after the last scalarized instruction. This ensures 2040 // the insertelement sequence will directly follow the scalar definitions. 2041 auto OldIP = Builder.saveIP(); 2042 auto NewIP = std::next(BasicBlock::iterator(LastInst)); 2043 Builder.SetInsertPoint(&*NewIP); 2044 2045 // However, if we are vectorizing, we need to construct the vector values. 2046 // If the value is known to be uniform after vectorization, we can just 2047 // broadcast the scalar value corresponding to lane zero for each unroll 2048 // iteration. Otherwise, we construct the vector values using insertelement 2049 // instructions. Since the resulting vectors are stored in 2050 // VectorLoopValueMap, we will only generate the insertelements once. 2051 Value *VectorValue = nullptr; 2052 if (Cost->isUniformAfterVectorization(I, VF)) { 2053 VectorValue = getBroadcastInstrs(ScalarValue); 2054 VectorLoopValueMap.setVectorValue(V, Part, VectorValue); 2055 } else { 2056 // Initialize packing with insertelements to start from undef. 2057 Value *Undef = UndefValue::get(VectorType::get(V->getType(), VF)); 2058 VectorLoopValueMap.setVectorValue(V, Part, Undef); 2059 for (unsigned Lane = 0; Lane < VF; ++Lane) 2060 packScalarIntoVectorValue(V, {Part, Lane}); 2061 VectorValue = VectorLoopValueMap.getVectorValue(V, Part); 2062 } 2063 Builder.restoreIP(OldIP); 2064 return VectorValue; 2065 } 2066 2067 // If this scalar is unknown, assume that it is a constant or that it is 2068 // loop invariant. Broadcast V and save the value for future uses. 2069 Value *B = getBroadcastInstrs(V); 2070 VectorLoopValueMap.setVectorValue(V, Part, B); 2071 return B; 2072 } 2073 2074 Value * 2075 InnerLoopVectorizer::getOrCreateScalarValue(Value *V, 2076 const VPIteration &Instance) { 2077 // If the value is not an instruction contained in the loop, it should 2078 // already be scalar. 2079 if (OrigLoop->isLoopInvariant(V)) 2080 return V; 2081 2082 assert(Instance.Lane > 0 2083 ? !Cost->isUniformAfterVectorization(cast<Instruction>(V), VF) 2084 : true && "Uniform values only have lane zero"); 2085 2086 // If the value from the original loop has not been vectorized, it is 2087 // represented by UF x VF scalar values in the new loop. Return the requested 2088 // scalar value. 2089 if (VectorLoopValueMap.hasScalarValue(V, Instance)) 2090 return VectorLoopValueMap.getScalarValue(V, Instance); 2091 2092 // If the value has not been scalarized, get its entry in VectorLoopValueMap 2093 // for the given unroll part. If this entry is not a vector type (i.e., the 2094 // vectorization factor is one), there is no need to generate an 2095 // extractelement instruction. 2096 auto *U = getOrCreateVectorValue(V, Instance.Part); 2097 if (!U->getType()->isVectorTy()) { 2098 assert(VF == 1 && "Value not scalarized has non-vector type"); 2099 return U; 2100 } 2101 2102 // Otherwise, the value from the original loop has been vectorized and is 2103 // represented by UF vector values. Extract and return the requested scalar 2104 // value from the appropriate vector lane. 2105 return Builder.CreateExtractElement(U, Builder.getInt32(Instance.Lane)); 2106 } 2107 2108 void InnerLoopVectorizer::packScalarIntoVectorValue( 2109 Value *V, const VPIteration &Instance) { 2110 assert(V != Induction && "The new induction variable should not be used."); 2111 assert(!V->getType()->isVectorTy() && "Can't pack a vector"); 2112 assert(!V->getType()->isVoidTy() && "Type does not produce a value"); 2113 2114 Value *ScalarInst = VectorLoopValueMap.getScalarValue(V, Instance); 2115 Value *VectorValue = VectorLoopValueMap.getVectorValue(V, Instance.Part); 2116 VectorValue = Builder.CreateInsertElement(VectorValue, ScalarInst, 2117 Builder.getInt32(Instance.Lane)); 2118 VectorLoopValueMap.resetVectorValue(V, Instance.Part, VectorValue); 2119 } 2120 2121 Value *InnerLoopVectorizer::reverseVector(Value *Vec) { 2122 assert(Vec->getType()->isVectorTy() && "Invalid type"); 2123 SmallVector<Constant *, 8> ShuffleMask; 2124 for (unsigned i = 0; i < VF; ++i) 2125 ShuffleMask.push_back(Builder.getInt32(VF - i - 1)); 2126 2127 return Builder.CreateShuffleVector(Vec, UndefValue::get(Vec->getType()), 2128 ConstantVector::get(ShuffleMask), 2129 "reverse"); 2130 } 2131 2132 // Return whether we allow using masked interleave-groups (for dealing with 2133 // strided loads/stores that reside in predicated blocks, or for dealing 2134 // with gaps). 2135 static bool useMaskedInterleavedAccesses(const TargetTransformInfo &TTI) { 2136 // If an override option has been passed in for interleaved accesses, use it. 2137 if (EnableMaskedInterleavedMemAccesses.getNumOccurrences() > 0) 2138 return EnableMaskedInterleavedMemAccesses; 2139 2140 return TTI.enableMaskedInterleavedAccessVectorization(); 2141 } 2142 2143 // Try to vectorize the interleave group that \p Instr belongs to. 2144 // 2145 // E.g. Translate following interleaved load group (factor = 3): 2146 // for (i = 0; i < N; i+=3) { 2147 // R = Pic[i]; // Member of index 0 2148 // G = Pic[i+1]; // Member of index 1 2149 // B = Pic[i+2]; // Member of index 2 2150 // ... // do something to R, G, B 2151 // } 2152 // To: 2153 // %wide.vec = load <12 x i32> ; Read 4 tuples of R,G,B 2154 // %R.vec = shuffle %wide.vec, undef, <0, 3, 6, 9> ; R elements 2155 // %G.vec = shuffle %wide.vec, undef, <1, 4, 7, 10> ; G elements 2156 // %B.vec = shuffle %wide.vec, undef, <2, 5, 8, 11> ; B elements 2157 // 2158 // Or translate following interleaved store group (factor = 3): 2159 // for (i = 0; i < N; i+=3) { 2160 // ... do something to R, G, B 2161 // Pic[i] = R; // Member of index 0 2162 // Pic[i+1] = G; // Member of index 1 2163 // Pic[i+2] = B; // Member of index 2 2164 // } 2165 // To: 2166 // %R_G.vec = shuffle %R.vec, %G.vec, <0, 1, 2, ..., 7> 2167 // %B_U.vec = shuffle %B.vec, undef, <0, 1, 2, 3, u, u, u, u> 2168 // %interleaved.vec = shuffle %R_G.vec, %B_U.vec, 2169 // <0, 4, 8, 1, 5, 9, 2, 6, 10, 3, 7, 11> ; Interleave R,G,B elements 2170 // store <12 x i32> %interleaved.vec ; Write 4 tuples of R,G,B 2171 void InnerLoopVectorizer::vectorizeInterleaveGroup(Instruction *Instr, 2172 VPTransformState &State, 2173 VPValue *Addr, 2174 VPValue *BlockInMask) { 2175 const InterleaveGroup<Instruction> *Group = 2176 Cost->getInterleavedAccessGroup(Instr); 2177 assert(Group && "Fail to get an interleaved access group."); 2178 2179 // Skip if current instruction is not the insert position. 2180 if (Instr != Group->getInsertPos()) 2181 return; 2182 2183 const DataLayout &DL = Instr->getModule()->getDataLayout(); 2184 2185 // Prepare for the vector type of the interleaved load/store. 2186 Type *ScalarTy = getMemInstValueType(Instr); 2187 unsigned InterleaveFactor = Group->getFactor(); 2188 Type *VecTy = VectorType::get(ScalarTy, InterleaveFactor * VF); 2189 2190 // Prepare for the new pointers. 2191 SmallVector<Value *, 2> AddrParts; 2192 unsigned Index = Group->getIndex(Instr); 2193 2194 // TODO: extend the masked interleaved-group support to reversed access. 2195 assert((!BlockInMask || !Group->isReverse()) && 2196 "Reversed masked interleave-group not supported."); 2197 2198 // If the group is reverse, adjust the index to refer to the last vector lane 2199 // instead of the first. We adjust the index from the first vector lane, 2200 // rather than directly getting the pointer for lane VF - 1, because the 2201 // pointer operand of the interleaved access is supposed to be uniform. For 2202 // uniform instructions, we're only required to generate a value for the 2203 // first vector lane in each unroll iteration. 2204 if (Group->isReverse()) 2205 Index += (VF - 1) * Group->getFactor(); 2206 2207 for (unsigned Part = 0; Part < UF; Part++) { 2208 Value *AddrPart = State.get(Addr, {Part, 0}); 2209 setDebugLocFromInst(Builder, AddrPart); 2210 2211 // Notice current instruction could be any index. Need to adjust the address 2212 // to the member of index 0. 2213 // 2214 // E.g. a = A[i+1]; // Member of index 1 (Current instruction) 2215 // b = A[i]; // Member of index 0 2216 // Current pointer is pointed to A[i+1], adjust it to A[i]. 2217 // 2218 // E.g. A[i+1] = a; // Member of index 1 2219 // A[i] = b; // Member of index 0 2220 // A[i+2] = c; // Member of index 2 (Current instruction) 2221 // Current pointer is pointed to A[i+2], adjust it to A[i]. 2222 2223 bool InBounds = false; 2224 if (auto *gep = dyn_cast<GetElementPtrInst>(AddrPart->stripPointerCasts())) 2225 InBounds = gep->isInBounds(); 2226 AddrPart = Builder.CreateGEP(ScalarTy, AddrPart, Builder.getInt32(-Index)); 2227 cast<GetElementPtrInst>(AddrPart)->setIsInBounds(InBounds); 2228 2229 // Cast to the vector pointer type. 2230 unsigned AddressSpace = AddrPart->getType()->getPointerAddressSpace(); 2231 Type *PtrTy = VecTy->getPointerTo(AddressSpace); 2232 AddrParts.push_back(Builder.CreateBitCast(AddrPart, PtrTy)); 2233 } 2234 2235 setDebugLocFromInst(Builder, Instr); 2236 Value *UndefVec = UndefValue::get(VecTy); 2237 2238 Value *MaskForGaps = nullptr; 2239 if (Group->requiresScalarEpilogue() && !Cost->isScalarEpilogueAllowed()) { 2240 MaskForGaps = createBitMaskForGaps(Builder, VF, *Group); 2241 assert(MaskForGaps && "Mask for Gaps is required but it is null"); 2242 } 2243 2244 // Vectorize the interleaved load group. 2245 if (isa<LoadInst>(Instr)) { 2246 // For each unroll part, create a wide load for the group. 2247 SmallVector<Value *, 2> NewLoads; 2248 for (unsigned Part = 0; Part < UF; Part++) { 2249 Instruction *NewLoad; 2250 if (BlockInMask || MaskForGaps) { 2251 assert(useMaskedInterleavedAccesses(*TTI) && 2252 "masked interleaved groups are not allowed."); 2253 Value *GroupMask = MaskForGaps; 2254 if (BlockInMask) { 2255 Value *BlockInMaskPart = State.get(BlockInMask, Part); 2256 auto *Undefs = UndefValue::get(BlockInMaskPart->getType()); 2257 auto *RepMask = createReplicatedMask(Builder, InterleaveFactor, VF); 2258 Value *ShuffledMask = Builder.CreateShuffleVector( 2259 BlockInMaskPart, Undefs, RepMask, "interleaved.mask"); 2260 GroupMask = MaskForGaps 2261 ? Builder.CreateBinOp(Instruction::And, ShuffledMask, 2262 MaskForGaps) 2263 : ShuffledMask; 2264 } 2265 NewLoad = 2266 Builder.CreateMaskedLoad(AddrParts[Part], Group->getAlign(), 2267 GroupMask, UndefVec, "wide.masked.vec"); 2268 } 2269 else 2270 NewLoad = Builder.CreateAlignedLoad(VecTy, AddrParts[Part], 2271 Group->getAlignment(), "wide.vec"); 2272 Group->addMetadata(NewLoad); 2273 NewLoads.push_back(NewLoad); 2274 } 2275 2276 // For each member in the group, shuffle out the appropriate data from the 2277 // wide loads. 2278 for (unsigned I = 0; I < InterleaveFactor; ++I) { 2279 Instruction *Member = Group->getMember(I); 2280 2281 // Skip the gaps in the group. 2282 if (!Member) 2283 continue; 2284 2285 Constant *StrideMask = createStrideMask(Builder, I, InterleaveFactor, VF); 2286 for (unsigned Part = 0; Part < UF; Part++) { 2287 Value *StridedVec = Builder.CreateShuffleVector( 2288 NewLoads[Part], UndefVec, StrideMask, "strided.vec"); 2289 2290 // If this member has different type, cast the result type. 2291 if (Member->getType() != ScalarTy) { 2292 VectorType *OtherVTy = VectorType::get(Member->getType(), VF); 2293 StridedVec = createBitOrPointerCast(StridedVec, OtherVTy, DL); 2294 } 2295 2296 if (Group->isReverse()) 2297 StridedVec = reverseVector(StridedVec); 2298 2299 VectorLoopValueMap.setVectorValue(Member, Part, StridedVec); 2300 } 2301 } 2302 return; 2303 } 2304 2305 // The sub vector type for current instruction. 2306 VectorType *SubVT = VectorType::get(ScalarTy, VF); 2307 2308 // Vectorize the interleaved store group. 2309 for (unsigned Part = 0; Part < UF; Part++) { 2310 // Collect the stored vector from each member. 2311 SmallVector<Value *, 4> StoredVecs; 2312 for (unsigned i = 0; i < InterleaveFactor; i++) { 2313 // Interleaved store group doesn't allow a gap, so each index has a member 2314 Instruction *Member = Group->getMember(i); 2315 assert(Member && "Fail to get a member from an interleaved store group"); 2316 2317 Value *StoredVec = getOrCreateVectorValue( 2318 cast<StoreInst>(Member)->getValueOperand(), Part); 2319 if (Group->isReverse()) 2320 StoredVec = reverseVector(StoredVec); 2321 2322 // If this member has different type, cast it to a unified type. 2323 2324 if (StoredVec->getType() != SubVT) 2325 StoredVec = createBitOrPointerCast(StoredVec, SubVT, DL); 2326 2327 StoredVecs.push_back(StoredVec); 2328 } 2329 2330 // Concatenate all vectors into a wide vector. 2331 Value *WideVec = concatenateVectors(Builder, StoredVecs); 2332 2333 // Interleave the elements in the wide vector. 2334 Constant *IMask = createInterleaveMask(Builder, VF, InterleaveFactor); 2335 Value *IVec = Builder.CreateShuffleVector(WideVec, UndefVec, IMask, 2336 "interleaved.vec"); 2337 2338 Instruction *NewStoreInstr; 2339 if (BlockInMask) { 2340 Value *BlockInMaskPart = State.get(BlockInMask, Part); 2341 auto *Undefs = UndefValue::get(BlockInMaskPart->getType()); 2342 auto *RepMask = createReplicatedMask(Builder, InterleaveFactor, VF); 2343 Value *ShuffledMask = Builder.CreateShuffleVector( 2344 BlockInMaskPart, Undefs, RepMask, "interleaved.mask"); 2345 NewStoreInstr = Builder.CreateMaskedStore( 2346 IVec, AddrParts[Part], Group->getAlign(), ShuffledMask); 2347 } 2348 else 2349 NewStoreInstr = Builder.CreateAlignedStore(IVec, AddrParts[Part], 2350 Group->getAlignment()); 2351 2352 Group->addMetadata(NewStoreInstr); 2353 } 2354 } 2355 2356 void InnerLoopVectorizer::vectorizeMemoryInstruction(Instruction *Instr, 2357 VPTransformState &State, 2358 VPValue *Addr, 2359 VPValue *BlockInMask) { 2360 // Attempt to issue a wide load. 2361 LoadInst *LI = dyn_cast<LoadInst>(Instr); 2362 StoreInst *SI = dyn_cast<StoreInst>(Instr); 2363 2364 assert((LI || SI) && "Invalid Load/Store instruction"); 2365 2366 LoopVectorizationCostModel::InstWidening Decision = 2367 Cost->getWideningDecision(Instr, VF); 2368 assert(Decision != LoopVectorizationCostModel::CM_Unknown && 2369 "CM decision should be taken at this point"); 2370 if (Decision == LoopVectorizationCostModel::CM_Interleave) 2371 return vectorizeInterleaveGroup(Instr, State, Addr, BlockInMask); 2372 2373 Type *ScalarDataTy = getMemInstValueType(Instr); 2374 Type *DataTy = VectorType::get(ScalarDataTy, VF); 2375 // An alignment of 0 means target abi alignment. We need to use the scalar's 2376 // target abi alignment in such a case. 2377 const DataLayout &DL = Instr->getModule()->getDataLayout(); 2378 const Align Alignment = 2379 DL.getValueOrABITypeAlignment(getLoadStoreAlignment(Instr), ScalarDataTy); 2380 2381 // Determine if the pointer operand of the access is either consecutive or 2382 // reverse consecutive. 2383 bool Reverse = (Decision == LoopVectorizationCostModel::CM_Widen_Reverse); 2384 bool ConsecutiveStride = 2385 Reverse || (Decision == LoopVectorizationCostModel::CM_Widen); 2386 bool CreateGatherScatter = 2387 (Decision == LoopVectorizationCostModel::CM_GatherScatter); 2388 2389 // Either Ptr feeds a vector load/store, or a vector GEP should feed a vector 2390 // gather/scatter. Otherwise Decision should have been to Scalarize. 2391 assert((ConsecutiveStride || CreateGatherScatter) && 2392 "The instruction should be scalarized"); 2393 (void)ConsecutiveStride; 2394 2395 VectorParts BlockInMaskParts(UF); 2396 bool isMaskRequired = BlockInMask; 2397 if (isMaskRequired) 2398 for (unsigned Part = 0; Part < UF; ++Part) 2399 BlockInMaskParts[Part] = State.get(BlockInMask, Part); 2400 2401 const auto CreateVecPtr = [&](unsigned Part, Value *Ptr) -> Value * { 2402 // Calculate the pointer for the specific unroll-part. 2403 GetElementPtrInst *PartPtr = nullptr; 2404 2405 bool InBounds = false; 2406 if (auto *gep = dyn_cast<GetElementPtrInst>(Ptr->stripPointerCasts())) 2407 InBounds = gep->isInBounds(); 2408 2409 if (Reverse) { 2410 // If the address is consecutive but reversed, then the 2411 // wide store needs to start at the last vector element. 2412 PartPtr = cast<GetElementPtrInst>( 2413 Builder.CreateGEP(ScalarDataTy, Ptr, Builder.getInt32(-Part * VF))); 2414 PartPtr->setIsInBounds(InBounds); 2415 PartPtr = cast<GetElementPtrInst>( 2416 Builder.CreateGEP(ScalarDataTy, PartPtr, Builder.getInt32(1 - VF))); 2417 PartPtr->setIsInBounds(InBounds); 2418 if (isMaskRequired) // Reverse of a null all-one mask is a null mask. 2419 BlockInMaskParts[Part] = reverseVector(BlockInMaskParts[Part]); 2420 } else { 2421 PartPtr = cast<GetElementPtrInst>( 2422 Builder.CreateGEP(ScalarDataTy, Ptr, Builder.getInt32(Part * VF))); 2423 PartPtr->setIsInBounds(InBounds); 2424 } 2425 2426 unsigned AddressSpace = Ptr->getType()->getPointerAddressSpace(); 2427 return Builder.CreateBitCast(PartPtr, DataTy->getPointerTo(AddressSpace)); 2428 }; 2429 2430 // Handle Stores: 2431 if (SI) { 2432 setDebugLocFromInst(Builder, SI); 2433 2434 for (unsigned Part = 0; Part < UF; ++Part) { 2435 Instruction *NewSI = nullptr; 2436 Value *StoredVal = getOrCreateVectorValue(SI->getValueOperand(), Part); 2437 if (CreateGatherScatter) { 2438 Value *MaskPart = isMaskRequired ? BlockInMaskParts[Part] : nullptr; 2439 Value *VectorGep = State.get(Addr, Part); 2440 NewSI = Builder.CreateMaskedScatter(StoredVal, VectorGep, 2441 Alignment.value(), MaskPart); 2442 } else { 2443 if (Reverse) { 2444 // If we store to reverse consecutive memory locations, then we need 2445 // to reverse the order of elements in the stored value. 2446 StoredVal = reverseVector(StoredVal); 2447 // We don't want to update the value in the map as it might be used in 2448 // another expression. So don't call resetVectorValue(StoredVal). 2449 } 2450 auto *VecPtr = CreateVecPtr(Part, State.get(Addr, {0, 0})); 2451 if (isMaskRequired) 2452 NewSI = Builder.CreateMaskedStore(StoredVal, VecPtr, Alignment, 2453 BlockInMaskParts[Part]); 2454 else 2455 NewSI = 2456 Builder.CreateAlignedStore(StoredVal, VecPtr, Alignment.value()); 2457 } 2458 addMetadata(NewSI, SI); 2459 } 2460 return; 2461 } 2462 2463 // Handle loads. 2464 assert(LI && "Must have a load instruction"); 2465 setDebugLocFromInst(Builder, LI); 2466 for (unsigned Part = 0; Part < UF; ++Part) { 2467 Value *NewLI; 2468 if (CreateGatherScatter) { 2469 Value *MaskPart = isMaskRequired ? BlockInMaskParts[Part] : nullptr; 2470 Value *VectorGep = State.get(Addr, Part); 2471 NewLI = Builder.CreateMaskedGather(VectorGep, Alignment.value(), MaskPart, 2472 nullptr, "wide.masked.gather"); 2473 addMetadata(NewLI, LI); 2474 } else { 2475 auto *VecPtr = CreateVecPtr(Part, State.get(Addr, {0, 0})); 2476 if (isMaskRequired) 2477 NewLI = Builder.CreateMaskedLoad( 2478 VecPtr, Alignment, BlockInMaskParts[Part], UndefValue::get(DataTy), 2479 "wide.masked.load"); 2480 else 2481 NewLI = Builder.CreateAlignedLoad(DataTy, VecPtr, Alignment.value(), 2482 "wide.load"); 2483 2484 // Add metadata to the load, but setVectorValue to the reverse shuffle. 2485 addMetadata(NewLI, LI); 2486 if (Reverse) 2487 NewLI = reverseVector(NewLI); 2488 } 2489 VectorLoopValueMap.setVectorValue(Instr, Part, NewLI); 2490 } 2491 } 2492 2493 void InnerLoopVectorizer::scalarizeInstruction(Instruction *Instr, 2494 const VPIteration &Instance, 2495 bool IfPredicateInstr) { 2496 assert(!Instr->getType()->isAggregateType() && "Can't handle vectors"); 2497 2498 setDebugLocFromInst(Builder, Instr); 2499 2500 // Does this instruction return a value ? 2501 bool IsVoidRetTy = Instr->getType()->isVoidTy(); 2502 2503 Instruction *Cloned = Instr->clone(); 2504 if (!IsVoidRetTy) 2505 Cloned->setName(Instr->getName() + ".cloned"); 2506 2507 // Replace the operands of the cloned instructions with their scalar 2508 // equivalents in the new loop. 2509 for (unsigned op = 0, e = Instr->getNumOperands(); op != e; ++op) { 2510 auto *NewOp = getOrCreateScalarValue(Instr->getOperand(op), Instance); 2511 Cloned->setOperand(op, NewOp); 2512 } 2513 addNewMetadata(Cloned, Instr); 2514 2515 // Place the cloned scalar in the new loop. 2516 Builder.Insert(Cloned); 2517 2518 // Add the cloned scalar to the scalar map entry. 2519 VectorLoopValueMap.setScalarValue(Instr, Instance, Cloned); 2520 2521 // If we just cloned a new assumption, add it the assumption cache. 2522 if (auto *II = dyn_cast<IntrinsicInst>(Cloned)) 2523 if (II->getIntrinsicID() == Intrinsic::assume) 2524 AC->registerAssumption(II); 2525 2526 // End if-block. 2527 if (IfPredicateInstr) 2528 PredicatedInstructions.push_back(Cloned); 2529 } 2530 2531 PHINode *InnerLoopVectorizer::createInductionVariable(Loop *L, Value *Start, 2532 Value *End, Value *Step, 2533 Instruction *DL) { 2534 BasicBlock *Header = L->getHeader(); 2535 BasicBlock *Latch = L->getLoopLatch(); 2536 // As we're just creating this loop, it's possible no latch exists 2537 // yet. If so, use the header as this will be a single block loop. 2538 if (!Latch) 2539 Latch = Header; 2540 2541 IRBuilder<> Builder(&*Header->getFirstInsertionPt()); 2542 Instruction *OldInst = getDebugLocFromInstOrOperands(OldInduction); 2543 setDebugLocFromInst(Builder, OldInst); 2544 auto *Induction = Builder.CreatePHI(Start->getType(), 2, "index"); 2545 2546 Builder.SetInsertPoint(Latch->getTerminator()); 2547 setDebugLocFromInst(Builder, OldInst); 2548 2549 // Create i+1 and fill the PHINode. 2550 Value *Next = Builder.CreateAdd(Induction, Step, "index.next"); 2551 Induction->addIncoming(Start, L->getLoopPreheader()); 2552 Induction->addIncoming(Next, Latch); 2553 // Create the compare. 2554 Value *ICmp = Builder.CreateICmpEQ(Next, End); 2555 Builder.CreateCondBr(ICmp, L->getExitBlock(), Header); 2556 2557 // Now we have two terminators. Remove the old one from the block. 2558 Latch->getTerminator()->eraseFromParent(); 2559 2560 return Induction; 2561 } 2562 2563 Value *InnerLoopVectorizer::getOrCreateTripCount(Loop *L) { 2564 if (TripCount) 2565 return TripCount; 2566 2567 assert(L && "Create Trip Count for null loop."); 2568 IRBuilder<> Builder(L->getLoopPreheader()->getTerminator()); 2569 // Find the loop boundaries. 2570 ScalarEvolution *SE = PSE.getSE(); 2571 const SCEV *BackedgeTakenCount = PSE.getBackedgeTakenCount(); 2572 assert(BackedgeTakenCount != SE->getCouldNotCompute() && 2573 "Invalid loop count"); 2574 2575 Type *IdxTy = Legal->getWidestInductionType(); 2576 assert(IdxTy && "No type for induction"); 2577 2578 // The exit count might have the type of i64 while the phi is i32. This can 2579 // happen if we have an induction variable that is sign extended before the 2580 // compare. The only way that we get a backedge taken count is that the 2581 // induction variable was signed and as such will not overflow. In such a case 2582 // truncation is legal. 2583 if (BackedgeTakenCount->getType()->getPrimitiveSizeInBits() > 2584 IdxTy->getPrimitiveSizeInBits()) 2585 BackedgeTakenCount = SE->getTruncateOrNoop(BackedgeTakenCount, IdxTy); 2586 BackedgeTakenCount = SE->getNoopOrZeroExtend(BackedgeTakenCount, IdxTy); 2587 2588 // Get the total trip count from the count by adding 1. 2589 const SCEV *ExitCount = SE->getAddExpr( 2590 BackedgeTakenCount, SE->getOne(BackedgeTakenCount->getType())); 2591 2592 const DataLayout &DL = L->getHeader()->getModule()->getDataLayout(); 2593 2594 // Expand the trip count and place the new instructions in the preheader. 2595 // Notice that the pre-header does not change, only the loop body. 2596 SCEVExpander Exp(*SE, DL, "induction"); 2597 2598 // Count holds the overall loop count (N). 2599 TripCount = Exp.expandCodeFor(ExitCount, ExitCount->getType(), 2600 L->getLoopPreheader()->getTerminator()); 2601 2602 if (TripCount->getType()->isPointerTy()) 2603 TripCount = 2604 CastInst::CreatePointerCast(TripCount, IdxTy, "exitcount.ptrcnt.to.int", 2605 L->getLoopPreheader()->getTerminator()); 2606 2607 return TripCount; 2608 } 2609 2610 Value *InnerLoopVectorizer::getOrCreateVectorTripCount(Loop *L) { 2611 if (VectorTripCount) 2612 return VectorTripCount; 2613 2614 Value *TC = getOrCreateTripCount(L); 2615 IRBuilder<> Builder(L->getLoopPreheader()->getTerminator()); 2616 2617 Type *Ty = TC->getType(); 2618 Constant *Step = ConstantInt::get(Ty, VF * UF); 2619 2620 // If the tail is to be folded by masking, round the number of iterations N 2621 // up to a multiple of Step instead of rounding down. This is done by first 2622 // adding Step-1 and then rounding down. Note that it's ok if this addition 2623 // overflows: the vector induction variable will eventually wrap to zero given 2624 // that it starts at zero and its Step is a power of two; the loop will then 2625 // exit, with the last early-exit vector comparison also producing all-true. 2626 if (Cost->foldTailByMasking()) { 2627 assert(isPowerOf2_32(VF * UF) && 2628 "VF*UF must be a power of 2 when folding tail by masking"); 2629 TC = Builder.CreateAdd(TC, ConstantInt::get(Ty, VF * UF - 1), "n.rnd.up"); 2630 } 2631 2632 // Now we need to generate the expression for the part of the loop that the 2633 // vectorized body will execute. This is equal to N - (N % Step) if scalar 2634 // iterations are not required for correctness, or N - Step, otherwise. Step 2635 // is equal to the vectorization factor (number of SIMD elements) times the 2636 // unroll factor (number of SIMD instructions). 2637 Value *R = Builder.CreateURem(TC, Step, "n.mod.vf"); 2638 2639 // If there is a non-reversed interleaved group that may speculatively access 2640 // memory out-of-bounds, we need to ensure that there will be at least one 2641 // iteration of the scalar epilogue loop. Thus, if the step evenly divides 2642 // the trip count, we set the remainder to be equal to the step. If the step 2643 // does not evenly divide the trip count, no adjustment is necessary since 2644 // there will already be scalar iterations. Note that the minimum iterations 2645 // check ensures that N >= Step. 2646 if (VF > 1 && Cost->requiresScalarEpilogue()) { 2647 auto *IsZero = Builder.CreateICmpEQ(R, ConstantInt::get(R->getType(), 0)); 2648 R = Builder.CreateSelect(IsZero, Step, R); 2649 } 2650 2651 VectorTripCount = Builder.CreateSub(TC, R, "n.vec"); 2652 2653 return VectorTripCount; 2654 } 2655 2656 Value *InnerLoopVectorizer::createBitOrPointerCast(Value *V, VectorType *DstVTy, 2657 const DataLayout &DL) { 2658 // Verify that V is a vector type with same number of elements as DstVTy. 2659 unsigned VF = DstVTy->getNumElements(); 2660 VectorType *SrcVecTy = cast<VectorType>(V->getType()); 2661 assert((VF == SrcVecTy->getNumElements()) && "Vector dimensions do not match"); 2662 Type *SrcElemTy = SrcVecTy->getElementType(); 2663 Type *DstElemTy = DstVTy->getElementType(); 2664 assert((DL.getTypeSizeInBits(SrcElemTy) == DL.getTypeSizeInBits(DstElemTy)) && 2665 "Vector elements must have same size"); 2666 2667 // Do a direct cast if element types are castable. 2668 if (CastInst::isBitOrNoopPointerCastable(SrcElemTy, DstElemTy, DL)) { 2669 return Builder.CreateBitOrPointerCast(V, DstVTy); 2670 } 2671 // V cannot be directly casted to desired vector type. 2672 // May happen when V is a floating point vector but DstVTy is a vector of 2673 // pointers or vice-versa. Handle this using a two-step bitcast using an 2674 // intermediate Integer type for the bitcast i.e. Ptr <-> Int <-> Float. 2675 assert((DstElemTy->isPointerTy() != SrcElemTy->isPointerTy()) && 2676 "Only one type should be a pointer type"); 2677 assert((DstElemTy->isFloatingPointTy() != SrcElemTy->isFloatingPointTy()) && 2678 "Only one type should be a floating point type"); 2679 Type *IntTy = 2680 IntegerType::getIntNTy(V->getContext(), DL.getTypeSizeInBits(SrcElemTy)); 2681 VectorType *VecIntTy = VectorType::get(IntTy, VF); 2682 Value *CastVal = Builder.CreateBitOrPointerCast(V, VecIntTy); 2683 return Builder.CreateBitOrPointerCast(CastVal, DstVTy); 2684 } 2685 2686 void InnerLoopVectorizer::emitMinimumIterationCountCheck(Loop *L, 2687 BasicBlock *Bypass) { 2688 Value *Count = getOrCreateTripCount(L); 2689 // Reuse existing vector loop preheader for TC checks. 2690 // Note that new preheader block is generated for vector loop. 2691 BasicBlock *const TCCheckBlock = LoopVectorPreHeader; 2692 IRBuilder<> Builder(TCCheckBlock->getTerminator()); 2693 2694 // Generate code to check if the loop's trip count is less than VF * UF, or 2695 // equal to it in case a scalar epilogue is required; this implies that the 2696 // vector trip count is zero. This check also covers the case where adding one 2697 // to the backedge-taken count overflowed leading to an incorrect trip count 2698 // of zero. In this case we will also jump to the scalar loop. 2699 auto P = Cost->requiresScalarEpilogue() ? ICmpInst::ICMP_ULE 2700 : ICmpInst::ICMP_ULT; 2701 2702 // If tail is to be folded, vector loop takes care of all iterations. 2703 Value *CheckMinIters = Builder.getFalse(); 2704 if (!Cost->foldTailByMasking()) 2705 CheckMinIters = Builder.CreateICmp( 2706 P, Count, ConstantInt::get(Count->getType(), VF * UF), 2707 "min.iters.check"); 2708 2709 // Create new preheader for vector loop. 2710 LoopVectorPreHeader = 2711 SplitBlock(TCCheckBlock, TCCheckBlock->getTerminator(), DT, LI, nullptr, 2712 "vector.ph"); 2713 2714 assert(DT->properlyDominates(DT->getNode(TCCheckBlock), 2715 DT->getNode(Bypass)->getIDom()) && 2716 "TC check is expected to dominate Bypass"); 2717 2718 // Update dominator for Bypass & LoopExit. 2719 DT->changeImmediateDominator(Bypass, TCCheckBlock); 2720 DT->changeImmediateDominator(LoopExitBlock, TCCheckBlock); 2721 2722 ReplaceInstWithInst( 2723 TCCheckBlock->getTerminator(), 2724 BranchInst::Create(Bypass, LoopVectorPreHeader, CheckMinIters)); 2725 LoopBypassBlocks.push_back(TCCheckBlock); 2726 } 2727 2728 void InnerLoopVectorizer::emitSCEVChecks(Loop *L, BasicBlock *Bypass) { 2729 // Reuse existing vector loop preheader for SCEV checks. 2730 // Note that new preheader block is generated for vector loop. 2731 BasicBlock *const SCEVCheckBlock = LoopVectorPreHeader; 2732 2733 // Generate the code to check that the SCEV assumptions that we made. 2734 // We want the new basic block to start at the first instruction in a 2735 // sequence of instructions that form a check. 2736 SCEVExpander Exp(*PSE.getSE(), Bypass->getModule()->getDataLayout(), 2737 "scev.check"); 2738 Value *SCEVCheck = Exp.expandCodeForPredicate( 2739 &PSE.getUnionPredicate(), SCEVCheckBlock->getTerminator()); 2740 2741 if (auto *C = dyn_cast<ConstantInt>(SCEVCheck)) 2742 if (C->isZero()) 2743 return; 2744 2745 assert(!SCEVCheckBlock->getParent()->hasOptSize() && 2746 "Cannot SCEV check stride or overflow when optimizing for size"); 2747 2748 SCEVCheckBlock->setName("vector.scevcheck"); 2749 // Create new preheader for vector loop. 2750 LoopVectorPreHeader = 2751 SplitBlock(SCEVCheckBlock, SCEVCheckBlock->getTerminator(), DT, LI, 2752 nullptr, "vector.ph"); 2753 2754 // Update dominator only if this is first RT check. 2755 if (LoopBypassBlocks.empty()) { 2756 DT->changeImmediateDominator(Bypass, SCEVCheckBlock); 2757 DT->changeImmediateDominator(LoopExitBlock, SCEVCheckBlock); 2758 } 2759 2760 ReplaceInstWithInst( 2761 SCEVCheckBlock->getTerminator(), 2762 BranchInst::Create(Bypass, LoopVectorPreHeader, SCEVCheck)); 2763 LoopBypassBlocks.push_back(SCEVCheckBlock); 2764 AddedSafetyChecks = true; 2765 } 2766 2767 void InnerLoopVectorizer::emitMemRuntimeChecks(Loop *L, BasicBlock *Bypass) { 2768 // VPlan-native path does not do any analysis for runtime checks currently. 2769 if (EnableVPlanNativePath) 2770 return; 2771 2772 // Reuse existing vector loop preheader for runtime memory checks. 2773 // Note that new preheader block is generated for vector loop. 2774 BasicBlock *const MemCheckBlock = L->getLoopPreheader(); 2775 2776 // Generate the code that checks in runtime if arrays overlap. We put the 2777 // checks into a separate block to make the more common case of few elements 2778 // faster. 2779 Instruction *FirstCheckInst; 2780 Instruction *MemRuntimeCheck; 2781 std::tie(FirstCheckInst, MemRuntimeCheck) = 2782 Legal->getLAI()->addRuntimeChecks(MemCheckBlock->getTerminator()); 2783 if (!MemRuntimeCheck) 2784 return; 2785 2786 if (MemCheckBlock->getParent()->hasOptSize()) { 2787 assert(Cost->Hints->getForce() == LoopVectorizeHints::FK_Enabled && 2788 "Cannot emit memory checks when optimizing for size, unless forced " 2789 "to vectorize."); 2790 ORE->emit([&]() { 2791 return OptimizationRemarkAnalysis(DEBUG_TYPE, "VectorizationCodeSize", 2792 L->getStartLoc(), L->getHeader()) 2793 << "Code-size may be reduced by not forcing " 2794 "vectorization, or by source-code modifications " 2795 "eliminating the need for runtime checks " 2796 "(e.g., adding 'restrict')."; 2797 }); 2798 } 2799 2800 MemCheckBlock->setName("vector.memcheck"); 2801 // Create new preheader for vector loop. 2802 LoopVectorPreHeader = 2803 SplitBlock(MemCheckBlock, MemCheckBlock->getTerminator(), DT, LI, nullptr, 2804 "vector.ph"); 2805 2806 // Update dominator only if this is first RT check. 2807 if (LoopBypassBlocks.empty()) { 2808 DT->changeImmediateDominator(Bypass, MemCheckBlock); 2809 DT->changeImmediateDominator(LoopExitBlock, MemCheckBlock); 2810 } 2811 2812 ReplaceInstWithInst( 2813 MemCheckBlock->getTerminator(), 2814 BranchInst::Create(Bypass, LoopVectorPreHeader, MemRuntimeCheck)); 2815 LoopBypassBlocks.push_back(MemCheckBlock); 2816 AddedSafetyChecks = true; 2817 2818 // We currently don't use LoopVersioning for the actual loop cloning but we 2819 // still use it to add the noalias metadata. 2820 LVer = std::make_unique<LoopVersioning>(*Legal->getLAI(), OrigLoop, LI, DT, 2821 PSE.getSE()); 2822 LVer->prepareNoAliasMetadata(); 2823 } 2824 2825 Value *InnerLoopVectorizer::emitTransformedIndex( 2826 IRBuilder<> &B, Value *Index, ScalarEvolution *SE, const DataLayout &DL, 2827 const InductionDescriptor &ID) const { 2828 2829 SCEVExpander Exp(*SE, DL, "induction"); 2830 auto Step = ID.getStep(); 2831 auto StartValue = ID.getStartValue(); 2832 assert(Index->getType() == Step->getType() && 2833 "Index type does not match StepValue type"); 2834 2835 // Note: the IR at this point is broken. We cannot use SE to create any new 2836 // SCEV and then expand it, hoping that SCEV's simplification will give us 2837 // a more optimal code. Unfortunately, attempt of doing so on invalid IR may 2838 // lead to various SCEV crashes. So all we can do is to use builder and rely 2839 // on InstCombine for future simplifications. Here we handle some trivial 2840 // cases only. 2841 auto CreateAdd = [&B](Value *X, Value *Y) { 2842 assert(X->getType() == Y->getType() && "Types don't match!"); 2843 if (auto *CX = dyn_cast<ConstantInt>(X)) 2844 if (CX->isZero()) 2845 return Y; 2846 if (auto *CY = dyn_cast<ConstantInt>(Y)) 2847 if (CY->isZero()) 2848 return X; 2849 return B.CreateAdd(X, Y); 2850 }; 2851 2852 auto CreateMul = [&B](Value *X, Value *Y) { 2853 assert(X->getType() == Y->getType() && "Types don't match!"); 2854 if (auto *CX = dyn_cast<ConstantInt>(X)) 2855 if (CX->isOne()) 2856 return Y; 2857 if (auto *CY = dyn_cast<ConstantInt>(Y)) 2858 if (CY->isOne()) 2859 return X; 2860 return B.CreateMul(X, Y); 2861 }; 2862 2863 switch (ID.getKind()) { 2864 case InductionDescriptor::IK_IntInduction: { 2865 assert(Index->getType() == StartValue->getType() && 2866 "Index type does not match StartValue type"); 2867 if (ID.getConstIntStepValue() && ID.getConstIntStepValue()->isMinusOne()) 2868 return B.CreateSub(StartValue, Index); 2869 auto *Offset = CreateMul( 2870 Index, Exp.expandCodeFor(Step, Index->getType(), &*B.GetInsertPoint())); 2871 return CreateAdd(StartValue, Offset); 2872 } 2873 case InductionDescriptor::IK_PtrInduction: { 2874 assert(isa<SCEVConstant>(Step) && 2875 "Expected constant step for pointer induction"); 2876 return B.CreateGEP( 2877 StartValue->getType()->getPointerElementType(), StartValue, 2878 CreateMul(Index, Exp.expandCodeFor(Step, Index->getType(), 2879 &*B.GetInsertPoint()))); 2880 } 2881 case InductionDescriptor::IK_FpInduction: { 2882 assert(Step->getType()->isFloatingPointTy() && "Expected FP Step value"); 2883 auto InductionBinOp = ID.getInductionBinOp(); 2884 assert(InductionBinOp && 2885 (InductionBinOp->getOpcode() == Instruction::FAdd || 2886 InductionBinOp->getOpcode() == Instruction::FSub) && 2887 "Original bin op should be defined for FP induction"); 2888 2889 Value *StepValue = cast<SCEVUnknown>(Step)->getValue(); 2890 2891 // Floating point operations had to be 'fast' to enable the induction. 2892 FastMathFlags Flags; 2893 Flags.setFast(); 2894 2895 Value *MulExp = B.CreateFMul(StepValue, Index); 2896 if (isa<Instruction>(MulExp)) 2897 // We have to check, the MulExp may be a constant. 2898 cast<Instruction>(MulExp)->setFastMathFlags(Flags); 2899 2900 Value *BOp = B.CreateBinOp(InductionBinOp->getOpcode(), StartValue, MulExp, 2901 "induction"); 2902 if (isa<Instruction>(BOp)) 2903 cast<Instruction>(BOp)->setFastMathFlags(Flags); 2904 2905 return BOp; 2906 } 2907 case InductionDescriptor::IK_NoInduction: 2908 return nullptr; 2909 } 2910 llvm_unreachable("invalid enum"); 2911 } 2912 2913 BasicBlock *InnerLoopVectorizer::createVectorizedLoopSkeleton() { 2914 /* 2915 In this function we generate a new loop. The new loop will contain 2916 the vectorized instructions while the old loop will continue to run the 2917 scalar remainder. 2918 2919 [ ] <-- loop iteration number check. 2920 / | 2921 / v 2922 | [ ] <-- vector loop bypass (may consist of multiple blocks). 2923 | / | 2924 | / v 2925 || [ ] <-- vector pre header. 2926 |/ | 2927 | v 2928 | [ ] \ 2929 | [ ]_| <-- vector loop. 2930 | | 2931 | v 2932 | -[ ] <--- middle-block. 2933 | / | 2934 | / v 2935 -|- >[ ] <--- new preheader. 2936 | | 2937 | v 2938 | [ ] \ 2939 | [ ]_| <-- old scalar loop to handle remainder. 2940 \ | 2941 \ v 2942 >[ ] <-- exit block. 2943 ... 2944 */ 2945 2946 MDNode *OrigLoopID = OrigLoop->getLoopID(); 2947 2948 // Some loops have a single integer induction variable, while other loops 2949 // don't. One example is c++ iterators that often have multiple pointer 2950 // induction variables. In the code below we also support a case where we 2951 // don't have a single induction variable. 2952 // 2953 // We try to obtain an induction variable from the original loop as hard 2954 // as possible. However if we don't find one that: 2955 // - is an integer 2956 // - counts from zero, stepping by one 2957 // - is the size of the widest induction variable type 2958 // then we create a new one. 2959 OldInduction = Legal->getPrimaryInduction(); 2960 Type *IdxTy = Legal->getWidestInductionType(); 2961 2962 // Split the single block loop into the two loop structure described above. 2963 LoopScalarBody = OrigLoop->getHeader(); 2964 LoopVectorPreHeader = OrigLoop->getLoopPreheader(); 2965 LoopExitBlock = OrigLoop->getExitBlock(); 2966 assert(LoopExitBlock && "Must have an exit block"); 2967 assert(LoopVectorPreHeader && "Invalid loop structure"); 2968 2969 LoopMiddleBlock = 2970 SplitBlock(LoopVectorPreHeader, LoopVectorPreHeader->getTerminator(), DT, 2971 LI, nullptr, "middle.block"); 2972 LoopScalarPreHeader = 2973 SplitBlock(LoopMiddleBlock, LoopMiddleBlock->getTerminator(), DT, LI, 2974 nullptr, "scalar.ph"); 2975 // We intentionally don't let SplitBlock to update LoopInfo since 2976 // LoopVectorBody should belong to another loop than LoopVectorPreHeader. 2977 // LoopVectorBody is explicitly added to the correct place few lines later. 2978 LoopVectorBody = 2979 SplitBlock(LoopVectorPreHeader, LoopVectorPreHeader->getTerminator(), DT, 2980 nullptr, nullptr, "vector.body"); 2981 2982 // Update dominator for loop exit. 2983 DT->changeImmediateDominator(LoopExitBlock, LoopMiddleBlock); 2984 2985 // Create and register the new vector loop. 2986 Loop *Lp = LI->AllocateLoop(); 2987 Loop *ParentLoop = OrigLoop->getParentLoop(); 2988 2989 // Insert the new loop into the loop nest and register the new basic blocks 2990 // before calling any utilities such as SCEV that require valid LoopInfo. 2991 if (ParentLoop) { 2992 ParentLoop->addChildLoop(Lp); 2993 } else { 2994 LI->addTopLevelLoop(Lp); 2995 } 2996 Lp->addBasicBlockToLoop(LoopVectorBody, *LI); 2997 2998 // Find the loop boundaries. 2999 Value *Count = getOrCreateTripCount(Lp); 3000 3001 Value *StartIdx = ConstantInt::get(IdxTy, 0); 3002 3003 // Now, compare the new count to zero. If it is zero skip the vector loop and 3004 // jump to the scalar loop. This check also covers the case where the 3005 // backedge-taken count is uint##_max: adding one to it will overflow leading 3006 // to an incorrect trip count of zero. In this (rare) case we will also jump 3007 // to the scalar loop. 3008 emitMinimumIterationCountCheck(Lp, LoopScalarPreHeader); 3009 3010 // Generate the code to check any assumptions that we've made for SCEV 3011 // expressions. 3012 emitSCEVChecks(Lp, LoopScalarPreHeader); 3013 3014 // Generate the code that checks in runtime if arrays overlap. We put the 3015 // checks into a separate block to make the more common case of few elements 3016 // faster. 3017 emitMemRuntimeChecks(Lp, LoopScalarPreHeader); 3018 3019 // Generate the induction variable. 3020 // The loop step is equal to the vectorization factor (num of SIMD elements) 3021 // times the unroll factor (num of SIMD instructions). 3022 Value *CountRoundDown = getOrCreateVectorTripCount(Lp); 3023 Constant *Step = ConstantInt::get(IdxTy, VF * UF); 3024 Induction = 3025 createInductionVariable(Lp, StartIdx, CountRoundDown, Step, 3026 getDebugLocFromInstOrOperands(OldInduction)); 3027 3028 // We are going to resume the execution of the scalar loop. 3029 // Go over all of the induction variables that we found and fix the 3030 // PHIs that are left in the scalar version of the loop. 3031 // The starting values of PHI nodes depend on the counter of the last 3032 // iteration in the vectorized loop. 3033 // If we come from a bypass edge then we need to start from the original 3034 // start value. 3035 3036 // This variable saves the new starting index for the scalar loop. It is used 3037 // to test if there are any tail iterations left once the vector loop has 3038 // completed. 3039 LoopVectorizationLegality::InductionList *List = Legal->getInductionVars(); 3040 for (auto &InductionEntry : *List) { 3041 PHINode *OrigPhi = InductionEntry.first; 3042 InductionDescriptor II = InductionEntry.second; 3043 3044 // Create phi nodes to merge from the backedge-taken check block. 3045 PHINode *BCResumeVal = 3046 PHINode::Create(OrigPhi->getType(), 3, "bc.resume.val", 3047 LoopScalarPreHeader->getTerminator()); 3048 // Copy original phi DL over to the new one. 3049 BCResumeVal->setDebugLoc(OrigPhi->getDebugLoc()); 3050 Value *&EndValue = IVEndValues[OrigPhi]; 3051 if (OrigPhi == OldInduction) { 3052 // We know what the end value is. 3053 EndValue = CountRoundDown; 3054 } else { 3055 IRBuilder<> B(Lp->getLoopPreheader()->getTerminator()); 3056 Type *StepType = II.getStep()->getType(); 3057 Instruction::CastOps CastOp = 3058 CastInst::getCastOpcode(CountRoundDown, true, StepType, true); 3059 Value *CRD = B.CreateCast(CastOp, CountRoundDown, StepType, "cast.crd"); 3060 const DataLayout &DL = LoopScalarBody->getModule()->getDataLayout(); 3061 EndValue = emitTransformedIndex(B, CRD, PSE.getSE(), DL, II); 3062 EndValue->setName("ind.end"); 3063 } 3064 3065 // The new PHI merges the original incoming value, in case of a bypass, 3066 // or the value at the end of the vectorized loop. 3067 BCResumeVal->addIncoming(EndValue, LoopMiddleBlock); 3068 3069 // Fix the scalar body counter (PHI node). 3070 // The old induction's phi node in the scalar body needs the truncated 3071 // value. 3072 for (BasicBlock *BB : LoopBypassBlocks) 3073 BCResumeVal->addIncoming(II.getStartValue(), BB); 3074 OrigPhi->setIncomingValueForBlock(LoopScalarPreHeader, BCResumeVal); 3075 } 3076 3077 // We need the OrigLoop (scalar loop part) latch terminator to help 3078 // produce correct debug info for the middle block BB instructions. 3079 // The legality check stage guarantees that the loop will have a single 3080 // latch. 3081 assert(isa<BranchInst>(OrigLoop->getLoopLatch()->getTerminator()) && 3082 "Scalar loop latch terminator isn't a branch"); 3083 BranchInst *ScalarLatchBr = 3084 cast<BranchInst>(OrigLoop->getLoopLatch()->getTerminator()); 3085 3086 // Add a check in the middle block to see if we have completed 3087 // all of the iterations in the first vector loop. 3088 // If (N - N%VF) == N, then we *don't* need to run the remainder. 3089 // If tail is to be folded, we know we don't need to run the remainder. 3090 Value *CmpN = Builder.getTrue(); 3091 if (!Cost->foldTailByMasking()) { 3092 CmpN = CmpInst::Create(Instruction::ICmp, CmpInst::ICMP_EQ, Count, 3093 CountRoundDown, "cmp.n", 3094 LoopMiddleBlock->getTerminator()); 3095 3096 // Here we use the same DebugLoc as the scalar loop latch branch instead 3097 // of the corresponding compare because they may have ended up with 3098 // different line numbers and we want to avoid awkward line stepping while 3099 // debugging. Eg. if the compare has got a line number inside the loop. 3100 cast<Instruction>(CmpN)->setDebugLoc(ScalarLatchBr->getDebugLoc()); 3101 } 3102 3103 BranchInst *BrInst = 3104 BranchInst::Create(LoopExitBlock, LoopScalarPreHeader, CmpN); 3105 BrInst->setDebugLoc(ScalarLatchBr->getDebugLoc()); 3106 ReplaceInstWithInst(LoopMiddleBlock->getTerminator(), BrInst); 3107 3108 // Get ready to start creating new instructions into the vectorized body. 3109 assert(LoopVectorPreHeader == Lp->getLoopPreheader() && 3110 "Inconsistent vector loop preheader"); 3111 Builder.SetInsertPoint(&*LoopVectorBody->getFirstInsertionPt()); 3112 3113 Optional<MDNode *> VectorizedLoopID = 3114 makeFollowupLoopID(OrigLoopID, {LLVMLoopVectorizeFollowupAll, 3115 LLVMLoopVectorizeFollowupVectorized}); 3116 if (VectorizedLoopID.hasValue()) { 3117 Lp->setLoopID(VectorizedLoopID.getValue()); 3118 3119 // Do not setAlreadyVectorized if loop attributes have been defined 3120 // explicitly. 3121 return LoopVectorPreHeader; 3122 } 3123 3124 // Keep all loop hints from the original loop on the vector loop (we'll 3125 // replace the vectorizer-specific hints below). 3126 if (MDNode *LID = OrigLoop->getLoopID()) 3127 Lp->setLoopID(LID); 3128 3129 LoopVectorizeHints Hints(Lp, true, *ORE); 3130 Hints.setAlreadyVectorized(); 3131 3132 #ifdef EXPENSIVE_CHECKS 3133 assert(DT->verify(DominatorTree::VerificationLevel::Fast)); 3134 LI->verify(*DT); 3135 #endif 3136 3137 return LoopVectorPreHeader; 3138 } 3139 3140 // Fix up external users of the induction variable. At this point, we are 3141 // in LCSSA form, with all external PHIs that use the IV having one input value, 3142 // coming from the remainder loop. We need those PHIs to also have a correct 3143 // value for the IV when arriving directly from the middle block. 3144 void InnerLoopVectorizer::fixupIVUsers(PHINode *OrigPhi, 3145 const InductionDescriptor &II, 3146 Value *CountRoundDown, Value *EndValue, 3147 BasicBlock *MiddleBlock) { 3148 // There are two kinds of external IV usages - those that use the value 3149 // computed in the last iteration (the PHI) and those that use the penultimate 3150 // value (the value that feeds into the phi from the loop latch). 3151 // We allow both, but they, obviously, have different values. 3152 3153 assert(OrigLoop->getExitBlock() && "Expected a single exit block"); 3154 3155 DenseMap<Value *, Value *> MissingVals; 3156 3157 // An external user of the last iteration's value should see the value that 3158 // the remainder loop uses to initialize its own IV. 3159 Value *PostInc = OrigPhi->getIncomingValueForBlock(OrigLoop->getLoopLatch()); 3160 for (User *U : PostInc->users()) { 3161 Instruction *UI = cast<Instruction>(U); 3162 if (!OrigLoop->contains(UI)) { 3163 assert(isa<PHINode>(UI) && "Expected LCSSA form"); 3164 MissingVals[UI] = EndValue; 3165 } 3166 } 3167 3168 // An external user of the penultimate value need to see EndValue - Step. 3169 // The simplest way to get this is to recompute it from the constituent SCEVs, 3170 // that is Start + (Step * (CRD - 1)). 3171 for (User *U : OrigPhi->users()) { 3172 auto *UI = cast<Instruction>(U); 3173 if (!OrigLoop->contains(UI)) { 3174 const DataLayout &DL = 3175 OrigLoop->getHeader()->getModule()->getDataLayout(); 3176 assert(isa<PHINode>(UI) && "Expected LCSSA form"); 3177 3178 IRBuilder<> B(MiddleBlock->getTerminator()); 3179 Value *CountMinusOne = B.CreateSub( 3180 CountRoundDown, ConstantInt::get(CountRoundDown->getType(), 1)); 3181 Value *CMO = 3182 !II.getStep()->getType()->isIntegerTy() 3183 ? B.CreateCast(Instruction::SIToFP, CountMinusOne, 3184 II.getStep()->getType()) 3185 : B.CreateSExtOrTrunc(CountMinusOne, II.getStep()->getType()); 3186 CMO->setName("cast.cmo"); 3187 Value *Escape = emitTransformedIndex(B, CMO, PSE.getSE(), DL, II); 3188 Escape->setName("ind.escape"); 3189 MissingVals[UI] = Escape; 3190 } 3191 } 3192 3193 for (auto &I : MissingVals) { 3194 PHINode *PHI = cast<PHINode>(I.first); 3195 // One corner case we have to handle is two IVs "chasing" each-other, 3196 // that is %IV2 = phi [...], [ %IV1, %latch ] 3197 // In this case, if IV1 has an external use, we need to avoid adding both 3198 // "last value of IV1" and "penultimate value of IV2". So, verify that we 3199 // don't already have an incoming value for the middle block. 3200 if (PHI->getBasicBlockIndex(MiddleBlock) == -1) 3201 PHI->addIncoming(I.second, MiddleBlock); 3202 } 3203 } 3204 3205 namespace { 3206 3207 struct CSEDenseMapInfo { 3208 static bool canHandle(const Instruction *I) { 3209 return isa<InsertElementInst>(I) || isa<ExtractElementInst>(I) || 3210 isa<ShuffleVectorInst>(I) || isa<GetElementPtrInst>(I); 3211 } 3212 3213 static inline Instruction *getEmptyKey() { 3214 return DenseMapInfo<Instruction *>::getEmptyKey(); 3215 } 3216 3217 static inline Instruction *getTombstoneKey() { 3218 return DenseMapInfo<Instruction *>::getTombstoneKey(); 3219 } 3220 3221 static unsigned getHashValue(const Instruction *I) { 3222 assert(canHandle(I) && "Unknown instruction!"); 3223 return hash_combine(I->getOpcode(), hash_combine_range(I->value_op_begin(), 3224 I->value_op_end())); 3225 } 3226 3227 static bool isEqual(const Instruction *LHS, const Instruction *RHS) { 3228 if (LHS == getEmptyKey() || RHS == getEmptyKey() || 3229 LHS == getTombstoneKey() || RHS == getTombstoneKey()) 3230 return LHS == RHS; 3231 return LHS->isIdenticalTo(RHS); 3232 } 3233 }; 3234 3235 } // end anonymous namespace 3236 3237 ///Perform cse of induction variable instructions. 3238 static void cse(BasicBlock *BB) { 3239 // Perform simple cse. 3240 SmallDenseMap<Instruction *, Instruction *, 4, CSEDenseMapInfo> CSEMap; 3241 for (BasicBlock::iterator I = BB->begin(), E = BB->end(); I != E;) { 3242 Instruction *In = &*I++; 3243 3244 if (!CSEDenseMapInfo::canHandle(In)) 3245 continue; 3246 3247 // Check if we can replace this instruction with any of the 3248 // visited instructions. 3249 if (Instruction *V = CSEMap.lookup(In)) { 3250 In->replaceAllUsesWith(V); 3251 In->eraseFromParent(); 3252 continue; 3253 } 3254 3255 CSEMap[In] = In; 3256 } 3257 } 3258 3259 unsigned LoopVectorizationCostModel::getVectorCallCost(CallInst *CI, 3260 unsigned VF, 3261 bool &NeedToScalarize) { 3262 Function *F = CI->getCalledFunction(); 3263 Type *ScalarRetTy = CI->getType(); 3264 SmallVector<Type *, 4> Tys, ScalarTys; 3265 for (auto &ArgOp : CI->arg_operands()) 3266 ScalarTys.push_back(ArgOp->getType()); 3267 3268 // Estimate cost of scalarized vector call. The source operands are assumed 3269 // to be vectors, so we need to extract individual elements from there, 3270 // execute VF scalar calls, and then gather the result into the vector return 3271 // value. 3272 unsigned ScalarCallCost = TTI.getCallInstrCost(F, ScalarRetTy, ScalarTys); 3273 if (VF == 1) 3274 return ScalarCallCost; 3275 3276 // Compute corresponding vector type for return value and arguments. 3277 Type *RetTy = ToVectorTy(ScalarRetTy, VF); 3278 for (Type *ScalarTy : ScalarTys) 3279 Tys.push_back(ToVectorTy(ScalarTy, VF)); 3280 3281 // Compute costs of unpacking argument values for the scalar calls and 3282 // packing the return values to a vector. 3283 unsigned ScalarizationCost = getScalarizationOverhead(CI, VF); 3284 3285 unsigned Cost = ScalarCallCost * VF + ScalarizationCost; 3286 3287 // If we can't emit a vector call for this function, then the currently found 3288 // cost is the cost we need to return. 3289 NeedToScalarize = true; 3290 if (!TLI || CI->isNoBuiltin() || VFDatabase::getMappings(*CI).empty()) 3291 return Cost; 3292 3293 // If the corresponding vector cost is cheaper, return its cost. 3294 unsigned VectorCallCost = TTI.getCallInstrCost(nullptr, RetTy, Tys); 3295 if (VectorCallCost < Cost) { 3296 NeedToScalarize = false; 3297 return VectorCallCost; 3298 } 3299 return Cost; 3300 } 3301 3302 unsigned LoopVectorizationCostModel::getVectorIntrinsicCost(CallInst *CI, 3303 unsigned VF) { 3304 Intrinsic::ID ID = getVectorIntrinsicIDForCall(CI, TLI); 3305 assert(ID && "Expected intrinsic call!"); 3306 3307 FastMathFlags FMF; 3308 if (auto *FPMO = dyn_cast<FPMathOperator>(CI)) 3309 FMF = FPMO->getFastMathFlags(); 3310 3311 SmallVector<Value *, 4> Operands(CI->arg_operands()); 3312 return TTI.getIntrinsicInstrCost(ID, CI->getType(), Operands, FMF, VF); 3313 } 3314 3315 static Type *smallestIntegerVectorType(Type *T1, Type *T2) { 3316 auto *I1 = cast<IntegerType>(T1->getVectorElementType()); 3317 auto *I2 = cast<IntegerType>(T2->getVectorElementType()); 3318 return I1->getBitWidth() < I2->getBitWidth() ? T1 : T2; 3319 } 3320 static Type *largestIntegerVectorType(Type *T1, Type *T2) { 3321 auto *I1 = cast<IntegerType>(T1->getVectorElementType()); 3322 auto *I2 = cast<IntegerType>(T2->getVectorElementType()); 3323 return I1->getBitWidth() > I2->getBitWidth() ? T1 : T2; 3324 } 3325 3326 void InnerLoopVectorizer::truncateToMinimalBitwidths() { 3327 // For every instruction `I` in MinBWs, truncate the operands, create a 3328 // truncated version of `I` and reextend its result. InstCombine runs 3329 // later and will remove any ext/trunc pairs. 3330 SmallPtrSet<Value *, 4> Erased; 3331 for (const auto &KV : Cost->getMinimalBitwidths()) { 3332 // If the value wasn't vectorized, we must maintain the original scalar 3333 // type. The absence of the value from VectorLoopValueMap indicates that it 3334 // wasn't vectorized. 3335 if (!VectorLoopValueMap.hasAnyVectorValue(KV.first)) 3336 continue; 3337 for (unsigned Part = 0; Part < UF; ++Part) { 3338 Value *I = getOrCreateVectorValue(KV.first, Part); 3339 if (Erased.find(I) != Erased.end() || I->use_empty() || 3340 !isa<Instruction>(I)) 3341 continue; 3342 Type *OriginalTy = I->getType(); 3343 Type *ScalarTruncatedTy = 3344 IntegerType::get(OriginalTy->getContext(), KV.second); 3345 Type *TruncatedTy = VectorType::get(ScalarTruncatedTy, 3346 OriginalTy->getVectorNumElements()); 3347 if (TruncatedTy == OriginalTy) 3348 continue; 3349 3350 IRBuilder<> B(cast<Instruction>(I)); 3351 auto ShrinkOperand = [&](Value *V) -> Value * { 3352 if (auto *ZI = dyn_cast<ZExtInst>(V)) 3353 if (ZI->getSrcTy() == TruncatedTy) 3354 return ZI->getOperand(0); 3355 return B.CreateZExtOrTrunc(V, TruncatedTy); 3356 }; 3357 3358 // The actual instruction modification depends on the instruction type, 3359 // unfortunately. 3360 Value *NewI = nullptr; 3361 if (auto *BO = dyn_cast<BinaryOperator>(I)) { 3362 NewI = B.CreateBinOp(BO->getOpcode(), ShrinkOperand(BO->getOperand(0)), 3363 ShrinkOperand(BO->getOperand(1))); 3364 3365 // Any wrapping introduced by shrinking this operation shouldn't be 3366 // considered undefined behavior. So, we can't unconditionally copy 3367 // arithmetic wrapping flags to NewI. 3368 cast<BinaryOperator>(NewI)->copyIRFlags(I, /*IncludeWrapFlags=*/false); 3369 } else if (auto *CI = dyn_cast<ICmpInst>(I)) { 3370 NewI = 3371 B.CreateICmp(CI->getPredicate(), ShrinkOperand(CI->getOperand(0)), 3372 ShrinkOperand(CI->getOperand(1))); 3373 } else if (auto *SI = dyn_cast<SelectInst>(I)) { 3374 NewI = B.CreateSelect(SI->getCondition(), 3375 ShrinkOperand(SI->getTrueValue()), 3376 ShrinkOperand(SI->getFalseValue())); 3377 } else if (auto *CI = dyn_cast<CastInst>(I)) { 3378 switch (CI->getOpcode()) { 3379 default: 3380 llvm_unreachable("Unhandled cast!"); 3381 case Instruction::Trunc: 3382 NewI = ShrinkOperand(CI->getOperand(0)); 3383 break; 3384 case Instruction::SExt: 3385 NewI = B.CreateSExtOrTrunc( 3386 CI->getOperand(0), 3387 smallestIntegerVectorType(OriginalTy, TruncatedTy)); 3388 break; 3389 case Instruction::ZExt: 3390 NewI = B.CreateZExtOrTrunc( 3391 CI->getOperand(0), 3392 smallestIntegerVectorType(OriginalTy, TruncatedTy)); 3393 break; 3394 } 3395 } else if (auto *SI = dyn_cast<ShuffleVectorInst>(I)) { 3396 auto Elements0 = SI->getOperand(0)->getType()->getVectorNumElements(); 3397 auto *O0 = B.CreateZExtOrTrunc( 3398 SI->getOperand(0), VectorType::get(ScalarTruncatedTy, Elements0)); 3399 auto Elements1 = SI->getOperand(1)->getType()->getVectorNumElements(); 3400 auto *O1 = B.CreateZExtOrTrunc( 3401 SI->getOperand(1), VectorType::get(ScalarTruncatedTy, Elements1)); 3402 3403 NewI = B.CreateShuffleVector(O0, O1, SI->getMask()); 3404 } else if (isa<LoadInst>(I) || isa<PHINode>(I)) { 3405 // Don't do anything with the operands, just extend the result. 3406 continue; 3407 } else if (auto *IE = dyn_cast<InsertElementInst>(I)) { 3408 auto Elements = IE->getOperand(0)->getType()->getVectorNumElements(); 3409 auto *O0 = B.CreateZExtOrTrunc( 3410 IE->getOperand(0), VectorType::get(ScalarTruncatedTy, Elements)); 3411 auto *O1 = B.CreateZExtOrTrunc(IE->getOperand(1), ScalarTruncatedTy); 3412 NewI = B.CreateInsertElement(O0, O1, IE->getOperand(2)); 3413 } else if (auto *EE = dyn_cast<ExtractElementInst>(I)) { 3414 auto Elements = EE->getOperand(0)->getType()->getVectorNumElements(); 3415 auto *O0 = B.CreateZExtOrTrunc( 3416 EE->getOperand(0), VectorType::get(ScalarTruncatedTy, Elements)); 3417 NewI = B.CreateExtractElement(O0, EE->getOperand(2)); 3418 } else { 3419 // If we don't know what to do, be conservative and don't do anything. 3420 continue; 3421 } 3422 3423 // Lastly, extend the result. 3424 NewI->takeName(cast<Instruction>(I)); 3425 Value *Res = B.CreateZExtOrTrunc(NewI, OriginalTy); 3426 I->replaceAllUsesWith(Res); 3427 cast<Instruction>(I)->eraseFromParent(); 3428 Erased.insert(I); 3429 VectorLoopValueMap.resetVectorValue(KV.first, Part, Res); 3430 } 3431 } 3432 3433 // We'll have created a bunch of ZExts that are now parentless. Clean up. 3434 for (const auto &KV : Cost->getMinimalBitwidths()) { 3435 // If the value wasn't vectorized, we must maintain the original scalar 3436 // type. The absence of the value from VectorLoopValueMap indicates that it 3437 // wasn't vectorized. 3438 if (!VectorLoopValueMap.hasAnyVectorValue(KV.first)) 3439 continue; 3440 for (unsigned Part = 0; Part < UF; ++Part) { 3441 Value *I = getOrCreateVectorValue(KV.first, Part); 3442 ZExtInst *Inst = dyn_cast<ZExtInst>(I); 3443 if (Inst && Inst->use_empty()) { 3444 Value *NewI = Inst->getOperand(0); 3445 Inst->eraseFromParent(); 3446 VectorLoopValueMap.resetVectorValue(KV.first, Part, NewI); 3447 } 3448 } 3449 } 3450 } 3451 3452 void InnerLoopVectorizer::fixVectorizedLoop() { 3453 // Insert truncates and extends for any truncated instructions as hints to 3454 // InstCombine. 3455 if (VF > 1) 3456 truncateToMinimalBitwidths(); 3457 3458 // Fix widened non-induction PHIs by setting up the PHI operands. 3459 if (OrigPHIsToFix.size()) { 3460 assert(EnableVPlanNativePath && 3461 "Unexpected non-induction PHIs for fixup in non VPlan-native path"); 3462 fixNonInductionPHIs(); 3463 } 3464 3465 // At this point every instruction in the original loop is widened to a 3466 // vector form. Now we need to fix the recurrences in the loop. These PHI 3467 // nodes are currently empty because we did not want to introduce cycles. 3468 // This is the second stage of vectorizing recurrences. 3469 fixCrossIterationPHIs(); 3470 3471 // Forget the original basic block. 3472 PSE.getSE()->forgetLoop(OrigLoop); 3473 3474 // Fix-up external users of the induction variables. 3475 for (auto &Entry : *Legal->getInductionVars()) 3476 fixupIVUsers(Entry.first, Entry.second, 3477 getOrCreateVectorTripCount(LI->getLoopFor(LoopVectorBody)), 3478 IVEndValues[Entry.first], LoopMiddleBlock); 3479 3480 fixLCSSAPHIs(); 3481 for (Instruction *PI : PredicatedInstructions) 3482 sinkScalarOperands(&*PI); 3483 3484 // Remove redundant induction instructions. 3485 cse(LoopVectorBody); 3486 3487 // Set/update profile weights for the vector and remainder loops as original 3488 // loop iterations are now distributed among them. Note that original loop 3489 // represented by LoopScalarBody becomes remainder loop after vectorization. 3490 // 3491 // For cases like foldTailByMasking() and requiresScalarEpiloque() we may 3492 // end up getting slightly roughened result but that should be OK since 3493 // profile is not inherently precise anyway. Note also possible bypass of 3494 // vector code caused by legality checks is ignored, assigning all the weight 3495 // to the vector loop, optimistically. 3496 setProfileInfoAfterUnrolling(LI->getLoopFor(LoopScalarBody), 3497 LI->getLoopFor(LoopVectorBody), 3498 LI->getLoopFor(LoopScalarBody), VF * UF); 3499 } 3500 3501 void InnerLoopVectorizer::fixCrossIterationPHIs() { 3502 // In order to support recurrences we need to be able to vectorize Phi nodes. 3503 // Phi nodes have cycles, so we need to vectorize them in two stages. This is 3504 // stage #2: We now need to fix the recurrences by adding incoming edges to 3505 // the currently empty PHI nodes. At this point every instruction in the 3506 // original loop is widened to a vector form so we can use them to construct 3507 // the incoming edges. 3508 for (PHINode &Phi : OrigLoop->getHeader()->phis()) { 3509 // Handle first-order recurrences and reductions that need to be fixed. 3510 if (Legal->isFirstOrderRecurrence(&Phi)) 3511 fixFirstOrderRecurrence(&Phi); 3512 else if (Legal->isReductionVariable(&Phi)) 3513 fixReduction(&Phi); 3514 } 3515 } 3516 3517 void InnerLoopVectorizer::fixFirstOrderRecurrence(PHINode *Phi) { 3518 // This is the second phase of vectorizing first-order recurrences. An 3519 // overview of the transformation is described below. Suppose we have the 3520 // following loop. 3521 // 3522 // for (int i = 0; i < n; ++i) 3523 // b[i] = a[i] - a[i - 1]; 3524 // 3525 // There is a first-order recurrence on "a". For this loop, the shorthand 3526 // scalar IR looks like: 3527 // 3528 // scalar.ph: 3529 // s_init = a[-1] 3530 // br scalar.body 3531 // 3532 // scalar.body: 3533 // i = phi [0, scalar.ph], [i+1, scalar.body] 3534 // s1 = phi [s_init, scalar.ph], [s2, scalar.body] 3535 // s2 = a[i] 3536 // b[i] = s2 - s1 3537 // br cond, scalar.body, ... 3538 // 3539 // In this example, s1 is a recurrence because it's value depends on the 3540 // previous iteration. In the first phase of vectorization, we created a 3541 // temporary value for s1. We now complete the vectorization and produce the 3542 // shorthand vector IR shown below (for VF = 4, UF = 1). 3543 // 3544 // vector.ph: 3545 // v_init = vector(..., ..., ..., a[-1]) 3546 // br vector.body 3547 // 3548 // vector.body 3549 // i = phi [0, vector.ph], [i+4, vector.body] 3550 // v1 = phi [v_init, vector.ph], [v2, vector.body] 3551 // v2 = a[i, i+1, i+2, i+3]; 3552 // v3 = vector(v1(3), v2(0, 1, 2)) 3553 // b[i, i+1, i+2, i+3] = v2 - v3 3554 // br cond, vector.body, middle.block 3555 // 3556 // middle.block: 3557 // x = v2(3) 3558 // br scalar.ph 3559 // 3560 // scalar.ph: 3561 // s_init = phi [x, middle.block], [a[-1], otherwise] 3562 // br scalar.body 3563 // 3564 // After execution completes the vector loop, we extract the next value of 3565 // the recurrence (x) to use as the initial value in the scalar loop. 3566 3567 // Get the original loop preheader and single loop latch. 3568 auto *Preheader = OrigLoop->getLoopPreheader(); 3569 auto *Latch = OrigLoop->getLoopLatch(); 3570 3571 // Get the initial and previous values of the scalar recurrence. 3572 auto *ScalarInit = Phi->getIncomingValueForBlock(Preheader); 3573 auto *Previous = Phi->getIncomingValueForBlock(Latch); 3574 3575 // Create a vector from the initial value. 3576 auto *VectorInit = ScalarInit; 3577 if (VF > 1) { 3578 Builder.SetInsertPoint(LoopVectorPreHeader->getTerminator()); 3579 VectorInit = Builder.CreateInsertElement( 3580 UndefValue::get(VectorType::get(VectorInit->getType(), VF)), VectorInit, 3581 Builder.getInt32(VF - 1), "vector.recur.init"); 3582 } 3583 3584 // We constructed a temporary phi node in the first phase of vectorization. 3585 // This phi node will eventually be deleted. 3586 Builder.SetInsertPoint( 3587 cast<Instruction>(VectorLoopValueMap.getVectorValue(Phi, 0))); 3588 3589 // Create a phi node for the new recurrence. The current value will either be 3590 // the initial value inserted into a vector or loop-varying vector value. 3591 auto *VecPhi = Builder.CreatePHI(VectorInit->getType(), 2, "vector.recur"); 3592 VecPhi->addIncoming(VectorInit, LoopVectorPreHeader); 3593 3594 // Get the vectorized previous value of the last part UF - 1. It appears last 3595 // among all unrolled iterations, due to the order of their construction. 3596 Value *PreviousLastPart = getOrCreateVectorValue(Previous, UF - 1); 3597 3598 // Find and set the insertion point after the previous value if it is an 3599 // instruction. 3600 BasicBlock::iterator InsertPt; 3601 // Note that the previous value may have been constant-folded so it is not 3602 // guaranteed to be an instruction in the vector loop. 3603 // FIXME: Loop invariant values do not form recurrences. We should deal with 3604 // them earlier. 3605 if (LI->getLoopFor(LoopVectorBody)->isLoopInvariant(PreviousLastPart)) 3606 InsertPt = LoopVectorBody->getFirstInsertionPt(); 3607 else { 3608 Instruction *PreviousInst = cast<Instruction>(PreviousLastPart); 3609 if (isa<PHINode>(PreviousLastPart)) 3610 // If the previous value is a phi node, we should insert after all the phi 3611 // nodes in the block containing the PHI to avoid breaking basic block 3612 // verification. Note that the basic block may be different to 3613 // LoopVectorBody, in case we predicate the loop. 3614 InsertPt = PreviousInst->getParent()->getFirstInsertionPt(); 3615 else 3616 InsertPt = ++PreviousInst->getIterator(); 3617 } 3618 Builder.SetInsertPoint(&*InsertPt); 3619 3620 // We will construct a vector for the recurrence by combining the values for 3621 // the current and previous iterations. This is the required shuffle mask. 3622 SmallVector<Constant *, 8> ShuffleMask(VF); 3623 ShuffleMask[0] = Builder.getInt32(VF - 1); 3624 for (unsigned I = 1; I < VF; ++I) 3625 ShuffleMask[I] = Builder.getInt32(I + VF - 1); 3626 3627 // The vector from which to take the initial value for the current iteration 3628 // (actual or unrolled). Initially, this is the vector phi node. 3629 Value *Incoming = VecPhi; 3630 3631 // Shuffle the current and previous vector and update the vector parts. 3632 for (unsigned Part = 0; Part < UF; ++Part) { 3633 Value *PreviousPart = getOrCreateVectorValue(Previous, Part); 3634 Value *PhiPart = VectorLoopValueMap.getVectorValue(Phi, Part); 3635 auto *Shuffle = 3636 VF > 1 ? Builder.CreateShuffleVector(Incoming, PreviousPart, 3637 ConstantVector::get(ShuffleMask)) 3638 : Incoming; 3639 PhiPart->replaceAllUsesWith(Shuffle); 3640 cast<Instruction>(PhiPart)->eraseFromParent(); 3641 VectorLoopValueMap.resetVectorValue(Phi, Part, Shuffle); 3642 Incoming = PreviousPart; 3643 } 3644 3645 // Fix the latch value of the new recurrence in the vector loop. 3646 VecPhi->addIncoming(Incoming, LI->getLoopFor(LoopVectorBody)->getLoopLatch()); 3647 3648 // Extract the last vector element in the middle block. This will be the 3649 // initial value for the recurrence when jumping to the scalar loop. 3650 auto *ExtractForScalar = Incoming; 3651 if (VF > 1) { 3652 Builder.SetInsertPoint(LoopMiddleBlock->getTerminator()); 3653 ExtractForScalar = Builder.CreateExtractElement( 3654 ExtractForScalar, Builder.getInt32(VF - 1), "vector.recur.extract"); 3655 } 3656 // Extract the second last element in the middle block if the 3657 // Phi is used outside the loop. We need to extract the phi itself 3658 // and not the last element (the phi update in the current iteration). This 3659 // will be the value when jumping to the exit block from the LoopMiddleBlock, 3660 // when the scalar loop is not run at all. 3661 Value *ExtractForPhiUsedOutsideLoop = nullptr; 3662 if (VF > 1) 3663 ExtractForPhiUsedOutsideLoop = Builder.CreateExtractElement( 3664 Incoming, Builder.getInt32(VF - 2), "vector.recur.extract.for.phi"); 3665 // When loop is unrolled without vectorizing, initialize 3666 // ExtractForPhiUsedOutsideLoop with the value just prior to unrolled value of 3667 // `Incoming`. This is analogous to the vectorized case above: extracting the 3668 // second last element when VF > 1. 3669 else if (UF > 1) 3670 ExtractForPhiUsedOutsideLoop = getOrCreateVectorValue(Previous, UF - 2); 3671 3672 // Fix the initial value of the original recurrence in the scalar loop. 3673 Builder.SetInsertPoint(&*LoopScalarPreHeader->begin()); 3674 auto *Start = Builder.CreatePHI(Phi->getType(), 2, "scalar.recur.init"); 3675 for (auto *BB : predecessors(LoopScalarPreHeader)) { 3676 auto *Incoming = BB == LoopMiddleBlock ? ExtractForScalar : ScalarInit; 3677 Start->addIncoming(Incoming, BB); 3678 } 3679 3680 Phi->setIncomingValueForBlock(LoopScalarPreHeader, Start); 3681 Phi->setName("scalar.recur"); 3682 3683 // Finally, fix users of the recurrence outside the loop. The users will need 3684 // either the last value of the scalar recurrence or the last value of the 3685 // vector recurrence we extracted in the middle block. Since the loop is in 3686 // LCSSA form, we just need to find all the phi nodes for the original scalar 3687 // recurrence in the exit block, and then add an edge for the middle block. 3688 for (PHINode &LCSSAPhi : LoopExitBlock->phis()) { 3689 if (LCSSAPhi.getIncomingValue(0) == Phi) { 3690 LCSSAPhi.addIncoming(ExtractForPhiUsedOutsideLoop, LoopMiddleBlock); 3691 } 3692 } 3693 } 3694 3695 void InnerLoopVectorizer::fixReduction(PHINode *Phi) { 3696 Constant *Zero = Builder.getInt32(0); 3697 3698 // Get it's reduction variable descriptor. 3699 assert(Legal->isReductionVariable(Phi) && 3700 "Unable to find the reduction variable"); 3701 RecurrenceDescriptor RdxDesc = (*Legal->getReductionVars())[Phi]; 3702 3703 RecurrenceDescriptor::RecurrenceKind RK = RdxDesc.getRecurrenceKind(); 3704 TrackingVH<Value> ReductionStartValue = RdxDesc.getRecurrenceStartValue(); 3705 Instruction *LoopExitInst = RdxDesc.getLoopExitInstr(); 3706 RecurrenceDescriptor::MinMaxRecurrenceKind MinMaxKind = 3707 RdxDesc.getMinMaxRecurrenceKind(); 3708 setDebugLocFromInst(Builder, ReductionStartValue); 3709 3710 // We need to generate a reduction vector from the incoming scalar. 3711 // To do so, we need to generate the 'identity' vector and override 3712 // one of the elements with the incoming scalar reduction. We need 3713 // to do it in the vector-loop preheader. 3714 Builder.SetInsertPoint(LoopVectorPreHeader->getTerminator()); 3715 3716 // This is the vector-clone of the value that leaves the loop. 3717 Type *VecTy = getOrCreateVectorValue(LoopExitInst, 0)->getType(); 3718 3719 // Find the reduction identity variable. Zero for addition, or, xor, 3720 // one for multiplication, -1 for And. 3721 Value *Identity; 3722 Value *VectorStart; 3723 if (RK == RecurrenceDescriptor::RK_IntegerMinMax || 3724 RK == RecurrenceDescriptor::RK_FloatMinMax) { 3725 // MinMax reduction have the start value as their identify. 3726 if (VF == 1) { 3727 VectorStart = Identity = ReductionStartValue; 3728 } else { 3729 VectorStart = Identity = 3730 Builder.CreateVectorSplat(VF, ReductionStartValue, "minmax.ident"); 3731 } 3732 } else { 3733 // Handle other reduction kinds: 3734 Constant *Iden = RecurrenceDescriptor::getRecurrenceIdentity( 3735 RK, VecTy->getScalarType()); 3736 if (VF == 1) { 3737 Identity = Iden; 3738 // This vector is the Identity vector where the first element is the 3739 // incoming scalar reduction. 3740 VectorStart = ReductionStartValue; 3741 } else { 3742 Identity = ConstantVector::getSplat(VF, Iden); 3743 3744 // This vector is the Identity vector where the first element is the 3745 // incoming scalar reduction. 3746 VectorStart = 3747 Builder.CreateInsertElement(Identity, ReductionStartValue, Zero); 3748 } 3749 } 3750 3751 // Wrap flags are in general invalid after vectorization, clear them. 3752 clearReductionWrapFlags(RdxDesc); 3753 3754 // Fix the vector-loop phi. 3755 3756 // Reductions do not have to start at zero. They can start with 3757 // any loop invariant values. 3758 BasicBlock *Latch = OrigLoop->getLoopLatch(); 3759 Value *LoopVal = Phi->getIncomingValueForBlock(Latch); 3760 3761 for (unsigned Part = 0; Part < UF; ++Part) { 3762 Value *VecRdxPhi = getOrCreateVectorValue(Phi, Part); 3763 Value *Val = getOrCreateVectorValue(LoopVal, Part); 3764 // Make sure to add the reduction start value only to the 3765 // first unroll part. 3766 Value *StartVal = (Part == 0) ? VectorStart : Identity; 3767 cast<PHINode>(VecRdxPhi)->addIncoming(StartVal, LoopVectorPreHeader); 3768 cast<PHINode>(VecRdxPhi) 3769 ->addIncoming(Val, LI->getLoopFor(LoopVectorBody)->getLoopLatch()); 3770 } 3771 3772 // Before each round, move the insertion point right between 3773 // the PHIs and the values we are going to write. 3774 // This allows us to write both PHINodes and the extractelement 3775 // instructions. 3776 Builder.SetInsertPoint(&*LoopMiddleBlock->getFirstInsertionPt()); 3777 3778 setDebugLocFromInst(Builder, LoopExitInst); 3779 3780 // If tail is folded by masking, the vector value to leave the loop should be 3781 // a Select choosing between the vectorized LoopExitInst and vectorized Phi, 3782 // instead of the former. 3783 if (Cost->foldTailByMasking()) { 3784 for (unsigned Part = 0; Part < UF; ++Part) { 3785 Value *VecLoopExitInst = 3786 VectorLoopValueMap.getVectorValue(LoopExitInst, Part); 3787 Value *Sel = nullptr; 3788 for (User *U : VecLoopExitInst->users()) { 3789 if (isa<SelectInst>(U)) { 3790 assert(!Sel && "Reduction exit feeding two selects"); 3791 Sel = U; 3792 } else 3793 assert(isa<PHINode>(U) && "Reduction exit must feed Phi's or select"); 3794 } 3795 assert(Sel && "Reduction exit feeds no select"); 3796 VectorLoopValueMap.resetVectorValue(LoopExitInst, Part, Sel); 3797 } 3798 } 3799 3800 // If the vector reduction can be performed in a smaller type, we truncate 3801 // then extend the loop exit value to enable InstCombine to evaluate the 3802 // entire expression in the smaller type. 3803 if (VF > 1 && Phi->getType() != RdxDesc.getRecurrenceType()) { 3804 Type *RdxVecTy = VectorType::get(RdxDesc.getRecurrenceType(), VF); 3805 Builder.SetInsertPoint( 3806 LI->getLoopFor(LoopVectorBody)->getLoopLatch()->getTerminator()); 3807 VectorParts RdxParts(UF); 3808 for (unsigned Part = 0; Part < UF; ++Part) { 3809 RdxParts[Part] = VectorLoopValueMap.getVectorValue(LoopExitInst, Part); 3810 Value *Trunc = Builder.CreateTrunc(RdxParts[Part], RdxVecTy); 3811 Value *Extnd = RdxDesc.isSigned() ? Builder.CreateSExt(Trunc, VecTy) 3812 : Builder.CreateZExt(Trunc, VecTy); 3813 for (Value::user_iterator UI = RdxParts[Part]->user_begin(); 3814 UI != RdxParts[Part]->user_end();) 3815 if (*UI != Trunc) { 3816 (*UI++)->replaceUsesOfWith(RdxParts[Part], Extnd); 3817 RdxParts[Part] = Extnd; 3818 } else { 3819 ++UI; 3820 } 3821 } 3822 Builder.SetInsertPoint(&*LoopMiddleBlock->getFirstInsertionPt()); 3823 for (unsigned Part = 0; Part < UF; ++Part) { 3824 RdxParts[Part] = Builder.CreateTrunc(RdxParts[Part], RdxVecTy); 3825 VectorLoopValueMap.resetVectorValue(LoopExitInst, Part, RdxParts[Part]); 3826 } 3827 } 3828 3829 // Reduce all of the unrolled parts into a single vector. 3830 Value *ReducedPartRdx = VectorLoopValueMap.getVectorValue(LoopExitInst, 0); 3831 unsigned Op = RecurrenceDescriptor::getRecurrenceBinOp(RK); 3832 3833 // The middle block terminator has already been assigned a DebugLoc here (the 3834 // OrigLoop's single latch terminator). We want the whole middle block to 3835 // appear to execute on this line because: (a) it is all compiler generated, 3836 // (b) these instructions are always executed after evaluating the latch 3837 // conditional branch, and (c) other passes may add new predecessors which 3838 // terminate on this line. This is the easiest way to ensure we don't 3839 // accidentally cause an extra step back into the loop while debugging. 3840 setDebugLocFromInst(Builder, LoopMiddleBlock->getTerminator()); 3841 for (unsigned Part = 1; Part < UF; ++Part) { 3842 Value *RdxPart = VectorLoopValueMap.getVectorValue(LoopExitInst, Part); 3843 if (Op != Instruction::ICmp && Op != Instruction::FCmp) 3844 // Floating point operations had to be 'fast' to enable the reduction. 3845 ReducedPartRdx = addFastMathFlag( 3846 Builder.CreateBinOp((Instruction::BinaryOps)Op, RdxPart, 3847 ReducedPartRdx, "bin.rdx"), 3848 RdxDesc.getFastMathFlags()); 3849 else 3850 ReducedPartRdx = createMinMaxOp(Builder, MinMaxKind, ReducedPartRdx, 3851 RdxPart); 3852 } 3853 3854 if (VF > 1) { 3855 bool NoNaN = Legal->hasFunNoNaNAttr(); 3856 ReducedPartRdx = 3857 createTargetReduction(Builder, TTI, RdxDesc, ReducedPartRdx, NoNaN); 3858 // If the reduction can be performed in a smaller type, we need to extend 3859 // the reduction to the wider type before we branch to the original loop. 3860 if (Phi->getType() != RdxDesc.getRecurrenceType()) 3861 ReducedPartRdx = 3862 RdxDesc.isSigned() 3863 ? Builder.CreateSExt(ReducedPartRdx, Phi->getType()) 3864 : Builder.CreateZExt(ReducedPartRdx, Phi->getType()); 3865 } 3866 3867 // Create a phi node that merges control-flow from the backedge-taken check 3868 // block and the middle block. 3869 PHINode *BCBlockPhi = PHINode::Create(Phi->getType(), 2, "bc.merge.rdx", 3870 LoopScalarPreHeader->getTerminator()); 3871 for (unsigned I = 0, E = LoopBypassBlocks.size(); I != E; ++I) 3872 BCBlockPhi->addIncoming(ReductionStartValue, LoopBypassBlocks[I]); 3873 BCBlockPhi->addIncoming(ReducedPartRdx, LoopMiddleBlock); 3874 3875 // Now, we need to fix the users of the reduction variable 3876 // inside and outside of the scalar remainder loop. 3877 // We know that the loop is in LCSSA form. We need to update the 3878 // PHI nodes in the exit blocks. 3879 for (PHINode &LCSSAPhi : LoopExitBlock->phis()) { 3880 // All PHINodes need to have a single entry edge, or two if 3881 // we already fixed them. 3882 assert(LCSSAPhi.getNumIncomingValues() < 3 && "Invalid LCSSA PHI"); 3883 3884 // We found a reduction value exit-PHI. Update it with the 3885 // incoming bypass edge. 3886 if (LCSSAPhi.getIncomingValue(0) == LoopExitInst) 3887 LCSSAPhi.addIncoming(ReducedPartRdx, LoopMiddleBlock); 3888 } // end of the LCSSA phi scan. 3889 3890 // Fix the scalar loop reduction variable with the incoming reduction sum 3891 // from the vector body and from the backedge value. 3892 int IncomingEdgeBlockIdx = 3893 Phi->getBasicBlockIndex(OrigLoop->getLoopLatch()); 3894 assert(IncomingEdgeBlockIdx >= 0 && "Invalid block index"); 3895 // Pick the other block. 3896 int SelfEdgeBlockIdx = (IncomingEdgeBlockIdx ? 0 : 1); 3897 Phi->setIncomingValue(SelfEdgeBlockIdx, BCBlockPhi); 3898 Phi->setIncomingValue(IncomingEdgeBlockIdx, LoopExitInst); 3899 } 3900 3901 void InnerLoopVectorizer::clearReductionWrapFlags( 3902 RecurrenceDescriptor &RdxDesc) { 3903 RecurrenceDescriptor::RecurrenceKind RK = RdxDesc.getRecurrenceKind(); 3904 if (RK != RecurrenceDescriptor::RK_IntegerAdd && 3905 RK != RecurrenceDescriptor::RK_IntegerMult) 3906 return; 3907 3908 Instruction *LoopExitInstr = RdxDesc.getLoopExitInstr(); 3909 assert(LoopExitInstr && "null loop exit instruction"); 3910 SmallVector<Instruction *, 8> Worklist; 3911 SmallPtrSet<Instruction *, 8> Visited; 3912 Worklist.push_back(LoopExitInstr); 3913 Visited.insert(LoopExitInstr); 3914 3915 while (!Worklist.empty()) { 3916 Instruction *Cur = Worklist.pop_back_val(); 3917 if (isa<OverflowingBinaryOperator>(Cur)) 3918 for (unsigned Part = 0; Part < UF; ++Part) { 3919 Value *V = getOrCreateVectorValue(Cur, Part); 3920 cast<Instruction>(V)->dropPoisonGeneratingFlags(); 3921 } 3922 3923 for (User *U : Cur->users()) { 3924 Instruction *UI = cast<Instruction>(U); 3925 if ((Cur != LoopExitInstr || OrigLoop->contains(UI->getParent())) && 3926 Visited.insert(UI).second) 3927 Worklist.push_back(UI); 3928 } 3929 } 3930 } 3931 3932 void InnerLoopVectorizer::fixLCSSAPHIs() { 3933 for (PHINode &LCSSAPhi : LoopExitBlock->phis()) { 3934 if (LCSSAPhi.getNumIncomingValues() == 1) { 3935 auto *IncomingValue = LCSSAPhi.getIncomingValue(0); 3936 // Non-instruction incoming values will have only one value. 3937 unsigned LastLane = 0; 3938 if (isa<Instruction>(IncomingValue)) 3939 LastLane = Cost->isUniformAfterVectorization( 3940 cast<Instruction>(IncomingValue), VF) 3941 ? 0 3942 : VF - 1; 3943 // Can be a loop invariant incoming value or the last scalar value to be 3944 // extracted from the vectorized loop. 3945 Builder.SetInsertPoint(LoopMiddleBlock->getTerminator()); 3946 Value *lastIncomingValue = 3947 getOrCreateScalarValue(IncomingValue, { UF - 1, LastLane }); 3948 LCSSAPhi.addIncoming(lastIncomingValue, LoopMiddleBlock); 3949 } 3950 } 3951 } 3952 3953 void InnerLoopVectorizer::sinkScalarOperands(Instruction *PredInst) { 3954 // The basic block and loop containing the predicated instruction. 3955 auto *PredBB = PredInst->getParent(); 3956 auto *VectorLoop = LI->getLoopFor(PredBB); 3957 3958 // Initialize a worklist with the operands of the predicated instruction. 3959 SetVector<Value *> Worklist(PredInst->op_begin(), PredInst->op_end()); 3960 3961 // Holds instructions that we need to analyze again. An instruction may be 3962 // reanalyzed if we don't yet know if we can sink it or not. 3963 SmallVector<Instruction *, 8> InstsToReanalyze; 3964 3965 // Returns true if a given use occurs in the predicated block. Phi nodes use 3966 // their operands in their corresponding predecessor blocks. 3967 auto isBlockOfUsePredicated = [&](Use &U) -> bool { 3968 auto *I = cast<Instruction>(U.getUser()); 3969 BasicBlock *BB = I->getParent(); 3970 if (auto *Phi = dyn_cast<PHINode>(I)) 3971 BB = Phi->getIncomingBlock( 3972 PHINode::getIncomingValueNumForOperand(U.getOperandNo())); 3973 return BB == PredBB; 3974 }; 3975 3976 // Iteratively sink the scalarized operands of the predicated instruction 3977 // into the block we created for it. When an instruction is sunk, it's 3978 // operands are then added to the worklist. The algorithm ends after one pass 3979 // through the worklist doesn't sink a single instruction. 3980 bool Changed; 3981 do { 3982 // Add the instructions that need to be reanalyzed to the worklist, and 3983 // reset the changed indicator. 3984 Worklist.insert(InstsToReanalyze.begin(), InstsToReanalyze.end()); 3985 InstsToReanalyze.clear(); 3986 Changed = false; 3987 3988 while (!Worklist.empty()) { 3989 auto *I = dyn_cast<Instruction>(Worklist.pop_back_val()); 3990 3991 // We can't sink an instruction if it is a phi node, is already in the 3992 // predicated block, is not in the loop, or may have side effects. 3993 if (!I || isa<PHINode>(I) || I->getParent() == PredBB || 3994 !VectorLoop->contains(I) || I->mayHaveSideEffects()) 3995 continue; 3996 3997 // It's legal to sink the instruction if all its uses occur in the 3998 // predicated block. Otherwise, there's nothing to do yet, and we may 3999 // need to reanalyze the instruction. 4000 if (!llvm::all_of(I->uses(), isBlockOfUsePredicated)) { 4001 InstsToReanalyze.push_back(I); 4002 continue; 4003 } 4004 4005 // Move the instruction to the beginning of the predicated block, and add 4006 // it's operands to the worklist. 4007 I->moveBefore(&*PredBB->getFirstInsertionPt()); 4008 Worklist.insert(I->op_begin(), I->op_end()); 4009 4010 // The sinking may have enabled other instructions to be sunk, so we will 4011 // need to iterate. 4012 Changed = true; 4013 } 4014 } while (Changed); 4015 } 4016 4017 void InnerLoopVectorizer::fixNonInductionPHIs() { 4018 for (PHINode *OrigPhi : OrigPHIsToFix) { 4019 PHINode *NewPhi = 4020 cast<PHINode>(VectorLoopValueMap.getVectorValue(OrigPhi, 0)); 4021 unsigned NumIncomingValues = OrigPhi->getNumIncomingValues(); 4022 4023 SmallVector<BasicBlock *, 2> ScalarBBPredecessors( 4024 predecessors(OrigPhi->getParent())); 4025 SmallVector<BasicBlock *, 2> VectorBBPredecessors( 4026 predecessors(NewPhi->getParent())); 4027 assert(ScalarBBPredecessors.size() == VectorBBPredecessors.size() && 4028 "Scalar and Vector BB should have the same number of predecessors"); 4029 4030 // The insertion point in Builder may be invalidated by the time we get 4031 // here. Force the Builder insertion point to something valid so that we do 4032 // not run into issues during insertion point restore in 4033 // getOrCreateVectorValue calls below. 4034 Builder.SetInsertPoint(NewPhi); 4035 4036 // The predecessor order is preserved and we can rely on mapping between 4037 // scalar and vector block predecessors. 4038 for (unsigned i = 0; i < NumIncomingValues; ++i) { 4039 BasicBlock *NewPredBB = VectorBBPredecessors[i]; 4040 4041 // When looking up the new scalar/vector values to fix up, use incoming 4042 // values from original phi. 4043 Value *ScIncV = 4044 OrigPhi->getIncomingValueForBlock(ScalarBBPredecessors[i]); 4045 4046 // Scalar incoming value may need a broadcast 4047 Value *NewIncV = getOrCreateVectorValue(ScIncV, 0); 4048 NewPhi->addIncoming(NewIncV, NewPredBB); 4049 } 4050 } 4051 } 4052 4053 void InnerLoopVectorizer::widenGEP(GetElementPtrInst *GEP, unsigned UF, 4054 unsigned VF, bool IsPtrLoopInvariant, 4055 SmallBitVector &IsIndexLoopInvariant) { 4056 // Construct a vector GEP by widening the operands of the scalar GEP as 4057 // necessary. We mark the vector GEP 'inbounds' if appropriate. A GEP 4058 // results in a vector of pointers when at least one operand of the GEP 4059 // is vector-typed. Thus, to keep the representation compact, we only use 4060 // vector-typed operands for loop-varying values. 4061 4062 if (VF > 1 && IsPtrLoopInvariant && IsIndexLoopInvariant.all()) { 4063 // If we are vectorizing, but the GEP has only loop-invariant operands, 4064 // the GEP we build (by only using vector-typed operands for 4065 // loop-varying values) would be a scalar pointer. Thus, to ensure we 4066 // produce a vector of pointers, we need to either arbitrarily pick an 4067 // operand to broadcast, or broadcast a clone of the original GEP. 4068 // Here, we broadcast a clone of the original. 4069 // 4070 // TODO: If at some point we decide to scalarize instructions having 4071 // loop-invariant operands, this special case will no longer be 4072 // required. We would add the scalarization decision to 4073 // collectLoopScalars() and teach getVectorValue() to broadcast 4074 // the lane-zero scalar value. 4075 auto *Clone = Builder.Insert(GEP->clone()); 4076 for (unsigned Part = 0; Part < UF; ++Part) { 4077 Value *EntryPart = Builder.CreateVectorSplat(VF, Clone); 4078 VectorLoopValueMap.setVectorValue(GEP, Part, EntryPart); 4079 addMetadata(EntryPart, GEP); 4080 } 4081 } else { 4082 // If the GEP has at least one loop-varying operand, we are sure to 4083 // produce a vector of pointers. But if we are only unrolling, we want 4084 // to produce a scalar GEP for each unroll part. Thus, the GEP we 4085 // produce with the code below will be scalar (if VF == 1) or vector 4086 // (otherwise). Note that for the unroll-only case, we still maintain 4087 // values in the vector mapping with initVector, as we do for other 4088 // instructions. 4089 for (unsigned Part = 0; Part < UF; ++Part) { 4090 // The pointer operand of the new GEP. If it's loop-invariant, we 4091 // won't broadcast it. 4092 auto *Ptr = IsPtrLoopInvariant 4093 ? GEP->getPointerOperand() 4094 : getOrCreateVectorValue(GEP->getPointerOperand(), Part); 4095 4096 // Collect all the indices for the new GEP. If any index is 4097 // loop-invariant, we won't broadcast it. 4098 SmallVector<Value *, 4> Indices; 4099 for (auto Index : enumerate(GEP->indices())) { 4100 Value *User = Index.value().get(); 4101 if (IsIndexLoopInvariant[Index.index()]) 4102 Indices.push_back(User); 4103 else 4104 Indices.push_back(getOrCreateVectorValue(User, Part)); 4105 } 4106 4107 // Create the new GEP. Note that this GEP may be a scalar if VF == 1, 4108 // but it should be a vector, otherwise. 4109 auto *NewGEP = 4110 GEP->isInBounds() 4111 ? Builder.CreateInBoundsGEP(GEP->getSourceElementType(), Ptr, 4112 Indices) 4113 : Builder.CreateGEP(GEP->getSourceElementType(), Ptr, Indices); 4114 assert((VF == 1 || NewGEP->getType()->isVectorTy()) && 4115 "NewGEP is not a pointer vector"); 4116 VectorLoopValueMap.setVectorValue(GEP, Part, NewGEP); 4117 addMetadata(NewGEP, GEP); 4118 } 4119 } 4120 } 4121 4122 void InnerLoopVectorizer::widenPHIInstruction(Instruction *PN, unsigned UF, 4123 unsigned VF) { 4124 PHINode *P = cast<PHINode>(PN); 4125 if (EnableVPlanNativePath) { 4126 // Currently we enter here in the VPlan-native path for non-induction 4127 // PHIs where all control flow is uniform. We simply widen these PHIs. 4128 // Create a vector phi with no operands - the vector phi operands will be 4129 // set at the end of vector code generation. 4130 Type *VecTy = 4131 (VF == 1) ? PN->getType() : VectorType::get(PN->getType(), VF); 4132 Value *VecPhi = Builder.CreatePHI(VecTy, PN->getNumOperands(), "vec.phi"); 4133 VectorLoopValueMap.setVectorValue(P, 0, VecPhi); 4134 OrigPHIsToFix.push_back(P); 4135 4136 return; 4137 } 4138 4139 assert(PN->getParent() == OrigLoop->getHeader() && 4140 "Non-header phis should have been handled elsewhere"); 4141 4142 // In order to support recurrences we need to be able to vectorize Phi nodes. 4143 // Phi nodes have cycles, so we need to vectorize them in two stages. This is 4144 // stage #1: We create a new vector PHI node with no incoming edges. We'll use 4145 // this value when we vectorize all of the instructions that use the PHI. 4146 if (Legal->isReductionVariable(P) || Legal->isFirstOrderRecurrence(P)) { 4147 for (unsigned Part = 0; Part < UF; ++Part) { 4148 // This is phase one of vectorizing PHIs. 4149 Type *VecTy = 4150 (VF == 1) ? PN->getType() : VectorType::get(PN->getType(), VF); 4151 Value *EntryPart = PHINode::Create( 4152 VecTy, 2, "vec.phi", &*LoopVectorBody->getFirstInsertionPt()); 4153 VectorLoopValueMap.setVectorValue(P, Part, EntryPart); 4154 } 4155 return; 4156 } 4157 4158 setDebugLocFromInst(Builder, P); 4159 4160 // This PHINode must be an induction variable. 4161 // Make sure that we know about it. 4162 assert(Legal->getInductionVars()->count(P) && "Not an induction variable"); 4163 4164 InductionDescriptor II = Legal->getInductionVars()->lookup(P); 4165 const DataLayout &DL = OrigLoop->getHeader()->getModule()->getDataLayout(); 4166 4167 // FIXME: The newly created binary instructions should contain nsw/nuw flags, 4168 // which can be found from the original scalar operations. 4169 switch (II.getKind()) { 4170 case InductionDescriptor::IK_NoInduction: 4171 llvm_unreachable("Unknown induction"); 4172 case InductionDescriptor::IK_IntInduction: 4173 case InductionDescriptor::IK_FpInduction: 4174 llvm_unreachable("Integer/fp induction is handled elsewhere."); 4175 case InductionDescriptor::IK_PtrInduction: { 4176 // Handle the pointer induction variable case. 4177 assert(P->getType()->isPointerTy() && "Unexpected type."); 4178 // This is the normalized GEP that starts counting at zero. 4179 Value *PtrInd = Induction; 4180 PtrInd = Builder.CreateSExtOrTrunc(PtrInd, II.getStep()->getType()); 4181 // Determine the number of scalars we need to generate for each unroll 4182 // iteration. If the instruction is uniform, we only need to generate the 4183 // first lane. Otherwise, we generate all VF values. 4184 unsigned Lanes = Cost->isUniformAfterVectorization(P, VF) ? 1 : VF; 4185 // These are the scalar results. Notice that we don't generate vector GEPs 4186 // because scalar GEPs result in better code. 4187 for (unsigned Part = 0; Part < UF; ++Part) { 4188 for (unsigned Lane = 0; Lane < Lanes; ++Lane) { 4189 Constant *Idx = ConstantInt::get(PtrInd->getType(), Lane + Part * VF); 4190 Value *GlobalIdx = Builder.CreateAdd(PtrInd, Idx); 4191 Value *SclrGep = 4192 emitTransformedIndex(Builder, GlobalIdx, PSE.getSE(), DL, II); 4193 SclrGep->setName("next.gep"); 4194 VectorLoopValueMap.setScalarValue(P, {Part, Lane}, SclrGep); 4195 } 4196 } 4197 return; 4198 } 4199 } 4200 } 4201 4202 /// A helper function for checking whether an integer division-related 4203 /// instruction may divide by zero (in which case it must be predicated if 4204 /// executed conditionally in the scalar code). 4205 /// TODO: It may be worthwhile to generalize and check isKnownNonZero(). 4206 /// Non-zero divisors that are non compile-time constants will not be 4207 /// converted into multiplication, so we will still end up scalarizing 4208 /// the division, but can do so w/o predication. 4209 static bool mayDivideByZero(Instruction &I) { 4210 assert((I.getOpcode() == Instruction::UDiv || 4211 I.getOpcode() == Instruction::SDiv || 4212 I.getOpcode() == Instruction::URem || 4213 I.getOpcode() == Instruction::SRem) && 4214 "Unexpected instruction"); 4215 Value *Divisor = I.getOperand(1); 4216 auto *CInt = dyn_cast<ConstantInt>(Divisor); 4217 return !CInt || CInt->isZero(); 4218 } 4219 4220 void InnerLoopVectorizer::widenInstruction(Instruction &I) { 4221 switch (I.getOpcode()) { 4222 case Instruction::Br: 4223 case Instruction::PHI: 4224 case Instruction::GetElementPtr: 4225 llvm_unreachable("This instruction is handled by a different recipe."); 4226 case Instruction::UDiv: 4227 case Instruction::SDiv: 4228 case Instruction::SRem: 4229 case Instruction::URem: 4230 case Instruction::Add: 4231 case Instruction::FAdd: 4232 case Instruction::Sub: 4233 case Instruction::FSub: 4234 case Instruction::FNeg: 4235 case Instruction::Mul: 4236 case Instruction::FMul: 4237 case Instruction::FDiv: 4238 case Instruction::FRem: 4239 case Instruction::Shl: 4240 case Instruction::LShr: 4241 case Instruction::AShr: 4242 case Instruction::And: 4243 case Instruction::Or: 4244 case Instruction::Xor: { 4245 // Just widen unops and binops. 4246 setDebugLocFromInst(Builder, &I); 4247 4248 for (unsigned Part = 0; Part < UF; ++Part) { 4249 SmallVector<Value *, 2> Ops; 4250 for (Value *Op : I.operands()) 4251 Ops.push_back(getOrCreateVectorValue(Op, Part)); 4252 4253 Value *V = Builder.CreateNAryOp(I.getOpcode(), Ops); 4254 4255 if (auto *VecOp = dyn_cast<Instruction>(V)) 4256 VecOp->copyIRFlags(&I); 4257 4258 // Use this vector value for all users of the original instruction. 4259 VectorLoopValueMap.setVectorValue(&I, Part, V); 4260 addMetadata(V, &I); 4261 } 4262 4263 break; 4264 } 4265 case Instruction::Select: { 4266 // Widen selects. 4267 // If the selector is loop invariant we can create a select 4268 // instruction with a scalar condition. Otherwise, use vector-select. 4269 auto *SE = PSE.getSE(); 4270 bool InvariantCond = 4271 SE->isLoopInvariant(PSE.getSCEV(I.getOperand(0)), OrigLoop); 4272 setDebugLocFromInst(Builder, &I); 4273 4274 // The condition can be loop invariant but still defined inside the 4275 // loop. This means that we can't just use the original 'cond' value. 4276 // We have to take the 'vectorized' value and pick the first lane. 4277 // Instcombine will make this a no-op. 4278 4279 auto *ScalarCond = getOrCreateScalarValue(I.getOperand(0), {0, 0}); 4280 4281 for (unsigned Part = 0; Part < UF; ++Part) { 4282 Value *Cond = getOrCreateVectorValue(I.getOperand(0), Part); 4283 Value *Op0 = getOrCreateVectorValue(I.getOperand(1), Part); 4284 Value *Op1 = getOrCreateVectorValue(I.getOperand(2), Part); 4285 Value *Sel = 4286 Builder.CreateSelect(InvariantCond ? ScalarCond : Cond, Op0, Op1); 4287 VectorLoopValueMap.setVectorValue(&I, Part, Sel); 4288 addMetadata(Sel, &I); 4289 } 4290 4291 break; 4292 } 4293 4294 case Instruction::ICmp: 4295 case Instruction::FCmp: { 4296 // Widen compares. Generate vector compares. 4297 bool FCmp = (I.getOpcode() == Instruction::FCmp); 4298 auto *Cmp = cast<CmpInst>(&I); 4299 setDebugLocFromInst(Builder, Cmp); 4300 for (unsigned Part = 0; Part < UF; ++Part) { 4301 Value *A = getOrCreateVectorValue(Cmp->getOperand(0), Part); 4302 Value *B = getOrCreateVectorValue(Cmp->getOperand(1), Part); 4303 Value *C = nullptr; 4304 if (FCmp) { 4305 // Propagate fast math flags. 4306 IRBuilder<>::FastMathFlagGuard FMFG(Builder); 4307 Builder.setFastMathFlags(Cmp->getFastMathFlags()); 4308 C = Builder.CreateFCmp(Cmp->getPredicate(), A, B); 4309 } else { 4310 C = Builder.CreateICmp(Cmp->getPredicate(), A, B); 4311 } 4312 VectorLoopValueMap.setVectorValue(&I, Part, C); 4313 addMetadata(C, &I); 4314 } 4315 4316 break; 4317 } 4318 4319 case Instruction::ZExt: 4320 case Instruction::SExt: 4321 case Instruction::FPToUI: 4322 case Instruction::FPToSI: 4323 case Instruction::FPExt: 4324 case Instruction::PtrToInt: 4325 case Instruction::IntToPtr: 4326 case Instruction::SIToFP: 4327 case Instruction::UIToFP: 4328 case Instruction::Trunc: 4329 case Instruction::FPTrunc: 4330 case Instruction::BitCast: { 4331 auto *CI = cast<CastInst>(&I); 4332 setDebugLocFromInst(Builder, CI); 4333 4334 /// Vectorize casts. 4335 Type *DestTy = 4336 (VF == 1) ? CI->getType() : VectorType::get(CI->getType(), VF); 4337 4338 for (unsigned Part = 0; Part < UF; ++Part) { 4339 Value *A = getOrCreateVectorValue(CI->getOperand(0), Part); 4340 Value *Cast = Builder.CreateCast(CI->getOpcode(), A, DestTy); 4341 VectorLoopValueMap.setVectorValue(&I, Part, Cast); 4342 addMetadata(Cast, &I); 4343 } 4344 break; 4345 } 4346 4347 case Instruction::Call: { 4348 // Ignore dbg intrinsics. 4349 if (isa<DbgInfoIntrinsic>(I)) 4350 break; 4351 setDebugLocFromInst(Builder, &I); 4352 4353 Module *M = I.getParent()->getParent()->getParent(); 4354 auto *CI = cast<CallInst>(&I); 4355 4356 SmallVector<Type *, 4> Tys; 4357 for (Value *ArgOperand : CI->arg_operands()) 4358 Tys.push_back(ToVectorTy(ArgOperand->getType(), VF)); 4359 4360 Intrinsic::ID ID = getVectorIntrinsicIDForCall(CI, TLI); 4361 4362 // The flag shows whether we use Intrinsic or a usual Call for vectorized 4363 // version of the instruction. 4364 // Is it beneficial to perform intrinsic call compared to lib call? 4365 bool NeedToScalarize; 4366 unsigned CallCost = Cost->getVectorCallCost(CI, VF, NeedToScalarize); 4367 bool UseVectorIntrinsic = 4368 ID && Cost->getVectorIntrinsicCost(CI, VF) <= CallCost; 4369 assert((UseVectorIntrinsic || !NeedToScalarize) && 4370 "Instruction should be scalarized elsewhere."); 4371 4372 for (unsigned Part = 0; Part < UF; ++Part) { 4373 SmallVector<Value *, 4> Args; 4374 for (unsigned i = 0, ie = CI->getNumArgOperands(); i != ie; ++i) { 4375 Value *Arg = CI->getArgOperand(i); 4376 // Some intrinsics have a scalar argument - don't replace it with a 4377 // vector. 4378 if (!UseVectorIntrinsic || !hasVectorInstrinsicScalarOpd(ID, i)) 4379 Arg = getOrCreateVectorValue(CI->getArgOperand(i), Part); 4380 Args.push_back(Arg); 4381 } 4382 4383 Function *VectorF; 4384 if (UseVectorIntrinsic) { 4385 // Use vector version of the intrinsic. 4386 Type *TysForDecl[] = {CI->getType()}; 4387 if (VF > 1) 4388 TysForDecl[0] = VectorType::get(CI->getType()->getScalarType(), VF); 4389 VectorF = Intrinsic::getDeclaration(M, ID, TysForDecl); 4390 } else { 4391 // Use vector version of the function call. 4392 const VFShape Shape = 4393 VFShape::get(*CI, {VF, false} /*EC*/, false /*HasGlobalPred*/); 4394 #ifndef NDEBUG 4395 const SmallVector<VFInfo, 8> Infos = VFDatabase::getMappings(*CI); 4396 assert(std::find_if(Infos.begin(), Infos.end(), 4397 [&Shape](const VFInfo &Info) { 4398 return Info.Shape == Shape; 4399 }) != Infos.end() && 4400 "Vector function shape is missing from the database."); 4401 #endif 4402 VectorF = VFDatabase(*CI).getVectorizedFunction(Shape); 4403 } 4404 assert(VectorF && "Can't create vector function."); 4405 4406 SmallVector<OperandBundleDef, 1> OpBundles; 4407 CI->getOperandBundlesAsDefs(OpBundles); 4408 CallInst *V = Builder.CreateCall(VectorF, Args, OpBundles); 4409 4410 if (isa<FPMathOperator>(V)) 4411 V->copyFastMathFlags(CI); 4412 4413 VectorLoopValueMap.setVectorValue(&I, Part, V); 4414 addMetadata(V, &I); 4415 } 4416 4417 break; 4418 } 4419 4420 default: 4421 // This instruction is not vectorized by simple widening. 4422 LLVM_DEBUG(dbgs() << "LV: Found an unhandled instruction: " << I); 4423 llvm_unreachable("Unhandled instruction!"); 4424 } // end of switch. 4425 } 4426 4427 void LoopVectorizationCostModel::collectLoopScalars(unsigned VF) { 4428 // We should not collect Scalars more than once per VF. Right now, this 4429 // function is called from collectUniformsAndScalars(), which already does 4430 // this check. Collecting Scalars for VF=1 does not make any sense. 4431 assert(VF >= 2 && Scalars.find(VF) == Scalars.end() && 4432 "This function should not be visited twice for the same VF"); 4433 4434 SmallSetVector<Instruction *, 8> Worklist; 4435 4436 // These sets are used to seed the analysis with pointers used by memory 4437 // accesses that will remain scalar. 4438 SmallSetVector<Instruction *, 8> ScalarPtrs; 4439 SmallPtrSet<Instruction *, 8> PossibleNonScalarPtrs; 4440 4441 // A helper that returns true if the use of Ptr by MemAccess will be scalar. 4442 // The pointer operands of loads and stores will be scalar as long as the 4443 // memory access is not a gather or scatter operation. The value operand of a 4444 // store will remain scalar if the store is scalarized. 4445 auto isScalarUse = [&](Instruction *MemAccess, Value *Ptr) { 4446 InstWidening WideningDecision = getWideningDecision(MemAccess, VF); 4447 assert(WideningDecision != CM_Unknown && 4448 "Widening decision should be ready at this moment"); 4449 if (auto *Store = dyn_cast<StoreInst>(MemAccess)) 4450 if (Ptr == Store->getValueOperand()) 4451 return WideningDecision == CM_Scalarize; 4452 assert(Ptr == getLoadStorePointerOperand(MemAccess) && 4453 "Ptr is neither a value or pointer operand"); 4454 return WideningDecision != CM_GatherScatter; 4455 }; 4456 4457 // A helper that returns true if the given value is a bitcast or 4458 // getelementptr instruction contained in the loop. 4459 auto isLoopVaryingBitCastOrGEP = [&](Value *V) { 4460 return ((isa<BitCastInst>(V) && V->getType()->isPointerTy()) || 4461 isa<GetElementPtrInst>(V)) && 4462 !TheLoop->isLoopInvariant(V); 4463 }; 4464 4465 // A helper that evaluates a memory access's use of a pointer. If the use 4466 // will be a scalar use, and the pointer is only used by memory accesses, we 4467 // place the pointer in ScalarPtrs. Otherwise, the pointer is placed in 4468 // PossibleNonScalarPtrs. 4469 auto evaluatePtrUse = [&](Instruction *MemAccess, Value *Ptr) { 4470 // We only care about bitcast and getelementptr instructions contained in 4471 // the loop. 4472 if (!isLoopVaryingBitCastOrGEP(Ptr)) 4473 return; 4474 4475 // If the pointer has already been identified as scalar (e.g., if it was 4476 // also identified as uniform), there's nothing to do. 4477 auto *I = cast<Instruction>(Ptr); 4478 if (Worklist.count(I)) 4479 return; 4480 4481 // If the use of the pointer will be a scalar use, and all users of the 4482 // pointer are memory accesses, place the pointer in ScalarPtrs. Otherwise, 4483 // place the pointer in PossibleNonScalarPtrs. 4484 if (isScalarUse(MemAccess, Ptr) && llvm::all_of(I->users(), [&](User *U) { 4485 return isa<LoadInst>(U) || isa<StoreInst>(U); 4486 })) 4487 ScalarPtrs.insert(I); 4488 else 4489 PossibleNonScalarPtrs.insert(I); 4490 }; 4491 4492 // We seed the scalars analysis with three classes of instructions: (1) 4493 // instructions marked uniform-after-vectorization, (2) bitcast and 4494 // getelementptr instructions used by memory accesses requiring a scalar use, 4495 // and (3) pointer induction variables and their update instructions (we 4496 // currently only scalarize these). 4497 // 4498 // (1) Add to the worklist all instructions that have been identified as 4499 // uniform-after-vectorization. 4500 Worklist.insert(Uniforms[VF].begin(), Uniforms[VF].end()); 4501 4502 // (2) Add to the worklist all bitcast and getelementptr instructions used by 4503 // memory accesses requiring a scalar use. The pointer operands of loads and 4504 // stores will be scalar as long as the memory accesses is not a gather or 4505 // scatter operation. The value operand of a store will remain scalar if the 4506 // store is scalarized. 4507 for (auto *BB : TheLoop->blocks()) 4508 for (auto &I : *BB) { 4509 if (auto *Load = dyn_cast<LoadInst>(&I)) { 4510 evaluatePtrUse(Load, Load->getPointerOperand()); 4511 } else if (auto *Store = dyn_cast<StoreInst>(&I)) { 4512 evaluatePtrUse(Store, Store->getPointerOperand()); 4513 evaluatePtrUse(Store, Store->getValueOperand()); 4514 } 4515 } 4516 for (auto *I : ScalarPtrs) 4517 if (PossibleNonScalarPtrs.find(I) == PossibleNonScalarPtrs.end()) { 4518 LLVM_DEBUG(dbgs() << "LV: Found scalar instruction: " << *I << "\n"); 4519 Worklist.insert(I); 4520 } 4521 4522 // (3) Add to the worklist all pointer induction variables and their update 4523 // instructions. 4524 // 4525 // TODO: Once we are able to vectorize pointer induction variables we should 4526 // no longer insert them into the worklist here. 4527 auto *Latch = TheLoop->getLoopLatch(); 4528 for (auto &Induction : *Legal->getInductionVars()) { 4529 auto *Ind = Induction.first; 4530 auto *IndUpdate = cast<Instruction>(Ind->getIncomingValueForBlock(Latch)); 4531 if (Induction.second.getKind() != InductionDescriptor::IK_PtrInduction) 4532 continue; 4533 Worklist.insert(Ind); 4534 Worklist.insert(IndUpdate); 4535 LLVM_DEBUG(dbgs() << "LV: Found scalar instruction: " << *Ind << "\n"); 4536 LLVM_DEBUG(dbgs() << "LV: Found scalar instruction: " << *IndUpdate 4537 << "\n"); 4538 } 4539 4540 // Insert the forced scalars. 4541 // FIXME: Currently widenPHIInstruction() often creates a dead vector 4542 // induction variable when the PHI user is scalarized. 4543 auto ForcedScalar = ForcedScalars.find(VF); 4544 if (ForcedScalar != ForcedScalars.end()) 4545 for (auto *I : ForcedScalar->second) 4546 Worklist.insert(I); 4547 4548 // Expand the worklist by looking through any bitcasts and getelementptr 4549 // instructions we've already identified as scalar. This is similar to the 4550 // expansion step in collectLoopUniforms(); however, here we're only 4551 // expanding to include additional bitcasts and getelementptr instructions. 4552 unsigned Idx = 0; 4553 while (Idx != Worklist.size()) { 4554 Instruction *Dst = Worklist[Idx++]; 4555 if (!isLoopVaryingBitCastOrGEP(Dst->getOperand(0))) 4556 continue; 4557 auto *Src = cast<Instruction>(Dst->getOperand(0)); 4558 if (llvm::all_of(Src->users(), [&](User *U) -> bool { 4559 auto *J = cast<Instruction>(U); 4560 return !TheLoop->contains(J) || Worklist.count(J) || 4561 ((isa<LoadInst>(J) || isa<StoreInst>(J)) && 4562 isScalarUse(J, Src)); 4563 })) { 4564 Worklist.insert(Src); 4565 LLVM_DEBUG(dbgs() << "LV: Found scalar instruction: " << *Src << "\n"); 4566 } 4567 } 4568 4569 // An induction variable will remain scalar if all users of the induction 4570 // variable and induction variable update remain scalar. 4571 for (auto &Induction : *Legal->getInductionVars()) { 4572 auto *Ind = Induction.first; 4573 auto *IndUpdate = cast<Instruction>(Ind->getIncomingValueForBlock(Latch)); 4574 4575 // We already considered pointer induction variables, so there's no reason 4576 // to look at their users again. 4577 // 4578 // TODO: Once we are able to vectorize pointer induction variables we 4579 // should no longer skip over them here. 4580 if (Induction.second.getKind() == InductionDescriptor::IK_PtrInduction) 4581 continue; 4582 4583 // Determine if all users of the induction variable are scalar after 4584 // vectorization. 4585 auto ScalarInd = llvm::all_of(Ind->users(), [&](User *U) -> bool { 4586 auto *I = cast<Instruction>(U); 4587 return I == IndUpdate || !TheLoop->contains(I) || Worklist.count(I); 4588 }); 4589 if (!ScalarInd) 4590 continue; 4591 4592 // Determine if all users of the induction variable update instruction are 4593 // scalar after vectorization. 4594 auto ScalarIndUpdate = 4595 llvm::all_of(IndUpdate->users(), [&](User *U) -> bool { 4596 auto *I = cast<Instruction>(U); 4597 return I == Ind || !TheLoop->contains(I) || Worklist.count(I); 4598 }); 4599 if (!ScalarIndUpdate) 4600 continue; 4601 4602 // The induction variable and its update instruction will remain scalar. 4603 Worklist.insert(Ind); 4604 Worklist.insert(IndUpdate); 4605 LLVM_DEBUG(dbgs() << "LV: Found scalar instruction: " << *Ind << "\n"); 4606 LLVM_DEBUG(dbgs() << "LV: Found scalar instruction: " << *IndUpdate 4607 << "\n"); 4608 } 4609 4610 Scalars[VF].insert(Worklist.begin(), Worklist.end()); 4611 } 4612 4613 bool LoopVectorizationCostModel::isScalarWithPredication(Instruction *I, unsigned VF) { 4614 if (!blockNeedsPredication(I->getParent())) 4615 return false; 4616 switch(I->getOpcode()) { 4617 default: 4618 break; 4619 case Instruction::Load: 4620 case Instruction::Store: { 4621 if (!Legal->isMaskRequired(I)) 4622 return false; 4623 auto *Ptr = getLoadStorePointerOperand(I); 4624 auto *Ty = getMemInstValueType(I); 4625 // We have already decided how to vectorize this instruction, get that 4626 // result. 4627 if (VF > 1) { 4628 InstWidening WideningDecision = getWideningDecision(I, VF); 4629 assert(WideningDecision != CM_Unknown && 4630 "Widening decision should be ready at this moment"); 4631 return WideningDecision == CM_Scalarize; 4632 } 4633 const MaybeAlign Alignment = getLoadStoreAlignment(I); 4634 return isa<LoadInst>(I) ? !(isLegalMaskedLoad(Ty, Ptr, Alignment) || 4635 isLegalMaskedGather(Ty, Alignment)) 4636 : !(isLegalMaskedStore(Ty, Ptr, Alignment) || 4637 isLegalMaskedScatter(Ty, Alignment)); 4638 } 4639 case Instruction::UDiv: 4640 case Instruction::SDiv: 4641 case Instruction::SRem: 4642 case Instruction::URem: 4643 return mayDivideByZero(*I); 4644 } 4645 return false; 4646 } 4647 4648 bool LoopVectorizationCostModel::interleavedAccessCanBeWidened(Instruction *I, 4649 unsigned VF) { 4650 assert(isAccessInterleaved(I) && "Expecting interleaved access."); 4651 assert(getWideningDecision(I, VF) == CM_Unknown && 4652 "Decision should not be set yet."); 4653 auto *Group = getInterleavedAccessGroup(I); 4654 assert(Group && "Must have a group."); 4655 4656 // If the instruction's allocated size doesn't equal it's type size, it 4657 // requires padding and will be scalarized. 4658 auto &DL = I->getModule()->getDataLayout(); 4659 auto *ScalarTy = getMemInstValueType(I); 4660 if (hasIrregularType(ScalarTy, DL, VF)) 4661 return false; 4662 4663 // Check if masking is required. 4664 // A Group may need masking for one of two reasons: it resides in a block that 4665 // needs predication, or it was decided to use masking to deal with gaps. 4666 bool PredicatedAccessRequiresMasking = 4667 Legal->blockNeedsPredication(I->getParent()) && Legal->isMaskRequired(I); 4668 bool AccessWithGapsRequiresMasking = 4669 Group->requiresScalarEpilogue() && !isScalarEpilogueAllowed(); 4670 if (!PredicatedAccessRequiresMasking && !AccessWithGapsRequiresMasking) 4671 return true; 4672 4673 // If masked interleaving is required, we expect that the user/target had 4674 // enabled it, because otherwise it either wouldn't have been created or 4675 // it should have been invalidated by the CostModel. 4676 assert(useMaskedInterleavedAccesses(TTI) && 4677 "Masked interleave-groups for predicated accesses are not enabled."); 4678 4679 auto *Ty = getMemInstValueType(I); 4680 const MaybeAlign Alignment = getLoadStoreAlignment(I); 4681 return isa<LoadInst>(I) ? TTI.isLegalMaskedLoad(Ty, Alignment) 4682 : TTI.isLegalMaskedStore(Ty, Alignment); 4683 } 4684 4685 bool LoopVectorizationCostModel::memoryInstructionCanBeWidened(Instruction *I, 4686 unsigned VF) { 4687 // Get and ensure we have a valid memory instruction. 4688 LoadInst *LI = dyn_cast<LoadInst>(I); 4689 StoreInst *SI = dyn_cast<StoreInst>(I); 4690 assert((LI || SI) && "Invalid memory instruction"); 4691 4692 auto *Ptr = getLoadStorePointerOperand(I); 4693 4694 // In order to be widened, the pointer should be consecutive, first of all. 4695 if (!Legal->isConsecutivePtr(Ptr)) 4696 return false; 4697 4698 // If the instruction is a store located in a predicated block, it will be 4699 // scalarized. 4700 if (isScalarWithPredication(I)) 4701 return false; 4702 4703 // If the instruction's allocated size doesn't equal it's type size, it 4704 // requires padding and will be scalarized. 4705 auto &DL = I->getModule()->getDataLayout(); 4706 auto *ScalarTy = LI ? LI->getType() : SI->getValueOperand()->getType(); 4707 if (hasIrregularType(ScalarTy, DL, VF)) 4708 return false; 4709 4710 return true; 4711 } 4712 4713 void LoopVectorizationCostModel::collectLoopUniforms(unsigned VF) { 4714 // We should not collect Uniforms more than once per VF. Right now, 4715 // this function is called from collectUniformsAndScalars(), which 4716 // already does this check. Collecting Uniforms for VF=1 does not make any 4717 // sense. 4718 4719 assert(VF >= 2 && Uniforms.find(VF) == Uniforms.end() && 4720 "This function should not be visited twice for the same VF"); 4721 4722 // Visit the list of Uniforms. If we'll not find any uniform value, we'll 4723 // not analyze again. Uniforms.count(VF) will return 1. 4724 Uniforms[VF].clear(); 4725 4726 // We now know that the loop is vectorizable! 4727 // Collect instructions inside the loop that will remain uniform after 4728 // vectorization. 4729 4730 // Global values, params and instructions outside of current loop are out of 4731 // scope. 4732 auto isOutOfScope = [&](Value *V) -> bool { 4733 Instruction *I = dyn_cast<Instruction>(V); 4734 return (!I || !TheLoop->contains(I)); 4735 }; 4736 4737 SetVector<Instruction *> Worklist; 4738 BasicBlock *Latch = TheLoop->getLoopLatch(); 4739 4740 // Instructions that are scalar with predication must not be considered 4741 // uniform after vectorization, because that would create an erroneous 4742 // replicating region where only a single instance out of VF should be formed. 4743 // TODO: optimize such seldom cases if found important, see PR40816. 4744 auto addToWorklistIfAllowed = [&](Instruction *I) -> void { 4745 if (isScalarWithPredication(I, VF)) { 4746 LLVM_DEBUG(dbgs() << "LV: Found not uniform being ScalarWithPredication: " 4747 << *I << "\n"); 4748 return; 4749 } 4750 LLVM_DEBUG(dbgs() << "LV: Found uniform instruction: " << *I << "\n"); 4751 Worklist.insert(I); 4752 }; 4753 4754 // Start with the conditional branch. If the branch condition is an 4755 // instruction contained in the loop that is only used by the branch, it is 4756 // uniform. 4757 auto *Cmp = dyn_cast<Instruction>(Latch->getTerminator()->getOperand(0)); 4758 if (Cmp && TheLoop->contains(Cmp) && Cmp->hasOneUse()) 4759 addToWorklistIfAllowed(Cmp); 4760 4761 // Holds consecutive and consecutive-like pointers. Consecutive-like pointers 4762 // are pointers that are treated like consecutive pointers during 4763 // vectorization. The pointer operands of interleaved accesses are an 4764 // example. 4765 SmallSetVector<Instruction *, 8> ConsecutiveLikePtrs; 4766 4767 // Holds pointer operands of instructions that are possibly non-uniform. 4768 SmallPtrSet<Instruction *, 8> PossibleNonUniformPtrs; 4769 4770 auto isUniformDecision = [&](Instruction *I, unsigned VF) { 4771 InstWidening WideningDecision = getWideningDecision(I, VF); 4772 assert(WideningDecision != CM_Unknown && 4773 "Widening decision should be ready at this moment"); 4774 4775 return (WideningDecision == CM_Widen || 4776 WideningDecision == CM_Widen_Reverse || 4777 WideningDecision == CM_Interleave); 4778 }; 4779 // Iterate over the instructions in the loop, and collect all 4780 // consecutive-like pointer operands in ConsecutiveLikePtrs. If it's possible 4781 // that a consecutive-like pointer operand will be scalarized, we collect it 4782 // in PossibleNonUniformPtrs instead. We use two sets here because a single 4783 // getelementptr instruction can be used by both vectorized and scalarized 4784 // memory instructions. For example, if a loop loads and stores from the same 4785 // location, but the store is conditional, the store will be scalarized, and 4786 // the getelementptr won't remain uniform. 4787 for (auto *BB : TheLoop->blocks()) 4788 for (auto &I : *BB) { 4789 // If there's no pointer operand, there's nothing to do. 4790 auto *Ptr = dyn_cast_or_null<Instruction>(getLoadStorePointerOperand(&I)); 4791 if (!Ptr) 4792 continue; 4793 4794 // True if all users of Ptr are memory accesses that have Ptr as their 4795 // pointer operand. 4796 auto UsersAreMemAccesses = 4797 llvm::all_of(Ptr->users(), [&](User *U) -> bool { 4798 return getLoadStorePointerOperand(U) == Ptr; 4799 }); 4800 4801 // Ensure the memory instruction will not be scalarized or used by 4802 // gather/scatter, making its pointer operand non-uniform. If the pointer 4803 // operand is used by any instruction other than a memory access, we 4804 // conservatively assume the pointer operand may be non-uniform. 4805 if (!UsersAreMemAccesses || !isUniformDecision(&I, VF)) 4806 PossibleNonUniformPtrs.insert(Ptr); 4807 4808 // If the memory instruction will be vectorized and its pointer operand 4809 // is consecutive-like, or interleaving - the pointer operand should 4810 // remain uniform. 4811 else 4812 ConsecutiveLikePtrs.insert(Ptr); 4813 } 4814 4815 // Add to the Worklist all consecutive and consecutive-like pointers that 4816 // aren't also identified as possibly non-uniform. 4817 for (auto *V : ConsecutiveLikePtrs) 4818 if (PossibleNonUniformPtrs.find(V) == PossibleNonUniformPtrs.end()) 4819 addToWorklistIfAllowed(V); 4820 4821 // Expand Worklist in topological order: whenever a new instruction 4822 // is added , its users should be already inside Worklist. It ensures 4823 // a uniform instruction will only be used by uniform instructions. 4824 unsigned idx = 0; 4825 while (idx != Worklist.size()) { 4826 Instruction *I = Worklist[idx++]; 4827 4828 for (auto OV : I->operand_values()) { 4829 // isOutOfScope operands cannot be uniform instructions. 4830 if (isOutOfScope(OV)) 4831 continue; 4832 // First order recurrence Phi's should typically be considered 4833 // non-uniform. 4834 auto *OP = dyn_cast<PHINode>(OV); 4835 if (OP && Legal->isFirstOrderRecurrence(OP)) 4836 continue; 4837 // If all the users of the operand are uniform, then add the 4838 // operand into the uniform worklist. 4839 auto *OI = cast<Instruction>(OV); 4840 if (llvm::all_of(OI->users(), [&](User *U) -> bool { 4841 auto *J = cast<Instruction>(U); 4842 return Worklist.count(J) || 4843 (OI == getLoadStorePointerOperand(J) && 4844 isUniformDecision(J, VF)); 4845 })) 4846 addToWorklistIfAllowed(OI); 4847 } 4848 } 4849 4850 // Returns true if Ptr is the pointer operand of a memory access instruction 4851 // I, and I is known to not require scalarization. 4852 auto isVectorizedMemAccessUse = [&](Instruction *I, Value *Ptr) -> bool { 4853 return getLoadStorePointerOperand(I) == Ptr && isUniformDecision(I, VF); 4854 }; 4855 4856 // For an instruction to be added into Worklist above, all its users inside 4857 // the loop should also be in Worklist. However, this condition cannot be 4858 // true for phi nodes that form a cyclic dependence. We must process phi 4859 // nodes separately. An induction variable will remain uniform if all users 4860 // of the induction variable and induction variable update remain uniform. 4861 // The code below handles both pointer and non-pointer induction variables. 4862 for (auto &Induction : *Legal->getInductionVars()) { 4863 auto *Ind = Induction.first; 4864 auto *IndUpdate = cast<Instruction>(Ind->getIncomingValueForBlock(Latch)); 4865 4866 // Determine if all users of the induction variable are uniform after 4867 // vectorization. 4868 auto UniformInd = llvm::all_of(Ind->users(), [&](User *U) -> bool { 4869 auto *I = cast<Instruction>(U); 4870 return I == IndUpdate || !TheLoop->contains(I) || Worklist.count(I) || 4871 isVectorizedMemAccessUse(I, Ind); 4872 }); 4873 if (!UniformInd) 4874 continue; 4875 4876 // Determine if all users of the induction variable update instruction are 4877 // uniform after vectorization. 4878 auto UniformIndUpdate = 4879 llvm::all_of(IndUpdate->users(), [&](User *U) -> bool { 4880 auto *I = cast<Instruction>(U); 4881 return I == Ind || !TheLoop->contains(I) || Worklist.count(I) || 4882 isVectorizedMemAccessUse(I, IndUpdate); 4883 }); 4884 if (!UniformIndUpdate) 4885 continue; 4886 4887 // The induction variable and its update instruction will remain uniform. 4888 addToWorklistIfAllowed(Ind); 4889 addToWorklistIfAllowed(IndUpdate); 4890 } 4891 4892 Uniforms[VF].insert(Worklist.begin(), Worklist.end()); 4893 } 4894 4895 bool LoopVectorizationCostModel::runtimeChecksRequired() { 4896 LLVM_DEBUG(dbgs() << "LV: Performing code size checks.\n"); 4897 4898 if (Legal->getRuntimePointerChecking()->Need) { 4899 reportVectorizationFailure("Runtime ptr check is required with -Os/-Oz", 4900 "runtime pointer checks needed. Enable vectorization of this " 4901 "loop with '#pragma clang loop vectorize(enable)' when " 4902 "compiling with -Os/-Oz", 4903 "CantVersionLoopWithOptForSize", ORE, TheLoop); 4904 return true; 4905 } 4906 4907 if (!PSE.getUnionPredicate().getPredicates().empty()) { 4908 reportVectorizationFailure("Runtime SCEV check is required with -Os/-Oz", 4909 "runtime SCEV checks needed. Enable vectorization of this " 4910 "loop with '#pragma clang loop vectorize(enable)' when " 4911 "compiling with -Os/-Oz", 4912 "CantVersionLoopWithOptForSize", ORE, TheLoop); 4913 return true; 4914 } 4915 4916 // FIXME: Avoid specializing for stride==1 instead of bailing out. 4917 if (!Legal->getLAI()->getSymbolicStrides().empty()) { 4918 reportVectorizationFailure("Runtime stride check is required with -Os/-Oz", 4919 "runtime stride == 1 checks needed. Enable vectorization of " 4920 "this loop with '#pragma clang loop vectorize(enable)' when " 4921 "compiling with -Os/-Oz", 4922 "CantVersionLoopWithOptForSize", ORE, TheLoop); 4923 return true; 4924 } 4925 4926 return false; 4927 } 4928 4929 Optional<unsigned> LoopVectorizationCostModel::computeMaxVF() { 4930 if (Legal->getRuntimePointerChecking()->Need && TTI.hasBranchDivergence()) { 4931 // TODO: It may by useful to do since it's still likely to be dynamically 4932 // uniform if the target can skip. 4933 reportVectorizationFailure( 4934 "Not inserting runtime ptr check for divergent target", 4935 "runtime pointer checks needed. Not enabled for divergent target", 4936 "CantVersionLoopWithDivergentTarget", ORE, TheLoop); 4937 return None; 4938 } 4939 4940 unsigned TC = PSE.getSE()->getSmallConstantTripCount(TheLoop); 4941 LLVM_DEBUG(dbgs() << "LV: Found trip count: " << TC << '\n'); 4942 if (TC == 1) { 4943 reportVectorizationFailure("Single iteration (non) loop", 4944 "loop trip count is one, irrelevant for vectorization", 4945 "SingleIterationLoop", ORE, TheLoop); 4946 return None; 4947 } 4948 4949 switch (ScalarEpilogueStatus) { 4950 case CM_ScalarEpilogueAllowed: 4951 return computeFeasibleMaxVF(TC); 4952 case CM_ScalarEpilogueNotNeededUsePredicate: 4953 LLVM_DEBUG( 4954 dbgs() << "LV: vector predicate hint/switch found.\n" 4955 << "LV: Not allowing scalar epilogue, creating predicated " 4956 << "vector loop.\n"); 4957 break; 4958 case CM_ScalarEpilogueNotAllowedLowTripLoop: 4959 // fallthrough as a special case of OptForSize 4960 case CM_ScalarEpilogueNotAllowedOptSize: 4961 if (ScalarEpilogueStatus == CM_ScalarEpilogueNotAllowedOptSize) 4962 LLVM_DEBUG( 4963 dbgs() << "LV: Not allowing scalar epilogue due to -Os/-Oz.\n"); 4964 else 4965 LLVM_DEBUG(dbgs() << "LV: Not allowing scalar epilogue due to low trip " 4966 << "count.\n"); 4967 4968 // Bail if runtime checks are required, which are not good when optimising 4969 // for size. 4970 if (runtimeChecksRequired()) 4971 return None; 4972 break; 4973 } 4974 4975 // Now try the tail folding 4976 4977 // Invalidate interleave groups that require an epilogue if we can't mask 4978 // the interleave-group. 4979 if (!useMaskedInterleavedAccesses(TTI)) 4980 InterleaveInfo.invalidateGroupsRequiringScalarEpilogue(); 4981 4982 unsigned MaxVF = computeFeasibleMaxVF(TC); 4983 if (TC > 0 && TC % MaxVF == 0) { 4984 // Accept MaxVF if we do not have a tail. 4985 LLVM_DEBUG(dbgs() << "LV: No tail will remain for any chosen VF.\n"); 4986 return MaxVF; 4987 } 4988 4989 // If we don't know the precise trip count, or if the trip count that we 4990 // found modulo the vectorization factor is not zero, try to fold the tail 4991 // by masking. 4992 // FIXME: look for a smaller MaxVF that does divide TC rather than masking. 4993 if (Legal->prepareToFoldTailByMasking()) { 4994 FoldTailByMasking = true; 4995 return MaxVF; 4996 } 4997 4998 if (TC == 0) { 4999 reportVectorizationFailure( 5000 "Unable to calculate the loop count due to complex control flow", 5001 "unable to calculate the loop count due to complex control flow", 5002 "UnknownLoopCountComplexCFG", ORE, TheLoop); 5003 return None; 5004 } 5005 5006 reportVectorizationFailure( 5007 "Cannot optimize for size and vectorize at the same time.", 5008 "cannot optimize for size and vectorize at the same time. " 5009 "Enable vectorization of this loop with '#pragma clang loop " 5010 "vectorize(enable)' when compiling with -Os/-Oz", 5011 "NoTailLoopWithOptForSize", ORE, TheLoop); 5012 return None; 5013 } 5014 5015 unsigned 5016 LoopVectorizationCostModel::computeFeasibleMaxVF(unsigned ConstTripCount) { 5017 MinBWs = computeMinimumValueSizes(TheLoop->getBlocks(), *DB, &TTI); 5018 unsigned SmallestType, WidestType; 5019 std::tie(SmallestType, WidestType) = getSmallestAndWidestTypes(); 5020 unsigned WidestRegister = TTI.getRegisterBitWidth(true); 5021 5022 // Get the maximum safe dependence distance in bits computed by LAA. 5023 // It is computed by MaxVF * sizeOf(type) * 8, where type is taken from 5024 // the memory accesses that is most restrictive (involved in the smallest 5025 // dependence distance). 5026 unsigned MaxSafeRegisterWidth = Legal->getMaxSafeRegisterWidth(); 5027 5028 WidestRegister = std::min(WidestRegister, MaxSafeRegisterWidth); 5029 5030 unsigned MaxVectorSize = WidestRegister / WidestType; 5031 5032 LLVM_DEBUG(dbgs() << "LV: The Smallest and Widest types: " << SmallestType 5033 << " / " << WidestType << " bits.\n"); 5034 LLVM_DEBUG(dbgs() << "LV: The Widest register safe to use is: " 5035 << WidestRegister << " bits.\n"); 5036 5037 assert(MaxVectorSize <= 256 && "Did not expect to pack so many elements" 5038 " into one vector!"); 5039 if (MaxVectorSize == 0) { 5040 LLVM_DEBUG(dbgs() << "LV: The target has no vector registers.\n"); 5041 MaxVectorSize = 1; 5042 return MaxVectorSize; 5043 } else if (ConstTripCount && ConstTripCount < MaxVectorSize && 5044 isPowerOf2_32(ConstTripCount)) { 5045 // We need to clamp the VF to be the ConstTripCount. There is no point in 5046 // choosing a higher viable VF as done in the loop below. 5047 LLVM_DEBUG(dbgs() << "LV: Clamping the MaxVF to the constant trip count: " 5048 << ConstTripCount << "\n"); 5049 MaxVectorSize = ConstTripCount; 5050 return MaxVectorSize; 5051 } 5052 5053 unsigned MaxVF = MaxVectorSize; 5054 if (TTI.shouldMaximizeVectorBandwidth(!isScalarEpilogueAllowed()) || 5055 (MaximizeBandwidth && isScalarEpilogueAllowed())) { 5056 // Collect all viable vectorization factors larger than the default MaxVF 5057 // (i.e. MaxVectorSize). 5058 SmallVector<unsigned, 8> VFs; 5059 unsigned NewMaxVectorSize = WidestRegister / SmallestType; 5060 for (unsigned VS = MaxVectorSize * 2; VS <= NewMaxVectorSize; VS *= 2) 5061 VFs.push_back(VS); 5062 5063 // For each VF calculate its register usage. 5064 auto RUs = calculateRegisterUsage(VFs); 5065 5066 // Select the largest VF which doesn't require more registers than existing 5067 // ones. 5068 for (int i = RUs.size() - 1; i >= 0; --i) { 5069 bool Selected = true; 5070 for (auto& pair : RUs[i].MaxLocalUsers) { 5071 unsigned TargetNumRegisters = TTI.getNumberOfRegisters(pair.first); 5072 if (pair.second > TargetNumRegisters) 5073 Selected = false; 5074 } 5075 if (Selected) { 5076 MaxVF = VFs[i]; 5077 break; 5078 } 5079 } 5080 if (unsigned MinVF = TTI.getMinimumVF(SmallestType)) { 5081 if (MaxVF < MinVF) { 5082 LLVM_DEBUG(dbgs() << "LV: Overriding calculated MaxVF(" << MaxVF 5083 << ") with target's minimum: " << MinVF << '\n'); 5084 MaxVF = MinVF; 5085 } 5086 } 5087 } 5088 return MaxVF; 5089 } 5090 5091 VectorizationFactor 5092 LoopVectorizationCostModel::selectVectorizationFactor(unsigned MaxVF) { 5093 float Cost = expectedCost(1).first; 5094 const float ScalarCost = Cost; 5095 unsigned Width = 1; 5096 LLVM_DEBUG(dbgs() << "LV: Scalar loop costs: " << (int)ScalarCost << ".\n"); 5097 5098 bool ForceVectorization = Hints->getForce() == LoopVectorizeHints::FK_Enabled; 5099 if (ForceVectorization && MaxVF > 1) { 5100 // Ignore scalar width, because the user explicitly wants vectorization. 5101 // Initialize cost to max so that VF = 2 is, at least, chosen during cost 5102 // evaluation. 5103 Cost = std::numeric_limits<float>::max(); 5104 } 5105 5106 for (unsigned i = 2; i <= MaxVF; i *= 2) { 5107 // Notice that the vector loop needs to be executed less times, so 5108 // we need to divide the cost of the vector loops by the width of 5109 // the vector elements. 5110 VectorizationCostTy C = expectedCost(i); 5111 float VectorCost = C.first / (float)i; 5112 LLVM_DEBUG(dbgs() << "LV: Vector loop of width " << i 5113 << " costs: " << (int)VectorCost << ".\n"); 5114 if (!C.second && !ForceVectorization) { 5115 LLVM_DEBUG( 5116 dbgs() << "LV: Not considering vector loop of width " << i 5117 << " because it will not generate any vector instructions.\n"); 5118 continue; 5119 } 5120 if (VectorCost < Cost) { 5121 Cost = VectorCost; 5122 Width = i; 5123 } 5124 } 5125 5126 if (!EnableCondStoresVectorization && NumPredStores) { 5127 reportVectorizationFailure("There are conditional stores.", 5128 "store that is conditionally executed prevents vectorization", 5129 "ConditionalStore", ORE, TheLoop); 5130 Width = 1; 5131 Cost = ScalarCost; 5132 } 5133 5134 LLVM_DEBUG(if (ForceVectorization && Width > 1 && Cost >= ScalarCost) dbgs() 5135 << "LV: Vectorization seems to be not beneficial, " 5136 << "but was forced by a user.\n"); 5137 LLVM_DEBUG(dbgs() << "LV: Selecting VF: " << Width << ".\n"); 5138 VectorizationFactor Factor = {Width, (unsigned)(Width * Cost)}; 5139 return Factor; 5140 } 5141 5142 std::pair<unsigned, unsigned> 5143 LoopVectorizationCostModel::getSmallestAndWidestTypes() { 5144 unsigned MinWidth = -1U; 5145 unsigned MaxWidth = 8; 5146 const DataLayout &DL = TheFunction->getParent()->getDataLayout(); 5147 5148 // For each block. 5149 for (BasicBlock *BB : TheLoop->blocks()) { 5150 // For each instruction in the loop. 5151 for (Instruction &I : BB->instructionsWithoutDebug()) { 5152 Type *T = I.getType(); 5153 5154 // Skip ignored values. 5155 if (ValuesToIgnore.find(&I) != ValuesToIgnore.end()) 5156 continue; 5157 5158 // Only examine Loads, Stores and PHINodes. 5159 if (!isa<LoadInst>(I) && !isa<StoreInst>(I) && !isa<PHINode>(I)) 5160 continue; 5161 5162 // Examine PHI nodes that are reduction variables. Update the type to 5163 // account for the recurrence type. 5164 if (auto *PN = dyn_cast<PHINode>(&I)) { 5165 if (!Legal->isReductionVariable(PN)) 5166 continue; 5167 RecurrenceDescriptor RdxDesc = (*Legal->getReductionVars())[PN]; 5168 T = RdxDesc.getRecurrenceType(); 5169 } 5170 5171 // Examine the stored values. 5172 if (auto *ST = dyn_cast<StoreInst>(&I)) 5173 T = ST->getValueOperand()->getType(); 5174 5175 // Ignore loaded pointer types and stored pointer types that are not 5176 // vectorizable. 5177 // 5178 // FIXME: The check here attempts to predict whether a load or store will 5179 // be vectorized. We only know this for certain after a VF has 5180 // been selected. Here, we assume that if an access can be 5181 // vectorized, it will be. We should also look at extending this 5182 // optimization to non-pointer types. 5183 // 5184 if (T->isPointerTy() && !isConsecutiveLoadOrStore(&I) && 5185 !isAccessInterleaved(&I) && !isLegalGatherOrScatter(&I)) 5186 continue; 5187 5188 MinWidth = std::min(MinWidth, 5189 (unsigned)DL.getTypeSizeInBits(T->getScalarType())); 5190 MaxWidth = std::max(MaxWidth, 5191 (unsigned)DL.getTypeSizeInBits(T->getScalarType())); 5192 } 5193 } 5194 5195 return {MinWidth, MaxWidth}; 5196 } 5197 5198 unsigned LoopVectorizationCostModel::selectInterleaveCount(unsigned VF, 5199 unsigned LoopCost) { 5200 // -- The interleave heuristics -- 5201 // We interleave the loop in order to expose ILP and reduce the loop overhead. 5202 // There are many micro-architectural considerations that we can't predict 5203 // at this level. For example, frontend pressure (on decode or fetch) due to 5204 // code size, or the number and capabilities of the execution ports. 5205 // 5206 // We use the following heuristics to select the interleave count: 5207 // 1. If the code has reductions, then we interleave to break the cross 5208 // iteration dependency. 5209 // 2. If the loop is really small, then we interleave to reduce the loop 5210 // overhead. 5211 // 3. We don't interleave if we think that we will spill registers to memory 5212 // due to the increased register pressure. 5213 5214 if (!isScalarEpilogueAllowed()) 5215 return 1; 5216 5217 // We used the distance for the interleave count. 5218 if (Legal->getMaxSafeDepDistBytes() != -1U) 5219 return 1; 5220 5221 // Do not interleave loops with a relatively small known or estimated trip 5222 // count. 5223 auto BestKnownTC = getSmallBestKnownTC(*PSE.getSE(), TheLoop); 5224 if (BestKnownTC && *BestKnownTC < TinyTripCountInterleaveThreshold) 5225 return 1; 5226 5227 RegisterUsage R = calculateRegisterUsage({VF})[0]; 5228 // We divide by these constants so assume that we have at least one 5229 // instruction that uses at least one register. 5230 for (auto& pair : R.MaxLocalUsers) { 5231 pair.second = std::max(pair.second, 1U); 5232 } 5233 5234 // We calculate the interleave count using the following formula. 5235 // Subtract the number of loop invariants from the number of available 5236 // registers. These registers are used by all of the interleaved instances. 5237 // Next, divide the remaining registers by the number of registers that is 5238 // required by the loop, in order to estimate how many parallel instances 5239 // fit without causing spills. All of this is rounded down if necessary to be 5240 // a power of two. We want power of two interleave count to simplify any 5241 // addressing operations or alignment considerations. 5242 // We also want power of two interleave counts to ensure that the induction 5243 // variable of the vector loop wraps to zero, when tail is folded by masking; 5244 // this currently happens when OptForSize, in which case IC is set to 1 above. 5245 unsigned IC = UINT_MAX; 5246 5247 for (auto& pair : R.MaxLocalUsers) { 5248 unsigned TargetNumRegisters = TTI.getNumberOfRegisters(pair.first); 5249 LLVM_DEBUG(dbgs() << "LV: The target has " << TargetNumRegisters 5250 << " registers of " 5251 << TTI.getRegisterClassName(pair.first) << " register class\n"); 5252 if (VF == 1) { 5253 if (ForceTargetNumScalarRegs.getNumOccurrences() > 0) 5254 TargetNumRegisters = ForceTargetNumScalarRegs; 5255 } else { 5256 if (ForceTargetNumVectorRegs.getNumOccurrences() > 0) 5257 TargetNumRegisters = ForceTargetNumVectorRegs; 5258 } 5259 unsigned MaxLocalUsers = pair.second; 5260 unsigned LoopInvariantRegs = 0; 5261 if (R.LoopInvariantRegs.find(pair.first) != R.LoopInvariantRegs.end()) 5262 LoopInvariantRegs = R.LoopInvariantRegs[pair.first]; 5263 5264 unsigned TmpIC = PowerOf2Floor((TargetNumRegisters - LoopInvariantRegs) / MaxLocalUsers); 5265 // Don't count the induction variable as interleaved. 5266 if (EnableIndVarRegisterHeur) { 5267 TmpIC = 5268 PowerOf2Floor((TargetNumRegisters - LoopInvariantRegs - 1) / 5269 std::max(1U, (MaxLocalUsers - 1))); 5270 } 5271 5272 IC = std::min(IC, TmpIC); 5273 } 5274 5275 // Clamp the interleave ranges to reasonable counts. 5276 unsigned MaxInterleaveCount = TTI.getMaxInterleaveFactor(VF); 5277 5278 // Check if the user has overridden the max. 5279 if (VF == 1) { 5280 if (ForceTargetMaxScalarInterleaveFactor.getNumOccurrences() > 0) 5281 MaxInterleaveCount = ForceTargetMaxScalarInterleaveFactor; 5282 } else { 5283 if (ForceTargetMaxVectorInterleaveFactor.getNumOccurrences() > 0) 5284 MaxInterleaveCount = ForceTargetMaxVectorInterleaveFactor; 5285 } 5286 5287 // If trip count is known or estimated compile time constant, limit the 5288 // interleave count to be less than the trip count divided by VF. 5289 if (BestKnownTC) { 5290 MaxInterleaveCount = std::min(*BestKnownTC / VF, MaxInterleaveCount); 5291 } 5292 5293 // If we did not calculate the cost for VF (because the user selected the VF) 5294 // then we calculate the cost of VF here. 5295 if (LoopCost == 0) 5296 LoopCost = expectedCost(VF).first; 5297 5298 assert(LoopCost && "Non-zero loop cost expected"); 5299 5300 // Clamp the calculated IC to be between the 1 and the max interleave count 5301 // that the target and trip count allows. 5302 if (IC > MaxInterleaveCount) 5303 IC = MaxInterleaveCount; 5304 else if (IC < 1) 5305 IC = 1; 5306 5307 // Interleave if we vectorized this loop and there is a reduction that could 5308 // benefit from interleaving. 5309 if (VF > 1 && !Legal->getReductionVars()->empty()) { 5310 LLVM_DEBUG(dbgs() << "LV: Interleaving because of reductions.\n"); 5311 return IC; 5312 } 5313 5314 // Note that if we've already vectorized the loop we will have done the 5315 // runtime check and so interleaving won't require further checks. 5316 bool InterleavingRequiresRuntimePointerCheck = 5317 (VF == 1 && Legal->getRuntimePointerChecking()->Need); 5318 5319 // We want to interleave small loops in order to reduce the loop overhead and 5320 // potentially expose ILP opportunities. 5321 LLVM_DEBUG(dbgs() << "LV: Loop cost is " << LoopCost << '\n'); 5322 if (!InterleavingRequiresRuntimePointerCheck && LoopCost < SmallLoopCost) { 5323 // We assume that the cost overhead is 1 and we use the cost model 5324 // to estimate the cost of the loop and interleave until the cost of the 5325 // loop overhead is about 5% of the cost of the loop. 5326 unsigned SmallIC = 5327 std::min(IC, (unsigned)PowerOf2Floor(SmallLoopCost / LoopCost)); 5328 5329 // Interleave until store/load ports (estimated by max interleave count) are 5330 // saturated. 5331 unsigned NumStores = Legal->getNumStores(); 5332 unsigned NumLoads = Legal->getNumLoads(); 5333 unsigned StoresIC = IC / (NumStores ? NumStores : 1); 5334 unsigned LoadsIC = IC / (NumLoads ? NumLoads : 1); 5335 5336 // If we have a scalar reduction (vector reductions are already dealt with 5337 // by this point), we can increase the critical path length if the loop 5338 // we're interleaving is inside another loop. Limit, by default to 2, so the 5339 // critical path only gets increased by one reduction operation. 5340 if (!Legal->getReductionVars()->empty() && TheLoop->getLoopDepth() > 1) { 5341 unsigned F = static_cast<unsigned>(MaxNestedScalarReductionIC); 5342 SmallIC = std::min(SmallIC, F); 5343 StoresIC = std::min(StoresIC, F); 5344 LoadsIC = std::min(LoadsIC, F); 5345 } 5346 5347 if (EnableLoadStoreRuntimeInterleave && 5348 std::max(StoresIC, LoadsIC) > SmallIC) { 5349 LLVM_DEBUG( 5350 dbgs() << "LV: Interleaving to saturate store or load ports.\n"); 5351 return std::max(StoresIC, LoadsIC); 5352 } 5353 5354 LLVM_DEBUG(dbgs() << "LV: Interleaving to reduce branch cost.\n"); 5355 return SmallIC; 5356 } 5357 5358 // Interleave if this is a large loop (small loops are already dealt with by 5359 // this point) that could benefit from interleaving. 5360 bool HasReductions = !Legal->getReductionVars()->empty(); 5361 if (TTI.enableAggressiveInterleaving(HasReductions)) { 5362 LLVM_DEBUG(dbgs() << "LV: Interleaving to expose ILP.\n"); 5363 return IC; 5364 } 5365 5366 LLVM_DEBUG(dbgs() << "LV: Not Interleaving.\n"); 5367 return 1; 5368 } 5369 5370 SmallVector<LoopVectorizationCostModel::RegisterUsage, 8> 5371 LoopVectorizationCostModel::calculateRegisterUsage(ArrayRef<unsigned> VFs) { 5372 // This function calculates the register usage by measuring the highest number 5373 // of values that are alive at a single location. Obviously, this is a very 5374 // rough estimation. We scan the loop in a topological order in order and 5375 // assign a number to each instruction. We use RPO to ensure that defs are 5376 // met before their users. We assume that each instruction that has in-loop 5377 // users starts an interval. We record every time that an in-loop value is 5378 // used, so we have a list of the first and last occurrences of each 5379 // instruction. Next, we transpose this data structure into a multi map that 5380 // holds the list of intervals that *end* at a specific location. This multi 5381 // map allows us to perform a linear search. We scan the instructions linearly 5382 // and record each time that a new interval starts, by placing it in a set. 5383 // If we find this value in the multi-map then we remove it from the set. 5384 // The max register usage is the maximum size of the set. 5385 // We also search for instructions that are defined outside the loop, but are 5386 // used inside the loop. We need this number separately from the max-interval 5387 // usage number because when we unroll, loop-invariant values do not take 5388 // more register. 5389 LoopBlocksDFS DFS(TheLoop); 5390 DFS.perform(LI); 5391 5392 RegisterUsage RU; 5393 5394 // Each 'key' in the map opens a new interval. The values 5395 // of the map are the index of the 'last seen' usage of the 5396 // instruction that is the key. 5397 using IntervalMap = DenseMap<Instruction *, unsigned>; 5398 5399 // Maps instruction to its index. 5400 SmallVector<Instruction *, 64> IdxToInstr; 5401 // Marks the end of each interval. 5402 IntervalMap EndPoint; 5403 // Saves the list of instruction indices that are used in the loop. 5404 SmallPtrSet<Instruction *, 8> Ends; 5405 // Saves the list of values that are used in the loop but are 5406 // defined outside the loop, such as arguments and constants. 5407 SmallPtrSet<Value *, 8> LoopInvariants; 5408 5409 for (BasicBlock *BB : make_range(DFS.beginRPO(), DFS.endRPO())) { 5410 for (Instruction &I : BB->instructionsWithoutDebug()) { 5411 IdxToInstr.push_back(&I); 5412 5413 // Save the end location of each USE. 5414 for (Value *U : I.operands()) { 5415 auto *Instr = dyn_cast<Instruction>(U); 5416 5417 // Ignore non-instruction values such as arguments, constants, etc. 5418 if (!Instr) 5419 continue; 5420 5421 // If this instruction is outside the loop then record it and continue. 5422 if (!TheLoop->contains(Instr)) { 5423 LoopInvariants.insert(Instr); 5424 continue; 5425 } 5426 5427 // Overwrite previous end points. 5428 EndPoint[Instr] = IdxToInstr.size(); 5429 Ends.insert(Instr); 5430 } 5431 } 5432 } 5433 5434 // Saves the list of intervals that end with the index in 'key'. 5435 using InstrList = SmallVector<Instruction *, 2>; 5436 DenseMap<unsigned, InstrList> TransposeEnds; 5437 5438 // Transpose the EndPoints to a list of values that end at each index. 5439 for (auto &Interval : EndPoint) 5440 TransposeEnds[Interval.second].push_back(Interval.first); 5441 5442 SmallPtrSet<Instruction *, 8> OpenIntervals; 5443 5444 // Get the size of the widest register. 5445 unsigned MaxSafeDepDist = -1U; 5446 if (Legal->getMaxSafeDepDistBytes() != -1U) 5447 MaxSafeDepDist = Legal->getMaxSafeDepDistBytes() * 8; 5448 unsigned WidestRegister = 5449 std::min(TTI.getRegisterBitWidth(true), MaxSafeDepDist); 5450 const DataLayout &DL = TheFunction->getParent()->getDataLayout(); 5451 5452 SmallVector<RegisterUsage, 8> RUs(VFs.size()); 5453 SmallVector<SmallMapVector<unsigned, unsigned, 4>, 8> MaxUsages(VFs.size()); 5454 5455 LLVM_DEBUG(dbgs() << "LV(REG): Calculating max register usage:\n"); 5456 5457 // A lambda that gets the register usage for the given type and VF. 5458 auto GetRegUsage = [&DL, WidestRegister](Type *Ty, unsigned VF) { 5459 if (Ty->isTokenTy()) 5460 return 0U; 5461 unsigned TypeSize = DL.getTypeSizeInBits(Ty->getScalarType()); 5462 return std::max<unsigned>(1, VF * TypeSize / WidestRegister); 5463 }; 5464 5465 for (unsigned int i = 0, s = IdxToInstr.size(); i < s; ++i) { 5466 Instruction *I = IdxToInstr[i]; 5467 5468 // Remove all of the instructions that end at this location. 5469 InstrList &List = TransposeEnds[i]; 5470 for (Instruction *ToRemove : List) 5471 OpenIntervals.erase(ToRemove); 5472 5473 // Ignore instructions that are never used within the loop. 5474 if (Ends.find(I) == Ends.end()) 5475 continue; 5476 5477 // Skip ignored values. 5478 if (ValuesToIgnore.find(I) != ValuesToIgnore.end()) 5479 continue; 5480 5481 // For each VF find the maximum usage of registers. 5482 for (unsigned j = 0, e = VFs.size(); j < e; ++j) { 5483 // Count the number of live intervals. 5484 SmallMapVector<unsigned, unsigned, 4> RegUsage; 5485 5486 if (VFs[j] == 1) { 5487 for (auto Inst : OpenIntervals) { 5488 unsigned ClassID = TTI.getRegisterClassForType(false, Inst->getType()); 5489 if (RegUsage.find(ClassID) == RegUsage.end()) 5490 RegUsage[ClassID] = 1; 5491 else 5492 RegUsage[ClassID] += 1; 5493 } 5494 } else { 5495 collectUniformsAndScalars(VFs[j]); 5496 for (auto Inst : OpenIntervals) { 5497 // Skip ignored values for VF > 1. 5498 if (VecValuesToIgnore.find(Inst) != VecValuesToIgnore.end()) 5499 continue; 5500 if (isScalarAfterVectorization(Inst, VFs[j])) { 5501 unsigned ClassID = TTI.getRegisterClassForType(false, Inst->getType()); 5502 if (RegUsage.find(ClassID) == RegUsage.end()) 5503 RegUsage[ClassID] = 1; 5504 else 5505 RegUsage[ClassID] += 1; 5506 } else { 5507 unsigned ClassID = TTI.getRegisterClassForType(true, Inst->getType()); 5508 if (RegUsage.find(ClassID) == RegUsage.end()) 5509 RegUsage[ClassID] = GetRegUsage(Inst->getType(), VFs[j]); 5510 else 5511 RegUsage[ClassID] += GetRegUsage(Inst->getType(), VFs[j]); 5512 } 5513 } 5514 } 5515 5516 for (auto& pair : RegUsage) { 5517 if (MaxUsages[j].find(pair.first) != MaxUsages[j].end()) 5518 MaxUsages[j][pair.first] = std::max(MaxUsages[j][pair.first], pair.second); 5519 else 5520 MaxUsages[j][pair.first] = pair.second; 5521 } 5522 } 5523 5524 LLVM_DEBUG(dbgs() << "LV(REG): At #" << i << " Interval # " 5525 << OpenIntervals.size() << '\n'); 5526 5527 // Add the current instruction to the list of open intervals. 5528 OpenIntervals.insert(I); 5529 } 5530 5531 for (unsigned i = 0, e = VFs.size(); i < e; ++i) { 5532 SmallMapVector<unsigned, unsigned, 4> Invariant; 5533 5534 for (auto Inst : LoopInvariants) { 5535 unsigned Usage = VFs[i] == 1 ? 1 : GetRegUsage(Inst->getType(), VFs[i]); 5536 unsigned ClassID = TTI.getRegisterClassForType(VFs[i] > 1, Inst->getType()); 5537 if (Invariant.find(ClassID) == Invariant.end()) 5538 Invariant[ClassID] = Usage; 5539 else 5540 Invariant[ClassID] += Usage; 5541 } 5542 5543 LLVM_DEBUG({ 5544 dbgs() << "LV(REG): VF = " << VFs[i] << '\n'; 5545 dbgs() << "LV(REG): Found max usage: " << MaxUsages[i].size() 5546 << " item\n"; 5547 for (const auto &pair : MaxUsages[i]) { 5548 dbgs() << "LV(REG): RegisterClass: " 5549 << TTI.getRegisterClassName(pair.first) << ", " << pair.second 5550 << " registers\n"; 5551 } 5552 dbgs() << "LV(REG): Found invariant usage: " << Invariant.size() 5553 << " item\n"; 5554 for (const auto &pair : Invariant) { 5555 dbgs() << "LV(REG): RegisterClass: " 5556 << TTI.getRegisterClassName(pair.first) << ", " << pair.second 5557 << " registers\n"; 5558 } 5559 }); 5560 5561 RU.LoopInvariantRegs = Invariant; 5562 RU.MaxLocalUsers = MaxUsages[i]; 5563 RUs[i] = RU; 5564 } 5565 5566 return RUs; 5567 } 5568 5569 bool LoopVectorizationCostModel::useEmulatedMaskMemRefHack(Instruction *I){ 5570 // TODO: Cost model for emulated masked load/store is completely 5571 // broken. This hack guides the cost model to use an artificially 5572 // high enough value to practically disable vectorization with such 5573 // operations, except where previously deployed legality hack allowed 5574 // using very low cost values. This is to avoid regressions coming simply 5575 // from moving "masked load/store" check from legality to cost model. 5576 // Masked Load/Gather emulation was previously never allowed. 5577 // Limited number of Masked Store/Scatter emulation was allowed. 5578 assert(isPredicatedInst(I) && "Expecting a scalar emulated instruction"); 5579 return isa<LoadInst>(I) || 5580 (isa<StoreInst>(I) && 5581 NumPredStores > NumberOfStoresToPredicate); 5582 } 5583 5584 void LoopVectorizationCostModel::collectInstsToScalarize(unsigned VF) { 5585 // If we aren't vectorizing the loop, or if we've already collected the 5586 // instructions to scalarize, there's nothing to do. Collection may already 5587 // have occurred if we have a user-selected VF and are now computing the 5588 // expected cost for interleaving. 5589 if (VF < 2 || InstsToScalarize.find(VF) != InstsToScalarize.end()) 5590 return; 5591 5592 // Initialize a mapping for VF in InstsToScalalarize. If we find that it's 5593 // not profitable to scalarize any instructions, the presence of VF in the 5594 // map will indicate that we've analyzed it already. 5595 ScalarCostsTy &ScalarCostsVF = InstsToScalarize[VF]; 5596 5597 // Find all the instructions that are scalar with predication in the loop and 5598 // determine if it would be better to not if-convert the blocks they are in. 5599 // If so, we also record the instructions to scalarize. 5600 for (BasicBlock *BB : TheLoop->blocks()) { 5601 if (!blockNeedsPredication(BB)) 5602 continue; 5603 for (Instruction &I : *BB) 5604 if (isScalarWithPredication(&I)) { 5605 ScalarCostsTy ScalarCosts; 5606 // Do not apply discount logic if hacked cost is needed 5607 // for emulated masked memrefs. 5608 if (!useEmulatedMaskMemRefHack(&I) && 5609 computePredInstDiscount(&I, ScalarCosts, VF) >= 0) 5610 ScalarCostsVF.insert(ScalarCosts.begin(), ScalarCosts.end()); 5611 // Remember that BB will remain after vectorization. 5612 PredicatedBBsAfterVectorization.insert(BB); 5613 } 5614 } 5615 } 5616 5617 int LoopVectorizationCostModel::computePredInstDiscount( 5618 Instruction *PredInst, DenseMap<Instruction *, unsigned> &ScalarCosts, 5619 unsigned VF) { 5620 assert(!isUniformAfterVectorization(PredInst, VF) && 5621 "Instruction marked uniform-after-vectorization will be predicated"); 5622 5623 // Initialize the discount to zero, meaning that the scalar version and the 5624 // vector version cost the same. 5625 int Discount = 0; 5626 5627 // Holds instructions to analyze. The instructions we visit are mapped in 5628 // ScalarCosts. Those instructions are the ones that would be scalarized if 5629 // we find that the scalar version costs less. 5630 SmallVector<Instruction *, 8> Worklist; 5631 5632 // Returns true if the given instruction can be scalarized. 5633 auto canBeScalarized = [&](Instruction *I) -> bool { 5634 // We only attempt to scalarize instructions forming a single-use chain 5635 // from the original predicated block that would otherwise be vectorized. 5636 // Although not strictly necessary, we give up on instructions we know will 5637 // already be scalar to avoid traversing chains that are unlikely to be 5638 // beneficial. 5639 if (!I->hasOneUse() || PredInst->getParent() != I->getParent() || 5640 isScalarAfterVectorization(I, VF)) 5641 return false; 5642 5643 // If the instruction is scalar with predication, it will be analyzed 5644 // separately. We ignore it within the context of PredInst. 5645 if (isScalarWithPredication(I)) 5646 return false; 5647 5648 // If any of the instruction's operands are uniform after vectorization, 5649 // the instruction cannot be scalarized. This prevents, for example, a 5650 // masked load from being scalarized. 5651 // 5652 // We assume we will only emit a value for lane zero of an instruction 5653 // marked uniform after vectorization, rather than VF identical values. 5654 // Thus, if we scalarize an instruction that uses a uniform, we would 5655 // create uses of values corresponding to the lanes we aren't emitting code 5656 // for. This behavior can be changed by allowing getScalarValue to clone 5657 // the lane zero values for uniforms rather than asserting. 5658 for (Use &U : I->operands()) 5659 if (auto *J = dyn_cast<Instruction>(U.get())) 5660 if (isUniformAfterVectorization(J, VF)) 5661 return false; 5662 5663 // Otherwise, we can scalarize the instruction. 5664 return true; 5665 }; 5666 5667 // Compute the expected cost discount from scalarizing the entire expression 5668 // feeding the predicated instruction. We currently only consider expressions 5669 // that are single-use instruction chains. 5670 Worklist.push_back(PredInst); 5671 while (!Worklist.empty()) { 5672 Instruction *I = Worklist.pop_back_val(); 5673 5674 // If we've already analyzed the instruction, there's nothing to do. 5675 if (ScalarCosts.find(I) != ScalarCosts.end()) 5676 continue; 5677 5678 // Compute the cost of the vector instruction. Note that this cost already 5679 // includes the scalarization overhead of the predicated instruction. 5680 unsigned VectorCost = getInstructionCost(I, VF).first; 5681 5682 // Compute the cost of the scalarized instruction. This cost is the cost of 5683 // the instruction as if it wasn't if-converted and instead remained in the 5684 // predicated block. We will scale this cost by block probability after 5685 // computing the scalarization overhead. 5686 unsigned ScalarCost = VF * getInstructionCost(I, 1).first; 5687 5688 // Compute the scalarization overhead of needed insertelement instructions 5689 // and phi nodes. 5690 if (isScalarWithPredication(I) && !I->getType()->isVoidTy()) { 5691 ScalarCost += TTI.getScalarizationOverhead(ToVectorTy(I->getType(), VF), 5692 true, false); 5693 ScalarCost += VF * TTI.getCFInstrCost(Instruction::PHI); 5694 } 5695 5696 // Compute the scalarization overhead of needed extractelement 5697 // instructions. For each of the instruction's operands, if the operand can 5698 // be scalarized, add it to the worklist; otherwise, account for the 5699 // overhead. 5700 for (Use &U : I->operands()) 5701 if (auto *J = dyn_cast<Instruction>(U.get())) { 5702 assert(VectorType::isValidElementType(J->getType()) && 5703 "Instruction has non-scalar type"); 5704 if (canBeScalarized(J)) 5705 Worklist.push_back(J); 5706 else if (needsExtract(J, VF)) 5707 ScalarCost += TTI.getScalarizationOverhead( 5708 ToVectorTy(J->getType(),VF), false, true); 5709 } 5710 5711 // Scale the total scalar cost by block probability. 5712 ScalarCost /= getReciprocalPredBlockProb(); 5713 5714 // Compute the discount. A non-negative discount means the vector version 5715 // of the instruction costs more, and scalarizing would be beneficial. 5716 Discount += VectorCost - ScalarCost; 5717 ScalarCosts[I] = ScalarCost; 5718 } 5719 5720 return Discount; 5721 } 5722 5723 LoopVectorizationCostModel::VectorizationCostTy 5724 LoopVectorizationCostModel::expectedCost(unsigned VF) { 5725 VectorizationCostTy Cost; 5726 5727 // For each block. 5728 for (BasicBlock *BB : TheLoop->blocks()) { 5729 VectorizationCostTy BlockCost; 5730 5731 // For each instruction in the old loop. 5732 for (Instruction &I : BB->instructionsWithoutDebug()) { 5733 // Skip ignored values. 5734 if (ValuesToIgnore.find(&I) != ValuesToIgnore.end() || 5735 (VF > 1 && VecValuesToIgnore.find(&I) != VecValuesToIgnore.end())) 5736 continue; 5737 5738 VectorizationCostTy C = getInstructionCost(&I, VF); 5739 5740 // Check if we should override the cost. 5741 if (ForceTargetInstructionCost.getNumOccurrences() > 0) 5742 C.first = ForceTargetInstructionCost; 5743 5744 BlockCost.first += C.first; 5745 BlockCost.second |= C.second; 5746 LLVM_DEBUG(dbgs() << "LV: Found an estimated cost of " << C.first 5747 << " for VF " << VF << " For instruction: " << I 5748 << '\n'); 5749 } 5750 5751 // If we are vectorizing a predicated block, it will have been 5752 // if-converted. This means that the block's instructions (aside from 5753 // stores and instructions that may divide by zero) will now be 5754 // unconditionally executed. For the scalar case, we may not always execute 5755 // the predicated block. Thus, scale the block's cost by the probability of 5756 // executing it. 5757 if (VF == 1 && blockNeedsPredication(BB)) 5758 BlockCost.first /= getReciprocalPredBlockProb(); 5759 5760 Cost.first += BlockCost.first; 5761 Cost.second |= BlockCost.second; 5762 } 5763 5764 return Cost; 5765 } 5766 5767 /// Gets Address Access SCEV after verifying that the access pattern 5768 /// is loop invariant except the induction variable dependence. 5769 /// 5770 /// This SCEV can be sent to the Target in order to estimate the address 5771 /// calculation cost. 5772 static const SCEV *getAddressAccessSCEV( 5773 Value *Ptr, 5774 LoopVectorizationLegality *Legal, 5775 PredicatedScalarEvolution &PSE, 5776 const Loop *TheLoop) { 5777 5778 auto *Gep = dyn_cast<GetElementPtrInst>(Ptr); 5779 if (!Gep) 5780 return nullptr; 5781 5782 // We are looking for a gep with all loop invariant indices except for one 5783 // which should be an induction variable. 5784 auto SE = PSE.getSE(); 5785 unsigned NumOperands = Gep->getNumOperands(); 5786 for (unsigned i = 1; i < NumOperands; ++i) { 5787 Value *Opd = Gep->getOperand(i); 5788 if (!SE->isLoopInvariant(SE->getSCEV(Opd), TheLoop) && 5789 !Legal->isInductionVariable(Opd)) 5790 return nullptr; 5791 } 5792 5793 // Now we know we have a GEP ptr, %inv, %ind, %inv. return the Ptr SCEV. 5794 return PSE.getSCEV(Ptr); 5795 } 5796 5797 static bool isStrideMul(Instruction *I, LoopVectorizationLegality *Legal) { 5798 return Legal->hasStride(I->getOperand(0)) || 5799 Legal->hasStride(I->getOperand(1)); 5800 } 5801 5802 unsigned LoopVectorizationCostModel::getMemInstScalarizationCost(Instruction *I, 5803 unsigned VF) { 5804 assert(VF > 1 && "Scalarization cost of instruction implies vectorization."); 5805 Type *ValTy = getMemInstValueType(I); 5806 auto SE = PSE.getSE(); 5807 5808 unsigned AS = getLoadStoreAddressSpace(I); 5809 Value *Ptr = getLoadStorePointerOperand(I); 5810 Type *PtrTy = ToVectorTy(Ptr->getType(), VF); 5811 5812 // Figure out whether the access is strided and get the stride value 5813 // if it's known in compile time 5814 const SCEV *PtrSCEV = getAddressAccessSCEV(Ptr, Legal, PSE, TheLoop); 5815 5816 // Get the cost of the scalar memory instruction and address computation. 5817 unsigned Cost = VF * TTI.getAddressComputationCost(PtrTy, SE, PtrSCEV); 5818 5819 // Don't pass *I here, since it is scalar but will actually be part of a 5820 // vectorized loop where the user of it is a vectorized instruction. 5821 const MaybeAlign Alignment = getLoadStoreAlignment(I); 5822 Cost += VF * TTI.getMemoryOpCost(I->getOpcode(), ValTy->getScalarType(), 5823 Alignment, AS); 5824 5825 // Get the overhead of the extractelement and insertelement instructions 5826 // we might create due to scalarization. 5827 Cost += getScalarizationOverhead(I, VF); 5828 5829 // If we have a predicated store, it may not be executed for each vector 5830 // lane. Scale the cost by the probability of executing the predicated 5831 // block. 5832 if (isPredicatedInst(I)) { 5833 Cost /= getReciprocalPredBlockProb(); 5834 5835 if (useEmulatedMaskMemRefHack(I)) 5836 // Artificially setting to a high enough value to practically disable 5837 // vectorization with such operations. 5838 Cost = 3000000; 5839 } 5840 5841 return Cost; 5842 } 5843 5844 unsigned LoopVectorizationCostModel::getConsecutiveMemOpCost(Instruction *I, 5845 unsigned VF) { 5846 Type *ValTy = getMemInstValueType(I); 5847 Type *VectorTy = ToVectorTy(ValTy, VF); 5848 Value *Ptr = getLoadStorePointerOperand(I); 5849 unsigned AS = getLoadStoreAddressSpace(I); 5850 int ConsecutiveStride = Legal->isConsecutivePtr(Ptr); 5851 5852 assert((ConsecutiveStride == 1 || ConsecutiveStride == -1) && 5853 "Stride should be 1 or -1 for consecutive memory access"); 5854 const MaybeAlign Alignment = getLoadStoreAlignment(I); 5855 unsigned Cost = 0; 5856 if (Legal->isMaskRequired(I)) 5857 Cost += TTI.getMaskedMemoryOpCost(I->getOpcode(), VectorTy, 5858 Alignment ? Alignment->value() : 0, AS); 5859 else 5860 Cost += TTI.getMemoryOpCost(I->getOpcode(), VectorTy, Alignment, AS, I); 5861 5862 bool Reverse = ConsecutiveStride < 0; 5863 if (Reverse) 5864 Cost += TTI.getShuffleCost(TargetTransformInfo::SK_Reverse, VectorTy, 0); 5865 return Cost; 5866 } 5867 5868 unsigned LoopVectorizationCostModel::getUniformMemOpCost(Instruction *I, 5869 unsigned VF) { 5870 Type *ValTy = getMemInstValueType(I); 5871 Type *VectorTy = ToVectorTy(ValTy, VF); 5872 const MaybeAlign Alignment = getLoadStoreAlignment(I); 5873 unsigned AS = getLoadStoreAddressSpace(I); 5874 if (isa<LoadInst>(I)) { 5875 return TTI.getAddressComputationCost(ValTy) + 5876 TTI.getMemoryOpCost(Instruction::Load, ValTy, Alignment, AS) + 5877 TTI.getShuffleCost(TargetTransformInfo::SK_Broadcast, VectorTy); 5878 } 5879 StoreInst *SI = cast<StoreInst>(I); 5880 5881 bool isLoopInvariantStoreValue = Legal->isUniform(SI->getValueOperand()); 5882 return TTI.getAddressComputationCost(ValTy) + 5883 TTI.getMemoryOpCost(Instruction::Store, ValTy, Alignment, AS) + 5884 (isLoopInvariantStoreValue 5885 ? 0 5886 : TTI.getVectorInstrCost(Instruction::ExtractElement, VectorTy, 5887 VF - 1)); 5888 } 5889 5890 unsigned LoopVectorizationCostModel::getGatherScatterCost(Instruction *I, 5891 unsigned VF) { 5892 Type *ValTy = getMemInstValueType(I); 5893 Type *VectorTy = ToVectorTy(ValTy, VF); 5894 const MaybeAlign Alignment = getLoadStoreAlignment(I); 5895 Value *Ptr = getLoadStorePointerOperand(I); 5896 5897 return TTI.getAddressComputationCost(VectorTy) + 5898 TTI.getGatherScatterOpCost(I->getOpcode(), VectorTy, Ptr, 5899 Legal->isMaskRequired(I), 5900 Alignment ? Alignment->value() : 0); 5901 } 5902 5903 unsigned LoopVectorizationCostModel::getInterleaveGroupCost(Instruction *I, 5904 unsigned VF) { 5905 Type *ValTy = getMemInstValueType(I); 5906 Type *VectorTy = ToVectorTy(ValTy, VF); 5907 unsigned AS = getLoadStoreAddressSpace(I); 5908 5909 auto Group = getInterleavedAccessGroup(I); 5910 assert(Group && "Fail to get an interleaved access group."); 5911 5912 unsigned InterleaveFactor = Group->getFactor(); 5913 Type *WideVecTy = VectorType::get(ValTy, VF * InterleaveFactor); 5914 5915 // Holds the indices of existing members in an interleaved load group. 5916 // An interleaved store group doesn't need this as it doesn't allow gaps. 5917 SmallVector<unsigned, 4> Indices; 5918 if (isa<LoadInst>(I)) { 5919 for (unsigned i = 0; i < InterleaveFactor; i++) 5920 if (Group->getMember(i)) 5921 Indices.push_back(i); 5922 } 5923 5924 // Calculate the cost of the whole interleaved group. 5925 bool UseMaskForGaps = 5926 Group->requiresScalarEpilogue() && !isScalarEpilogueAllowed(); 5927 unsigned Cost = TTI.getInterleavedMemoryOpCost( 5928 I->getOpcode(), WideVecTy, Group->getFactor(), Indices, 5929 Group->getAlignment(), AS, Legal->isMaskRequired(I), UseMaskForGaps); 5930 5931 if (Group->isReverse()) { 5932 // TODO: Add support for reversed masked interleaved access. 5933 assert(!Legal->isMaskRequired(I) && 5934 "Reverse masked interleaved access not supported."); 5935 Cost += Group->getNumMembers() * 5936 TTI.getShuffleCost(TargetTransformInfo::SK_Reverse, VectorTy, 0); 5937 } 5938 return Cost; 5939 } 5940 5941 unsigned LoopVectorizationCostModel::getMemoryInstructionCost(Instruction *I, 5942 unsigned VF) { 5943 // Calculate scalar cost only. Vectorization cost should be ready at this 5944 // moment. 5945 if (VF == 1) { 5946 Type *ValTy = getMemInstValueType(I); 5947 const MaybeAlign Alignment = getLoadStoreAlignment(I); 5948 unsigned AS = getLoadStoreAddressSpace(I); 5949 5950 return TTI.getAddressComputationCost(ValTy) + 5951 TTI.getMemoryOpCost(I->getOpcode(), ValTy, Alignment, AS, I); 5952 } 5953 return getWideningCost(I, VF); 5954 } 5955 5956 LoopVectorizationCostModel::VectorizationCostTy 5957 LoopVectorizationCostModel::getInstructionCost(Instruction *I, unsigned VF) { 5958 // If we know that this instruction will remain uniform, check the cost of 5959 // the scalar version. 5960 if (isUniformAfterVectorization(I, VF)) 5961 VF = 1; 5962 5963 if (VF > 1 && isProfitableToScalarize(I, VF)) 5964 return VectorizationCostTy(InstsToScalarize[VF][I], false); 5965 5966 // Forced scalars do not have any scalarization overhead. 5967 auto ForcedScalar = ForcedScalars.find(VF); 5968 if (VF > 1 && ForcedScalar != ForcedScalars.end()) { 5969 auto InstSet = ForcedScalar->second; 5970 if (InstSet.find(I) != InstSet.end()) 5971 return VectorizationCostTy((getInstructionCost(I, 1).first * VF), false); 5972 } 5973 5974 Type *VectorTy; 5975 unsigned C = getInstructionCost(I, VF, VectorTy); 5976 5977 bool TypeNotScalarized = 5978 VF > 1 && VectorTy->isVectorTy() && TTI.getNumberOfParts(VectorTy) < VF; 5979 return VectorizationCostTy(C, TypeNotScalarized); 5980 } 5981 5982 unsigned LoopVectorizationCostModel::getScalarizationOverhead(Instruction *I, 5983 unsigned VF) { 5984 5985 if (VF == 1) 5986 return 0; 5987 5988 unsigned Cost = 0; 5989 Type *RetTy = ToVectorTy(I->getType(), VF); 5990 if (!RetTy->isVoidTy() && 5991 (!isa<LoadInst>(I) || !TTI.supportsEfficientVectorElementLoadStore())) 5992 Cost += TTI.getScalarizationOverhead(RetTy, true, false); 5993 5994 // Some targets keep addresses scalar. 5995 if (isa<LoadInst>(I) && !TTI.prefersVectorizedAddressing()) 5996 return Cost; 5997 5998 // Some targets support efficient element stores. 5999 if (isa<StoreInst>(I) && TTI.supportsEfficientVectorElementLoadStore()) 6000 return Cost; 6001 6002 // Collect operands to consider. 6003 CallInst *CI = dyn_cast<CallInst>(I); 6004 Instruction::op_range Ops = CI ? CI->arg_operands() : I->operands(); 6005 6006 // Skip operands that do not require extraction/scalarization and do not incur 6007 // any overhead. 6008 return Cost + TTI.getOperandsScalarizationOverhead( 6009 filterExtractingOperands(Ops, VF), VF); 6010 } 6011 6012 void LoopVectorizationCostModel::setCostBasedWideningDecision(unsigned VF) { 6013 if (VF == 1) 6014 return; 6015 NumPredStores = 0; 6016 for (BasicBlock *BB : TheLoop->blocks()) { 6017 // For each instruction in the old loop. 6018 for (Instruction &I : *BB) { 6019 Value *Ptr = getLoadStorePointerOperand(&I); 6020 if (!Ptr) 6021 continue; 6022 6023 // TODO: We should generate better code and update the cost model for 6024 // predicated uniform stores. Today they are treated as any other 6025 // predicated store (see added test cases in 6026 // invariant-store-vectorization.ll). 6027 if (isa<StoreInst>(&I) && isScalarWithPredication(&I)) 6028 NumPredStores++; 6029 6030 if (Legal->isUniform(Ptr) && 6031 // Conditional loads and stores should be scalarized and predicated. 6032 // isScalarWithPredication cannot be used here since masked 6033 // gather/scatters are not considered scalar with predication. 6034 !Legal->blockNeedsPredication(I.getParent())) { 6035 // TODO: Avoid replicating loads and stores instead of 6036 // relying on instcombine to remove them. 6037 // Load: Scalar load + broadcast 6038 // Store: Scalar store + isLoopInvariantStoreValue ? 0 : extract 6039 unsigned Cost = getUniformMemOpCost(&I, VF); 6040 setWideningDecision(&I, VF, CM_Scalarize, Cost); 6041 continue; 6042 } 6043 6044 // We assume that widening is the best solution when possible. 6045 if (memoryInstructionCanBeWidened(&I, VF)) { 6046 unsigned Cost = getConsecutiveMemOpCost(&I, VF); 6047 int ConsecutiveStride = 6048 Legal->isConsecutivePtr(getLoadStorePointerOperand(&I)); 6049 assert((ConsecutiveStride == 1 || ConsecutiveStride == -1) && 6050 "Expected consecutive stride."); 6051 InstWidening Decision = 6052 ConsecutiveStride == 1 ? CM_Widen : CM_Widen_Reverse; 6053 setWideningDecision(&I, VF, Decision, Cost); 6054 continue; 6055 } 6056 6057 // Choose between Interleaving, Gather/Scatter or Scalarization. 6058 unsigned InterleaveCost = std::numeric_limits<unsigned>::max(); 6059 unsigned NumAccesses = 1; 6060 if (isAccessInterleaved(&I)) { 6061 auto Group = getInterleavedAccessGroup(&I); 6062 assert(Group && "Fail to get an interleaved access group."); 6063 6064 // Make one decision for the whole group. 6065 if (getWideningDecision(&I, VF) != CM_Unknown) 6066 continue; 6067 6068 NumAccesses = Group->getNumMembers(); 6069 if (interleavedAccessCanBeWidened(&I, VF)) 6070 InterleaveCost = getInterleaveGroupCost(&I, VF); 6071 } 6072 6073 unsigned GatherScatterCost = 6074 isLegalGatherOrScatter(&I) 6075 ? getGatherScatterCost(&I, VF) * NumAccesses 6076 : std::numeric_limits<unsigned>::max(); 6077 6078 unsigned ScalarizationCost = 6079 getMemInstScalarizationCost(&I, VF) * NumAccesses; 6080 6081 // Choose better solution for the current VF, 6082 // write down this decision and use it during vectorization. 6083 unsigned Cost; 6084 InstWidening Decision; 6085 if (InterleaveCost <= GatherScatterCost && 6086 InterleaveCost < ScalarizationCost) { 6087 Decision = CM_Interleave; 6088 Cost = InterleaveCost; 6089 } else if (GatherScatterCost < ScalarizationCost) { 6090 Decision = CM_GatherScatter; 6091 Cost = GatherScatterCost; 6092 } else { 6093 Decision = CM_Scalarize; 6094 Cost = ScalarizationCost; 6095 } 6096 // If the instructions belongs to an interleave group, the whole group 6097 // receives the same decision. The whole group receives the cost, but 6098 // the cost will actually be assigned to one instruction. 6099 if (auto Group = getInterleavedAccessGroup(&I)) 6100 setWideningDecision(Group, VF, Decision, Cost); 6101 else 6102 setWideningDecision(&I, VF, Decision, Cost); 6103 } 6104 } 6105 6106 // Make sure that any load of address and any other address computation 6107 // remains scalar unless there is gather/scatter support. This avoids 6108 // inevitable extracts into address registers, and also has the benefit of 6109 // activating LSR more, since that pass can't optimize vectorized 6110 // addresses. 6111 if (TTI.prefersVectorizedAddressing()) 6112 return; 6113 6114 // Start with all scalar pointer uses. 6115 SmallPtrSet<Instruction *, 8> AddrDefs; 6116 for (BasicBlock *BB : TheLoop->blocks()) 6117 for (Instruction &I : *BB) { 6118 Instruction *PtrDef = 6119 dyn_cast_or_null<Instruction>(getLoadStorePointerOperand(&I)); 6120 if (PtrDef && TheLoop->contains(PtrDef) && 6121 getWideningDecision(&I, VF) != CM_GatherScatter) 6122 AddrDefs.insert(PtrDef); 6123 } 6124 6125 // Add all instructions used to generate the addresses. 6126 SmallVector<Instruction *, 4> Worklist; 6127 for (auto *I : AddrDefs) 6128 Worklist.push_back(I); 6129 while (!Worklist.empty()) { 6130 Instruction *I = Worklist.pop_back_val(); 6131 for (auto &Op : I->operands()) 6132 if (auto *InstOp = dyn_cast<Instruction>(Op)) 6133 if ((InstOp->getParent() == I->getParent()) && !isa<PHINode>(InstOp) && 6134 AddrDefs.insert(InstOp).second) 6135 Worklist.push_back(InstOp); 6136 } 6137 6138 for (auto *I : AddrDefs) { 6139 if (isa<LoadInst>(I)) { 6140 // Setting the desired widening decision should ideally be handled in 6141 // by cost functions, but since this involves the task of finding out 6142 // if the loaded register is involved in an address computation, it is 6143 // instead changed here when we know this is the case. 6144 InstWidening Decision = getWideningDecision(I, VF); 6145 if (Decision == CM_Widen || Decision == CM_Widen_Reverse) 6146 // Scalarize a widened load of address. 6147 setWideningDecision(I, VF, CM_Scalarize, 6148 (VF * getMemoryInstructionCost(I, 1))); 6149 else if (auto Group = getInterleavedAccessGroup(I)) { 6150 // Scalarize an interleave group of address loads. 6151 for (unsigned I = 0; I < Group->getFactor(); ++I) { 6152 if (Instruction *Member = Group->getMember(I)) 6153 setWideningDecision(Member, VF, CM_Scalarize, 6154 (VF * getMemoryInstructionCost(Member, 1))); 6155 } 6156 } 6157 } else 6158 // Make sure I gets scalarized and a cost estimate without 6159 // scalarization overhead. 6160 ForcedScalars[VF].insert(I); 6161 } 6162 } 6163 6164 unsigned LoopVectorizationCostModel::getInstructionCost(Instruction *I, 6165 unsigned VF, 6166 Type *&VectorTy) { 6167 Type *RetTy = I->getType(); 6168 if (canTruncateToMinimalBitwidth(I, VF)) 6169 RetTy = IntegerType::get(RetTy->getContext(), MinBWs[I]); 6170 VectorTy = isScalarAfterVectorization(I, VF) ? RetTy : ToVectorTy(RetTy, VF); 6171 auto SE = PSE.getSE(); 6172 6173 // TODO: We need to estimate the cost of intrinsic calls. 6174 switch (I->getOpcode()) { 6175 case Instruction::GetElementPtr: 6176 // We mark this instruction as zero-cost because the cost of GEPs in 6177 // vectorized code depends on whether the corresponding memory instruction 6178 // is scalarized or not. Therefore, we handle GEPs with the memory 6179 // instruction cost. 6180 return 0; 6181 case Instruction::Br: { 6182 // In cases of scalarized and predicated instructions, there will be VF 6183 // predicated blocks in the vectorized loop. Each branch around these 6184 // blocks requires also an extract of its vector compare i1 element. 6185 bool ScalarPredicatedBB = false; 6186 BranchInst *BI = cast<BranchInst>(I); 6187 if (VF > 1 && BI->isConditional() && 6188 (PredicatedBBsAfterVectorization.find(BI->getSuccessor(0)) != 6189 PredicatedBBsAfterVectorization.end() || 6190 PredicatedBBsAfterVectorization.find(BI->getSuccessor(1)) != 6191 PredicatedBBsAfterVectorization.end())) 6192 ScalarPredicatedBB = true; 6193 6194 if (ScalarPredicatedBB) { 6195 // Return cost for branches around scalarized and predicated blocks. 6196 Type *Vec_i1Ty = 6197 VectorType::get(IntegerType::getInt1Ty(RetTy->getContext()), VF); 6198 return (TTI.getScalarizationOverhead(Vec_i1Ty, false, true) + 6199 (TTI.getCFInstrCost(Instruction::Br) * VF)); 6200 } else if (I->getParent() == TheLoop->getLoopLatch() || VF == 1) 6201 // The back-edge branch will remain, as will all scalar branches. 6202 return TTI.getCFInstrCost(Instruction::Br); 6203 else 6204 // This branch will be eliminated by if-conversion. 6205 return 0; 6206 // Note: We currently assume zero cost for an unconditional branch inside 6207 // a predicated block since it will become a fall-through, although we 6208 // may decide in the future to call TTI for all branches. 6209 } 6210 case Instruction::PHI: { 6211 auto *Phi = cast<PHINode>(I); 6212 6213 // First-order recurrences are replaced by vector shuffles inside the loop. 6214 // NOTE: Don't use ToVectorTy as SK_ExtractSubvector expects a vector type. 6215 if (VF > 1 && Legal->isFirstOrderRecurrence(Phi)) 6216 return TTI.getShuffleCost(TargetTransformInfo::SK_ExtractSubvector, 6217 VectorTy, VF - 1, VectorType::get(RetTy, 1)); 6218 6219 // Phi nodes in non-header blocks (not inductions, reductions, etc.) are 6220 // converted into select instructions. We require N - 1 selects per phi 6221 // node, where N is the number of incoming values. 6222 if (VF > 1 && Phi->getParent() != TheLoop->getHeader()) 6223 return (Phi->getNumIncomingValues() - 1) * 6224 TTI.getCmpSelInstrCost( 6225 Instruction::Select, ToVectorTy(Phi->getType(), VF), 6226 ToVectorTy(Type::getInt1Ty(Phi->getContext()), VF)); 6227 6228 return TTI.getCFInstrCost(Instruction::PHI); 6229 } 6230 case Instruction::UDiv: 6231 case Instruction::SDiv: 6232 case Instruction::URem: 6233 case Instruction::SRem: 6234 // If we have a predicated instruction, it may not be executed for each 6235 // vector lane. Get the scalarization cost and scale this amount by the 6236 // probability of executing the predicated block. If the instruction is not 6237 // predicated, we fall through to the next case. 6238 if (VF > 1 && isScalarWithPredication(I)) { 6239 unsigned Cost = 0; 6240 6241 // These instructions have a non-void type, so account for the phi nodes 6242 // that we will create. This cost is likely to be zero. The phi node 6243 // cost, if any, should be scaled by the block probability because it 6244 // models a copy at the end of each predicated block. 6245 Cost += VF * TTI.getCFInstrCost(Instruction::PHI); 6246 6247 // The cost of the non-predicated instruction. 6248 Cost += VF * TTI.getArithmeticInstrCost(I->getOpcode(), RetTy); 6249 6250 // The cost of insertelement and extractelement instructions needed for 6251 // scalarization. 6252 Cost += getScalarizationOverhead(I, VF); 6253 6254 // Scale the cost by the probability of executing the predicated blocks. 6255 // This assumes the predicated block for each vector lane is equally 6256 // likely. 6257 return Cost / getReciprocalPredBlockProb(); 6258 } 6259 LLVM_FALLTHROUGH; 6260 case Instruction::Add: 6261 case Instruction::FAdd: 6262 case Instruction::Sub: 6263 case Instruction::FSub: 6264 case Instruction::Mul: 6265 case Instruction::FMul: 6266 case Instruction::FDiv: 6267 case Instruction::FRem: 6268 case Instruction::Shl: 6269 case Instruction::LShr: 6270 case Instruction::AShr: 6271 case Instruction::And: 6272 case Instruction::Or: 6273 case Instruction::Xor: { 6274 // Since we will replace the stride by 1 the multiplication should go away. 6275 if (I->getOpcode() == Instruction::Mul && isStrideMul(I, Legal)) 6276 return 0; 6277 // Certain instructions can be cheaper to vectorize if they have a constant 6278 // second vector operand. One example of this are shifts on x86. 6279 Value *Op2 = I->getOperand(1); 6280 TargetTransformInfo::OperandValueProperties Op2VP; 6281 TargetTransformInfo::OperandValueKind Op2VK = 6282 TTI.getOperandInfo(Op2, Op2VP); 6283 if (Op2VK == TargetTransformInfo::OK_AnyValue && Legal->isUniform(Op2)) 6284 Op2VK = TargetTransformInfo::OK_UniformValue; 6285 6286 SmallVector<const Value *, 4> Operands(I->operand_values()); 6287 unsigned N = isScalarAfterVectorization(I, VF) ? VF : 1; 6288 return N * TTI.getArithmeticInstrCost( 6289 I->getOpcode(), VectorTy, TargetTransformInfo::OK_AnyValue, 6290 Op2VK, TargetTransformInfo::OP_None, Op2VP, Operands, I); 6291 } 6292 case Instruction::FNeg: { 6293 unsigned N = isScalarAfterVectorization(I, VF) ? VF : 1; 6294 return N * TTI.getArithmeticInstrCost( 6295 I->getOpcode(), VectorTy, TargetTransformInfo::OK_AnyValue, 6296 TargetTransformInfo::OK_AnyValue, 6297 TargetTransformInfo::OP_None, TargetTransformInfo::OP_None, 6298 I->getOperand(0), I); 6299 } 6300 case Instruction::Select: { 6301 SelectInst *SI = cast<SelectInst>(I); 6302 const SCEV *CondSCEV = SE->getSCEV(SI->getCondition()); 6303 bool ScalarCond = (SE->isLoopInvariant(CondSCEV, TheLoop)); 6304 Type *CondTy = SI->getCondition()->getType(); 6305 if (!ScalarCond) 6306 CondTy = VectorType::get(CondTy, VF); 6307 6308 return TTI.getCmpSelInstrCost(I->getOpcode(), VectorTy, CondTy, I); 6309 } 6310 case Instruction::ICmp: 6311 case Instruction::FCmp: { 6312 Type *ValTy = I->getOperand(0)->getType(); 6313 Instruction *Op0AsInstruction = dyn_cast<Instruction>(I->getOperand(0)); 6314 if (canTruncateToMinimalBitwidth(Op0AsInstruction, VF)) 6315 ValTy = IntegerType::get(ValTy->getContext(), MinBWs[Op0AsInstruction]); 6316 VectorTy = ToVectorTy(ValTy, VF); 6317 return TTI.getCmpSelInstrCost(I->getOpcode(), VectorTy, nullptr, I); 6318 } 6319 case Instruction::Store: 6320 case Instruction::Load: { 6321 unsigned Width = VF; 6322 if (Width > 1) { 6323 InstWidening Decision = getWideningDecision(I, Width); 6324 assert(Decision != CM_Unknown && 6325 "CM decision should be taken at this point"); 6326 if (Decision == CM_Scalarize) 6327 Width = 1; 6328 } 6329 VectorTy = ToVectorTy(getMemInstValueType(I), Width); 6330 return getMemoryInstructionCost(I, VF); 6331 } 6332 case Instruction::ZExt: 6333 case Instruction::SExt: 6334 case Instruction::FPToUI: 6335 case Instruction::FPToSI: 6336 case Instruction::FPExt: 6337 case Instruction::PtrToInt: 6338 case Instruction::IntToPtr: 6339 case Instruction::SIToFP: 6340 case Instruction::UIToFP: 6341 case Instruction::Trunc: 6342 case Instruction::FPTrunc: 6343 case Instruction::BitCast: { 6344 // We optimize the truncation of induction variables having constant 6345 // integer steps. The cost of these truncations is the same as the scalar 6346 // operation. 6347 if (isOptimizableIVTruncate(I, VF)) { 6348 auto *Trunc = cast<TruncInst>(I); 6349 return TTI.getCastInstrCost(Instruction::Trunc, Trunc->getDestTy(), 6350 Trunc->getSrcTy(), Trunc); 6351 } 6352 6353 Type *SrcScalarTy = I->getOperand(0)->getType(); 6354 Type *SrcVecTy = 6355 VectorTy->isVectorTy() ? ToVectorTy(SrcScalarTy, VF) : SrcScalarTy; 6356 if (canTruncateToMinimalBitwidth(I, VF)) { 6357 // This cast is going to be shrunk. This may remove the cast or it might 6358 // turn it into slightly different cast. For example, if MinBW == 16, 6359 // "zext i8 %1 to i32" becomes "zext i8 %1 to i16". 6360 // 6361 // Calculate the modified src and dest types. 6362 Type *MinVecTy = VectorTy; 6363 if (I->getOpcode() == Instruction::Trunc) { 6364 SrcVecTy = smallestIntegerVectorType(SrcVecTy, MinVecTy); 6365 VectorTy = 6366 largestIntegerVectorType(ToVectorTy(I->getType(), VF), MinVecTy); 6367 } else if (I->getOpcode() == Instruction::ZExt || 6368 I->getOpcode() == Instruction::SExt) { 6369 SrcVecTy = largestIntegerVectorType(SrcVecTy, MinVecTy); 6370 VectorTy = 6371 smallestIntegerVectorType(ToVectorTy(I->getType(), VF), MinVecTy); 6372 } 6373 } 6374 6375 unsigned N = isScalarAfterVectorization(I, VF) ? VF : 1; 6376 return N * TTI.getCastInstrCost(I->getOpcode(), VectorTy, SrcVecTy, I); 6377 } 6378 case Instruction::Call: { 6379 bool NeedToScalarize; 6380 CallInst *CI = cast<CallInst>(I); 6381 unsigned CallCost = getVectorCallCost(CI, VF, NeedToScalarize); 6382 if (getVectorIntrinsicIDForCall(CI, TLI)) 6383 return std::min(CallCost, getVectorIntrinsicCost(CI, VF)); 6384 return CallCost; 6385 } 6386 default: 6387 // The cost of executing VF copies of the scalar instruction. This opcode 6388 // is unknown. Assume that it is the same as 'mul'. 6389 return VF * TTI.getArithmeticInstrCost(Instruction::Mul, VectorTy) + 6390 getScalarizationOverhead(I, VF); 6391 } // end of switch. 6392 } 6393 6394 char LoopVectorize::ID = 0; 6395 6396 static const char lv_name[] = "Loop Vectorization"; 6397 6398 INITIALIZE_PASS_BEGIN(LoopVectorize, LV_NAME, lv_name, false, false) 6399 INITIALIZE_PASS_DEPENDENCY(TargetTransformInfoWrapperPass) 6400 INITIALIZE_PASS_DEPENDENCY(BasicAAWrapperPass) 6401 INITIALIZE_PASS_DEPENDENCY(AAResultsWrapperPass) 6402 INITIALIZE_PASS_DEPENDENCY(GlobalsAAWrapperPass) 6403 INITIALIZE_PASS_DEPENDENCY(AssumptionCacheTracker) 6404 INITIALIZE_PASS_DEPENDENCY(BlockFrequencyInfoWrapperPass) 6405 INITIALIZE_PASS_DEPENDENCY(DominatorTreeWrapperPass) 6406 INITIALIZE_PASS_DEPENDENCY(ScalarEvolutionWrapperPass) 6407 INITIALIZE_PASS_DEPENDENCY(LoopInfoWrapperPass) 6408 INITIALIZE_PASS_DEPENDENCY(LoopAccessLegacyAnalysis) 6409 INITIALIZE_PASS_DEPENDENCY(DemandedBitsWrapperPass) 6410 INITIALIZE_PASS_DEPENDENCY(OptimizationRemarkEmitterWrapperPass) 6411 INITIALIZE_PASS_DEPENDENCY(ProfileSummaryInfoWrapperPass) 6412 INITIALIZE_PASS_DEPENDENCY(InjectTLIMappingsLegacy) 6413 INITIALIZE_PASS_END(LoopVectorize, LV_NAME, lv_name, false, false) 6414 6415 namespace llvm { 6416 6417 Pass *createLoopVectorizePass() { return new LoopVectorize(); } 6418 6419 Pass *createLoopVectorizePass(bool InterleaveOnlyWhenForced, 6420 bool VectorizeOnlyWhenForced) { 6421 return new LoopVectorize(InterleaveOnlyWhenForced, VectorizeOnlyWhenForced); 6422 } 6423 6424 } // end namespace llvm 6425 6426 bool LoopVectorizationCostModel::isConsecutiveLoadOrStore(Instruction *Inst) { 6427 // Check if the pointer operand of a load or store instruction is 6428 // consecutive. 6429 if (auto *Ptr = getLoadStorePointerOperand(Inst)) 6430 return Legal->isConsecutivePtr(Ptr); 6431 return false; 6432 } 6433 6434 void LoopVectorizationCostModel::collectValuesToIgnore() { 6435 // Ignore ephemeral values. 6436 CodeMetrics::collectEphemeralValues(TheLoop, AC, ValuesToIgnore); 6437 6438 // Ignore type-promoting instructions we identified during reduction 6439 // detection. 6440 for (auto &Reduction : *Legal->getReductionVars()) { 6441 RecurrenceDescriptor &RedDes = Reduction.second; 6442 SmallPtrSetImpl<Instruction *> &Casts = RedDes.getCastInsts(); 6443 VecValuesToIgnore.insert(Casts.begin(), Casts.end()); 6444 } 6445 // Ignore type-casting instructions we identified during induction 6446 // detection. 6447 for (auto &Induction : *Legal->getInductionVars()) { 6448 InductionDescriptor &IndDes = Induction.second; 6449 const SmallVectorImpl<Instruction *> &Casts = IndDes.getCastInsts(); 6450 VecValuesToIgnore.insert(Casts.begin(), Casts.end()); 6451 } 6452 } 6453 6454 // TODO: we could return a pair of values that specify the max VF and 6455 // min VF, to be used in `buildVPlans(MinVF, MaxVF)` instead of 6456 // `buildVPlans(VF, VF)`. We cannot do it because VPLAN at the moment 6457 // doesn't have a cost model that can choose which plan to execute if 6458 // more than one is generated. 6459 static unsigned determineVPlanVF(const unsigned WidestVectorRegBits, 6460 LoopVectorizationCostModel &CM) { 6461 unsigned WidestType; 6462 std::tie(std::ignore, WidestType) = CM.getSmallestAndWidestTypes(); 6463 return WidestVectorRegBits / WidestType; 6464 } 6465 6466 VectorizationFactor 6467 LoopVectorizationPlanner::planInVPlanNativePath(unsigned UserVF) { 6468 unsigned VF = UserVF; 6469 // Outer loop handling: They may require CFG and instruction level 6470 // transformations before even evaluating whether vectorization is profitable. 6471 // Since we cannot modify the incoming IR, we need to build VPlan upfront in 6472 // the vectorization pipeline. 6473 if (!OrigLoop->empty()) { 6474 // If the user doesn't provide a vectorization factor, determine a 6475 // reasonable one. 6476 if (!UserVF) { 6477 VF = determineVPlanVF(TTI->getRegisterBitWidth(true /* Vector*/), CM); 6478 LLVM_DEBUG(dbgs() << "LV: VPlan computed VF " << VF << ".\n"); 6479 6480 // Make sure we have a VF > 1 for stress testing. 6481 if (VPlanBuildStressTest && VF < 2) { 6482 LLVM_DEBUG(dbgs() << "LV: VPlan stress testing: " 6483 << "overriding computed VF.\n"); 6484 VF = 4; 6485 } 6486 } 6487 assert(EnableVPlanNativePath && "VPlan-native path is not enabled."); 6488 assert(isPowerOf2_32(VF) && "VF needs to be a power of two"); 6489 LLVM_DEBUG(dbgs() << "LV: Using " << (UserVF ? "user " : "") << "VF " << VF 6490 << " to build VPlans.\n"); 6491 buildVPlans(VF, VF); 6492 6493 // For VPlan build stress testing, we bail out after VPlan construction. 6494 if (VPlanBuildStressTest) 6495 return VectorizationFactor::Disabled(); 6496 6497 return {VF, 0}; 6498 } 6499 6500 LLVM_DEBUG( 6501 dbgs() << "LV: Not vectorizing. Inner loops aren't supported in the " 6502 "VPlan-native path.\n"); 6503 return VectorizationFactor::Disabled(); 6504 } 6505 6506 Optional<VectorizationFactor> LoopVectorizationPlanner::plan(unsigned UserVF) { 6507 assert(OrigLoop->empty() && "Inner loop expected."); 6508 Optional<unsigned> MaybeMaxVF = CM.computeMaxVF(); 6509 if (!MaybeMaxVF) // Cases that should not to be vectorized nor interleaved. 6510 return None; 6511 6512 // Invalidate interleave groups if all blocks of loop will be predicated. 6513 if (CM.blockNeedsPredication(OrigLoop->getHeader()) && 6514 !useMaskedInterleavedAccesses(*TTI)) { 6515 LLVM_DEBUG( 6516 dbgs() 6517 << "LV: Invalidate all interleaved groups due to fold-tail by masking " 6518 "which requires masked-interleaved support.\n"); 6519 CM.InterleaveInfo.reset(); 6520 } 6521 6522 if (UserVF) { 6523 LLVM_DEBUG(dbgs() << "LV: Using user VF " << UserVF << ".\n"); 6524 assert(isPowerOf2_32(UserVF) && "VF needs to be a power of two"); 6525 // Collect the instructions (and their associated costs) that will be more 6526 // profitable to scalarize. 6527 CM.selectUserVectorizationFactor(UserVF); 6528 buildVPlansWithVPRecipes(UserVF, UserVF); 6529 LLVM_DEBUG(printPlans(dbgs())); 6530 return {{UserVF, 0}}; 6531 } 6532 6533 unsigned MaxVF = MaybeMaxVF.getValue(); 6534 assert(MaxVF != 0 && "MaxVF is zero."); 6535 6536 for (unsigned VF = 1; VF <= MaxVF; VF *= 2) { 6537 // Collect Uniform and Scalar instructions after vectorization with VF. 6538 CM.collectUniformsAndScalars(VF); 6539 6540 // Collect the instructions (and their associated costs) that will be more 6541 // profitable to scalarize. 6542 if (VF > 1) 6543 CM.collectInstsToScalarize(VF); 6544 } 6545 6546 buildVPlansWithVPRecipes(1, MaxVF); 6547 LLVM_DEBUG(printPlans(dbgs())); 6548 if (MaxVF == 1) 6549 return VectorizationFactor::Disabled(); 6550 6551 // Select the optimal vectorization factor. 6552 return CM.selectVectorizationFactor(MaxVF); 6553 } 6554 6555 void LoopVectorizationPlanner::setBestPlan(unsigned VF, unsigned UF) { 6556 LLVM_DEBUG(dbgs() << "Setting best plan to VF=" << VF << ", UF=" << UF 6557 << '\n'); 6558 BestVF = VF; 6559 BestUF = UF; 6560 6561 erase_if(VPlans, [VF](const VPlanPtr &Plan) { 6562 return !Plan->hasVF(VF); 6563 }); 6564 assert(VPlans.size() == 1 && "Best VF has not a single VPlan."); 6565 } 6566 6567 void LoopVectorizationPlanner::executePlan(InnerLoopVectorizer &ILV, 6568 DominatorTree *DT) { 6569 // Perform the actual loop transformation. 6570 6571 // 1. Create a new empty loop. Unlink the old loop and connect the new one. 6572 VPCallbackILV CallbackILV(ILV); 6573 6574 VPTransformState State{BestVF, BestUF, LI, 6575 DT, ILV.Builder, ILV.VectorLoopValueMap, 6576 &ILV, CallbackILV}; 6577 State.CFG.PrevBB = ILV.createVectorizedLoopSkeleton(); 6578 State.TripCount = ILV.getOrCreateTripCount(nullptr); 6579 6580 //===------------------------------------------------===// 6581 // 6582 // Notice: any optimization or new instruction that go 6583 // into the code below should also be implemented in 6584 // the cost-model. 6585 // 6586 //===------------------------------------------------===// 6587 6588 // 2. Copy and widen instructions from the old loop into the new loop. 6589 assert(VPlans.size() == 1 && "Not a single VPlan to execute."); 6590 VPlans.front()->execute(&State); 6591 6592 // 3. Fix the vectorized code: take care of header phi's, live-outs, 6593 // predication, updating analyses. 6594 ILV.fixVectorizedLoop(); 6595 } 6596 6597 void LoopVectorizationPlanner::collectTriviallyDeadInstructions( 6598 SmallPtrSetImpl<Instruction *> &DeadInstructions) { 6599 BasicBlock *Latch = OrigLoop->getLoopLatch(); 6600 6601 // We create new control-flow for the vectorized loop, so the original 6602 // condition will be dead after vectorization if it's only used by the 6603 // branch. 6604 auto *Cmp = dyn_cast<Instruction>(Latch->getTerminator()->getOperand(0)); 6605 if (Cmp && Cmp->hasOneUse()) 6606 DeadInstructions.insert(Cmp); 6607 6608 // We create new "steps" for induction variable updates to which the original 6609 // induction variables map. An original update instruction will be dead if 6610 // all its users except the induction variable are dead. 6611 for (auto &Induction : *Legal->getInductionVars()) { 6612 PHINode *Ind = Induction.first; 6613 auto *IndUpdate = cast<Instruction>(Ind->getIncomingValueForBlock(Latch)); 6614 if (llvm::all_of(IndUpdate->users(), [&](User *U) -> bool { 6615 return U == Ind || DeadInstructions.find(cast<Instruction>(U)) != 6616 DeadInstructions.end(); 6617 })) 6618 DeadInstructions.insert(IndUpdate); 6619 6620 // We record as "Dead" also the type-casting instructions we had identified 6621 // during induction analysis. We don't need any handling for them in the 6622 // vectorized loop because we have proven that, under a proper runtime 6623 // test guarding the vectorized loop, the value of the phi, and the casted 6624 // value of the phi, are the same. The last instruction in this casting chain 6625 // will get its scalar/vector/widened def from the scalar/vector/widened def 6626 // of the respective phi node. Any other casts in the induction def-use chain 6627 // have no other uses outside the phi update chain, and will be ignored. 6628 InductionDescriptor &IndDes = Induction.second; 6629 const SmallVectorImpl<Instruction *> &Casts = IndDes.getCastInsts(); 6630 DeadInstructions.insert(Casts.begin(), Casts.end()); 6631 } 6632 } 6633 6634 Value *InnerLoopUnroller::reverseVector(Value *Vec) { return Vec; } 6635 6636 Value *InnerLoopUnroller::getBroadcastInstrs(Value *V) { return V; } 6637 6638 Value *InnerLoopUnroller::getStepVector(Value *Val, int StartIdx, Value *Step, 6639 Instruction::BinaryOps BinOp) { 6640 // When unrolling and the VF is 1, we only need to add a simple scalar. 6641 Type *Ty = Val->getType(); 6642 assert(!Ty->isVectorTy() && "Val must be a scalar"); 6643 6644 if (Ty->isFloatingPointTy()) { 6645 Constant *C = ConstantFP::get(Ty, (double)StartIdx); 6646 6647 // Floating point operations had to be 'fast' to enable the unrolling. 6648 Value *MulOp = addFastMathFlag(Builder.CreateFMul(C, Step)); 6649 return addFastMathFlag(Builder.CreateBinOp(BinOp, Val, MulOp)); 6650 } 6651 Constant *C = ConstantInt::get(Ty, StartIdx); 6652 return Builder.CreateAdd(Val, Builder.CreateMul(C, Step), "induction"); 6653 } 6654 6655 static void AddRuntimeUnrollDisableMetaData(Loop *L) { 6656 SmallVector<Metadata *, 4> MDs; 6657 // Reserve first location for self reference to the LoopID metadata node. 6658 MDs.push_back(nullptr); 6659 bool IsUnrollMetadata = false; 6660 MDNode *LoopID = L->getLoopID(); 6661 if (LoopID) { 6662 // First find existing loop unrolling disable metadata. 6663 for (unsigned i = 1, ie = LoopID->getNumOperands(); i < ie; ++i) { 6664 auto *MD = dyn_cast<MDNode>(LoopID->getOperand(i)); 6665 if (MD) { 6666 const auto *S = dyn_cast<MDString>(MD->getOperand(0)); 6667 IsUnrollMetadata = 6668 S && S->getString().startswith("llvm.loop.unroll.disable"); 6669 } 6670 MDs.push_back(LoopID->getOperand(i)); 6671 } 6672 } 6673 6674 if (!IsUnrollMetadata) { 6675 // Add runtime unroll disable metadata. 6676 LLVMContext &Context = L->getHeader()->getContext(); 6677 SmallVector<Metadata *, 1> DisableOperands; 6678 DisableOperands.push_back( 6679 MDString::get(Context, "llvm.loop.unroll.runtime.disable")); 6680 MDNode *DisableNode = MDNode::get(Context, DisableOperands); 6681 MDs.push_back(DisableNode); 6682 MDNode *NewLoopID = MDNode::get(Context, MDs); 6683 // Set operand 0 to refer to the loop id itself. 6684 NewLoopID->replaceOperandWith(0, NewLoopID); 6685 L->setLoopID(NewLoopID); 6686 } 6687 } 6688 6689 bool LoopVectorizationPlanner::getDecisionAndClampRange( 6690 const std::function<bool(unsigned)> &Predicate, VFRange &Range) { 6691 assert(Range.End > Range.Start && "Trying to test an empty VF range."); 6692 bool PredicateAtRangeStart = Predicate(Range.Start); 6693 6694 for (unsigned TmpVF = Range.Start * 2; TmpVF < Range.End; TmpVF *= 2) 6695 if (Predicate(TmpVF) != PredicateAtRangeStart) { 6696 Range.End = TmpVF; 6697 break; 6698 } 6699 6700 return PredicateAtRangeStart; 6701 } 6702 6703 /// Build VPlans for the full range of feasible VF's = {\p MinVF, 2 * \p MinVF, 6704 /// 4 * \p MinVF, ..., \p MaxVF} by repeatedly building a VPlan for a sub-range 6705 /// of VF's starting at a given VF and extending it as much as possible. Each 6706 /// vectorization decision can potentially shorten this sub-range during 6707 /// buildVPlan(). 6708 void LoopVectorizationPlanner::buildVPlans(unsigned MinVF, unsigned MaxVF) { 6709 for (unsigned VF = MinVF; VF < MaxVF + 1;) { 6710 VFRange SubRange = {VF, MaxVF + 1}; 6711 VPlans.push_back(buildVPlan(SubRange)); 6712 VF = SubRange.End; 6713 } 6714 } 6715 6716 VPValue *VPRecipeBuilder::createEdgeMask(BasicBlock *Src, BasicBlock *Dst, 6717 VPlanPtr &Plan) { 6718 assert(is_contained(predecessors(Dst), Src) && "Invalid edge"); 6719 6720 // Look for cached value. 6721 std::pair<BasicBlock *, BasicBlock *> Edge(Src, Dst); 6722 EdgeMaskCacheTy::iterator ECEntryIt = EdgeMaskCache.find(Edge); 6723 if (ECEntryIt != EdgeMaskCache.end()) 6724 return ECEntryIt->second; 6725 6726 VPValue *SrcMask = createBlockInMask(Src, Plan); 6727 6728 // The terminator has to be a branch inst! 6729 BranchInst *BI = dyn_cast<BranchInst>(Src->getTerminator()); 6730 assert(BI && "Unexpected terminator found"); 6731 6732 if (!BI->isConditional() || BI->getSuccessor(0) == BI->getSuccessor(1)) 6733 return EdgeMaskCache[Edge] = SrcMask; 6734 6735 VPValue *EdgeMask = Plan->getVPValue(BI->getCondition()); 6736 assert(EdgeMask && "No Edge Mask found for condition"); 6737 6738 if (BI->getSuccessor(0) != Dst) 6739 EdgeMask = Builder.createNot(EdgeMask); 6740 6741 if (SrcMask) // Otherwise block in-mask is all-one, no need to AND. 6742 EdgeMask = Builder.createAnd(EdgeMask, SrcMask); 6743 6744 return EdgeMaskCache[Edge] = EdgeMask; 6745 } 6746 6747 VPValue *VPRecipeBuilder::createBlockInMask(BasicBlock *BB, VPlanPtr &Plan) { 6748 assert(OrigLoop->contains(BB) && "Block is not a part of a loop"); 6749 6750 // Look for cached value. 6751 BlockMaskCacheTy::iterator BCEntryIt = BlockMaskCache.find(BB); 6752 if (BCEntryIt != BlockMaskCache.end()) 6753 return BCEntryIt->second; 6754 6755 // All-one mask is modelled as no-mask following the convention for masked 6756 // load/store/gather/scatter. Initialize BlockMask to no-mask. 6757 VPValue *BlockMask = nullptr; 6758 6759 if (OrigLoop->getHeader() == BB) { 6760 if (!CM.blockNeedsPredication(BB)) 6761 return BlockMaskCache[BB] = BlockMask; // Loop incoming mask is all-one. 6762 6763 // Introduce the early-exit compare IV <= BTC to form header block mask. 6764 // This is used instead of IV < TC because TC may wrap, unlike BTC. 6765 VPValue *IV = Plan->getVPValue(Legal->getPrimaryInduction()); 6766 VPValue *BTC = Plan->getOrCreateBackedgeTakenCount(); 6767 BlockMask = Builder.createNaryOp(VPInstruction::ICmpULE, {IV, BTC}); 6768 return BlockMaskCache[BB] = BlockMask; 6769 } 6770 6771 // This is the block mask. We OR all incoming edges. 6772 for (auto *Predecessor : predecessors(BB)) { 6773 VPValue *EdgeMask = createEdgeMask(Predecessor, BB, Plan); 6774 if (!EdgeMask) // Mask of predecessor is all-one so mask of block is too. 6775 return BlockMaskCache[BB] = EdgeMask; 6776 6777 if (!BlockMask) { // BlockMask has its initialized nullptr value. 6778 BlockMask = EdgeMask; 6779 continue; 6780 } 6781 6782 BlockMask = Builder.createOr(BlockMask, EdgeMask); 6783 } 6784 6785 return BlockMaskCache[BB] = BlockMask; 6786 } 6787 6788 VPWidenMemoryInstructionRecipe * 6789 VPRecipeBuilder::tryToWidenMemory(Instruction *I, VFRange &Range, 6790 VPlanPtr &Plan) { 6791 if (!isa<LoadInst>(I) && !isa<StoreInst>(I)) 6792 return nullptr; 6793 6794 auto willWiden = [&](unsigned VF) -> bool { 6795 if (VF == 1) 6796 return false; 6797 LoopVectorizationCostModel::InstWidening Decision = 6798 CM.getWideningDecision(I, VF); 6799 assert(Decision != LoopVectorizationCostModel::CM_Unknown && 6800 "CM decision should be taken at this point."); 6801 if (Decision == LoopVectorizationCostModel::CM_Interleave) 6802 return true; 6803 if (CM.isScalarAfterVectorization(I, VF) || 6804 CM.isProfitableToScalarize(I, VF)) 6805 return false; 6806 return Decision != LoopVectorizationCostModel::CM_Scalarize; 6807 }; 6808 6809 if (!LoopVectorizationPlanner::getDecisionAndClampRange(willWiden, Range)) 6810 return nullptr; 6811 6812 VPValue *Mask = nullptr; 6813 if (Legal->isMaskRequired(I)) 6814 Mask = createBlockInMask(I->getParent(), Plan); 6815 6816 VPValue *Addr = Plan->getOrAddVPValue(getLoadStorePointerOperand(I)); 6817 return new VPWidenMemoryInstructionRecipe(*I, Addr, Mask); 6818 } 6819 6820 VPWidenIntOrFpInductionRecipe * 6821 VPRecipeBuilder::tryToOptimizeInduction(Instruction *I, VFRange &Range) { 6822 if (PHINode *Phi = dyn_cast<PHINode>(I)) { 6823 // Check if this is an integer or fp induction. If so, build the recipe that 6824 // produces its scalar and vector values. 6825 InductionDescriptor II = Legal->getInductionVars()->lookup(Phi); 6826 if (II.getKind() == InductionDescriptor::IK_IntInduction || 6827 II.getKind() == InductionDescriptor::IK_FpInduction) 6828 return new VPWidenIntOrFpInductionRecipe(Phi); 6829 6830 return nullptr; 6831 } 6832 6833 // Optimize the special case where the source is a constant integer 6834 // induction variable. Notice that we can only optimize the 'trunc' case 6835 // because (a) FP conversions lose precision, (b) sext/zext may wrap, and 6836 // (c) other casts depend on pointer size. 6837 6838 // Determine whether \p K is a truncation based on an induction variable that 6839 // can be optimized. 6840 auto isOptimizableIVTruncate = 6841 [&](Instruction *K) -> std::function<bool(unsigned)> { 6842 return 6843 [=](unsigned VF) -> bool { return CM.isOptimizableIVTruncate(K, VF); }; 6844 }; 6845 6846 if (isa<TruncInst>(I) && LoopVectorizationPlanner::getDecisionAndClampRange( 6847 isOptimizableIVTruncate(I), Range)) 6848 return new VPWidenIntOrFpInductionRecipe(cast<PHINode>(I->getOperand(0)), 6849 cast<TruncInst>(I)); 6850 return nullptr; 6851 } 6852 6853 VPBlendRecipe *VPRecipeBuilder::tryToBlend(Instruction *I, VPlanPtr &Plan) { 6854 PHINode *Phi = dyn_cast<PHINode>(I); 6855 if (!Phi || Phi->getParent() == OrigLoop->getHeader()) 6856 return nullptr; 6857 6858 // We know that all PHIs in non-header blocks are converted into selects, so 6859 // we don't have to worry about the insertion order and we can just use the 6860 // builder. At this point we generate the predication tree. There may be 6861 // duplications since this is a simple recursive scan, but future 6862 // optimizations will clean it up. 6863 6864 SmallVector<VPValue *, 2> Masks; 6865 unsigned NumIncoming = Phi->getNumIncomingValues(); 6866 for (unsigned In = 0; In < NumIncoming; In++) { 6867 VPValue *EdgeMask = 6868 createEdgeMask(Phi->getIncomingBlock(In), Phi->getParent(), Plan); 6869 assert((EdgeMask || NumIncoming == 1) && 6870 "Multiple predecessors with one having a full mask"); 6871 if (EdgeMask) 6872 Masks.push_back(EdgeMask); 6873 } 6874 return new VPBlendRecipe(Phi, Masks); 6875 } 6876 6877 bool VPRecipeBuilder::tryToWiden(Instruction *I, VPBasicBlock *VPBB, 6878 VFRange &Range) { 6879 6880 bool IsPredicated = LoopVectorizationPlanner::getDecisionAndClampRange( 6881 [&](unsigned VF) { return CM.isScalarWithPredication(I, VF); }, Range); 6882 6883 if (IsPredicated) 6884 return false; 6885 6886 auto IsVectorizableOpcode = [](unsigned Opcode) { 6887 switch (Opcode) { 6888 case Instruction::Add: 6889 case Instruction::And: 6890 case Instruction::AShr: 6891 case Instruction::BitCast: 6892 case Instruction::Br: 6893 case Instruction::Call: 6894 case Instruction::FAdd: 6895 case Instruction::FCmp: 6896 case Instruction::FDiv: 6897 case Instruction::FMul: 6898 case Instruction::FNeg: 6899 case Instruction::FPExt: 6900 case Instruction::FPToSI: 6901 case Instruction::FPToUI: 6902 case Instruction::FPTrunc: 6903 case Instruction::FRem: 6904 case Instruction::FSub: 6905 case Instruction::ICmp: 6906 case Instruction::IntToPtr: 6907 case Instruction::Load: 6908 case Instruction::LShr: 6909 case Instruction::Mul: 6910 case Instruction::Or: 6911 case Instruction::PHI: 6912 case Instruction::PtrToInt: 6913 case Instruction::SDiv: 6914 case Instruction::Select: 6915 case Instruction::SExt: 6916 case Instruction::Shl: 6917 case Instruction::SIToFP: 6918 case Instruction::SRem: 6919 case Instruction::Store: 6920 case Instruction::Sub: 6921 case Instruction::Trunc: 6922 case Instruction::UDiv: 6923 case Instruction::UIToFP: 6924 case Instruction::URem: 6925 case Instruction::Xor: 6926 case Instruction::ZExt: 6927 return true; 6928 } 6929 return false; 6930 }; 6931 6932 if (!IsVectorizableOpcode(I->getOpcode())) 6933 return false; 6934 6935 if (CallInst *CI = dyn_cast<CallInst>(I)) { 6936 Intrinsic::ID ID = getVectorIntrinsicIDForCall(CI, TLI); 6937 if (ID && (ID == Intrinsic::assume || ID == Intrinsic::lifetime_end || 6938 ID == Intrinsic::lifetime_start || ID == Intrinsic::sideeffect)) 6939 return false; 6940 } 6941 6942 auto willWiden = [&](unsigned VF) -> bool { 6943 if (!isa<PHINode>(I) && (CM.isScalarAfterVectorization(I, VF) || 6944 CM.isProfitableToScalarize(I, VF))) 6945 return false; 6946 if (CallInst *CI = dyn_cast<CallInst>(I)) { 6947 Intrinsic::ID ID = getVectorIntrinsicIDForCall(CI, TLI); 6948 // The following case may be scalarized depending on the VF. 6949 // The flag shows whether we use Intrinsic or a usual Call for vectorized 6950 // version of the instruction. 6951 // Is it beneficial to perform intrinsic call compared to lib call? 6952 bool NeedToScalarize; 6953 unsigned CallCost = CM.getVectorCallCost(CI, VF, NeedToScalarize); 6954 bool UseVectorIntrinsic = 6955 ID && CM.getVectorIntrinsicCost(CI, VF) <= CallCost; 6956 return UseVectorIntrinsic || !NeedToScalarize; 6957 } 6958 if (isa<LoadInst>(I) || isa<StoreInst>(I)) { 6959 assert(CM.getWideningDecision(I, VF) == 6960 LoopVectorizationCostModel::CM_Scalarize && 6961 "Memory widening decisions should have been taken care by now"); 6962 return false; 6963 } 6964 return true; 6965 }; 6966 6967 if (!LoopVectorizationPlanner::getDecisionAndClampRange(willWiden, Range)) 6968 return false; 6969 // If this ingredient's recipe is to be recorded, keep its recipe a singleton 6970 // to avoid having to split recipes later. 6971 bool IsSingleton = Ingredient2Recipe.count(I); 6972 6973 // Success: widen this instruction. 6974 6975 // Use the default widening recipe. We optimize the common case where 6976 // consecutive instructions can be represented by a single recipe. 6977 if (!IsSingleton && !VPBB->empty() && LastExtensibleRecipe == &VPBB->back() && 6978 LastExtensibleRecipe->appendInstruction(I)) 6979 return true; 6980 6981 VPWidenRecipe *WidenRecipe = new VPWidenRecipe(I); 6982 if (!IsSingleton) 6983 LastExtensibleRecipe = WidenRecipe; 6984 setRecipe(I, WidenRecipe); 6985 VPBB->appendRecipe(WidenRecipe); 6986 return true; 6987 } 6988 6989 VPBasicBlock *VPRecipeBuilder::handleReplication( 6990 Instruction *I, VFRange &Range, VPBasicBlock *VPBB, 6991 DenseMap<Instruction *, VPReplicateRecipe *> &PredInst2Recipe, 6992 VPlanPtr &Plan) { 6993 bool IsUniform = LoopVectorizationPlanner::getDecisionAndClampRange( 6994 [&](unsigned VF) { return CM.isUniformAfterVectorization(I, VF); }, 6995 Range); 6996 6997 bool IsPredicated = LoopVectorizationPlanner::getDecisionAndClampRange( 6998 [&](unsigned VF) { return CM.isScalarWithPredication(I, VF); }, Range); 6999 7000 auto *Recipe = new VPReplicateRecipe(I, IsUniform, IsPredicated); 7001 setRecipe(I, Recipe); 7002 7003 // Find if I uses a predicated instruction. If so, it will use its scalar 7004 // value. Avoid hoisting the insert-element which packs the scalar value into 7005 // a vector value, as that happens iff all users use the vector value. 7006 for (auto &Op : I->operands()) 7007 if (auto *PredInst = dyn_cast<Instruction>(Op)) 7008 if (PredInst2Recipe.find(PredInst) != PredInst2Recipe.end()) 7009 PredInst2Recipe[PredInst]->setAlsoPack(false); 7010 7011 // Finalize the recipe for Instr, first if it is not predicated. 7012 if (!IsPredicated) { 7013 LLVM_DEBUG(dbgs() << "LV: Scalarizing:" << *I << "\n"); 7014 VPBB->appendRecipe(Recipe); 7015 return VPBB; 7016 } 7017 LLVM_DEBUG(dbgs() << "LV: Scalarizing and predicating:" << *I << "\n"); 7018 assert(VPBB->getSuccessors().empty() && 7019 "VPBB has successors when handling predicated replication."); 7020 // Record predicated instructions for above packing optimizations. 7021 PredInst2Recipe[I] = Recipe; 7022 VPBlockBase *Region = createReplicateRegion(I, Recipe, Plan); 7023 VPBlockUtils::insertBlockAfter(Region, VPBB); 7024 auto *RegSucc = new VPBasicBlock(); 7025 VPBlockUtils::insertBlockAfter(RegSucc, Region); 7026 return RegSucc; 7027 } 7028 7029 VPRegionBlock *VPRecipeBuilder::createReplicateRegion(Instruction *Instr, 7030 VPRecipeBase *PredRecipe, 7031 VPlanPtr &Plan) { 7032 // Instructions marked for predication are replicated and placed under an 7033 // if-then construct to prevent side-effects. 7034 7035 // Generate recipes to compute the block mask for this region. 7036 VPValue *BlockInMask = createBlockInMask(Instr->getParent(), Plan); 7037 7038 // Build the triangular if-then region. 7039 std::string RegionName = (Twine("pred.") + Instr->getOpcodeName()).str(); 7040 assert(Instr->getParent() && "Predicated instruction not in any basic block"); 7041 auto *BOMRecipe = new VPBranchOnMaskRecipe(BlockInMask); 7042 auto *Entry = new VPBasicBlock(Twine(RegionName) + ".entry", BOMRecipe); 7043 auto *PHIRecipe = 7044 Instr->getType()->isVoidTy() ? nullptr : new VPPredInstPHIRecipe(Instr); 7045 auto *Exit = new VPBasicBlock(Twine(RegionName) + ".continue", PHIRecipe); 7046 auto *Pred = new VPBasicBlock(Twine(RegionName) + ".if", PredRecipe); 7047 VPRegionBlock *Region = new VPRegionBlock(Entry, Exit, RegionName, true); 7048 7049 // Note: first set Entry as region entry and then connect successors starting 7050 // from it in order, to propagate the "parent" of each VPBasicBlock. 7051 VPBlockUtils::insertTwoBlocksAfter(Pred, Exit, BlockInMask, Entry); 7052 VPBlockUtils::connectBlocks(Pred, Exit); 7053 7054 return Region; 7055 } 7056 7057 bool VPRecipeBuilder::tryToCreateRecipe(Instruction *Instr, VFRange &Range, 7058 VPlanPtr &Plan, VPBasicBlock *VPBB) { 7059 VPRecipeBase *Recipe = nullptr; 7060 7061 // First, check for specific widening recipes that deal with memory 7062 // operations, inductions and Phi nodes. 7063 if ((Recipe = tryToWidenMemory(Instr, Range, Plan)) || 7064 (Recipe = tryToOptimizeInduction(Instr, Range)) || 7065 (Recipe = tryToBlend(Instr, Plan)) || 7066 (isa<PHINode>(Instr) && 7067 (Recipe = new VPWidenPHIRecipe(cast<PHINode>(Instr))))) { 7068 setRecipe(Instr, Recipe); 7069 VPBB->appendRecipe(Recipe); 7070 return true; 7071 } 7072 7073 // Handle GEP widening. 7074 if (GetElementPtrInst *GEP = dyn_cast<GetElementPtrInst>(Instr)) { 7075 auto Scalarize = [&](unsigned VF) { 7076 return CM.isScalarWithPredication(Instr, VF) || 7077 CM.isScalarAfterVectorization(Instr, VF) || 7078 CM.isProfitableToScalarize(Instr, VF); 7079 }; 7080 if (LoopVectorizationPlanner::getDecisionAndClampRange(Scalarize, Range)) 7081 return false; 7082 VPWidenGEPRecipe *Recipe = new VPWidenGEPRecipe(GEP, OrigLoop); 7083 setRecipe(Instr, Recipe); 7084 VPBB->appendRecipe(Recipe); 7085 return true; 7086 } 7087 7088 // Check if Instr is to be widened by a general VPWidenRecipe, after 7089 // having first checked for specific widening recipes. 7090 if (tryToWiden(Instr, VPBB, Range)) 7091 return true; 7092 7093 return false; 7094 } 7095 7096 void LoopVectorizationPlanner::buildVPlansWithVPRecipes(unsigned MinVF, 7097 unsigned MaxVF) { 7098 assert(OrigLoop->empty() && "Inner loop expected."); 7099 7100 // Collect conditions feeding internal conditional branches; they need to be 7101 // represented in VPlan for it to model masking. 7102 SmallPtrSet<Value *, 1> NeedDef; 7103 7104 auto *Latch = OrigLoop->getLoopLatch(); 7105 for (BasicBlock *BB : OrigLoop->blocks()) { 7106 if (BB == Latch) 7107 continue; 7108 BranchInst *Branch = dyn_cast<BranchInst>(BB->getTerminator()); 7109 if (Branch && Branch->isConditional()) 7110 NeedDef.insert(Branch->getCondition()); 7111 } 7112 7113 // If the tail is to be folded by masking, the primary induction variable 7114 // needs to be represented in VPlan for it to model early-exit masking. 7115 // Also, both the Phi and the live-out instruction of each reduction are 7116 // required in order to introduce a select between them in VPlan. 7117 if (CM.foldTailByMasking()) { 7118 NeedDef.insert(Legal->getPrimaryInduction()); 7119 for (auto &Reduction : *Legal->getReductionVars()) { 7120 NeedDef.insert(Reduction.first); 7121 NeedDef.insert(Reduction.second.getLoopExitInstr()); 7122 } 7123 } 7124 7125 // Collect instructions from the original loop that will become trivially dead 7126 // in the vectorized loop. We don't need to vectorize these instructions. For 7127 // example, original induction update instructions can become dead because we 7128 // separately emit induction "steps" when generating code for the new loop. 7129 // Similarly, we create a new latch condition when setting up the structure 7130 // of the new loop, so the old one can become dead. 7131 SmallPtrSet<Instruction *, 4> DeadInstructions; 7132 collectTriviallyDeadInstructions(DeadInstructions); 7133 7134 for (unsigned VF = MinVF; VF < MaxVF + 1;) { 7135 VFRange SubRange = {VF, MaxVF + 1}; 7136 VPlans.push_back( 7137 buildVPlanWithVPRecipes(SubRange, NeedDef, DeadInstructions)); 7138 VF = SubRange.End; 7139 } 7140 } 7141 7142 VPlanPtr LoopVectorizationPlanner::buildVPlanWithVPRecipes( 7143 VFRange &Range, SmallPtrSetImpl<Value *> &NeedDef, 7144 SmallPtrSetImpl<Instruction *> &DeadInstructions) { 7145 7146 // Hold a mapping from predicated instructions to their recipes, in order to 7147 // fix their AlsoPack behavior if a user is determined to replicate and use a 7148 // scalar instead of vector value. 7149 DenseMap<Instruction *, VPReplicateRecipe *> PredInst2Recipe; 7150 7151 DenseMap<Instruction *, Instruction *> &SinkAfter = Legal->getSinkAfter(); 7152 7153 SmallPtrSet<const InterleaveGroup<Instruction> *, 1> InterleaveGroups; 7154 7155 VPRecipeBuilder RecipeBuilder(OrigLoop, TLI, Legal, CM, Builder); 7156 7157 // --------------------------------------------------------------------------- 7158 // Pre-construction: record ingredients whose recipes we'll need to further 7159 // process after constructing the initial VPlan. 7160 // --------------------------------------------------------------------------- 7161 7162 // Mark instructions we'll need to sink later and their targets as 7163 // ingredients whose recipe we'll need to record. 7164 for (auto &Entry : SinkAfter) { 7165 RecipeBuilder.recordRecipeOf(Entry.first); 7166 RecipeBuilder.recordRecipeOf(Entry.second); 7167 } 7168 7169 // For each interleave group which is relevant for this (possibly trimmed) 7170 // Range, add it to the set of groups to be later applied to the VPlan and add 7171 // placeholders for its members' Recipes which we'll be replacing with a 7172 // single VPInterleaveRecipe. 7173 for (InterleaveGroup<Instruction> *IG : IAI.getInterleaveGroups()) { 7174 auto applyIG = [IG, this](unsigned VF) -> bool { 7175 return (VF >= 2 && // Query is illegal for VF == 1 7176 CM.getWideningDecision(IG->getInsertPos(), VF) == 7177 LoopVectorizationCostModel::CM_Interleave); 7178 }; 7179 if (!getDecisionAndClampRange(applyIG, Range)) 7180 continue; 7181 InterleaveGroups.insert(IG); 7182 for (unsigned i = 0; i < IG->getFactor(); i++) 7183 if (Instruction *Member = IG->getMember(i)) 7184 RecipeBuilder.recordRecipeOf(Member); 7185 }; 7186 7187 // --------------------------------------------------------------------------- 7188 // Build initial VPlan: Scan the body of the loop in a topological order to 7189 // visit each basic block after having visited its predecessor basic blocks. 7190 // --------------------------------------------------------------------------- 7191 7192 // Add assume instructions we need to drop to DeadInstructions, to prevent 7193 // them from being added to the VPlan. 7194 // TODO: We only need to drop assumes in blocks that get flattend. If the 7195 // control flow is preserved, we should keep them. 7196 auto &ConditionalAssumes = Legal->getConditionalAssumes(); 7197 DeadInstructions.insert(ConditionalAssumes.begin(), ConditionalAssumes.end()); 7198 7199 // Create a dummy pre-entry VPBasicBlock to start building the VPlan. 7200 VPBasicBlock *VPBB = new VPBasicBlock("Pre-Entry"); 7201 auto Plan = std::make_unique<VPlan>(VPBB); 7202 7203 // Represent values that will have defs inside VPlan. 7204 for (Value *V : NeedDef) 7205 Plan->addVPValue(V); 7206 7207 // Scan the body of the loop in a topological order to visit each basic block 7208 // after having visited its predecessor basic blocks. 7209 LoopBlocksDFS DFS(OrigLoop); 7210 DFS.perform(LI); 7211 7212 for (BasicBlock *BB : make_range(DFS.beginRPO(), DFS.endRPO())) { 7213 // Relevant instructions from basic block BB will be grouped into VPRecipe 7214 // ingredients and fill a new VPBasicBlock. 7215 unsigned VPBBsForBB = 0; 7216 auto *FirstVPBBForBB = new VPBasicBlock(BB->getName()); 7217 VPBlockUtils::insertBlockAfter(FirstVPBBForBB, VPBB); 7218 VPBB = FirstVPBBForBB; 7219 Builder.setInsertPoint(VPBB); 7220 7221 // Introduce each ingredient into VPlan. 7222 for (Instruction &I : BB->instructionsWithoutDebug()) { 7223 Instruction *Instr = &I; 7224 7225 // First filter out irrelevant instructions, to ensure no recipes are 7226 // built for them. 7227 if (isa<BranchInst>(Instr) || 7228 DeadInstructions.find(Instr) != DeadInstructions.end()) 7229 continue; 7230 7231 if (RecipeBuilder.tryToCreateRecipe(Instr, Range, Plan, VPBB)) 7232 continue; 7233 7234 // Otherwise, if all widening options failed, Instruction is to be 7235 // replicated. This may create a successor for VPBB. 7236 VPBasicBlock *NextVPBB = RecipeBuilder.handleReplication( 7237 Instr, Range, VPBB, PredInst2Recipe, Plan); 7238 if (NextVPBB != VPBB) { 7239 VPBB = NextVPBB; 7240 VPBB->setName(BB->hasName() ? BB->getName() + "." + Twine(VPBBsForBB++) 7241 : ""); 7242 } 7243 } 7244 } 7245 7246 // Discard empty dummy pre-entry VPBasicBlock. Note that other VPBasicBlocks 7247 // may also be empty, such as the last one VPBB, reflecting original 7248 // basic-blocks with no recipes. 7249 VPBasicBlock *PreEntry = cast<VPBasicBlock>(Plan->getEntry()); 7250 assert(PreEntry->empty() && "Expecting empty pre-entry block."); 7251 VPBlockBase *Entry = Plan->setEntry(PreEntry->getSingleSuccessor()); 7252 VPBlockUtils::disconnectBlocks(PreEntry, Entry); 7253 delete PreEntry; 7254 7255 // --------------------------------------------------------------------------- 7256 // Transform initial VPlan: Apply previously taken decisions, in order, to 7257 // bring the VPlan to its final state. 7258 // --------------------------------------------------------------------------- 7259 7260 // Apply Sink-After legal constraints. 7261 for (auto &Entry : SinkAfter) { 7262 VPRecipeBase *Sink = RecipeBuilder.getRecipe(Entry.first); 7263 VPRecipeBase *Target = RecipeBuilder.getRecipe(Entry.second); 7264 Sink->moveAfter(Target); 7265 } 7266 7267 // Interleave memory: for each Interleave Group we marked earlier as relevant 7268 // for this VPlan, replace the Recipes widening its memory instructions with a 7269 // single VPInterleaveRecipe at its insertion point. 7270 for (auto IG : InterleaveGroups) { 7271 auto *Recipe = cast<VPWidenMemoryInstructionRecipe>( 7272 RecipeBuilder.getRecipe(IG->getInsertPos())); 7273 (new VPInterleaveRecipe(IG, Recipe->getAddr(), Recipe->getMask())) 7274 ->insertBefore(Recipe); 7275 7276 for (unsigned i = 0; i < IG->getFactor(); ++i) 7277 if (Instruction *Member = IG->getMember(i)) { 7278 RecipeBuilder.getRecipe(Member)->eraseFromParent(); 7279 } 7280 } 7281 7282 // Finally, if tail is folded by masking, introduce selects between the phi 7283 // and the live-out instruction of each reduction, at the end of the latch. 7284 if (CM.foldTailByMasking()) { 7285 Builder.setInsertPoint(VPBB); 7286 auto *Cond = RecipeBuilder.createBlockInMask(OrigLoop->getHeader(), Plan); 7287 for (auto &Reduction : *Legal->getReductionVars()) { 7288 VPValue *Phi = Plan->getVPValue(Reduction.first); 7289 VPValue *Red = Plan->getVPValue(Reduction.second.getLoopExitInstr()); 7290 Builder.createNaryOp(Instruction::Select, {Cond, Red, Phi}); 7291 } 7292 } 7293 7294 std::string PlanName; 7295 raw_string_ostream RSO(PlanName); 7296 unsigned VF = Range.Start; 7297 Plan->addVF(VF); 7298 RSO << "Initial VPlan for VF={" << VF; 7299 for (VF *= 2; VF < Range.End; VF *= 2) { 7300 Plan->addVF(VF); 7301 RSO << "," << VF; 7302 } 7303 RSO << "},UF>=1"; 7304 RSO.flush(); 7305 Plan->setName(PlanName); 7306 7307 return Plan; 7308 } 7309 7310 VPlanPtr LoopVectorizationPlanner::buildVPlan(VFRange &Range) { 7311 // Outer loop handling: They may require CFG and instruction level 7312 // transformations before even evaluating whether vectorization is profitable. 7313 // Since we cannot modify the incoming IR, we need to build VPlan upfront in 7314 // the vectorization pipeline. 7315 assert(!OrigLoop->empty()); 7316 assert(EnableVPlanNativePath && "VPlan-native path is not enabled."); 7317 7318 // Create new empty VPlan 7319 auto Plan = std::make_unique<VPlan>(); 7320 7321 // Build hierarchical CFG 7322 VPlanHCFGBuilder HCFGBuilder(OrigLoop, LI, *Plan); 7323 HCFGBuilder.buildHierarchicalCFG(); 7324 7325 for (unsigned VF = Range.Start; VF < Range.End; VF *= 2) 7326 Plan->addVF(VF); 7327 7328 if (EnableVPlanPredication) { 7329 VPlanPredicator VPP(*Plan); 7330 VPP.predicate(); 7331 7332 // Avoid running transformation to recipes until masked code generation in 7333 // VPlan-native path is in place. 7334 return Plan; 7335 } 7336 7337 SmallPtrSet<Instruction *, 1> DeadInstructions; 7338 VPlanTransforms::VPInstructionsToVPRecipes( 7339 OrigLoop, Plan, Legal->getInductionVars(), DeadInstructions); 7340 return Plan; 7341 } 7342 7343 Value* LoopVectorizationPlanner::VPCallbackILV:: 7344 getOrCreateVectorValues(Value *V, unsigned Part) { 7345 return ILV.getOrCreateVectorValue(V, Part); 7346 } 7347 7348 Value *LoopVectorizationPlanner::VPCallbackILV::getOrCreateScalarValue( 7349 Value *V, const VPIteration &Instance) { 7350 return ILV.getOrCreateScalarValue(V, Instance); 7351 } 7352 7353 void VPInterleaveRecipe::print(raw_ostream &O, const Twine &Indent) const { 7354 O << " +\n" 7355 << Indent << "\"INTERLEAVE-GROUP with factor " << IG->getFactor() << " at "; 7356 IG->getInsertPos()->printAsOperand(O, false); 7357 O << ", "; 7358 getAddr()->printAsOperand(O); 7359 VPValue *Mask = getMask(); 7360 if (Mask) { 7361 O << ", "; 7362 Mask->printAsOperand(O); 7363 } 7364 O << "\\l\""; 7365 for (unsigned i = 0; i < IG->getFactor(); ++i) 7366 if (Instruction *I = IG->getMember(i)) 7367 O << " +\n" 7368 << Indent << "\" " << VPlanIngredient(I) << " " << i << "\\l\""; 7369 } 7370 7371 void VPWidenRecipe::execute(VPTransformState &State) { 7372 for (auto &Instr : make_range(Begin, End)) 7373 State.ILV->widenInstruction(Instr); 7374 } 7375 7376 void VPWidenGEPRecipe::execute(VPTransformState &State) { 7377 State.ILV->widenGEP(GEP, State.UF, State.VF, IsPtrLoopInvariant, 7378 IsIndexLoopInvariant); 7379 } 7380 7381 void VPWidenIntOrFpInductionRecipe::execute(VPTransformState &State) { 7382 assert(!State.Instance && "Int or FP induction being replicated."); 7383 State.ILV->widenIntOrFpInduction(IV, Trunc); 7384 } 7385 7386 void VPWidenPHIRecipe::execute(VPTransformState &State) { 7387 State.ILV->widenPHIInstruction(Phi, State.UF, State.VF); 7388 } 7389 7390 void VPBlendRecipe::execute(VPTransformState &State) { 7391 State.ILV->setDebugLocFromInst(State.Builder, Phi); 7392 // We know that all PHIs in non-header blocks are converted into 7393 // selects, so we don't have to worry about the insertion order and we 7394 // can just use the builder. 7395 // At this point we generate the predication tree. There may be 7396 // duplications since this is a simple recursive scan, but future 7397 // optimizations will clean it up. 7398 7399 unsigned NumIncoming = Phi->getNumIncomingValues(); 7400 7401 assert((User || NumIncoming == 1) && 7402 "Multiple predecessors with predecessors having a full mask"); 7403 // Generate a sequence of selects of the form: 7404 // SELECT(Mask3, In3, 7405 // SELECT(Mask2, In2, 7406 // ( ...))) 7407 InnerLoopVectorizer::VectorParts Entry(State.UF); 7408 for (unsigned In = 0; In < NumIncoming; ++In) { 7409 for (unsigned Part = 0; Part < State.UF; ++Part) { 7410 // We might have single edge PHIs (blocks) - use an identity 7411 // 'select' for the first PHI operand. 7412 Value *In0 = 7413 State.ILV->getOrCreateVectorValue(Phi->getIncomingValue(In), Part); 7414 if (In == 0) 7415 Entry[Part] = In0; // Initialize with the first incoming value. 7416 else { 7417 // Select between the current value and the previous incoming edge 7418 // based on the incoming mask. 7419 Value *Cond = State.get(User->getOperand(In), Part); 7420 Entry[Part] = 7421 State.Builder.CreateSelect(Cond, In0, Entry[Part], "predphi"); 7422 } 7423 } 7424 } 7425 for (unsigned Part = 0; Part < State.UF; ++Part) 7426 State.ValueMap.setVectorValue(Phi, Part, Entry[Part]); 7427 } 7428 7429 void VPInterleaveRecipe::execute(VPTransformState &State) { 7430 assert(!State.Instance && "Interleave group being replicated."); 7431 State.ILV->vectorizeInterleaveGroup(IG->getInsertPos(), State, getAddr(), 7432 getMask()); 7433 } 7434 7435 void VPReplicateRecipe::execute(VPTransformState &State) { 7436 if (State.Instance) { // Generate a single instance. 7437 State.ILV->scalarizeInstruction(Ingredient, *State.Instance, IsPredicated); 7438 // Insert scalar instance packing it into a vector. 7439 if (AlsoPack && State.VF > 1) { 7440 // If we're constructing lane 0, initialize to start from undef. 7441 if (State.Instance->Lane == 0) { 7442 Value *Undef = 7443 UndefValue::get(VectorType::get(Ingredient->getType(), State.VF)); 7444 State.ValueMap.setVectorValue(Ingredient, State.Instance->Part, Undef); 7445 } 7446 State.ILV->packScalarIntoVectorValue(Ingredient, *State.Instance); 7447 } 7448 return; 7449 } 7450 7451 // Generate scalar instances for all VF lanes of all UF parts, unless the 7452 // instruction is uniform inwhich case generate only the first lane for each 7453 // of the UF parts. 7454 unsigned EndLane = IsUniform ? 1 : State.VF; 7455 for (unsigned Part = 0; Part < State.UF; ++Part) 7456 for (unsigned Lane = 0; Lane < EndLane; ++Lane) 7457 State.ILV->scalarizeInstruction(Ingredient, {Part, Lane}, IsPredicated); 7458 } 7459 7460 void VPBranchOnMaskRecipe::execute(VPTransformState &State) { 7461 assert(State.Instance && "Branch on Mask works only on single instance."); 7462 7463 unsigned Part = State.Instance->Part; 7464 unsigned Lane = State.Instance->Lane; 7465 7466 Value *ConditionBit = nullptr; 7467 if (!User) // Block in mask is all-one. 7468 ConditionBit = State.Builder.getTrue(); 7469 else { 7470 VPValue *BlockInMask = User->getOperand(0); 7471 ConditionBit = State.get(BlockInMask, Part); 7472 if (ConditionBit->getType()->isVectorTy()) 7473 ConditionBit = State.Builder.CreateExtractElement( 7474 ConditionBit, State.Builder.getInt32(Lane)); 7475 } 7476 7477 // Replace the temporary unreachable terminator with a new conditional branch, 7478 // whose two destinations will be set later when they are created. 7479 auto *CurrentTerminator = State.CFG.PrevBB->getTerminator(); 7480 assert(isa<UnreachableInst>(CurrentTerminator) && 7481 "Expected to replace unreachable terminator with conditional branch."); 7482 auto *CondBr = BranchInst::Create(State.CFG.PrevBB, nullptr, ConditionBit); 7483 CondBr->setSuccessor(0, nullptr); 7484 ReplaceInstWithInst(CurrentTerminator, CondBr); 7485 } 7486 7487 void VPPredInstPHIRecipe::execute(VPTransformState &State) { 7488 assert(State.Instance && "Predicated instruction PHI works per instance."); 7489 Instruction *ScalarPredInst = cast<Instruction>( 7490 State.ValueMap.getScalarValue(PredInst, *State.Instance)); 7491 BasicBlock *PredicatedBB = ScalarPredInst->getParent(); 7492 BasicBlock *PredicatingBB = PredicatedBB->getSinglePredecessor(); 7493 assert(PredicatingBB && "Predicated block has no single predecessor."); 7494 7495 // By current pack/unpack logic we need to generate only a single phi node: if 7496 // a vector value for the predicated instruction exists at this point it means 7497 // the instruction has vector users only, and a phi for the vector value is 7498 // needed. In this case the recipe of the predicated instruction is marked to 7499 // also do that packing, thereby "hoisting" the insert-element sequence. 7500 // Otherwise, a phi node for the scalar value is needed. 7501 unsigned Part = State.Instance->Part; 7502 if (State.ValueMap.hasVectorValue(PredInst, Part)) { 7503 Value *VectorValue = State.ValueMap.getVectorValue(PredInst, Part); 7504 InsertElementInst *IEI = cast<InsertElementInst>(VectorValue); 7505 PHINode *VPhi = State.Builder.CreatePHI(IEI->getType(), 2); 7506 VPhi->addIncoming(IEI->getOperand(0), PredicatingBB); // Unmodified vector. 7507 VPhi->addIncoming(IEI, PredicatedBB); // New vector with inserted element. 7508 State.ValueMap.resetVectorValue(PredInst, Part, VPhi); // Update cache. 7509 } else { 7510 Type *PredInstType = PredInst->getType(); 7511 PHINode *Phi = State.Builder.CreatePHI(PredInstType, 2); 7512 Phi->addIncoming(UndefValue::get(ScalarPredInst->getType()), PredicatingBB); 7513 Phi->addIncoming(ScalarPredInst, PredicatedBB); 7514 State.ValueMap.resetScalarValue(PredInst, *State.Instance, Phi); 7515 } 7516 } 7517 7518 void VPWidenMemoryInstructionRecipe::execute(VPTransformState &State) { 7519 State.ILV->vectorizeMemoryInstruction(&Instr, State, getAddr(), getMask()); 7520 } 7521 7522 // Determine how to lower the scalar epilogue, which depends on 1) optimising 7523 // for minimum code-size, 2) predicate compiler options, 3) loop hints forcing 7524 // predication, and 4) a TTI hook that analyses whether the loop is suitable 7525 // for predication. 7526 static ScalarEpilogueLowering getScalarEpilogueLowering( 7527 Function *F, Loop *L, LoopVectorizeHints &Hints, ProfileSummaryInfo *PSI, 7528 BlockFrequencyInfo *BFI, TargetTransformInfo *TTI, TargetLibraryInfo *TLI, 7529 AssumptionCache *AC, LoopInfo *LI, ScalarEvolution *SE, DominatorTree *DT, 7530 LoopVectorizationLegality &LVL) { 7531 bool OptSize = 7532 F->hasOptSize() || llvm::shouldOptimizeForSize(L->getHeader(), PSI, BFI, 7533 PGSOQueryType::IRPass); 7534 // 1) OptSize takes precedence over all other options, i.e. if this is set, 7535 // don't look at hints or options, and don't request a scalar epilogue. 7536 if (OptSize && Hints.getForce() != LoopVectorizeHints::FK_Enabled) 7537 return CM_ScalarEpilogueNotAllowedOptSize; 7538 7539 bool PredicateOptDisabled = PreferPredicateOverEpilog.getNumOccurrences() && 7540 !PreferPredicateOverEpilog; 7541 7542 // 2) Next, if disabling predication is requested on the command line, honour 7543 // this and request a scalar epilogue. Also do this if we don't have a 7544 // primary induction variable, which is required for predication. 7545 if (PredicateOptDisabled || !LVL.getPrimaryInduction()) 7546 return CM_ScalarEpilogueAllowed; 7547 7548 // 3) and 4) look if enabling predication is requested on the command line, 7549 // with a loop hint, or if the TTI hook indicates this is profitable, request 7550 // predication . 7551 if (PreferPredicateOverEpilog || 7552 Hints.getPredicate() == LoopVectorizeHints::FK_Enabled || 7553 (TTI->preferPredicateOverEpilogue(L, LI, *SE, *AC, TLI, DT, 7554 LVL.getLAI()) && 7555 Hints.getPredicate() != LoopVectorizeHints::FK_Disabled)) 7556 return CM_ScalarEpilogueNotNeededUsePredicate; 7557 7558 return CM_ScalarEpilogueAllowed; 7559 } 7560 7561 // Process the loop in the VPlan-native vectorization path. This path builds 7562 // VPlan upfront in the vectorization pipeline, which allows to apply 7563 // VPlan-to-VPlan transformations from the very beginning without modifying the 7564 // input LLVM IR. 7565 static bool processLoopInVPlanNativePath( 7566 Loop *L, PredicatedScalarEvolution &PSE, LoopInfo *LI, DominatorTree *DT, 7567 LoopVectorizationLegality *LVL, TargetTransformInfo *TTI, 7568 TargetLibraryInfo *TLI, DemandedBits *DB, AssumptionCache *AC, 7569 OptimizationRemarkEmitter *ORE, BlockFrequencyInfo *BFI, 7570 ProfileSummaryInfo *PSI, LoopVectorizeHints &Hints) { 7571 7572 assert(EnableVPlanNativePath && "VPlan-native path is disabled."); 7573 Function *F = L->getHeader()->getParent(); 7574 InterleavedAccessInfo IAI(PSE, L, DT, LI, LVL->getLAI()); 7575 7576 ScalarEpilogueLowering SEL = getScalarEpilogueLowering( 7577 F, L, Hints, PSI, BFI, TTI, TLI, AC, LI, PSE.getSE(), DT, *LVL); 7578 7579 LoopVectorizationCostModel CM(SEL, L, PSE, LI, LVL, *TTI, TLI, DB, AC, ORE, F, 7580 &Hints, IAI); 7581 // Use the planner for outer loop vectorization. 7582 // TODO: CM is not used at this point inside the planner. Turn CM into an 7583 // optional argument if we don't need it in the future. 7584 LoopVectorizationPlanner LVP(L, LI, TLI, TTI, LVL, CM, IAI); 7585 7586 // Get user vectorization factor. 7587 const unsigned UserVF = Hints.getWidth(); 7588 7589 // Plan how to best vectorize, return the best VF and its cost. 7590 const VectorizationFactor VF = LVP.planInVPlanNativePath(UserVF); 7591 7592 // If we are stress testing VPlan builds, do not attempt to generate vector 7593 // code. Masked vector code generation support will follow soon. 7594 // Also, do not attempt to vectorize if no vector code will be produced. 7595 if (VPlanBuildStressTest || EnableVPlanPredication || 7596 VectorizationFactor::Disabled() == VF) 7597 return false; 7598 7599 LVP.setBestPlan(VF.Width, 1); 7600 7601 InnerLoopVectorizer LB(L, PSE, LI, DT, TLI, TTI, AC, ORE, VF.Width, 1, LVL, 7602 &CM); 7603 LLVM_DEBUG(dbgs() << "Vectorizing outer loop in \"" 7604 << L->getHeader()->getParent()->getName() << "\"\n"); 7605 LVP.executePlan(LB, DT); 7606 7607 // Mark the loop as already vectorized to avoid vectorizing again. 7608 Hints.setAlreadyVectorized(); 7609 7610 LLVM_DEBUG(verifyFunction(*L->getHeader()->getParent())); 7611 return true; 7612 } 7613 7614 bool LoopVectorizePass::processLoop(Loop *L) { 7615 assert((EnableVPlanNativePath || L->empty()) && 7616 "VPlan-native path is not enabled. Only process inner loops."); 7617 7618 #ifndef NDEBUG 7619 const std::string DebugLocStr = getDebugLocString(L); 7620 #endif /* NDEBUG */ 7621 7622 LLVM_DEBUG(dbgs() << "\nLV: Checking a loop in \"" 7623 << L->getHeader()->getParent()->getName() << "\" from " 7624 << DebugLocStr << "\n"); 7625 7626 LoopVectorizeHints Hints(L, InterleaveOnlyWhenForced, *ORE); 7627 7628 LLVM_DEBUG( 7629 dbgs() << "LV: Loop hints:" 7630 << " force=" 7631 << (Hints.getForce() == LoopVectorizeHints::FK_Disabled 7632 ? "disabled" 7633 : (Hints.getForce() == LoopVectorizeHints::FK_Enabled 7634 ? "enabled" 7635 : "?")) 7636 << " width=" << Hints.getWidth() 7637 << " unroll=" << Hints.getInterleave() << "\n"); 7638 7639 // Function containing loop 7640 Function *F = L->getHeader()->getParent(); 7641 7642 // Looking at the diagnostic output is the only way to determine if a loop 7643 // was vectorized (other than looking at the IR or machine code), so it 7644 // is important to generate an optimization remark for each loop. Most of 7645 // these messages are generated as OptimizationRemarkAnalysis. Remarks 7646 // generated as OptimizationRemark and OptimizationRemarkMissed are 7647 // less verbose reporting vectorized loops and unvectorized loops that may 7648 // benefit from vectorization, respectively. 7649 7650 if (!Hints.allowVectorization(F, L, VectorizeOnlyWhenForced)) { 7651 LLVM_DEBUG(dbgs() << "LV: Loop hints prevent vectorization.\n"); 7652 return false; 7653 } 7654 7655 PredicatedScalarEvolution PSE(*SE, *L); 7656 7657 // Check if it is legal to vectorize the loop. 7658 LoopVectorizationRequirements Requirements(*ORE); 7659 LoopVectorizationLegality LVL(L, PSE, DT, TTI, TLI, AA, F, GetLAA, LI, ORE, 7660 &Requirements, &Hints, DB, AC); 7661 if (!LVL.canVectorize(EnableVPlanNativePath)) { 7662 LLVM_DEBUG(dbgs() << "LV: Not vectorizing: Cannot prove legality.\n"); 7663 Hints.emitRemarkWithHints(); 7664 return false; 7665 } 7666 7667 // Check the function attributes and profiles to find out if this function 7668 // should be optimized for size. 7669 ScalarEpilogueLowering SEL = getScalarEpilogueLowering( 7670 F, L, Hints, PSI, BFI, TTI, TLI, AC, LI, PSE.getSE(), DT, LVL); 7671 7672 // Entrance to the VPlan-native vectorization path. Outer loops are processed 7673 // here. They may require CFG and instruction level transformations before 7674 // even evaluating whether vectorization is profitable. Since we cannot modify 7675 // the incoming IR, we need to build VPlan upfront in the vectorization 7676 // pipeline. 7677 if (!L->empty()) 7678 return processLoopInVPlanNativePath(L, PSE, LI, DT, &LVL, TTI, TLI, DB, AC, 7679 ORE, BFI, PSI, Hints); 7680 7681 assert(L->empty() && "Inner loop expected."); 7682 7683 // Check the loop for a trip count threshold: vectorize loops with a tiny trip 7684 // count by optimizing for size, to minimize overheads. 7685 auto ExpectedTC = getSmallBestKnownTC(*SE, L); 7686 if (ExpectedTC && *ExpectedTC < TinyTripCountVectorThreshold) { 7687 LLVM_DEBUG(dbgs() << "LV: Found a loop with a very small trip count. " 7688 << "This loop is worth vectorizing only if no scalar " 7689 << "iteration overheads are incurred."); 7690 if (Hints.getForce() == LoopVectorizeHints::FK_Enabled) 7691 LLVM_DEBUG(dbgs() << " But vectorizing was explicitly forced.\n"); 7692 else { 7693 LLVM_DEBUG(dbgs() << "\n"); 7694 SEL = CM_ScalarEpilogueNotAllowedLowTripLoop; 7695 } 7696 } 7697 7698 // Check the function attributes to see if implicit floats are allowed. 7699 // FIXME: This check doesn't seem possibly correct -- what if the loop is 7700 // an integer loop and the vector instructions selected are purely integer 7701 // vector instructions? 7702 if (F->hasFnAttribute(Attribute::NoImplicitFloat)) { 7703 reportVectorizationFailure( 7704 "Can't vectorize when the NoImplicitFloat attribute is used", 7705 "loop not vectorized due to NoImplicitFloat attribute", 7706 "NoImplicitFloat", ORE, L); 7707 Hints.emitRemarkWithHints(); 7708 return false; 7709 } 7710 7711 // Check if the target supports potentially unsafe FP vectorization. 7712 // FIXME: Add a check for the type of safety issue (denormal, signaling) 7713 // for the target we're vectorizing for, to make sure none of the 7714 // additional fp-math flags can help. 7715 if (Hints.isPotentiallyUnsafe() && 7716 TTI->isFPVectorizationPotentiallyUnsafe()) { 7717 reportVectorizationFailure( 7718 "Potentially unsafe FP op prevents vectorization", 7719 "loop not vectorized due to unsafe FP support.", 7720 "UnsafeFP", ORE, L); 7721 Hints.emitRemarkWithHints(); 7722 return false; 7723 } 7724 7725 bool UseInterleaved = TTI->enableInterleavedAccessVectorization(); 7726 InterleavedAccessInfo IAI(PSE, L, DT, LI, LVL.getLAI()); 7727 7728 // If an override option has been passed in for interleaved accesses, use it. 7729 if (EnableInterleavedMemAccesses.getNumOccurrences() > 0) 7730 UseInterleaved = EnableInterleavedMemAccesses; 7731 7732 // Analyze interleaved memory accesses. 7733 if (UseInterleaved) { 7734 IAI.analyzeInterleaving(useMaskedInterleavedAccesses(*TTI)); 7735 } 7736 7737 // Use the cost model. 7738 LoopVectorizationCostModel CM(SEL, L, PSE, LI, &LVL, *TTI, TLI, DB, AC, ORE, 7739 F, &Hints, IAI); 7740 CM.collectValuesToIgnore(); 7741 7742 // Use the planner for vectorization. 7743 LoopVectorizationPlanner LVP(L, LI, TLI, TTI, &LVL, CM, IAI); 7744 7745 // Get user vectorization factor. 7746 unsigned UserVF = Hints.getWidth(); 7747 7748 // Plan how to best vectorize, return the best VF and its cost. 7749 Optional<VectorizationFactor> MaybeVF = LVP.plan(UserVF); 7750 7751 VectorizationFactor VF = VectorizationFactor::Disabled(); 7752 unsigned IC = 1; 7753 unsigned UserIC = Hints.getInterleave(); 7754 7755 if (MaybeVF) { 7756 VF = *MaybeVF; 7757 // Select the interleave count. 7758 IC = CM.selectInterleaveCount(VF.Width, VF.Cost); 7759 } 7760 7761 // Identify the diagnostic messages that should be produced. 7762 std::pair<StringRef, std::string> VecDiagMsg, IntDiagMsg; 7763 bool VectorizeLoop = true, InterleaveLoop = true; 7764 if (Requirements.doesNotMeet(F, L, Hints)) { 7765 LLVM_DEBUG(dbgs() << "LV: Not vectorizing: loop did not meet vectorization " 7766 "requirements.\n"); 7767 Hints.emitRemarkWithHints(); 7768 return false; 7769 } 7770 7771 if (VF.Width == 1) { 7772 LLVM_DEBUG(dbgs() << "LV: Vectorization is possible but not beneficial.\n"); 7773 VecDiagMsg = std::make_pair( 7774 "VectorizationNotBeneficial", 7775 "the cost-model indicates that vectorization is not beneficial"); 7776 VectorizeLoop = false; 7777 } 7778 7779 if (!MaybeVF && UserIC > 1) { 7780 // Tell the user interleaving was avoided up-front, despite being explicitly 7781 // requested. 7782 LLVM_DEBUG(dbgs() << "LV: Ignoring UserIC, because vectorization and " 7783 "interleaving should be avoided up front\n"); 7784 IntDiagMsg = std::make_pair( 7785 "InterleavingAvoided", 7786 "Ignoring UserIC, because interleaving was avoided up front"); 7787 InterleaveLoop = false; 7788 } else if (IC == 1 && UserIC <= 1) { 7789 // Tell the user interleaving is not beneficial. 7790 LLVM_DEBUG(dbgs() << "LV: Interleaving is not beneficial.\n"); 7791 IntDiagMsg = std::make_pair( 7792 "InterleavingNotBeneficial", 7793 "the cost-model indicates that interleaving is not beneficial"); 7794 InterleaveLoop = false; 7795 if (UserIC == 1) { 7796 IntDiagMsg.first = "InterleavingNotBeneficialAndDisabled"; 7797 IntDiagMsg.second += 7798 " and is explicitly disabled or interleave count is set to 1"; 7799 } 7800 } else if (IC > 1 && UserIC == 1) { 7801 // Tell the user interleaving is beneficial, but it explicitly disabled. 7802 LLVM_DEBUG( 7803 dbgs() << "LV: Interleaving is beneficial but is explicitly disabled."); 7804 IntDiagMsg = std::make_pair( 7805 "InterleavingBeneficialButDisabled", 7806 "the cost-model indicates that interleaving is beneficial " 7807 "but is explicitly disabled or interleave count is set to 1"); 7808 InterleaveLoop = false; 7809 } 7810 7811 // Override IC if user provided an interleave count. 7812 IC = UserIC > 0 ? UserIC : IC; 7813 7814 // Emit diagnostic messages, if any. 7815 const char *VAPassName = Hints.vectorizeAnalysisPassName(); 7816 if (!VectorizeLoop && !InterleaveLoop) { 7817 // Do not vectorize or interleaving the loop. 7818 ORE->emit([&]() { 7819 return OptimizationRemarkMissed(VAPassName, VecDiagMsg.first, 7820 L->getStartLoc(), L->getHeader()) 7821 << VecDiagMsg.second; 7822 }); 7823 ORE->emit([&]() { 7824 return OptimizationRemarkMissed(LV_NAME, IntDiagMsg.first, 7825 L->getStartLoc(), L->getHeader()) 7826 << IntDiagMsg.second; 7827 }); 7828 return false; 7829 } else if (!VectorizeLoop && InterleaveLoop) { 7830 LLVM_DEBUG(dbgs() << "LV: Interleave Count is " << IC << '\n'); 7831 ORE->emit([&]() { 7832 return OptimizationRemarkAnalysis(VAPassName, VecDiagMsg.first, 7833 L->getStartLoc(), L->getHeader()) 7834 << VecDiagMsg.second; 7835 }); 7836 } else if (VectorizeLoop && !InterleaveLoop) { 7837 LLVM_DEBUG(dbgs() << "LV: Found a vectorizable loop (" << VF.Width 7838 << ") in " << DebugLocStr << '\n'); 7839 ORE->emit([&]() { 7840 return OptimizationRemarkAnalysis(LV_NAME, IntDiagMsg.first, 7841 L->getStartLoc(), L->getHeader()) 7842 << IntDiagMsg.second; 7843 }); 7844 } else if (VectorizeLoop && InterleaveLoop) { 7845 LLVM_DEBUG(dbgs() << "LV: Found a vectorizable loop (" << VF.Width 7846 << ") in " << DebugLocStr << '\n'); 7847 LLVM_DEBUG(dbgs() << "LV: Interleave Count is " << IC << '\n'); 7848 } 7849 7850 LVP.setBestPlan(VF.Width, IC); 7851 7852 using namespace ore; 7853 bool DisableRuntimeUnroll = false; 7854 MDNode *OrigLoopID = L->getLoopID(); 7855 7856 if (!VectorizeLoop) { 7857 assert(IC > 1 && "interleave count should not be 1 or 0"); 7858 // If we decided that it is not legal to vectorize the loop, then 7859 // interleave it. 7860 InnerLoopUnroller Unroller(L, PSE, LI, DT, TLI, TTI, AC, ORE, IC, &LVL, 7861 &CM); 7862 LVP.executePlan(Unroller, DT); 7863 7864 ORE->emit([&]() { 7865 return OptimizationRemark(LV_NAME, "Interleaved", L->getStartLoc(), 7866 L->getHeader()) 7867 << "interleaved loop (interleaved count: " 7868 << NV("InterleaveCount", IC) << ")"; 7869 }); 7870 } else { 7871 // If we decided that it is *legal* to vectorize the loop, then do it. 7872 InnerLoopVectorizer LB(L, PSE, LI, DT, TLI, TTI, AC, ORE, VF.Width, IC, 7873 &LVL, &CM); 7874 LVP.executePlan(LB, DT); 7875 ++LoopsVectorized; 7876 7877 // Add metadata to disable runtime unrolling a scalar loop when there are 7878 // no runtime checks about strides and memory. A scalar loop that is 7879 // rarely used is not worth unrolling. 7880 if (!LB.areSafetyChecksAdded()) 7881 DisableRuntimeUnroll = true; 7882 7883 // Report the vectorization decision. 7884 ORE->emit([&]() { 7885 return OptimizationRemark(LV_NAME, "Vectorized", L->getStartLoc(), 7886 L->getHeader()) 7887 << "vectorized loop (vectorization width: " 7888 << NV("VectorizationFactor", VF.Width) 7889 << ", interleaved count: " << NV("InterleaveCount", IC) << ")"; 7890 }); 7891 } 7892 7893 Optional<MDNode *> RemainderLoopID = 7894 makeFollowupLoopID(OrigLoopID, {LLVMLoopVectorizeFollowupAll, 7895 LLVMLoopVectorizeFollowupEpilogue}); 7896 if (RemainderLoopID.hasValue()) { 7897 L->setLoopID(RemainderLoopID.getValue()); 7898 } else { 7899 if (DisableRuntimeUnroll) 7900 AddRuntimeUnrollDisableMetaData(L); 7901 7902 // Mark the loop as already vectorized to avoid vectorizing again. 7903 Hints.setAlreadyVectorized(); 7904 } 7905 7906 LLVM_DEBUG(verifyFunction(*L->getHeader()->getParent())); 7907 return true; 7908 } 7909 7910 bool LoopVectorizePass::runImpl( 7911 Function &F, ScalarEvolution &SE_, LoopInfo &LI_, TargetTransformInfo &TTI_, 7912 DominatorTree &DT_, BlockFrequencyInfo &BFI_, TargetLibraryInfo *TLI_, 7913 DemandedBits &DB_, AliasAnalysis &AA_, AssumptionCache &AC_, 7914 std::function<const LoopAccessInfo &(Loop &)> &GetLAA_, 7915 OptimizationRemarkEmitter &ORE_, ProfileSummaryInfo *PSI_) { 7916 SE = &SE_; 7917 LI = &LI_; 7918 TTI = &TTI_; 7919 DT = &DT_; 7920 BFI = &BFI_; 7921 TLI = TLI_; 7922 AA = &AA_; 7923 AC = &AC_; 7924 GetLAA = &GetLAA_; 7925 DB = &DB_; 7926 ORE = &ORE_; 7927 PSI = PSI_; 7928 7929 // Don't attempt if 7930 // 1. the target claims to have no vector registers, and 7931 // 2. interleaving won't help ILP. 7932 // 7933 // The second condition is necessary because, even if the target has no 7934 // vector registers, loop vectorization may still enable scalar 7935 // interleaving. 7936 if (!TTI->getNumberOfRegisters(TTI->getRegisterClassForType(true)) && 7937 TTI->getMaxInterleaveFactor(1) < 2) 7938 return false; 7939 7940 bool Changed = false; 7941 7942 // The vectorizer requires loops to be in simplified form. 7943 // Since simplification may add new inner loops, it has to run before the 7944 // legality and profitability checks. This means running the loop vectorizer 7945 // will simplify all loops, regardless of whether anything end up being 7946 // vectorized. 7947 for (auto &L : *LI) 7948 Changed |= 7949 simplifyLoop(L, DT, LI, SE, AC, nullptr, false /* PreserveLCSSA */); 7950 7951 // Build up a worklist of inner-loops to vectorize. This is necessary as 7952 // the act of vectorizing or partially unrolling a loop creates new loops 7953 // and can invalidate iterators across the loops. 7954 SmallVector<Loop *, 8> Worklist; 7955 7956 for (Loop *L : *LI) 7957 collectSupportedLoops(*L, LI, ORE, Worklist); 7958 7959 LoopsAnalyzed += Worklist.size(); 7960 7961 // Now walk the identified inner loops. 7962 while (!Worklist.empty()) { 7963 Loop *L = Worklist.pop_back_val(); 7964 7965 // For the inner loops we actually process, form LCSSA to simplify the 7966 // transform. 7967 Changed |= formLCSSARecursively(*L, *DT, LI, SE); 7968 7969 Changed |= processLoop(L); 7970 } 7971 7972 // Process each loop nest in the function. 7973 return Changed; 7974 } 7975 7976 PreservedAnalyses LoopVectorizePass::run(Function &F, 7977 FunctionAnalysisManager &AM) { 7978 auto &SE = AM.getResult<ScalarEvolutionAnalysis>(F); 7979 auto &LI = AM.getResult<LoopAnalysis>(F); 7980 auto &TTI = AM.getResult<TargetIRAnalysis>(F); 7981 auto &DT = AM.getResult<DominatorTreeAnalysis>(F); 7982 auto &BFI = AM.getResult<BlockFrequencyAnalysis>(F); 7983 auto &TLI = AM.getResult<TargetLibraryAnalysis>(F); 7984 auto &AA = AM.getResult<AAManager>(F); 7985 auto &AC = AM.getResult<AssumptionAnalysis>(F); 7986 auto &DB = AM.getResult<DemandedBitsAnalysis>(F); 7987 auto &ORE = AM.getResult<OptimizationRemarkEmitterAnalysis>(F); 7988 MemorySSA *MSSA = EnableMSSALoopDependency 7989 ? &AM.getResult<MemorySSAAnalysis>(F).getMSSA() 7990 : nullptr; 7991 7992 auto &LAM = AM.getResult<LoopAnalysisManagerFunctionProxy>(F).getManager(); 7993 std::function<const LoopAccessInfo &(Loop &)> GetLAA = 7994 [&](Loop &L) -> const LoopAccessInfo & { 7995 LoopStandardAnalysisResults AR = {AA, AC, DT, LI, SE, TLI, TTI, MSSA}; 7996 return LAM.getResult<LoopAccessAnalysis>(L, AR); 7997 }; 7998 const ModuleAnalysisManager &MAM = 7999 AM.getResult<ModuleAnalysisManagerFunctionProxy>(F).getManager(); 8000 ProfileSummaryInfo *PSI = 8001 MAM.getCachedResult<ProfileSummaryAnalysis>(*F.getParent()); 8002 bool Changed = 8003 runImpl(F, SE, LI, TTI, DT, BFI, &TLI, DB, AA, AC, GetLAA, ORE, PSI); 8004 if (!Changed) 8005 return PreservedAnalyses::all(); 8006 PreservedAnalyses PA; 8007 8008 // We currently do not preserve loopinfo/dominator analyses with outer loop 8009 // vectorization. Until this is addressed, mark these analyses as preserved 8010 // only for non-VPlan-native path. 8011 // TODO: Preserve Loop and Dominator analyses for VPlan-native path. 8012 if (!EnableVPlanNativePath) { 8013 PA.preserve<LoopAnalysis>(); 8014 PA.preserve<DominatorTreeAnalysis>(); 8015 } 8016 PA.preserve<BasicAA>(); 8017 PA.preserve<GlobalsAA>(); 8018 return PA; 8019 } 8020