1 //===- SLPVectorizer.cpp - A bottom up SLP Vectorizer ---------------------===// 2 // 3 // Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. 4 // See https://llvm.org/LICENSE.txt for license information. 5 // SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception 6 // 7 //===----------------------------------------------------------------------===// 8 // 9 // This pass implements the Bottom Up SLP vectorizer. It detects consecutive 10 // stores that can be put together into vector-stores. Next, it attempts to 11 // construct vectorizable tree using the use-def chains. If a profitable tree 12 // was found, the SLP vectorizer performs vectorization on the tree. 13 // 14 // The pass is inspired by the work described in the paper: 15 // "Loop-Aware SLP in GCC" by Ira Rosen, Dorit Nuzman, Ayal Zaks. 16 // 17 //===----------------------------------------------------------------------===// 18 19 #include "llvm/Transforms/Vectorize/SLPVectorizer.h" 20 #include "llvm/ADT/DenseMap.h" 21 #include "llvm/ADT/DenseSet.h" 22 #include "llvm/ADT/Optional.h" 23 #include "llvm/ADT/PostOrderIterator.h" 24 #include "llvm/ADT/PriorityQueue.h" 25 #include "llvm/ADT/STLExtras.h" 26 #include "llvm/ADT/SetOperations.h" 27 #include "llvm/ADT/SetVector.h" 28 #include "llvm/ADT/SmallBitVector.h" 29 #include "llvm/ADT/SmallPtrSet.h" 30 #include "llvm/ADT/SmallSet.h" 31 #include "llvm/ADT/SmallString.h" 32 #include "llvm/ADT/Statistic.h" 33 #include "llvm/ADT/iterator.h" 34 #include "llvm/ADT/iterator_range.h" 35 #include "llvm/Analysis/AliasAnalysis.h" 36 #include "llvm/Analysis/AssumptionCache.h" 37 #include "llvm/Analysis/CodeMetrics.h" 38 #include "llvm/Analysis/DemandedBits.h" 39 #include "llvm/Analysis/GlobalsModRef.h" 40 #include "llvm/Analysis/IVDescriptors.h" 41 #include "llvm/Analysis/LoopAccessAnalysis.h" 42 #include "llvm/Analysis/LoopInfo.h" 43 #include "llvm/Analysis/MemoryLocation.h" 44 #include "llvm/Analysis/OptimizationRemarkEmitter.h" 45 #include "llvm/Analysis/ScalarEvolution.h" 46 #include "llvm/Analysis/ScalarEvolutionExpressions.h" 47 #include "llvm/Analysis/TargetLibraryInfo.h" 48 #include "llvm/Analysis/TargetTransformInfo.h" 49 #include "llvm/Analysis/ValueTracking.h" 50 #include "llvm/Analysis/VectorUtils.h" 51 #include "llvm/IR/Attributes.h" 52 #include "llvm/IR/BasicBlock.h" 53 #include "llvm/IR/Constant.h" 54 #include "llvm/IR/Constants.h" 55 #include "llvm/IR/DataLayout.h" 56 #include "llvm/IR/DebugLoc.h" 57 #include "llvm/IR/DerivedTypes.h" 58 #include "llvm/IR/Dominators.h" 59 #include "llvm/IR/Function.h" 60 #include "llvm/IR/IRBuilder.h" 61 #include "llvm/IR/InstrTypes.h" 62 #include "llvm/IR/Instruction.h" 63 #include "llvm/IR/Instructions.h" 64 #include "llvm/IR/IntrinsicInst.h" 65 #include "llvm/IR/Intrinsics.h" 66 #include "llvm/IR/Module.h" 67 #include "llvm/IR/Operator.h" 68 #include "llvm/IR/PatternMatch.h" 69 #include "llvm/IR/Type.h" 70 #include "llvm/IR/Use.h" 71 #include "llvm/IR/User.h" 72 #include "llvm/IR/Value.h" 73 #include "llvm/IR/ValueHandle.h" 74 #include "llvm/IR/Verifier.h" 75 #include "llvm/Pass.h" 76 #include "llvm/Support/Casting.h" 77 #include "llvm/Support/CommandLine.h" 78 #include "llvm/Support/Compiler.h" 79 #include "llvm/Support/DOTGraphTraits.h" 80 #include "llvm/Support/Debug.h" 81 #include "llvm/Support/ErrorHandling.h" 82 #include "llvm/Support/GraphWriter.h" 83 #include "llvm/Support/InstructionCost.h" 84 #include "llvm/Support/KnownBits.h" 85 #include "llvm/Support/MathExtras.h" 86 #include "llvm/Support/raw_ostream.h" 87 #include "llvm/Transforms/Utils/InjectTLIMappings.h" 88 #include "llvm/Transforms/Utils/LoopUtils.h" 89 #include "llvm/Transforms/Vectorize.h" 90 #include <algorithm> 91 #include <cassert> 92 #include <cstdint> 93 #include <iterator> 94 #include <memory> 95 #include <set> 96 #include <string> 97 #include <tuple> 98 #include <utility> 99 #include <vector> 100 101 using namespace llvm; 102 using namespace llvm::PatternMatch; 103 using namespace slpvectorizer; 104 105 #define SV_NAME "slp-vectorizer" 106 #define DEBUG_TYPE "SLP" 107 108 STATISTIC(NumVectorInstructions, "Number of vector instructions generated"); 109 110 cl::opt<bool> RunSLPVectorization("vectorize-slp", cl::init(true), cl::Hidden, 111 cl::desc("Run the SLP vectorization passes")); 112 113 static cl::opt<int> 114 SLPCostThreshold("slp-threshold", cl::init(0), cl::Hidden, 115 cl::desc("Only vectorize if you gain more than this " 116 "number ")); 117 118 static cl::opt<bool> 119 ShouldVectorizeHor("slp-vectorize-hor", cl::init(true), cl::Hidden, 120 cl::desc("Attempt to vectorize horizontal reductions")); 121 122 static cl::opt<bool> ShouldStartVectorizeHorAtStore( 123 "slp-vectorize-hor-store", cl::init(false), cl::Hidden, 124 cl::desc( 125 "Attempt to vectorize horizontal reductions feeding into a store")); 126 127 static cl::opt<int> 128 MaxVectorRegSizeOption("slp-max-reg-size", cl::init(128), cl::Hidden, 129 cl::desc("Attempt to vectorize for this register size in bits")); 130 131 static cl::opt<unsigned> 132 MaxVFOption("slp-max-vf", cl::init(0), cl::Hidden, 133 cl::desc("Maximum SLP vectorization factor (0=unlimited)")); 134 135 static cl::opt<int> 136 MaxStoreLookup("slp-max-store-lookup", cl::init(32), cl::Hidden, 137 cl::desc("Maximum depth of the lookup for consecutive stores.")); 138 139 /// Limits the size of scheduling regions in a block. 140 /// It avoid long compile times for _very_ large blocks where vector 141 /// instructions are spread over a wide range. 142 /// This limit is way higher than needed by real-world functions. 143 static cl::opt<int> 144 ScheduleRegionSizeBudget("slp-schedule-budget", cl::init(100000), cl::Hidden, 145 cl::desc("Limit the size of the SLP scheduling region per block")); 146 147 static cl::opt<int> MinVectorRegSizeOption( 148 "slp-min-reg-size", cl::init(128), cl::Hidden, 149 cl::desc("Attempt to vectorize for this register size in bits")); 150 151 static cl::opt<unsigned> RecursionMaxDepth( 152 "slp-recursion-max-depth", cl::init(12), cl::Hidden, 153 cl::desc("Limit the recursion depth when building a vectorizable tree")); 154 155 static cl::opt<unsigned> MinTreeSize( 156 "slp-min-tree-size", cl::init(3), cl::Hidden, 157 cl::desc("Only vectorize small trees if they are fully vectorizable")); 158 159 // The maximum depth that the look-ahead score heuristic will explore. 160 // The higher this value, the higher the compilation time overhead. 161 static cl::opt<int> LookAheadMaxDepth( 162 "slp-max-look-ahead-depth", cl::init(2), cl::Hidden, 163 cl::desc("The maximum look-ahead depth for operand reordering scores")); 164 165 static cl::opt<bool> 166 ViewSLPTree("view-slp-tree", cl::Hidden, 167 cl::desc("Display the SLP trees with Graphviz")); 168 169 // Limit the number of alias checks. The limit is chosen so that 170 // it has no negative effect on the llvm benchmarks. 171 static const unsigned AliasedCheckLimit = 10; 172 173 // Another limit for the alias checks: The maximum distance between load/store 174 // instructions where alias checks are done. 175 // This limit is useful for very large basic blocks. 176 static const unsigned MaxMemDepDistance = 160; 177 178 /// If the ScheduleRegionSizeBudget is exhausted, we allow small scheduling 179 /// regions to be handled. 180 static const int MinScheduleRegionSize = 16; 181 182 /// Predicate for the element types that the SLP vectorizer supports. 183 /// 184 /// The most important thing to filter here are types which are invalid in LLVM 185 /// vectors. We also filter target specific types which have absolutely no 186 /// meaningful vectorization path such as x86_fp80 and ppc_f128. This just 187 /// avoids spending time checking the cost model and realizing that they will 188 /// be inevitably scalarized. 189 static bool isValidElementType(Type *Ty) { 190 return VectorType::isValidElementType(Ty) && !Ty->isX86_FP80Ty() && 191 !Ty->isPPC_FP128Ty(); 192 } 193 194 /// \returns True if the value is a constant (but not globals/constant 195 /// expressions). 196 static bool isConstant(Value *V) { 197 return isa<Constant>(V) && !isa<ConstantExpr>(V) && !isa<GlobalValue>(V); 198 } 199 200 /// Checks if \p V is one of vector-like instructions, i.e. undef, 201 /// insertelement/extractelement with constant indices for fixed vector type or 202 /// extractvalue instruction. 203 static bool isVectorLikeInstWithConstOps(Value *V) { 204 if (!isa<InsertElementInst, ExtractElementInst>(V) && 205 !isa<ExtractValueInst, UndefValue>(V)) 206 return false; 207 auto *I = dyn_cast<Instruction>(V); 208 if (!I || isa<ExtractValueInst>(I)) 209 return true; 210 if (!isa<FixedVectorType>(I->getOperand(0)->getType())) 211 return false; 212 if (isa<ExtractElementInst>(I)) 213 return isConstant(I->getOperand(1)); 214 assert(isa<InsertElementInst>(V) && "Expected only insertelement."); 215 return isConstant(I->getOperand(2)); 216 } 217 218 /// \returns true if all of the instructions in \p VL are in the same block or 219 /// false otherwise. 220 static bool allSameBlock(ArrayRef<Value *> VL) { 221 Instruction *I0 = dyn_cast<Instruction>(VL[0]); 222 if (!I0) 223 return false; 224 if (all_of(VL, isVectorLikeInstWithConstOps)) 225 return true; 226 227 BasicBlock *BB = I0->getParent(); 228 for (int I = 1, E = VL.size(); I < E; I++) { 229 auto *II = dyn_cast<Instruction>(VL[I]); 230 if (!II) 231 return false; 232 233 if (BB != II->getParent()) 234 return false; 235 } 236 return true; 237 } 238 239 /// \returns True if all of the values in \p VL are constants (but not 240 /// globals/constant expressions). 241 static bool allConstant(ArrayRef<Value *> VL) { 242 // Constant expressions and globals can't be vectorized like normal integer/FP 243 // constants. 244 return all_of(VL, isConstant); 245 } 246 247 /// \returns True if all of the values in \p VL are identical or some of them 248 /// are UndefValue. 249 static bool isSplat(ArrayRef<Value *> VL) { 250 Value *FirstNonUndef = nullptr; 251 for (Value *V : VL) { 252 if (isa<UndefValue>(V)) 253 continue; 254 if (!FirstNonUndef) { 255 FirstNonUndef = V; 256 continue; 257 } 258 if (V != FirstNonUndef) 259 return false; 260 } 261 return FirstNonUndef != nullptr; 262 } 263 264 /// \returns True if \p I is commutative, handles CmpInst and BinaryOperator. 265 static bool isCommutative(Instruction *I) { 266 if (auto *Cmp = dyn_cast<CmpInst>(I)) 267 return Cmp->isCommutative(); 268 if (auto *BO = dyn_cast<BinaryOperator>(I)) 269 return BO->isCommutative(); 270 // TODO: This should check for generic Instruction::isCommutative(), but 271 // we need to confirm that the caller code correctly handles Intrinsics 272 // for example (does not have 2 operands). 273 return false; 274 } 275 276 /// Checks if the given value is actually an undefined constant vector. 277 static bool isUndefVector(const Value *V) { 278 if (isa<UndefValue>(V)) 279 return true; 280 auto *C = dyn_cast<Constant>(V); 281 if (!C) 282 return false; 283 if (!C->containsUndefOrPoisonElement()) 284 return false; 285 auto *VecTy = dyn_cast<FixedVectorType>(C->getType()); 286 if (!VecTy) 287 return false; 288 for (unsigned I = 0, E = VecTy->getNumElements(); I != E; ++I) { 289 if (Constant *Elem = C->getAggregateElement(I)) 290 if (!isa<UndefValue>(Elem)) 291 return false; 292 } 293 return true; 294 } 295 296 /// Checks if the vector of instructions can be represented as a shuffle, like: 297 /// %x0 = extractelement <4 x i8> %x, i32 0 298 /// %x3 = extractelement <4 x i8> %x, i32 3 299 /// %y1 = extractelement <4 x i8> %y, i32 1 300 /// %y2 = extractelement <4 x i8> %y, i32 2 301 /// %x0x0 = mul i8 %x0, %x0 302 /// %x3x3 = mul i8 %x3, %x3 303 /// %y1y1 = mul i8 %y1, %y1 304 /// %y2y2 = mul i8 %y2, %y2 305 /// %ins1 = insertelement <4 x i8> poison, i8 %x0x0, i32 0 306 /// %ins2 = insertelement <4 x i8> %ins1, i8 %x3x3, i32 1 307 /// %ins3 = insertelement <4 x i8> %ins2, i8 %y1y1, i32 2 308 /// %ins4 = insertelement <4 x i8> %ins3, i8 %y2y2, i32 3 309 /// ret <4 x i8> %ins4 310 /// can be transformed into: 311 /// %1 = shufflevector <4 x i8> %x, <4 x i8> %y, <4 x i32> <i32 0, i32 3, i32 5, 312 /// i32 6> 313 /// %2 = mul <4 x i8> %1, %1 314 /// ret <4 x i8> %2 315 /// We convert this initially to something like: 316 /// %x0 = extractelement <4 x i8> %x, i32 0 317 /// %x3 = extractelement <4 x i8> %x, i32 3 318 /// %y1 = extractelement <4 x i8> %y, i32 1 319 /// %y2 = extractelement <4 x i8> %y, i32 2 320 /// %1 = insertelement <4 x i8> poison, i8 %x0, i32 0 321 /// %2 = insertelement <4 x i8> %1, i8 %x3, i32 1 322 /// %3 = insertelement <4 x i8> %2, i8 %y1, i32 2 323 /// %4 = insertelement <4 x i8> %3, i8 %y2, i32 3 324 /// %5 = mul <4 x i8> %4, %4 325 /// %6 = extractelement <4 x i8> %5, i32 0 326 /// %ins1 = insertelement <4 x i8> poison, i8 %6, i32 0 327 /// %7 = extractelement <4 x i8> %5, i32 1 328 /// %ins2 = insertelement <4 x i8> %ins1, i8 %7, i32 1 329 /// %8 = extractelement <4 x i8> %5, i32 2 330 /// %ins3 = insertelement <4 x i8> %ins2, i8 %8, i32 2 331 /// %9 = extractelement <4 x i8> %5, i32 3 332 /// %ins4 = insertelement <4 x i8> %ins3, i8 %9, i32 3 333 /// ret <4 x i8> %ins4 334 /// InstCombiner transforms this into a shuffle and vector mul 335 /// Mask will return the Shuffle Mask equivalent to the extracted elements. 336 /// TODO: Can we split off and reuse the shuffle mask detection from 337 /// TargetTransformInfo::getInstructionThroughput? 338 static Optional<TargetTransformInfo::ShuffleKind> 339 isFixedVectorShuffle(ArrayRef<Value *> VL, SmallVectorImpl<int> &Mask) { 340 const auto *It = 341 find_if(VL, [](Value *V) { return isa<ExtractElementInst>(V); }); 342 if (It == VL.end()) 343 return None; 344 auto *EI0 = cast<ExtractElementInst>(*It); 345 if (isa<ScalableVectorType>(EI0->getVectorOperandType())) 346 return None; 347 unsigned Size = 348 cast<FixedVectorType>(EI0->getVectorOperandType())->getNumElements(); 349 Value *Vec1 = nullptr; 350 Value *Vec2 = nullptr; 351 enum ShuffleMode { Unknown, Select, Permute }; 352 ShuffleMode CommonShuffleMode = Unknown; 353 Mask.assign(VL.size(), UndefMaskElem); 354 for (unsigned I = 0, E = VL.size(); I < E; ++I) { 355 // Undef can be represented as an undef element in a vector. 356 if (isa<UndefValue>(VL[I])) 357 continue; 358 auto *EI = cast<ExtractElementInst>(VL[I]); 359 if (isa<ScalableVectorType>(EI->getVectorOperandType())) 360 return None; 361 auto *Vec = EI->getVectorOperand(); 362 // We can extractelement from undef or poison vector. 363 if (isUndefVector(Vec)) 364 continue; 365 // All vector operands must have the same number of vector elements. 366 if (cast<FixedVectorType>(Vec->getType())->getNumElements() != Size) 367 return None; 368 if (isa<UndefValue>(EI->getIndexOperand())) 369 continue; 370 auto *Idx = dyn_cast<ConstantInt>(EI->getIndexOperand()); 371 if (!Idx) 372 return None; 373 // Undefined behavior if Idx is negative or >= Size. 374 if (Idx->getValue().uge(Size)) 375 continue; 376 unsigned IntIdx = Idx->getValue().getZExtValue(); 377 Mask[I] = IntIdx; 378 // For correct shuffling we have to have at most 2 different vector operands 379 // in all extractelement instructions. 380 if (!Vec1 || Vec1 == Vec) { 381 Vec1 = Vec; 382 } else if (!Vec2 || Vec2 == Vec) { 383 Vec2 = Vec; 384 Mask[I] += Size; 385 } else { 386 return None; 387 } 388 if (CommonShuffleMode == Permute) 389 continue; 390 // If the extract index is not the same as the operation number, it is a 391 // permutation. 392 if (IntIdx != I) { 393 CommonShuffleMode = Permute; 394 continue; 395 } 396 CommonShuffleMode = Select; 397 } 398 // If we're not crossing lanes in different vectors, consider it as blending. 399 if (CommonShuffleMode == Select && Vec2) 400 return TargetTransformInfo::SK_Select; 401 // If Vec2 was never used, we have a permutation of a single vector, otherwise 402 // we have permutation of 2 vectors. 403 return Vec2 ? TargetTransformInfo::SK_PermuteTwoSrc 404 : TargetTransformInfo::SK_PermuteSingleSrc; 405 } 406 407 namespace { 408 409 /// Main data required for vectorization of instructions. 410 struct InstructionsState { 411 /// The very first instruction in the list with the main opcode. 412 Value *OpValue = nullptr; 413 414 /// The main/alternate instruction. 415 Instruction *MainOp = nullptr; 416 Instruction *AltOp = nullptr; 417 418 /// The main/alternate opcodes for the list of instructions. 419 unsigned getOpcode() const { 420 return MainOp ? MainOp->getOpcode() : 0; 421 } 422 423 unsigned getAltOpcode() const { 424 return AltOp ? AltOp->getOpcode() : 0; 425 } 426 427 /// Some of the instructions in the list have alternate opcodes. 428 bool isAltShuffle() const { return AltOp != MainOp; } 429 430 bool isOpcodeOrAlt(Instruction *I) const { 431 unsigned CheckedOpcode = I->getOpcode(); 432 return getOpcode() == CheckedOpcode || getAltOpcode() == CheckedOpcode; 433 } 434 435 InstructionsState() = delete; 436 InstructionsState(Value *OpValue, Instruction *MainOp, Instruction *AltOp) 437 : OpValue(OpValue), MainOp(MainOp), AltOp(AltOp) {} 438 }; 439 440 } // end anonymous namespace 441 442 /// Chooses the correct key for scheduling data. If \p Op has the same (or 443 /// alternate) opcode as \p OpValue, the key is \p Op. Otherwise the key is \p 444 /// OpValue. 445 static Value *isOneOf(const InstructionsState &S, Value *Op) { 446 auto *I = dyn_cast<Instruction>(Op); 447 if (I && S.isOpcodeOrAlt(I)) 448 return Op; 449 return S.OpValue; 450 } 451 452 /// \returns true if \p Opcode is allowed as part of of the main/alternate 453 /// instruction for SLP vectorization. 454 /// 455 /// Example of unsupported opcode is SDIV that can potentially cause UB if the 456 /// "shuffled out" lane would result in division by zero. 457 static bool isValidForAlternation(unsigned Opcode) { 458 if (Instruction::isIntDivRem(Opcode)) 459 return false; 460 461 return true; 462 } 463 464 static InstructionsState getSameOpcode(ArrayRef<Value *> VL, 465 unsigned BaseIndex = 0); 466 467 /// Checks if the provided operands of 2 cmp instructions are compatible, i.e. 468 /// compatible instructions or constants, or just some other regular values. 469 static bool areCompatibleCmpOps(Value *BaseOp0, Value *BaseOp1, Value *Op0, 470 Value *Op1) { 471 return (isConstant(BaseOp0) && isConstant(Op0)) || 472 (isConstant(BaseOp1) && isConstant(Op1)) || 473 (!isa<Instruction>(BaseOp0) && !isa<Instruction>(Op0) && 474 !isa<Instruction>(BaseOp1) && !isa<Instruction>(Op1)) || 475 getSameOpcode({BaseOp0, Op0}).getOpcode() || 476 getSameOpcode({BaseOp1, Op1}).getOpcode(); 477 } 478 479 /// \returns analysis of the Instructions in \p VL described in 480 /// InstructionsState, the Opcode that we suppose the whole list 481 /// could be vectorized even if its structure is diverse. 482 static InstructionsState getSameOpcode(ArrayRef<Value *> VL, 483 unsigned BaseIndex) { 484 // Make sure these are all Instructions. 485 if (llvm::any_of(VL, [](Value *V) { return !isa<Instruction>(V); })) 486 return InstructionsState(VL[BaseIndex], nullptr, nullptr); 487 488 bool IsCastOp = isa<CastInst>(VL[BaseIndex]); 489 bool IsBinOp = isa<BinaryOperator>(VL[BaseIndex]); 490 bool IsCmpOp = isa<CmpInst>(VL[BaseIndex]); 491 CmpInst::Predicate BasePred = 492 IsCmpOp ? cast<CmpInst>(VL[BaseIndex])->getPredicate() 493 : CmpInst::BAD_ICMP_PREDICATE; 494 unsigned Opcode = cast<Instruction>(VL[BaseIndex])->getOpcode(); 495 unsigned AltOpcode = Opcode; 496 unsigned AltIndex = BaseIndex; 497 498 // Check for one alternate opcode from another BinaryOperator. 499 // TODO - generalize to support all operators (types, calls etc.). 500 for (int Cnt = 0, E = VL.size(); Cnt < E; Cnt++) { 501 unsigned InstOpcode = cast<Instruction>(VL[Cnt])->getOpcode(); 502 if (IsBinOp && isa<BinaryOperator>(VL[Cnt])) { 503 if (InstOpcode == Opcode || InstOpcode == AltOpcode) 504 continue; 505 if (Opcode == AltOpcode && isValidForAlternation(InstOpcode) && 506 isValidForAlternation(Opcode)) { 507 AltOpcode = InstOpcode; 508 AltIndex = Cnt; 509 continue; 510 } 511 } else if (IsCastOp && isa<CastInst>(VL[Cnt])) { 512 Type *Ty0 = cast<Instruction>(VL[BaseIndex])->getOperand(0)->getType(); 513 Type *Ty1 = cast<Instruction>(VL[Cnt])->getOperand(0)->getType(); 514 if (Ty0 == Ty1) { 515 if (InstOpcode == Opcode || InstOpcode == AltOpcode) 516 continue; 517 if (Opcode == AltOpcode) { 518 assert(isValidForAlternation(Opcode) && 519 isValidForAlternation(InstOpcode) && 520 "Cast isn't safe for alternation, logic needs to be updated!"); 521 AltOpcode = InstOpcode; 522 AltIndex = Cnt; 523 continue; 524 } 525 } 526 } else if (IsCmpOp && isa<CmpInst>(VL[Cnt])) { 527 auto *BaseInst = cast<Instruction>(VL[BaseIndex]); 528 auto *Inst = cast<Instruction>(VL[Cnt]); 529 Type *Ty0 = BaseInst->getOperand(0)->getType(); 530 Type *Ty1 = Inst->getOperand(0)->getType(); 531 if (Ty0 == Ty1) { 532 Value *BaseOp0 = BaseInst->getOperand(0); 533 Value *BaseOp1 = BaseInst->getOperand(1); 534 Value *Op0 = Inst->getOperand(0); 535 Value *Op1 = Inst->getOperand(1); 536 CmpInst::Predicate CurrentPred = 537 cast<CmpInst>(VL[Cnt])->getPredicate(); 538 CmpInst::Predicate SwappedCurrentPred = 539 CmpInst::getSwappedPredicate(CurrentPred); 540 // Check for compatible operands. If the corresponding operands are not 541 // compatible - need to perform alternate vectorization. 542 if (InstOpcode == Opcode) { 543 if (BasePred == CurrentPred && 544 areCompatibleCmpOps(BaseOp0, BaseOp1, Op0, Op1)) 545 continue; 546 if (BasePred == SwappedCurrentPred && 547 areCompatibleCmpOps(BaseOp0, BaseOp1, Op1, Op0)) 548 continue; 549 if (E == 2 && 550 (BasePred == CurrentPred || BasePred == SwappedCurrentPred)) 551 continue; 552 auto *AltInst = cast<CmpInst>(VL[AltIndex]); 553 CmpInst::Predicate AltPred = AltInst->getPredicate(); 554 Value *AltOp0 = AltInst->getOperand(0); 555 Value *AltOp1 = AltInst->getOperand(1); 556 // Check if operands are compatible with alternate operands. 557 if (AltPred == CurrentPred && 558 areCompatibleCmpOps(AltOp0, AltOp1, Op0, Op1)) 559 continue; 560 if (AltPred == SwappedCurrentPred && 561 areCompatibleCmpOps(AltOp0, AltOp1, Op1, Op0)) 562 continue; 563 } 564 if (BaseIndex == AltIndex && BasePred != CurrentPred) { 565 assert(isValidForAlternation(Opcode) && 566 isValidForAlternation(InstOpcode) && 567 "Cast isn't safe for alternation, logic needs to be updated!"); 568 AltIndex = Cnt; 569 continue; 570 } 571 auto *AltInst = cast<CmpInst>(VL[AltIndex]); 572 CmpInst::Predicate AltPred = AltInst->getPredicate(); 573 if (BasePred == CurrentPred || BasePred == SwappedCurrentPred || 574 AltPred == CurrentPred || AltPred == SwappedCurrentPred) 575 continue; 576 } 577 } else if (InstOpcode == Opcode || InstOpcode == AltOpcode) 578 continue; 579 return InstructionsState(VL[BaseIndex], nullptr, nullptr); 580 } 581 582 return InstructionsState(VL[BaseIndex], cast<Instruction>(VL[BaseIndex]), 583 cast<Instruction>(VL[AltIndex])); 584 } 585 586 /// \returns true if all of the values in \p VL have the same type or false 587 /// otherwise. 588 static bool allSameType(ArrayRef<Value *> VL) { 589 Type *Ty = VL[0]->getType(); 590 for (int i = 1, e = VL.size(); i < e; i++) 591 if (VL[i]->getType() != Ty) 592 return false; 593 594 return true; 595 } 596 597 /// \returns True if Extract{Value,Element} instruction extracts element Idx. 598 static Optional<unsigned> getExtractIndex(Instruction *E) { 599 unsigned Opcode = E->getOpcode(); 600 assert((Opcode == Instruction::ExtractElement || 601 Opcode == Instruction::ExtractValue) && 602 "Expected extractelement or extractvalue instruction."); 603 if (Opcode == Instruction::ExtractElement) { 604 auto *CI = dyn_cast<ConstantInt>(E->getOperand(1)); 605 if (!CI) 606 return None; 607 return CI->getZExtValue(); 608 } 609 ExtractValueInst *EI = cast<ExtractValueInst>(E); 610 if (EI->getNumIndices() != 1) 611 return None; 612 return *EI->idx_begin(); 613 } 614 615 /// \returns True if in-tree use also needs extract. This refers to 616 /// possible scalar operand in vectorized instruction. 617 static bool InTreeUserNeedToExtract(Value *Scalar, Instruction *UserInst, 618 TargetLibraryInfo *TLI) { 619 unsigned Opcode = UserInst->getOpcode(); 620 switch (Opcode) { 621 case Instruction::Load: { 622 LoadInst *LI = cast<LoadInst>(UserInst); 623 return (LI->getPointerOperand() == Scalar); 624 } 625 case Instruction::Store: { 626 StoreInst *SI = cast<StoreInst>(UserInst); 627 return (SI->getPointerOperand() == Scalar); 628 } 629 case Instruction::Call: { 630 CallInst *CI = cast<CallInst>(UserInst); 631 Intrinsic::ID ID = getVectorIntrinsicIDForCall(CI, TLI); 632 for (unsigned i = 0, e = CI->arg_size(); i != e; ++i) { 633 if (hasVectorInstrinsicScalarOpd(ID, i)) 634 return (CI->getArgOperand(i) == Scalar); 635 } 636 LLVM_FALLTHROUGH; 637 } 638 default: 639 return false; 640 } 641 } 642 643 /// \returns the AA location that is being access by the instruction. 644 static MemoryLocation getLocation(Instruction *I) { 645 if (StoreInst *SI = dyn_cast<StoreInst>(I)) 646 return MemoryLocation::get(SI); 647 if (LoadInst *LI = dyn_cast<LoadInst>(I)) 648 return MemoryLocation::get(LI); 649 return MemoryLocation(); 650 } 651 652 /// \returns True if the instruction is not a volatile or atomic load/store. 653 static bool isSimple(Instruction *I) { 654 if (LoadInst *LI = dyn_cast<LoadInst>(I)) 655 return LI->isSimple(); 656 if (StoreInst *SI = dyn_cast<StoreInst>(I)) 657 return SI->isSimple(); 658 if (MemIntrinsic *MI = dyn_cast<MemIntrinsic>(I)) 659 return !MI->isVolatile(); 660 return true; 661 } 662 663 /// Shuffles \p Mask in accordance with the given \p SubMask. 664 static void addMask(SmallVectorImpl<int> &Mask, ArrayRef<int> SubMask) { 665 if (SubMask.empty()) 666 return; 667 if (Mask.empty()) { 668 Mask.append(SubMask.begin(), SubMask.end()); 669 return; 670 } 671 SmallVector<int> NewMask(SubMask.size(), UndefMaskElem); 672 int TermValue = std::min(Mask.size(), SubMask.size()); 673 for (int I = 0, E = SubMask.size(); I < E; ++I) { 674 if (SubMask[I] >= TermValue || SubMask[I] == UndefMaskElem || 675 Mask[SubMask[I]] >= TermValue) 676 continue; 677 NewMask[I] = Mask[SubMask[I]]; 678 } 679 Mask.swap(NewMask); 680 } 681 682 /// Order may have elements assigned special value (size) which is out of 683 /// bounds. Such indices only appear on places which correspond to undef values 684 /// (see canReuseExtract for details) and used in order to avoid undef values 685 /// have effect on operands ordering. 686 /// The first loop below simply finds all unused indices and then the next loop 687 /// nest assigns these indices for undef values positions. 688 /// As an example below Order has two undef positions and they have assigned 689 /// values 3 and 7 respectively: 690 /// before: 6 9 5 4 9 2 1 0 691 /// after: 6 3 5 4 7 2 1 0 692 static void fixupOrderingIndices(SmallVectorImpl<unsigned> &Order) { 693 const unsigned Sz = Order.size(); 694 SmallBitVector UnusedIndices(Sz, /*t=*/true); 695 SmallBitVector MaskedIndices(Sz); 696 for (unsigned I = 0; I < Sz; ++I) { 697 if (Order[I] < Sz) 698 UnusedIndices.reset(Order[I]); 699 else 700 MaskedIndices.set(I); 701 } 702 if (MaskedIndices.none()) 703 return; 704 assert(UnusedIndices.count() == MaskedIndices.count() && 705 "Non-synced masked/available indices."); 706 int Idx = UnusedIndices.find_first(); 707 int MIdx = MaskedIndices.find_first(); 708 while (MIdx >= 0) { 709 assert(Idx >= 0 && "Indices must be synced."); 710 Order[MIdx] = Idx; 711 Idx = UnusedIndices.find_next(Idx); 712 MIdx = MaskedIndices.find_next(MIdx); 713 } 714 } 715 716 namespace llvm { 717 718 static void inversePermutation(ArrayRef<unsigned> Indices, 719 SmallVectorImpl<int> &Mask) { 720 Mask.clear(); 721 const unsigned E = Indices.size(); 722 Mask.resize(E, UndefMaskElem); 723 for (unsigned I = 0; I < E; ++I) 724 Mask[Indices[I]] = I; 725 } 726 727 /// \returns inserting index of InsertElement or InsertValue instruction, 728 /// using Offset as base offset for index. 729 static Optional<unsigned> getInsertIndex(Value *InsertInst, 730 unsigned Offset = 0) { 731 int Index = Offset; 732 if (auto *IE = dyn_cast<InsertElementInst>(InsertInst)) { 733 if (auto *CI = dyn_cast<ConstantInt>(IE->getOperand(2))) { 734 auto *VT = cast<FixedVectorType>(IE->getType()); 735 if (CI->getValue().uge(VT->getNumElements())) 736 return None; 737 Index *= VT->getNumElements(); 738 Index += CI->getZExtValue(); 739 return Index; 740 } 741 return None; 742 } 743 744 auto *IV = cast<InsertValueInst>(InsertInst); 745 Type *CurrentType = IV->getType(); 746 for (unsigned I : IV->indices()) { 747 if (auto *ST = dyn_cast<StructType>(CurrentType)) { 748 Index *= ST->getNumElements(); 749 CurrentType = ST->getElementType(I); 750 } else if (auto *AT = dyn_cast<ArrayType>(CurrentType)) { 751 Index *= AT->getNumElements(); 752 CurrentType = AT->getElementType(); 753 } else { 754 return None; 755 } 756 Index += I; 757 } 758 return Index; 759 } 760 761 /// Reorders the list of scalars in accordance with the given \p Mask. 762 static void reorderScalars(SmallVectorImpl<Value *> &Scalars, 763 ArrayRef<int> Mask) { 764 assert(!Mask.empty() && "Expected non-empty mask."); 765 SmallVector<Value *> Prev(Scalars.size(), 766 UndefValue::get(Scalars.front()->getType())); 767 Prev.swap(Scalars); 768 for (unsigned I = 0, E = Prev.size(); I < E; ++I) 769 if (Mask[I] != UndefMaskElem) 770 Scalars[Mask[I]] = Prev[I]; 771 } 772 773 /// Checks if the provided value does not require scheduling. It does not 774 /// require scheduling if this is not an instruction or it is an instruction 775 /// that does not read/write memory and all operands are either not instructions 776 /// or phi nodes or instructions from different blocks. 777 static bool areAllOperandsNonInsts(Value *V) { 778 auto *I = dyn_cast<Instruction>(V); 779 if (!I) 780 return true; 781 return !mayHaveNonDefUseDependency(*I) && 782 all_of(I->operands(), [I](Value *V) { 783 auto *IO = dyn_cast<Instruction>(V); 784 if (!IO) 785 return true; 786 return isa<PHINode>(IO) || IO->getParent() != I->getParent(); 787 }); 788 } 789 790 /// Checks if the provided value does not require scheduling. It does not 791 /// require scheduling if this is not an instruction or it is an instruction 792 /// that does not read/write memory and all users are phi nodes or instructions 793 /// from the different blocks. 794 static bool isUsedOutsideBlock(Value *V) { 795 auto *I = dyn_cast<Instruction>(V); 796 if (!I) 797 return true; 798 // Limits the number of uses to save compile time. 799 constexpr int UsesLimit = 8; 800 return !I->mayReadOrWriteMemory() && !I->hasNUsesOrMore(UsesLimit) && 801 all_of(I->users(), [I](User *U) { 802 auto *IU = dyn_cast<Instruction>(U); 803 if (!IU) 804 return true; 805 return IU->getParent() != I->getParent() || isa<PHINode>(IU); 806 }); 807 } 808 809 /// Checks if the specified value does not require scheduling. It does not 810 /// require scheduling if all operands and all users do not need to be scheduled 811 /// in the current basic block. 812 static bool doesNotNeedToBeScheduled(Value *V) { 813 return areAllOperandsNonInsts(V) && isUsedOutsideBlock(V); 814 } 815 816 /// Checks if the specified array of instructions does not require scheduling. 817 /// It is so if all either instructions have operands that do not require 818 /// scheduling or their users do not require scheduling since they are phis or 819 /// in other basic blocks. 820 static bool doesNotNeedToSchedule(ArrayRef<Value *> VL) { 821 return !VL.empty() && 822 (all_of(VL, isUsedOutsideBlock) || all_of(VL, areAllOperandsNonInsts)); 823 } 824 825 namespace slpvectorizer { 826 827 /// Bottom Up SLP Vectorizer. 828 class BoUpSLP { 829 struct TreeEntry; 830 struct ScheduleData; 831 832 public: 833 using ValueList = SmallVector<Value *, 8>; 834 using InstrList = SmallVector<Instruction *, 16>; 835 using ValueSet = SmallPtrSet<Value *, 16>; 836 using StoreList = SmallVector<StoreInst *, 8>; 837 using ExtraValueToDebugLocsMap = 838 MapVector<Value *, SmallVector<Instruction *, 2>>; 839 using OrdersType = SmallVector<unsigned, 4>; 840 841 BoUpSLP(Function *Func, ScalarEvolution *Se, TargetTransformInfo *Tti, 842 TargetLibraryInfo *TLi, AAResults *Aa, LoopInfo *Li, 843 DominatorTree *Dt, AssumptionCache *AC, DemandedBits *DB, 844 const DataLayout *DL, OptimizationRemarkEmitter *ORE) 845 : BatchAA(*Aa), F(Func), SE(Se), TTI(Tti), TLI(TLi), LI(Li), 846 DT(Dt), AC(AC), DB(DB), DL(DL), ORE(ORE), Builder(Se->getContext()) { 847 CodeMetrics::collectEphemeralValues(F, AC, EphValues); 848 // Use the vector register size specified by the target unless overridden 849 // by a command-line option. 850 // TODO: It would be better to limit the vectorization factor based on 851 // data type rather than just register size. For example, x86 AVX has 852 // 256-bit registers, but it does not support integer operations 853 // at that width (that requires AVX2). 854 if (MaxVectorRegSizeOption.getNumOccurrences()) 855 MaxVecRegSize = MaxVectorRegSizeOption; 856 else 857 MaxVecRegSize = 858 TTI->getRegisterBitWidth(TargetTransformInfo::RGK_FixedWidthVector) 859 .getFixedSize(); 860 861 if (MinVectorRegSizeOption.getNumOccurrences()) 862 MinVecRegSize = MinVectorRegSizeOption; 863 else 864 MinVecRegSize = TTI->getMinVectorRegisterBitWidth(); 865 } 866 867 /// Vectorize the tree that starts with the elements in \p VL. 868 /// Returns the vectorized root. 869 Value *vectorizeTree(); 870 871 /// Vectorize the tree but with the list of externally used values \p 872 /// ExternallyUsedValues. Values in this MapVector can be replaced but the 873 /// generated extractvalue instructions. 874 Value *vectorizeTree(ExtraValueToDebugLocsMap &ExternallyUsedValues); 875 876 /// \returns the cost incurred by unwanted spills and fills, caused by 877 /// holding live values over call sites. 878 InstructionCost getSpillCost() const; 879 880 /// \returns the vectorization cost of the subtree that starts at \p VL. 881 /// A negative number means that this is profitable. 882 InstructionCost getTreeCost(ArrayRef<Value *> VectorizedVals = None); 883 884 /// Construct a vectorizable tree that starts at \p Roots, ignoring users for 885 /// the purpose of scheduling and extraction in the \p UserIgnoreLst. 886 void buildTree(ArrayRef<Value *> Roots, 887 ArrayRef<Value *> UserIgnoreLst = None); 888 889 /// Builds external uses of the vectorized scalars, i.e. the list of 890 /// vectorized scalars to be extracted, their lanes and their scalar users. \p 891 /// ExternallyUsedValues contains additional list of external uses to handle 892 /// vectorization of reductions. 893 void 894 buildExternalUses(const ExtraValueToDebugLocsMap &ExternallyUsedValues = {}); 895 896 /// Clear the internal data structures that are created by 'buildTree'. 897 void deleteTree() { 898 VectorizableTree.clear(); 899 ScalarToTreeEntry.clear(); 900 MustGather.clear(); 901 ExternalUses.clear(); 902 for (auto &Iter : BlocksSchedules) { 903 BlockScheduling *BS = Iter.second.get(); 904 BS->clear(); 905 } 906 MinBWs.clear(); 907 InstrElementSize.clear(); 908 } 909 910 unsigned getTreeSize() const { return VectorizableTree.size(); } 911 912 /// Perform LICM and CSE on the newly generated gather sequences. 913 void optimizeGatherSequence(); 914 915 /// Checks if the specified gather tree entry \p TE can be represented as a 916 /// shuffled vector entry + (possibly) permutation with other gathers. It 917 /// implements the checks only for possibly ordered scalars (Loads, 918 /// ExtractElement, ExtractValue), which can be part of the graph. 919 Optional<OrdersType> findReusedOrderedScalars(const TreeEntry &TE); 920 921 /// Gets reordering data for the given tree entry. If the entry is vectorized 922 /// - just return ReorderIndices, otherwise check if the scalars can be 923 /// reordered and return the most optimal order. 924 /// \param TopToBottom If true, include the order of vectorized stores and 925 /// insertelement nodes, otherwise skip them. 926 Optional<OrdersType> getReorderingData(const TreeEntry &TE, bool TopToBottom); 927 928 /// Reorders the current graph to the most profitable order starting from the 929 /// root node to the leaf nodes. The best order is chosen only from the nodes 930 /// of the same size (vectorization factor). Smaller nodes are considered 931 /// parts of subgraph with smaller VF and they are reordered independently. We 932 /// can make it because we still need to extend smaller nodes to the wider VF 933 /// and we can merge reordering shuffles with the widening shuffles. 934 void reorderTopToBottom(); 935 936 /// Reorders the current graph to the most profitable order starting from 937 /// leaves to the root. It allows to rotate small subgraphs and reduce the 938 /// number of reshuffles if the leaf nodes use the same order. In this case we 939 /// can merge the orders and just shuffle user node instead of shuffling its 940 /// operands. Plus, even the leaf nodes have different orders, it allows to 941 /// sink reordering in the graph closer to the root node and merge it later 942 /// during analysis. 943 void reorderBottomToTop(bool IgnoreReorder = false); 944 945 /// \return The vector element size in bits to use when vectorizing the 946 /// expression tree ending at \p V. If V is a store, the size is the width of 947 /// the stored value. Otherwise, the size is the width of the largest loaded 948 /// value reaching V. This method is used by the vectorizer to calculate 949 /// vectorization factors. 950 unsigned getVectorElementSize(Value *V); 951 952 /// Compute the minimum type sizes required to represent the entries in a 953 /// vectorizable tree. 954 void computeMinimumValueSizes(); 955 956 // \returns maximum vector register size as set by TTI or overridden by cl::opt. 957 unsigned getMaxVecRegSize() const { 958 return MaxVecRegSize; 959 } 960 961 // \returns minimum vector register size as set by cl::opt. 962 unsigned getMinVecRegSize() const { 963 return MinVecRegSize; 964 } 965 966 unsigned getMinVF(unsigned Sz) const { 967 return std::max(2U, getMinVecRegSize() / Sz); 968 } 969 970 unsigned getMaximumVF(unsigned ElemWidth, unsigned Opcode) const { 971 unsigned MaxVF = MaxVFOption.getNumOccurrences() ? 972 MaxVFOption : TTI->getMaximumVF(ElemWidth, Opcode); 973 return MaxVF ? MaxVF : UINT_MAX; 974 } 975 976 /// Check if homogeneous aggregate is isomorphic to some VectorType. 977 /// Accepts homogeneous multidimensional aggregate of scalars/vectors like 978 /// {[4 x i16], [4 x i16]}, { <2 x float>, <2 x float> }, 979 /// {{{i16, i16}, {i16, i16}}, {{i16, i16}, {i16, i16}}} and so on. 980 /// 981 /// \returns number of elements in vector if isomorphism exists, 0 otherwise. 982 unsigned canMapToVector(Type *T, const DataLayout &DL) const; 983 984 /// \returns True if the VectorizableTree is both tiny and not fully 985 /// vectorizable. We do not vectorize such trees. 986 bool isTreeTinyAndNotFullyVectorizable(bool ForReduction = false) const; 987 988 /// Assume that a legal-sized 'or'-reduction of shifted/zexted loaded values 989 /// can be load combined in the backend. Load combining may not be allowed in 990 /// the IR optimizer, so we do not want to alter the pattern. For example, 991 /// partially transforming a scalar bswap() pattern into vector code is 992 /// effectively impossible for the backend to undo. 993 /// TODO: If load combining is allowed in the IR optimizer, this analysis 994 /// may not be necessary. 995 bool isLoadCombineReductionCandidate(RecurKind RdxKind) const; 996 997 /// Assume that a vector of stores of bitwise-or/shifted/zexted loaded values 998 /// can be load combined in the backend. Load combining may not be allowed in 999 /// the IR optimizer, so we do not want to alter the pattern. For example, 1000 /// partially transforming a scalar bswap() pattern into vector code is 1001 /// effectively impossible for the backend to undo. 1002 /// TODO: If load combining is allowed in the IR optimizer, this analysis 1003 /// may not be necessary. 1004 bool isLoadCombineCandidate() const; 1005 1006 OptimizationRemarkEmitter *getORE() { return ORE; } 1007 1008 /// This structure holds any data we need about the edges being traversed 1009 /// during buildTree_rec(). We keep track of: 1010 /// (i) the user TreeEntry index, and 1011 /// (ii) the index of the edge. 1012 struct EdgeInfo { 1013 EdgeInfo() = default; 1014 EdgeInfo(TreeEntry *UserTE, unsigned EdgeIdx) 1015 : UserTE(UserTE), EdgeIdx(EdgeIdx) {} 1016 /// The user TreeEntry. 1017 TreeEntry *UserTE = nullptr; 1018 /// The operand index of the use. 1019 unsigned EdgeIdx = UINT_MAX; 1020 #ifndef NDEBUG 1021 friend inline raw_ostream &operator<<(raw_ostream &OS, 1022 const BoUpSLP::EdgeInfo &EI) { 1023 EI.dump(OS); 1024 return OS; 1025 } 1026 /// Debug print. 1027 void dump(raw_ostream &OS) const { 1028 OS << "{User:" << (UserTE ? std::to_string(UserTE->Idx) : "null") 1029 << " EdgeIdx:" << EdgeIdx << "}"; 1030 } 1031 LLVM_DUMP_METHOD void dump() const { dump(dbgs()); } 1032 #endif 1033 }; 1034 1035 /// A helper data structure to hold the operands of a vector of instructions. 1036 /// This supports a fixed vector length for all operand vectors. 1037 class VLOperands { 1038 /// For each operand we need (i) the value, and (ii) the opcode that it 1039 /// would be attached to if the expression was in a left-linearized form. 1040 /// This is required to avoid illegal operand reordering. 1041 /// For example: 1042 /// \verbatim 1043 /// 0 Op1 1044 /// |/ 1045 /// Op1 Op2 Linearized + Op2 1046 /// \ / ----------> |/ 1047 /// - - 1048 /// 1049 /// Op1 - Op2 (0 + Op1) - Op2 1050 /// \endverbatim 1051 /// 1052 /// Value Op1 is attached to a '+' operation, and Op2 to a '-'. 1053 /// 1054 /// Another way to think of this is to track all the operations across the 1055 /// path from the operand all the way to the root of the tree and to 1056 /// calculate the operation that corresponds to this path. For example, the 1057 /// path from Op2 to the root crosses the RHS of the '-', therefore the 1058 /// corresponding operation is a '-' (which matches the one in the 1059 /// linearized tree, as shown above). 1060 /// 1061 /// For lack of a better term, we refer to this operation as Accumulated 1062 /// Path Operation (APO). 1063 struct OperandData { 1064 OperandData() = default; 1065 OperandData(Value *V, bool APO, bool IsUsed) 1066 : V(V), APO(APO), IsUsed(IsUsed) {} 1067 /// The operand value. 1068 Value *V = nullptr; 1069 /// TreeEntries only allow a single opcode, or an alternate sequence of 1070 /// them (e.g, +, -). Therefore, we can safely use a boolean value for the 1071 /// APO. It is set to 'true' if 'V' is attached to an inverse operation 1072 /// in the left-linearized form (e.g., Sub/Div), and 'false' otherwise 1073 /// (e.g., Add/Mul) 1074 bool APO = false; 1075 /// Helper data for the reordering function. 1076 bool IsUsed = false; 1077 }; 1078 1079 /// During operand reordering, we are trying to select the operand at lane 1080 /// that matches best with the operand at the neighboring lane. Our 1081 /// selection is based on the type of value we are looking for. For example, 1082 /// if the neighboring lane has a load, we need to look for a load that is 1083 /// accessing a consecutive address. These strategies are summarized in the 1084 /// 'ReorderingMode' enumerator. 1085 enum class ReorderingMode { 1086 Load, ///< Matching loads to consecutive memory addresses 1087 Opcode, ///< Matching instructions based on opcode (same or alternate) 1088 Constant, ///< Matching constants 1089 Splat, ///< Matching the same instruction multiple times (broadcast) 1090 Failed, ///< We failed to create a vectorizable group 1091 }; 1092 1093 using OperandDataVec = SmallVector<OperandData, 2>; 1094 1095 /// A vector of operand vectors. 1096 SmallVector<OperandDataVec, 4> OpsVec; 1097 1098 const DataLayout &DL; 1099 ScalarEvolution &SE; 1100 const BoUpSLP &R; 1101 1102 /// \returns the operand data at \p OpIdx and \p Lane. 1103 OperandData &getData(unsigned OpIdx, unsigned Lane) { 1104 return OpsVec[OpIdx][Lane]; 1105 } 1106 1107 /// \returns the operand data at \p OpIdx and \p Lane. Const version. 1108 const OperandData &getData(unsigned OpIdx, unsigned Lane) const { 1109 return OpsVec[OpIdx][Lane]; 1110 } 1111 1112 /// Clears the used flag for all entries. 1113 void clearUsed() { 1114 for (unsigned OpIdx = 0, NumOperands = getNumOperands(); 1115 OpIdx != NumOperands; ++OpIdx) 1116 for (unsigned Lane = 0, NumLanes = getNumLanes(); Lane != NumLanes; 1117 ++Lane) 1118 OpsVec[OpIdx][Lane].IsUsed = false; 1119 } 1120 1121 /// Swap the operand at \p OpIdx1 with that one at \p OpIdx2. 1122 void swap(unsigned OpIdx1, unsigned OpIdx2, unsigned Lane) { 1123 std::swap(OpsVec[OpIdx1][Lane], OpsVec[OpIdx2][Lane]); 1124 } 1125 1126 // The hard-coded scores listed here are not very important, though it shall 1127 // be higher for better matches to improve the resulting cost. When 1128 // computing the scores of matching one sub-tree with another, we are 1129 // basically counting the number of values that are matching. So even if all 1130 // scores are set to 1, we would still get a decent matching result. 1131 // However, sometimes we have to break ties. For example we may have to 1132 // choose between matching loads vs matching opcodes. This is what these 1133 // scores are helping us with: they provide the order of preference. Also, 1134 // this is important if the scalar is externally used or used in another 1135 // tree entry node in the different lane. 1136 1137 /// Loads from consecutive memory addresses, e.g. load(A[i]), load(A[i+1]). 1138 static const int ScoreConsecutiveLoads = 4; 1139 /// Loads from reversed memory addresses, e.g. load(A[i+1]), load(A[i]). 1140 static const int ScoreReversedLoads = 3; 1141 /// ExtractElementInst from same vector and consecutive indexes. 1142 static const int ScoreConsecutiveExtracts = 4; 1143 /// ExtractElementInst from same vector and reversed indices. 1144 static const int ScoreReversedExtracts = 3; 1145 /// Constants. 1146 static const int ScoreConstants = 2; 1147 /// Instructions with the same opcode. 1148 static const int ScoreSameOpcode = 2; 1149 /// Instructions with alt opcodes (e.g, add + sub). 1150 static const int ScoreAltOpcodes = 1; 1151 /// Identical instructions (a.k.a. splat or broadcast). 1152 static const int ScoreSplat = 1; 1153 /// Matching with an undef is preferable to failing. 1154 static const int ScoreUndef = 1; 1155 /// Score for failing to find a decent match. 1156 static const int ScoreFail = 0; 1157 /// Score if all users are vectorized. 1158 static const int ScoreAllUserVectorized = 1; 1159 1160 /// \returns the score of placing \p V1 and \p V2 in consecutive lanes. 1161 /// Also, checks if \p V1 and \p V2 are compatible with instructions in \p 1162 /// MainAltOps. 1163 static int getShallowScore(Value *V1, Value *V2, const DataLayout &DL, 1164 ScalarEvolution &SE, int NumLanes, 1165 ArrayRef<Value *> MainAltOps) { 1166 if (V1 == V2) 1167 return VLOperands::ScoreSplat; 1168 1169 auto *LI1 = dyn_cast<LoadInst>(V1); 1170 auto *LI2 = dyn_cast<LoadInst>(V2); 1171 if (LI1 && LI2) { 1172 if (LI1->getParent() != LI2->getParent()) 1173 return VLOperands::ScoreFail; 1174 1175 Optional<int> Dist = getPointersDiff( 1176 LI1->getType(), LI1->getPointerOperand(), LI2->getType(), 1177 LI2->getPointerOperand(), DL, SE, /*StrictCheck=*/true); 1178 if (!Dist || *Dist == 0) 1179 return VLOperands::ScoreFail; 1180 // The distance is too large - still may be profitable to use masked 1181 // loads/gathers. 1182 if (std::abs(*Dist) > NumLanes / 2) 1183 return VLOperands::ScoreAltOpcodes; 1184 // This still will detect consecutive loads, but we might have "holes" 1185 // in some cases. It is ok for non-power-2 vectorization and may produce 1186 // better results. It should not affect current vectorization. 1187 return (*Dist > 0) ? VLOperands::ScoreConsecutiveLoads 1188 : VLOperands::ScoreReversedLoads; 1189 } 1190 1191 auto *C1 = dyn_cast<Constant>(V1); 1192 auto *C2 = dyn_cast<Constant>(V2); 1193 if (C1 && C2) 1194 return VLOperands::ScoreConstants; 1195 1196 // Extracts from consecutive indexes of the same vector better score as 1197 // the extracts could be optimized away. 1198 Value *EV1; 1199 ConstantInt *Ex1Idx; 1200 if (match(V1, m_ExtractElt(m_Value(EV1), m_ConstantInt(Ex1Idx)))) { 1201 // Undefs are always profitable for extractelements. 1202 if (isa<UndefValue>(V2)) 1203 return VLOperands::ScoreConsecutiveExtracts; 1204 Value *EV2 = nullptr; 1205 ConstantInt *Ex2Idx = nullptr; 1206 if (match(V2, 1207 m_ExtractElt(m_Value(EV2), m_CombineOr(m_ConstantInt(Ex2Idx), 1208 m_Undef())))) { 1209 // Undefs are always profitable for extractelements. 1210 if (!Ex2Idx) 1211 return VLOperands::ScoreConsecutiveExtracts; 1212 if (isUndefVector(EV2) && EV2->getType() == EV1->getType()) 1213 return VLOperands::ScoreConsecutiveExtracts; 1214 if (EV2 == EV1) { 1215 int Idx1 = Ex1Idx->getZExtValue(); 1216 int Idx2 = Ex2Idx->getZExtValue(); 1217 int Dist = Idx2 - Idx1; 1218 // The distance is too large - still may be profitable to use 1219 // shuffles. 1220 if (std::abs(Dist) == 0) 1221 return VLOperands::ScoreSplat; 1222 if (std::abs(Dist) > NumLanes / 2) 1223 return VLOperands::ScoreSameOpcode; 1224 return (Dist > 0) ? VLOperands::ScoreConsecutiveExtracts 1225 : VLOperands::ScoreReversedExtracts; 1226 } 1227 return VLOperands::ScoreAltOpcodes; 1228 } 1229 return VLOperands::ScoreFail; 1230 } 1231 1232 auto *I1 = dyn_cast<Instruction>(V1); 1233 auto *I2 = dyn_cast<Instruction>(V2); 1234 if (I1 && I2) { 1235 if (I1->getParent() != I2->getParent()) 1236 return VLOperands::ScoreFail; 1237 SmallVector<Value *, 4> Ops(MainAltOps.begin(), MainAltOps.end()); 1238 Ops.push_back(I1); 1239 Ops.push_back(I2); 1240 InstructionsState S = getSameOpcode(Ops); 1241 // Note: Only consider instructions with <= 2 operands to avoid 1242 // complexity explosion. 1243 if (S.getOpcode() && 1244 (S.MainOp->getNumOperands() <= 2 || !MainAltOps.empty() || 1245 !S.isAltShuffle()) && 1246 all_of(Ops, [&S](Value *V) { 1247 return cast<Instruction>(V)->getNumOperands() == 1248 S.MainOp->getNumOperands(); 1249 })) 1250 return S.isAltShuffle() ? VLOperands::ScoreAltOpcodes 1251 : VLOperands::ScoreSameOpcode; 1252 } 1253 1254 if (isa<UndefValue>(V2)) 1255 return VLOperands::ScoreUndef; 1256 1257 return VLOperands::ScoreFail; 1258 } 1259 1260 /// \param Lane lane of the operands under analysis. 1261 /// \param OpIdx operand index in \p Lane lane we're looking the best 1262 /// candidate for. 1263 /// \param Idx operand index of the current candidate value. 1264 /// \returns The additional score due to possible broadcasting of the 1265 /// elements in the lane. It is more profitable to have power-of-2 unique 1266 /// elements in the lane, it will be vectorized with higher probability 1267 /// after removing duplicates. Currently the SLP vectorizer supports only 1268 /// vectorization of the power-of-2 number of unique scalars. 1269 int getSplatScore(unsigned Lane, unsigned OpIdx, unsigned Idx) const { 1270 Value *IdxLaneV = getData(Idx, Lane).V; 1271 if (!isa<Instruction>(IdxLaneV) || IdxLaneV == getData(OpIdx, Lane).V) 1272 return 0; 1273 SmallPtrSet<Value *, 4> Uniques; 1274 for (unsigned Ln = 0, E = getNumLanes(); Ln < E; ++Ln) { 1275 if (Ln == Lane) 1276 continue; 1277 Value *OpIdxLnV = getData(OpIdx, Ln).V; 1278 if (!isa<Instruction>(OpIdxLnV)) 1279 return 0; 1280 Uniques.insert(OpIdxLnV); 1281 } 1282 int UniquesCount = Uniques.size(); 1283 int UniquesCntWithIdxLaneV = 1284 Uniques.contains(IdxLaneV) ? UniquesCount : UniquesCount + 1; 1285 Value *OpIdxLaneV = getData(OpIdx, Lane).V; 1286 int UniquesCntWithOpIdxLaneV = 1287 Uniques.contains(OpIdxLaneV) ? UniquesCount : UniquesCount + 1; 1288 if (UniquesCntWithIdxLaneV == UniquesCntWithOpIdxLaneV) 1289 return 0; 1290 return (PowerOf2Ceil(UniquesCntWithOpIdxLaneV) - 1291 UniquesCntWithOpIdxLaneV) - 1292 (PowerOf2Ceil(UniquesCntWithIdxLaneV) - UniquesCntWithIdxLaneV); 1293 } 1294 1295 /// \param Lane lane of the operands under analysis. 1296 /// \param OpIdx operand index in \p Lane lane we're looking the best 1297 /// candidate for. 1298 /// \param Idx operand index of the current candidate value. 1299 /// \returns The additional score for the scalar which users are all 1300 /// vectorized. 1301 int getExternalUseScore(unsigned Lane, unsigned OpIdx, unsigned Idx) const { 1302 Value *IdxLaneV = getData(Idx, Lane).V; 1303 Value *OpIdxLaneV = getData(OpIdx, Lane).V; 1304 // Do not care about number of uses for vector-like instructions 1305 // (extractelement/extractvalue with constant indices), they are extracts 1306 // themselves and already externally used. Vectorization of such 1307 // instructions does not add extra extractelement instruction, just may 1308 // remove it. 1309 if (isVectorLikeInstWithConstOps(IdxLaneV) && 1310 isVectorLikeInstWithConstOps(OpIdxLaneV)) 1311 return VLOperands::ScoreAllUserVectorized; 1312 auto *IdxLaneI = dyn_cast<Instruction>(IdxLaneV); 1313 if (!IdxLaneI || !isa<Instruction>(OpIdxLaneV)) 1314 return 0; 1315 return R.areAllUsersVectorized(IdxLaneI, None) 1316 ? VLOperands::ScoreAllUserVectorized 1317 : 0; 1318 } 1319 1320 /// Go through the operands of \p LHS and \p RHS recursively until \p 1321 /// MaxLevel, and return the cummulative score. For example: 1322 /// \verbatim 1323 /// A[0] B[0] A[1] B[1] C[0] D[0] B[1] A[1] 1324 /// \ / \ / \ / \ / 1325 /// + + + + 1326 /// G1 G2 G3 G4 1327 /// \endverbatim 1328 /// The getScoreAtLevelRec(G1, G2) function will try to match the nodes at 1329 /// each level recursively, accumulating the score. It starts from matching 1330 /// the additions at level 0, then moves on to the loads (level 1). The 1331 /// score of G1 and G2 is higher than G1 and G3, because {A[0],A[1]} and 1332 /// {B[0],B[1]} match with VLOperands::ScoreConsecutiveLoads, while 1333 /// {A[0],C[0]} has a score of VLOperands::ScoreFail. 1334 /// Please note that the order of the operands does not matter, as we 1335 /// evaluate the score of all profitable combinations of operands. In 1336 /// other words the score of G1 and G4 is the same as G1 and G2. This 1337 /// heuristic is based on ideas described in: 1338 /// Look-ahead SLP: Auto-vectorization in the presence of commutative 1339 /// operations, CGO 2018 by Vasileios Porpodas, Rodrigo C. O. Rocha, 1340 /// Luís F. W. Góes 1341 int getScoreAtLevelRec(Value *LHS, Value *RHS, int CurrLevel, int MaxLevel, 1342 ArrayRef<Value *> MainAltOps) { 1343 1344 // Get the shallow score of V1 and V2. 1345 int ShallowScoreAtThisLevel = 1346 getShallowScore(LHS, RHS, DL, SE, getNumLanes(), MainAltOps); 1347 1348 // If reached MaxLevel, 1349 // or if V1 and V2 are not instructions, 1350 // or if they are SPLAT, 1351 // or if they are not consecutive, 1352 // or if profitable to vectorize loads or extractelements, early return 1353 // the current cost. 1354 auto *I1 = dyn_cast<Instruction>(LHS); 1355 auto *I2 = dyn_cast<Instruction>(RHS); 1356 if (CurrLevel == MaxLevel || !(I1 && I2) || I1 == I2 || 1357 ShallowScoreAtThisLevel == VLOperands::ScoreFail || 1358 (((isa<LoadInst>(I1) && isa<LoadInst>(I2)) || 1359 (I1->getNumOperands() > 2 && I2->getNumOperands() > 2) || 1360 (isa<ExtractElementInst>(I1) && isa<ExtractElementInst>(I2))) && 1361 ShallowScoreAtThisLevel)) 1362 return ShallowScoreAtThisLevel; 1363 assert(I1 && I2 && "Should have early exited."); 1364 1365 // Contains the I2 operand indexes that got matched with I1 operands. 1366 SmallSet<unsigned, 4> Op2Used; 1367 1368 // Recursion towards the operands of I1 and I2. We are trying all possible 1369 // operand pairs, and keeping track of the best score. 1370 for (unsigned OpIdx1 = 0, NumOperands1 = I1->getNumOperands(); 1371 OpIdx1 != NumOperands1; ++OpIdx1) { 1372 // Try to pair op1I with the best operand of I2. 1373 int MaxTmpScore = 0; 1374 unsigned MaxOpIdx2 = 0; 1375 bool FoundBest = false; 1376 // If I2 is commutative try all combinations. 1377 unsigned FromIdx = isCommutative(I2) ? 0 : OpIdx1; 1378 unsigned ToIdx = isCommutative(I2) 1379 ? I2->getNumOperands() 1380 : std::min(I2->getNumOperands(), OpIdx1 + 1); 1381 assert(FromIdx <= ToIdx && "Bad index"); 1382 for (unsigned OpIdx2 = FromIdx; OpIdx2 != ToIdx; ++OpIdx2) { 1383 // Skip operands already paired with OpIdx1. 1384 if (Op2Used.count(OpIdx2)) 1385 continue; 1386 // Recursively calculate the cost at each level 1387 int TmpScore = 1388 getScoreAtLevelRec(I1->getOperand(OpIdx1), I2->getOperand(OpIdx2), 1389 CurrLevel + 1, MaxLevel, None); 1390 // Look for the best score. 1391 if (TmpScore > VLOperands::ScoreFail && TmpScore > MaxTmpScore) { 1392 MaxTmpScore = TmpScore; 1393 MaxOpIdx2 = OpIdx2; 1394 FoundBest = true; 1395 } 1396 } 1397 if (FoundBest) { 1398 // Pair {OpIdx1, MaxOpIdx2} was found to be best. Never revisit it. 1399 Op2Used.insert(MaxOpIdx2); 1400 ShallowScoreAtThisLevel += MaxTmpScore; 1401 } 1402 } 1403 return ShallowScoreAtThisLevel; 1404 } 1405 1406 /// Score scaling factor for fully compatible instructions but with 1407 /// different number of external uses. Allows better selection of the 1408 /// instructions with less external uses. 1409 static const int ScoreScaleFactor = 10; 1410 1411 /// \Returns the look-ahead score, which tells us how much the sub-trees 1412 /// rooted at \p LHS and \p RHS match, the more they match the higher the 1413 /// score. This helps break ties in an informed way when we cannot decide on 1414 /// the order of the operands by just considering the immediate 1415 /// predecessors. 1416 int getLookAheadScore(Value *LHS, Value *RHS, ArrayRef<Value *> MainAltOps, 1417 int Lane, unsigned OpIdx, unsigned Idx, 1418 bool &IsUsed) { 1419 int Score = 1420 getScoreAtLevelRec(LHS, RHS, 1, LookAheadMaxDepth, MainAltOps); 1421 if (Score) { 1422 int SplatScore = getSplatScore(Lane, OpIdx, Idx); 1423 if (Score <= -SplatScore) { 1424 // Set the minimum score for splat-like sequence to avoid setting 1425 // failed state. 1426 Score = 1; 1427 } else { 1428 Score += SplatScore; 1429 // Scale score to see the difference between different operands 1430 // and similar operands but all vectorized/not all vectorized 1431 // uses. It does not affect actual selection of the best 1432 // compatible operand in general, just allows to select the 1433 // operand with all vectorized uses. 1434 Score *= ScoreScaleFactor; 1435 Score += getExternalUseScore(Lane, OpIdx, Idx); 1436 IsUsed = true; 1437 } 1438 } 1439 return Score; 1440 } 1441 1442 /// Best defined scores per lanes between the passes. Used to choose the 1443 /// best operand (with the highest score) between the passes. 1444 /// The key - {Operand Index, Lane}. 1445 /// The value - the best score between the passes for the lane and the 1446 /// operand. 1447 SmallDenseMap<std::pair<unsigned, unsigned>, unsigned, 8> 1448 BestScoresPerLanes; 1449 1450 // Search all operands in Ops[*][Lane] for the one that matches best 1451 // Ops[OpIdx][LastLane] and return its opreand index. 1452 // If no good match can be found, return None. 1453 Optional<unsigned> getBestOperand(unsigned OpIdx, int Lane, int LastLane, 1454 ArrayRef<ReorderingMode> ReorderingModes, 1455 ArrayRef<Value *> MainAltOps) { 1456 unsigned NumOperands = getNumOperands(); 1457 1458 // The operand of the previous lane at OpIdx. 1459 Value *OpLastLane = getData(OpIdx, LastLane).V; 1460 1461 // Our strategy mode for OpIdx. 1462 ReorderingMode RMode = ReorderingModes[OpIdx]; 1463 if (RMode == ReorderingMode::Failed) 1464 return None; 1465 1466 // The linearized opcode of the operand at OpIdx, Lane. 1467 bool OpIdxAPO = getData(OpIdx, Lane).APO; 1468 1469 // The best operand index and its score. 1470 // Sometimes we have more than one option (e.g., Opcode and Undefs), so we 1471 // are using the score to differentiate between the two. 1472 struct BestOpData { 1473 Optional<unsigned> Idx = None; 1474 unsigned Score = 0; 1475 } BestOp; 1476 BestOp.Score = 1477 BestScoresPerLanes.try_emplace(std::make_pair(OpIdx, Lane), 0) 1478 .first->second; 1479 1480 // Track if the operand must be marked as used. If the operand is set to 1481 // Score 1 explicitly (because of non power-of-2 unique scalars, we may 1482 // want to reestimate the operands again on the following iterations). 1483 bool IsUsed = 1484 RMode == ReorderingMode::Splat || RMode == ReorderingMode::Constant; 1485 // Iterate through all unused operands and look for the best. 1486 for (unsigned Idx = 0; Idx != NumOperands; ++Idx) { 1487 // Get the operand at Idx and Lane. 1488 OperandData &OpData = getData(Idx, Lane); 1489 Value *Op = OpData.V; 1490 bool OpAPO = OpData.APO; 1491 1492 // Skip already selected operands. 1493 if (OpData.IsUsed) 1494 continue; 1495 1496 // Skip if we are trying to move the operand to a position with a 1497 // different opcode in the linearized tree form. This would break the 1498 // semantics. 1499 if (OpAPO != OpIdxAPO) 1500 continue; 1501 1502 // Look for an operand that matches the current mode. 1503 switch (RMode) { 1504 case ReorderingMode::Load: 1505 case ReorderingMode::Constant: 1506 case ReorderingMode::Opcode: { 1507 bool LeftToRight = Lane > LastLane; 1508 Value *OpLeft = (LeftToRight) ? OpLastLane : Op; 1509 Value *OpRight = (LeftToRight) ? Op : OpLastLane; 1510 int Score = getLookAheadScore(OpLeft, OpRight, MainAltOps, Lane, 1511 OpIdx, Idx, IsUsed); 1512 if (Score > static_cast<int>(BestOp.Score)) { 1513 BestOp.Idx = Idx; 1514 BestOp.Score = Score; 1515 BestScoresPerLanes[std::make_pair(OpIdx, Lane)] = Score; 1516 } 1517 break; 1518 } 1519 case ReorderingMode::Splat: 1520 if (Op == OpLastLane) 1521 BestOp.Idx = Idx; 1522 break; 1523 case ReorderingMode::Failed: 1524 llvm_unreachable("Not expected Failed reordering mode."); 1525 } 1526 } 1527 1528 if (BestOp.Idx) { 1529 getData(BestOp.Idx.getValue(), Lane).IsUsed = IsUsed; 1530 return BestOp.Idx; 1531 } 1532 // If we could not find a good match return None. 1533 return None; 1534 } 1535 1536 /// Helper for reorderOperandVecs. 1537 /// \returns the lane that we should start reordering from. This is the one 1538 /// which has the least number of operands that can freely move about or 1539 /// less profitable because it already has the most optimal set of operands. 1540 unsigned getBestLaneToStartReordering() const { 1541 unsigned Min = UINT_MAX; 1542 unsigned SameOpNumber = 0; 1543 // std::pair<unsigned, unsigned> is used to implement a simple voting 1544 // algorithm and choose the lane with the least number of operands that 1545 // can freely move about or less profitable because it already has the 1546 // most optimal set of operands. The first unsigned is a counter for 1547 // voting, the second unsigned is the counter of lanes with instructions 1548 // with same/alternate opcodes and same parent basic block. 1549 MapVector<unsigned, std::pair<unsigned, unsigned>> HashMap; 1550 // Try to be closer to the original results, if we have multiple lanes 1551 // with same cost. If 2 lanes have the same cost, use the one with the 1552 // lowest index. 1553 for (int I = getNumLanes(); I > 0; --I) { 1554 unsigned Lane = I - 1; 1555 OperandsOrderData NumFreeOpsHash = 1556 getMaxNumOperandsThatCanBeReordered(Lane); 1557 // Compare the number of operands that can move and choose the one with 1558 // the least number. 1559 if (NumFreeOpsHash.NumOfAPOs < Min) { 1560 Min = NumFreeOpsHash.NumOfAPOs; 1561 SameOpNumber = NumFreeOpsHash.NumOpsWithSameOpcodeParent; 1562 HashMap.clear(); 1563 HashMap[NumFreeOpsHash.Hash] = std::make_pair(1, Lane); 1564 } else if (NumFreeOpsHash.NumOfAPOs == Min && 1565 NumFreeOpsHash.NumOpsWithSameOpcodeParent < SameOpNumber) { 1566 // Select the most optimal lane in terms of number of operands that 1567 // should be moved around. 1568 SameOpNumber = NumFreeOpsHash.NumOpsWithSameOpcodeParent; 1569 HashMap[NumFreeOpsHash.Hash] = std::make_pair(1, Lane); 1570 } else if (NumFreeOpsHash.NumOfAPOs == Min && 1571 NumFreeOpsHash.NumOpsWithSameOpcodeParent == SameOpNumber) { 1572 auto It = HashMap.find(NumFreeOpsHash.Hash); 1573 if (It == HashMap.end()) 1574 HashMap[NumFreeOpsHash.Hash] = std::make_pair(1, Lane); 1575 else 1576 ++It->second.first; 1577 } 1578 } 1579 // Select the lane with the minimum counter. 1580 unsigned BestLane = 0; 1581 unsigned CntMin = UINT_MAX; 1582 for (const auto &Data : reverse(HashMap)) { 1583 if (Data.second.first < CntMin) { 1584 CntMin = Data.second.first; 1585 BestLane = Data.second.second; 1586 } 1587 } 1588 return BestLane; 1589 } 1590 1591 /// Data structure that helps to reorder operands. 1592 struct OperandsOrderData { 1593 /// The best number of operands with the same APOs, which can be 1594 /// reordered. 1595 unsigned NumOfAPOs = UINT_MAX; 1596 /// Number of operands with the same/alternate instruction opcode and 1597 /// parent. 1598 unsigned NumOpsWithSameOpcodeParent = 0; 1599 /// Hash for the actual operands ordering. 1600 /// Used to count operands, actually their position id and opcode 1601 /// value. It is used in the voting mechanism to find the lane with the 1602 /// least number of operands that can freely move about or less profitable 1603 /// because it already has the most optimal set of operands. Can be 1604 /// replaced with SmallVector<unsigned> instead but hash code is faster 1605 /// and requires less memory. 1606 unsigned Hash = 0; 1607 }; 1608 /// \returns the maximum number of operands that are allowed to be reordered 1609 /// for \p Lane and the number of compatible instructions(with the same 1610 /// parent/opcode). This is used as a heuristic for selecting the first lane 1611 /// to start operand reordering. 1612 OperandsOrderData getMaxNumOperandsThatCanBeReordered(unsigned Lane) const { 1613 unsigned CntTrue = 0; 1614 unsigned NumOperands = getNumOperands(); 1615 // Operands with the same APO can be reordered. We therefore need to count 1616 // how many of them we have for each APO, like this: Cnt[APO] = x. 1617 // Since we only have two APOs, namely true and false, we can avoid using 1618 // a map. Instead we can simply count the number of operands that 1619 // correspond to one of them (in this case the 'true' APO), and calculate 1620 // the other by subtracting it from the total number of operands. 1621 // Operands with the same instruction opcode and parent are more 1622 // profitable since we don't need to move them in many cases, with a high 1623 // probability such lane already can be vectorized effectively. 1624 bool AllUndefs = true; 1625 unsigned NumOpsWithSameOpcodeParent = 0; 1626 Instruction *OpcodeI = nullptr; 1627 BasicBlock *Parent = nullptr; 1628 unsigned Hash = 0; 1629 for (unsigned OpIdx = 0; OpIdx != NumOperands; ++OpIdx) { 1630 const OperandData &OpData = getData(OpIdx, Lane); 1631 if (OpData.APO) 1632 ++CntTrue; 1633 // Use Boyer-Moore majority voting for finding the majority opcode and 1634 // the number of times it occurs. 1635 if (auto *I = dyn_cast<Instruction>(OpData.V)) { 1636 if (!OpcodeI || !getSameOpcode({OpcodeI, I}).getOpcode() || 1637 I->getParent() != Parent) { 1638 if (NumOpsWithSameOpcodeParent == 0) { 1639 NumOpsWithSameOpcodeParent = 1; 1640 OpcodeI = I; 1641 Parent = I->getParent(); 1642 } else { 1643 --NumOpsWithSameOpcodeParent; 1644 } 1645 } else { 1646 ++NumOpsWithSameOpcodeParent; 1647 } 1648 } 1649 Hash = hash_combine( 1650 Hash, hash_value((OpIdx + 1) * (OpData.V->getValueID() + 1))); 1651 AllUndefs = AllUndefs && isa<UndefValue>(OpData.V); 1652 } 1653 if (AllUndefs) 1654 return {}; 1655 OperandsOrderData Data; 1656 Data.NumOfAPOs = std::max(CntTrue, NumOperands - CntTrue); 1657 Data.NumOpsWithSameOpcodeParent = NumOpsWithSameOpcodeParent; 1658 Data.Hash = Hash; 1659 return Data; 1660 } 1661 1662 /// Go through the instructions in VL and append their operands. 1663 void appendOperandsOfVL(ArrayRef<Value *> VL) { 1664 assert(!VL.empty() && "Bad VL"); 1665 assert((empty() || VL.size() == getNumLanes()) && 1666 "Expected same number of lanes"); 1667 assert(isa<Instruction>(VL[0]) && "Expected instruction"); 1668 unsigned NumOperands = cast<Instruction>(VL[0])->getNumOperands(); 1669 OpsVec.resize(NumOperands); 1670 unsigned NumLanes = VL.size(); 1671 for (unsigned OpIdx = 0; OpIdx != NumOperands; ++OpIdx) { 1672 OpsVec[OpIdx].resize(NumLanes); 1673 for (unsigned Lane = 0; Lane != NumLanes; ++Lane) { 1674 assert(isa<Instruction>(VL[Lane]) && "Expected instruction"); 1675 // Our tree has just 3 nodes: the root and two operands. 1676 // It is therefore trivial to get the APO. We only need to check the 1677 // opcode of VL[Lane] and whether the operand at OpIdx is the LHS or 1678 // RHS operand. The LHS operand of both add and sub is never attached 1679 // to an inversese operation in the linearized form, therefore its APO 1680 // is false. The RHS is true only if VL[Lane] is an inverse operation. 1681 1682 // Since operand reordering is performed on groups of commutative 1683 // operations or alternating sequences (e.g., +, -), we can safely 1684 // tell the inverse operations by checking commutativity. 1685 bool IsInverseOperation = !isCommutative(cast<Instruction>(VL[Lane])); 1686 bool APO = (OpIdx == 0) ? false : IsInverseOperation; 1687 OpsVec[OpIdx][Lane] = {cast<Instruction>(VL[Lane])->getOperand(OpIdx), 1688 APO, false}; 1689 } 1690 } 1691 } 1692 1693 /// \returns the number of operands. 1694 unsigned getNumOperands() const { return OpsVec.size(); } 1695 1696 /// \returns the number of lanes. 1697 unsigned getNumLanes() const { return OpsVec[0].size(); } 1698 1699 /// \returns the operand value at \p OpIdx and \p Lane. 1700 Value *getValue(unsigned OpIdx, unsigned Lane) const { 1701 return getData(OpIdx, Lane).V; 1702 } 1703 1704 /// \returns true if the data structure is empty. 1705 bool empty() const { return OpsVec.empty(); } 1706 1707 /// Clears the data. 1708 void clear() { OpsVec.clear(); } 1709 1710 /// \Returns true if there are enough operands identical to \p Op to fill 1711 /// the whole vector. 1712 /// Note: This modifies the 'IsUsed' flag, so a cleanUsed() must follow. 1713 bool shouldBroadcast(Value *Op, unsigned OpIdx, unsigned Lane) { 1714 bool OpAPO = getData(OpIdx, Lane).APO; 1715 for (unsigned Ln = 0, Lns = getNumLanes(); Ln != Lns; ++Ln) { 1716 if (Ln == Lane) 1717 continue; 1718 // This is set to true if we found a candidate for broadcast at Lane. 1719 bool FoundCandidate = false; 1720 for (unsigned OpI = 0, OpE = getNumOperands(); OpI != OpE; ++OpI) { 1721 OperandData &Data = getData(OpI, Ln); 1722 if (Data.APO != OpAPO || Data.IsUsed) 1723 continue; 1724 if (Data.V == Op) { 1725 FoundCandidate = true; 1726 Data.IsUsed = true; 1727 break; 1728 } 1729 } 1730 if (!FoundCandidate) 1731 return false; 1732 } 1733 return true; 1734 } 1735 1736 public: 1737 /// Initialize with all the operands of the instruction vector \p RootVL. 1738 VLOperands(ArrayRef<Value *> RootVL, const DataLayout &DL, 1739 ScalarEvolution &SE, const BoUpSLP &R) 1740 : DL(DL), SE(SE), R(R) { 1741 // Append all the operands of RootVL. 1742 appendOperandsOfVL(RootVL); 1743 } 1744 1745 /// \Returns a value vector with the operands across all lanes for the 1746 /// opearnd at \p OpIdx. 1747 ValueList getVL(unsigned OpIdx) const { 1748 ValueList OpVL(OpsVec[OpIdx].size()); 1749 assert(OpsVec[OpIdx].size() == getNumLanes() && 1750 "Expected same num of lanes across all operands"); 1751 for (unsigned Lane = 0, Lanes = getNumLanes(); Lane != Lanes; ++Lane) 1752 OpVL[Lane] = OpsVec[OpIdx][Lane].V; 1753 return OpVL; 1754 } 1755 1756 // Performs operand reordering for 2 or more operands. 1757 // The original operands are in OrigOps[OpIdx][Lane]. 1758 // The reordered operands are returned in 'SortedOps[OpIdx][Lane]'. 1759 void reorder() { 1760 unsigned NumOperands = getNumOperands(); 1761 unsigned NumLanes = getNumLanes(); 1762 // Each operand has its own mode. We are using this mode to help us select 1763 // the instructions for each lane, so that they match best with the ones 1764 // we have selected so far. 1765 SmallVector<ReorderingMode, 2> ReorderingModes(NumOperands); 1766 1767 // This is a greedy single-pass algorithm. We are going over each lane 1768 // once and deciding on the best order right away with no back-tracking. 1769 // However, in order to increase its effectiveness, we start with the lane 1770 // that has operands that can move the least. For example, given the 1771 // following lanes: 1772 // Lane 0 : A[0] = B[0] + C[0] // Visited 3rd 1773 // Lane 1 : A[1] = C[1] - B[1] // Visited 1st 1774 // Lane 2 : A[2] = B[2] + C[2] // Visited 2nd 1775 // Lane 3 : A[3] = C[3] - B[3] // Visited 4th 1776 // we will start at Lane 1, since the operands of the subtraction cannot 1777 // be reordered. Then we will visit the rest of the lanes in a circular 1778 // fashion. That is, Lanes 2, then Lane 0, and finally Lane 3. 1779 1780 // Find the first lane that we will start our search from. 1781 unsigned FirstLane = getBestLaneToStartReordering(); 1782 1783 // Initialize the modes. 1784 for (unsigned OpIdx = 0; OpIdx != NumOperands; ++OpIdx) { 1785 Value *OpLane0 = getValue(OpIdx, FirstLane); 1786 // Keep track if we have instructions with all the same opcode on one 1787 // side. 1788 if (isa<LoadInst>(OpLane0)) 1789 ReorderingModes[OpIdx] = ReorderingMode::Load; 1790 else if (isa<Instruction>(OpLane0)) { 1791 // Check if OpLane0 should be broadcast. 1792 if (shouldBroadcast(OpLane0, OpIdx, FirstLane)) 1793 ReorderingModes[OpIdx] = ReorderingMode::Splat; 1794 else 1795 ReorderingModes[OpIdx] = ReorderingMode::Opcode; 1796 } 1797 else if (isa<Constant>(OpLane0)) 1798 ReorderingModes[OpIdx] = ReorderingMode::Constant; 1799 else if (isa<Argument>(OpLane0)) 1800 // Our best hope is a Splat. It may save some cost in some cases. 1801 ReorderingModes[OpIdx] = ReorderingMode::Splat; 1802 else 1803 // NOTE: This should be unreachable. 1804 ReorderingModes[OpIdx] = ReorderingMode::Failed; 1805 } 1806 1807 // Check that we don't have same operands. No need to reorder if operands 1808 // are just perfect diamond or shuffled diamond match. Do not do it only 1809 // for possible broadcasts or non-power of 2 number of scalars (just for 1810 // now). 1811 auto &&SkipReordering = [this]() { 1812 SmallPtrSet<Value *, 4> UniqueValues; 1813 ArrayRef<OperandData> Op0 = OpsVec.front(); 1814 for (const OperandData &Data : Op0) 1815 UniqueValues.insert(Data.V); 1816 for (ArrayRef<OperandData> Op : drop_begin(OpsVec, 1)) { 1817 if (any_of(Op, [&UniqueValues](const OperandData &Data) { 1818 return !UniqueValues.contains(Data.V); 1819 })) 1820 return false; 1821 } 1822 // TODO: Check if we can remove a check for non-power-2 number of 1823 // scalars after full support of non-power-2 vectorization. 1824 return UniqueValues.size() != 2 && isPowerOf2_32(UniqueValues.size()); 1825 }; 1826 1827 // If the initial strategy fails for any of the operand indexes, then we 1828 // perform reordering again in a second pass. This helps avoid assigning 1829 // high priority to the failed strategy, and should improve reordering for 1830 // the non-failed operand indexes. 1831 for (int Pass = 0; Pass != 2; ++Pass) { 1832 // Check if no need to reorder operands since they're are perfect or 1833 // shuffled diamond match. 1834 // Need to to do it to avoid extra external use cost counting for 1835 // shuffled matches, which may cause regressions. 1836 if (SkipReordering()) 1837 break; 1838 // Skip the second pass if the first pass did not fail. 1839 bool StrategyFailed = false; 1840 // Mark all operand data as free to use. 1841 clearUsed(); 1842 // We keep the original operand order for the FirstLane, so reorder the 1843 // rest of the lanes. We are visiting the nodes in a circular fashion, 1844 // using FirstLane as the center point and increasing the radius 1845 // distance. 1846 SmallVector<SmallVector<Value *, 2>> MainAltOps(NumOperands); 1847 for (unsigned I = 0; I < NumOperands; ++I) 1848 MainAltOps[I].push_back(getData(I, FirstLane).V); 1849 1850 for (unsigned Distance = 1; Distance != NumLanes; ++Distance) { 1851 // Visit the lane on the right and then the lane on the left. 1852 for (int Direction : {+1, -1}) { 1853 int Lane = FirstLane + Direction * Distance; 1854 if (Lane < 0 || Lane >= (int)NumLanes) 1855 continue; 1856 int LastLane = Lane - Direction; 1857 assert(LastLane >= 0 && LastLane < (int)NumLanes && 1858 "Out of bounds"); 1859 // Look for a good match for each operand. 1860 for (unsigned OpIdx = 0; OpIdx != NumOperands; ++OpIdx) { 1861 // Search for the operand that matches SortedOps[OpIdx][Lane-1]. 1862 Optional<unsigned> BestIdx = getBestOperand( 1863 OpIdx, Lane, LastLane, ReorderingModes, MainAltOps[OpIdx]); 1864 // By not selecting a value, we allow the operands that follow to 1865 // select a better matching value. We will get a non-null value in 1866 // the next run of getBestOperand(). 1867 if (BestIdx) { 1868 // Swap the current operand with the one returned by 1869 // getBestOperand(). 1870 swap(OpIdx, BestIdx.getValue(), Lane); 1871 } else { 1872 // We failed to find a best operand, set mode to 'Failed'. 1873 ReorderingModes[OpIdx] = ReorderingMode::Failed; 1874 // Enable the second pass. 1875 StrategyFailed = true; 1876 } 1877 // Try to get the alternate opcode and follow it during analysis. 1878 if (MainAltOps[OpIdx].size() != 2) { 1879 OperandData &AltOp = getData(OpIdx, Lane); 1880 InstructionsState OpS = 1881 getSameOpcode({MainAltOps[OpIdx].front(), AltOp.V}); 1882 if (OpS.getOpcode() && OpS.isAltShuffle()) 1883 MainAltOps[OpIdx].push_back(AltOp.V); 1884 } 1885 } 1886 } 1887 } 1888 // Skip second pass if the strategy did not fail. 1889 if (!StrategyFailed) 1890 break; 1891 } 1892 } 1893 1894 #if !defined(NDEBUG) || defined(LLVM_ENABLE_DUMP) 1895 LLVM_DUMP_METHOD static StringRef getModeStr(ReorderingMode RMode) { 1896 switch (RMode) { 1897 case ReorderingMode::Load: 1898 return "Load"; 1899 case ReorderingMode::Opcode: 1900 return "Opcode"; 1901 case ReorderingMode::Constant: 1902 return "Constant"; 1903 case ReorderingMode::Splat: 1904 return "Splat"; 1905 case ReorderingMode::Failed: 1906 return "Failed"; 1907 } 1908 llvm_unreachable("Unimplemented Reordering Type"); 1909 } 1910 1911 LLVM_DUMP_METHOD static raw_ostream &printMode(ReorderingMode RMode, 1912 raw_ostream &OS) { 1913 return OS << getModeStr(RMode); 1914 } 1915 1916 /// Debug print. 1917 LLVM_DUMP_METHOD static void dumpMode(ReorderingMode RMode) { 1918 printMode(RMode, dbgs()); 1919 } 1920 1921 friend raw_ostream &operator<<(raw_ostream &OS, ReorderingMode RMode) { 1922 return printMode(RMode, OS); 1923 } 1924 1925 LLVM_DUMP_METHOD raw_ostream &print(raw_ostream &OS) const { 1926 const unsigned Indent = 2; 1927 unsigned Cnt = 0; 1928 for (const OperandDataVec &OpDataVec : OpsVec) { 1929 OS << "Operand " << Cnt++ << "\n"; 1930 for (const OperandData &OpData : OpDataVec) { 1931 OS.indent(Indent) << "{"; 1932 if (Value *V = OpData.V) 1933 OS << *V; 1934 else 1935 OS << "null"; 1936 OS << ", APO:" << OpData.APO << "}\n"; 1937 } 1938 OS << "\n"; 1939 } 1940 return OS; 1941 } 1942 1943 /// Debug print. 1944 LLVM_DUMP_METHOD void dump() const { print(dbgs()); } 1945 #endif 1946 }; 1947 1948 /// Checks if the instruction is marked for deletion. 1949 bool isDeleted(Instruction *I) const { return DeletedInstructions.count(I); } 1950 1951 /// Marks values operands for later deletion by replacing them with Undefs. 1952 void eraseInstructions(ArrayRef<Value *> AV); 1953 1954 ~BoUpSLP(); 1955 1956 private: 1957 /// Check if the operands on the edges \p Edges of the \p UserTE allows 1958 /// reordering (i.e. the operands can be reordered because they have only one 1959 /// user and reordarable). 1960 /// \param NonVectorized List of all gather nodes that require reordering 1961 /// (e.g., gather of extractlements or partially vectorizable loads). 1962 /// \param GatherOps List of gather operand nodes for \p UserTE that require 1963 /// reordering, subset of \p NonVectorized. 1964 bool 1965 canReorderOperands(TreeEntry *UserTE, 1966 SmallVectorImpl<std::pair<unsigned, TreeEntry *>> &Edges, 1967 ArrayRef<TreeEntry *> ReorderableGathers, 1968 SmallVectorImpl<TreeEntry *> &GatherOps); 1969 1970 /// Returns vectorized operand \p OpIdx of the node \p UserTE from the graph, 1971 /// if any. If it is not vectorized (gather node), returns nullptr. 1972 TreeEntry *getVectorizedOperand(TreeEntry *UserTE, unsigned OpIdx) { 1973 ArrayRef<Value *> VL = UserTE->getOperand(OpIdx); 1974 TreeEntry *TE = nullptr; 1975 const auto *It = find_if(VL, [this, &TE](Value *V) { 1976 TE = getTreeEntry(V); 1977 return TE; 1978 }); 1979 if (It != VL.end() && TE->isSame(VL)) 1980 return TE; 1981 return nullptr; 1982 } 1983 1984 /// Returns vectorized operand \p OpIdx of the node \p UserTE from the graph, 1985 /// if any. If it is not vectorized (gather node), returns nullptr. 1986 const TreeEntry *getVectorizedOperand(const TreeEntry *UserTE, 1987 unsigned OpIdx) const { 1988 return const_cast<BoUpSLP *>(this)->getVectorizedOperand( 1989 const_cast<TreeEntry *>(UserTE), OpIdx); 1990 } 1991 1992 /// Checks if all users of \p I are the part of the vectorization tree. 1993 bool areAllUsersVectorized(Instruction *I, 1994 ArrayRef<Value *> VectorizedVals) const; 1995 1996 /// \returns the cost of the vectorizable entry. 1997 InstructionCost getEntryCost(const TreeEntry *E, 1998 ArrayRef<Value *> VectorizedVals); 1999 2000 /// This is the recursive part of buildTree. 2001 void buildTree_rec(ArrayRef<Value *> Roots, unsigned Depth, 2002 const EdgeInfo &EI); 2003 2004 /// \returns true if the ExtractElement/ExtractValue instructions in \p VL can 2005 /// be vectorized to use the original vector (or aggregate "bitcast" to a 2006 /// vector) and sets \p CurrentOrder to the identity permutation; otherwise 2007 /// returns false, setting \p CurrentOrder to either an empty vector or a 2008 /// non-identity permutation that allows to reuse extract instructions. 2009 bool canReuseExtract(ArrayRef<Value *> VL, Value *OpValue, 2010 SmallVectorImpl<unsigned> &CurrentOrder) const; 2011 2012 /// Vectorize a single entry in the tree. 2013 Value *vectorizeTree(TreeEntry *E); 2014 2015 /// Vectorize a single entry in the tree, starting in \p VL. 2016 Value *vectorizeTree(ArrayRef<Value *> VL); 2017 2018 /// Create a new vector from a list of scalar values. Produces a sequence 2019 /// which exploits values reused across lanes, and arranges the inserts 2020 /// for ease of later optimization. 2021 Value *createBuildVector(ArrayRef<Value *> VL); 2022 2023 /// \returns the scalarization cost for this type. Scalarization in this 2024 /// context means the creation of vectors from a group of scalars. If \p 2025 /// NeedToShuffle is true, need to add a cost of reshuffling some of the 2026 /// vector elements. 2027 InstructionCost getGatherCost(FixedVectorType *Ty, 2028 const APInt &ShuffledIndices, 2029 bool NeedToShuffle) const; 2030 2031 /// Checks if the gathered \p VL can be represented as shuffle(s) of previous 2032 /// tree entries. 2033 /// \returns ShuffleKind, if gathered values can be represented as shuffles of 2034 /// previous tree entries. \p Mask is filled with the shuffle mask. 2035 Optional<TargetTransformInfo::ShuffleKind> 2036 isGatherShuffledEntry(const TreeEntry *TE, SmallVectorImpl<int> &Mask, 2037 SmallVectorImpl<const TreeEntry *> &Entries); 2038 2039 /// \returns the scalarization cost for this list of values. Assuming that 2040 /// this subtree gets vectorized, we may need to extract the values from the 2041 /// roots. This method calculates the cost of extracting the values. 2042 InstructionCost getGatherCost(ArrayRef<Value *> VL) const; 2043 2044 /// Set the Builder insert point to one after the last instruction in 2045 /// the bundle 2046 void setInsertPointAfterBundle(const TreeEntry *E); 2047 2048 /// \returns a vector from a collection of scalars in \p VL. 2049 Value *gather(ArrayRef<Value *> VL); 2050 2051 /// \returns whether the VectorizableTree is fully vectorizable and will 2052 /// be beneficial even the tree height is tiny. 2053 bool isFullyVectorizableTinyTree(bool ForReduction) const; 2054 2055 /// Reorder commutative or alt operands to get better probability of 2056 /// generating vectorized code. 2057 static void reorderInputsAccordingToOpcode(ArrayRef<Value *> VL, 2058 SmallVectorImpl<Value *> &Left, 2059 SmallVectorImpl<Value *> &Right, 2060 const DataLayout &DL, 2061 ScalarEvolution &SE, 2062 const BoUpSLP &R); 2063 struct TreeEntry { 2064 using VecTreeTy = SmallVector<std::unique_ptr<TreeEntry>, 8>; 2065 TreeEntry(VecTreeTy &Container) : Container(Container) {} 2066 2067 /// \returns true if the scalars in VL are equal to this entry. 2068 bool isSame(ArrayRef<Value *> VL) const { 2069 auto &&IsSame = [VL](ArrayRef<Value *> Scalars, ArrayRef<int> Mask) { 2070 if (Mask.size() != VL.size() && VL.size() == Scalars.size()) 2071 return std::equal(VL.begin(), VL.end(), Scalars.begin()); 2072 return VL.size() == Mask.size() && 2073 std::equal(VL.begin(), VL.end(), Mask.begin(), 2074 [Scalars](Value *V, int Idx) { 2075 return (isa<UndefValue>(V) && 2076 Idx == UndefMaskElem) || 2077 (Idx != UndefMaskElem && V == Scalars[Idx]); 2078 }); 2079 }; 2080 if (!ReorderIndices.empty()) { 2081 // TODO: implement matching if the nodes are just reordered, still can 2082 // treat the vector as the same if the list of scalars matches VL 2083 // directly, without reordering. 2084 SmallVector<int> Mask; 2085 inversePermutation(ReorderIndices, Mask); 2086 if (VL.size() == Scalars.size()) 2087 return IsSame(Scalars, Mask); 2088 if (VL.size() == ReuseShuffleIndices.size()) { 2089 ::addMask(Mask, ReuseShuffleIndices); 2090 return IsSame(Scalars, Mask); 2091 } 2092 return false; 2093 } 2094 return IsSame(Scalars, ReuseShuffleIndices); 2095 } 2096 2097 /// \returns true if current entry has same operands as \p TE. 2098 bool hasEqualOperands(const TreeEntry &TE) const { 2099 if (TE.getNumOperands() != getNumOperands()) 2100 return false; 2101 SmallBitVector Used(getNumOperands()); 2102 for (unsigned I = 0, E = getNumOperands(); I < E; ++I) { 2103 unsigned PrevCount = Used.count(); 2104 for (unsigned K = 0; K < E; ++K) { 2105 if (Used.test(K)) 2106 continue; 2107 if (getOperand(K) == TE.getOperand(I)) { 2108 Used.set(K); 2109 break; 2110 } 2111 } 2112 // Check if we actually found the matching operand. 2113 if (PrevCount == Used.count()) 2114 return false; 2115 } 2116 return true; 2117 } 2118 2119 /// \return Final vectorization factor for the node. Defined by the total 2120 /// number of vectorized scalars, including those, used several times in the 2121 /// entry and counted in the \a ReuseShuffleIndices, if any. 2122 unsigned getVectorFactor() const { 2123 if (!ReuseShuffleIndices.empty()) 2124 return ReuseShuffleIndices.size(); 2125 return Scalars.size(); 2126 }; 2127 2128 /// A vector of scalars. 2129 ValueList Scalars; 2130 2131 /// The Scalars are vectorized into this value. It is initialized to Null. 2132 Value *VectorizedValue = nullptr; 2133 2134 /// Do we need to gather this sequence or vectorize it 2135 /// (either with vector instruction or with scatter/gather 2136 /// intrinsics for store/load)? 2137 enum EntryState { Vectorize, ScatterVectorize, NeedToGather }; 2138 EntryState State; 2139 2140 /// Does this sequence require some shuffling? 2141 SmallVector<int, 4> ReuseShuffleIndices; 2142 2143 /// Does this entry require reordering? 2144 SmallVector<unsigned, 4> ReorderIndices; 2145 2146 /// Points back to the VectorizableTree. 2147 /// 2148 /// Only used for Graphviz right now. Unfortunately GraphTrait::NodeRef has 2149 /// to be a pointer and needs to be able to initialize the child iterator. 2150 /// Thus we need a reference back to the container to translate the indices 2151 /// to entries. 2152 VecTreeTy &Container; 2153 2154 /// The TreeEntry index containing the user of this entry. We can actually 2155 /// have multiple users so the data structure is not truly a tree. 2156 SmallVector<EdgeInfo, 1> UserTreeIndices; 2157 2158 /// The index of this treeEntry in VectorizableTree. 2159 int Idx = -1; 2160 2161 private: 2162 /// The operands of each instruction in each lane Operands[op_index][lane]. 2163 /// Note: This helps avoid the replication of the code that performs the 2164 /// reordering of operands during buildTree_rec() and vectorizeTree(). 2165 SmallVector<ValueList, 2> Operands; 2166 2167 /// The main/alternate instruction. 2168 Instruction *MainOp = nullptr; 2169 Instruction *AltOp = nullptr; 2170 2171 public: 2172 /// Set this bundle's \p OpIdx'th operand to \p OpVL. 2173 void setOperand(unsigned OpIdx, ArrayRef<Value *> OpVL) { 2174 if (Operands.size() < OpIdx + 1) 2175 Operands.resize(OpIdx + 1); 2176 assert(Operands[OpIdx].empty() && "Already resized?"); 2177 assert(OpVL.size() <= Scalars.size() && 2178 "Number of operands is greater than the number of scalars."); 2179 Operands[OpIdx].resize(OpVL.size()); 2180 copy(OpVL, Operands[OpIdx].begin()); 2181 } 2182 2183 /// Set the operands of this bundle in their original order. 2184 void setOperandsInOrder() { 2185 assert(Operands.empty() && "Already initialized?"); 2186 auto *I0 = cast<Instruction>(Scalars[0]); 2187 Operands.resize(I0->getNumOperands()); 2188 unsigned NumLanes = Scalars.size(); 2189 for (unsigned OpIdx = 0, NumOperands = I0->getNumOperands(); 2190 OpIdx != NumOperands; ++OpIdx) { 2191 Operands[OpIdx].resize(NumLanes); 2192 for (unsigned Lane = 0; Lane != NumLanes; ++Lane) { 2193 auto *I = cast<Instruction>(Scalars[Lane]); 2194 assert(I->getNumOperands() == NumOperands && 2195 "Expected same number of operands"); 2196 Operands[OpIdx][Lane] = I->getOperand(OpIdx); 2197 } 2198 } 2199 } 2200 2201 /// Reorders operands of the node to the given mask \p Mask. 2202 void reorderOperands(ArrayRef<int> Mask) { 2203 for (ValueList &Operand : Operands) 2204 reorderScalars(Operand, Mask); 2205 } 2206 2207 /// \returns the \p OpIdx operand of this TreeEntry. 2208 ValueList &getOperand(unsigned OpIdx) { 2209 assert(OpIdx < Operands.size() && "Off bounds"); 2210 return Operands[OpIdx]; 2211 } 2212 2213 /// \returns the \p OpIdx operand of this TreeEntry. 2214 ArrayRef<Value *> getOperand(unsigned OpIdx) const { 2215 assert(OpIdx < Operands.size() && "Off bounds"); 2216 return Operands[OpIdx]; 2217 } 2218 2219 /// \returns the number of operands. 2220 unsigned getNumOperands() const { return Operands.size(); } 2221 2222 /// \return the single \p OpIdx operand. 2223 Value *getSingleOperand(unsigned OpIdx) const { 2224 assert(OpIdx < Operands.size() && "Off bounds"); 2225 assert(!Operands[OpIdx].empty() && "No operand available"); 2226 return Operands[OpIdx][0]; 2227 } 2228 2229 /// Some of the instructions in the list have alternate opcodes. 2230 bool isAltShuffle() const { return MainOp != AltOp; } 2231 2232 bool isOpcodeOrAlt(Instruction *I) const { 2233 unsigned CheckedOpcode = I->getOpcode(); 2234 return (getOpcode() == CheckedOpcode || 2235 getAltOpcode() == CheckedOpcode); 2236 } 2237 2238 /// Chooses the correct key for scheduling data. If \p Op has the same (or 2239 /// alternate) opcode as \p OpValue, the key is \p Op. Otherwise the key is 2240 /// \p OpValue. 2241 Value *isOneOf(Value *Op) const { 2242 auto *I = dyn_cast<Instruction>(Op); 2243 if (I && isOpcodeOrAlt(I)) 2244 return Op; 2245 return MainOp; 2246 } 2247 2248 void setOperations(const InstructionsState &S) { 2249 MainOp = S.MainOp; 2250 AltOp = S.AltOp; 2251 } 2252 2253 Instruction *getMainOp() const { 2254 return MainOp; 2255 } 2256 2257 Instruction *getAltOp() const { 2258 return AltOp; 2259 } 2260 2261 /// The main/alternate opcodes for the list of instructions. 2262 unsigned getOpcode() const { 2263 return MainOp ? MainOp->getOpcode() : 0; 2264 } 2265 2266 unsigned getAltOpcode() const { 2267 return AltOp ? AltOp->getOpcode() : 0; 2268 } 2269 2270 /// When ReuseReorderShuffleIndices is empty it just returns position of \p 2271 /// V within vector of Scalars. Otherwise, try to remap on its reuse index. 2272 int findLaneForValue(Value *V) const { 2273 unsigned FoundLane = std::distance(Scalars.begin(), find(Scalars, V)); 2274 assert(FoundLane < Scalars.size() && "Couldn't find extract lane"); 2275 if (!ReorderIndices.empty()) 2276 FoundLane = ReorderIndices[FoundLane]; 2277 assert(FoundLane < Scalars.size() && "Couldn't find extract lane"); 2278 if (!ReuseShuffleIndices.empty()) { 2279 FoundLane = std::distance(ReuseShuffleIndices.begin(), 2280 find(ReuseShuffleIndices, FoundLane)); 2281 } 2282 return FoundLane; 2283 } 2284 2285 #ifndef NDEBUG 2286 /// Debug printer. 2287 LLVM_DUMP_METHOD void dump() const { 2288 dbgs() << Idx << ".\n"; 2289 for (unsigned OpI = 0, OpE = Operands.size(); OpI != OpE; ++OpI) { 2290 dbgs() << "Operand " << OpI << ":\n"; 2291 for (const Value *V : Operands[OpI]) 2292 dbgs().indent(2) << *V << "\n"; 2293 } 2294 dbgs() << "Scalars: \n"; 2295 for (Value *V : Scalars) 2296 dbgs().indent(2) << *V << "\n"; 2297 dbgs() << "State: "; 2298 switch (State) { 2299 case Vectorize: 2300 dbgs() << "Vectorize\n"; 2301 break; 2302 case ScatterVectorize: 2303 dbgs() << "ScatterVectorize\n"; 2304 break; 2305 case NeedToGather: 2306 dbgs() << "NeedToGather\n"; 2307 break; 2308 } 2309 dbgs() << "MainOp: "; 2310 if (MainOp) 2311 dbgs() << *MainOp << "\n"; 2312 else 2313 dbgs() << "NULL\n"; 2314 dbgs() << "AltOp: "; 2315 if (AltOp) 2316 dbgs() << *AltOp << "\n"; 2317 else 2318 dbgs() << "NULL\n"; 2319 dbgs() << "VectorizedValue: "; 2320 if (VectorizedValue) 2321 dbgs() << *VectorizedValue << "\n"; 2322 else 2323 dbgs() << "NULL\n"; 2324 dbgs() << "ReuseShuffleIndices: "; 2325 if (ReuseShuffleIndices.empty()) 2326 dbgs() << "Empty"; 2327 else 2328 for (int ReuseIdx : ReuseShuffleIndices) 2329 dbgs() << ReuseIdx << ", "; 2330 dbgs() << "\n"; 2331 dbgs() << "ReorderIndices: "; 2332 for (unsigned ReorderIdx : ReorderIndices) 2333 dbgs() << ReorderIdx << ", "; 2334 dbgs() << "\n"; 2335 dbgs() << "UserTreeIndices: "; 2336 for (const auto &EInfo : UserTreeIndices) 2337 dbgs() << EInfo << ", "; 2338 dbgs() << "\n"; 2339 } 2340 #endif 2341 }; 2342 2343 #ifndef NDEBUG 2344 void dumpTreeCosts(const TreeEntry *E, InstructionCost ReuseShuffleCost, 2345 InstructionCost VecCost, 2346 InstructionCost ScalarCost) const { 2347 dbgs() << "SLP: Calculated costs for Tree:\n"; E->dump(); 2348 dbgs() << "SLP: Costs:\n"; 2349 dbgs() << "SLP: ReuseShuffleCost = " << ReuseShuffleCost << "\n"; 2350 dbgs() << "SLP: VectorCost = " << VecCost << "\n"; 2351 dbgs() << "SLP: ScalarCost = " << ScalarCost << "\n"; 2352 dbgs() << "SLP: ReuseShuffleCost + VecCost - ScalarCost = " << 2353 ReuseShuffleCost + VecCost - ScalarCost << "\n"; 2354 } 2355 #endif 2356 2357 /// Create a new VectorizableTree entry. 2358 TreeEntry *newTreeEntry(ArrayRef<Value *> VL, Optional<ScheduleData *> Bundle, 2359 const InstructionsState &S, 2360 const EdgeInfo &UserTreeIdx, 2361 ArrayRef<int> ReuseShuffleIndices = None, 2362 ArrayRef<unsigned> ReorderIndices = None) { 2363 TreeEntry::EntryState EntryState = 2364 Bundle ? TreeEntry::Vectorize : TreeEntry::NeedToGather; 2365 return newTreeEntry(VL, EntryState, Bundle, S, UserTreeIdx, 2366 ReuseShuffleIndices, ReorderIndices); 2367 } 2368 2369 TreeEntry *newTreeEntry(ArrayRef<Value *> VL, 2370 TreeEntry::EntryState EntryState, 2371 Optional<ScheduleData *> Bundle, 2372 const InstructionsState &S, 2373 const EdgeInfo &UserTreeIdx, 2374 ArrayRef<int> ReuseShuffleIndices = None, 2375 ArrayRef<unsigned> ReorderIndices = None) { 2376 assert(((!Bundle && EntryState == TreeEntry::NeedToGather) || 2377 (Bundle && EntryState != TreeEntry::NeedToGather)) && 2378 "Need to vectorize gather entry?"); 2379 VectorizableTree.push_back(std::make_unique<TreeEntry>(VectorizableTree)); 2380 TreeEntry *Last = VectorizableTree.back().get(); 2381 Last->Idx = VectorizableTree.size() - 1; 2382 Last->State = EntryState; 2383 Last->ReuseShuffleIndices.append(ReuseShuffleIndices.begin(), 2384 ReuseShuffleIndices.end()); 2385 if (ReorderIndices.empty()) { 2386 Last->Scalars.assign(VL.begin(), VL.end()); 2387 Last->setOperations(S); 2388 } else { 2389 // Reorder scalars and build final mask. 2390 Last->Scalars.assign(VL.size(), nullptr); 2391 transform(ReorderIndices, Last->Scalars.begin(), 2392 [VL](unsigned Idx) -> Value * { 2393 if (Idx >= VL.size()) 2394 return UndefValue::get(VL.front()->getType()); 2395 return VL[Idx]; 2396 }); 2397 InstructionsState S = getSameOpcode(Last->Scalars); 2398 Last->setOperations(S); 2399 Last->ReorderIndices.append(ReorderIndices.begin(), ReorderIndices.end()); 2400 } 2401 if (Last->State != TreeEntry::NeedToGather) { 2402 for (Value *V : VL) { 2403 assert(!getTreeEntry(V) && "Scalar already in tree!"); 2404 ScalarToTreeEntry[V] = Last; 2405 } 2406 // Update the scheduler bundle to point to this TreeEntry. 2407 ScheduleData *BundleMember = Bundle.getValue(); 2408 assert((BundleMember || isa<PHINode>(S.MainOp) || 2409 isVectorLikeInstWithConstOps(S.MainOp) || 2410 doesNotNeedToSchedule(VL)) && 2411 "Bundle and VL out of sync"); 2412 if (BundleMember) { 2413 for (Value *V : VL) { 2414 if (doesNotNeedToBeScheduled(V)) 2415 continue; 2416 assert(BundleMember && "Unexpected end of bundle."); 2417 BundleMember->TE = Last; 2418 BundleMember = BundleMember->NextInBundle; 2419 } 2420 } 2421 assert(!BundleMember && "Bundle and VL out of sync"); 2422 } else { 2423 MustGather.insert(VL.begin(), VL.end()); 2424 } 2425 2426 if (UserTreeIdx.UserTE) 2427 Last->UserTreeIndices.push_back(UserTreeIdx); 2428 2429 return Last; 2430 } 2431 2432 /// -- Vectorization State -- 2433 /// Holds all of the tree entries. 2434 TreeEntry::VecTreeTy VectorizableTree; 2435 2436 #ifndef NDEBUG 2437 /// Debug printer. 2438 LLVM_DUMP_METHOD void dumpVectorizableTree() const { 2439 for (unsigned Id = 0, IdE = VectorizableTree.size(); Id != IdE; ++Id) { 2440 VectorizableTree[Id]->dump(); 2441 dbgs() << "\n"; 2442 } 2443 } 2444 #endif 2445 2446 TreeEntry *getTreeEntry(Value *V) { return ScalarToTreeEntry.lookup(V); } 2447 2448 const TreeEntry *getTreeEntry(Value *V) const { 2449 return ScalarToTreeEntry.lookup(V); 2450 } 2451 2452 /// Maps a specific scalar to its tree entry. 2453 SmallDenseMap<Value*, TreeEntry *> ScalarToTreeEntry; 2454 2455 /// Maps a value to the proposed vectorizable size. 2456 SmallDenseMap<Value *, unsigned> InstrElementSize; 2457 2458 /// A list of scalars that we found that we need to keep as scalars. 2459 ValueSet MustGather; 2460 2461 /// This POD struct describes one external user in the vectorized tree. 2462 struct ExternalUser { 2463 ExternalUser(Value *S, llvm::User *U, int L) 2464 : Scalar(S), User(U), Lane(L) {} 2465 2466 // Which scalar in our function. 2467 Value *Scalar; 2468 2469 // Which user that uses the scalar. 2470 llvm::User *User; 2471 2472 // Which lane does the scalar belong to. 2473 int Lane; 2474 }; 2475 using UserList = SmallVector<ExternalUser, 16>; 2476 2477 /// Checks if two instructions may access the same memory. 2478 /// 2479 /// \p Loc1 is the location of \p Inst1. It is passed explicitly because it 2480 /// is invariant in the calling loop. 2481 bool isAliased(const MemoryLocation &Loc1, Instruction *Inst1, 2482 Instruction *Inst2) { 2483 // First check if the result is already in the cache. 2484 AliasCacheKey key = std::make_pair(Inst1, Inst2); 2485 Optional<bool> &result = AliasCache[key]; 2486 if (result.hasValue()) { 2487 return result.getValue(); 2488 } 2489 bool aliased = true; 2490 if (Loc1.Ptr && isSimple(Inst1)) 2491 aliased = isModOrRefSet(BatchAA.getModRefInfo(Inst2, Loc1)); 2492 // Store the result in the cache. 2493 result = aliased; 2494 return aliased; 2495 } 2496 2497 using AliasCacheKey = std::pair<Instruction *, Instruction *>; 2498 2499 /// Cache for alias results. 2500 /// TODO: consider moving this to the AliasAnalysis itself. 2501 DenseMap<AliasCacheKey, Optional<bool>> AliasCache; 2502 2503 // Cache for pointerMayBeCaptured calls inside AA. This is preserved 2504 // globally through SLP because we don't perform any action which 2505 // invalidates capture results. 2506 BatchAAResults BatchAA; 2507 2508 /// Removes an instruction from its block and eventually deletes it. 2509 /// It's like Instruction::eraseFromParent() except that the actual deletion 2510 /// is delayed until BoUpSLP is destructed. 2511 /// This is required to ensure that there are no incorrect collisions in the 2512 /// AliasCache, which can happen if a new instruction is allocated at the 2513 /// same address as a previously deleted instruction. 2514 void eraseInstruction(Instruction *I, bool ReplaceOpsWithUndef = false) { 2515 auto It = DeletedInstructions.try_emplace(I, ReplaceOpsWithUndef).first; 2516 It->getSecond() = It->getSecond() && ReplaceOpsWithUndef; 2517 } 2518 2519 /// Temporary store for deleted instructions. Instructions will be deleted 2520 /// eventually when the BoUpSLP is destructed. 2521 DenseMap<Instruction *, bool> DeletedInstructions; 2522 2523 /// A list of values that need to extracted out of the tree. 2524 /// This list holds pairs of (Internal Scalar : External User). External User 2525 /// can be nullptr, it means that this Internal Scalar will be used later, 2526 /// after vectorization. 2527 UserList ExternalUses; 2528 2529 /// Values used only by @llvm.assume calls. 2530 SmallPtrSet<const Value *, 32> EphValues; 2531 2532 /// Holds all of the instructions that we gathered. 2533 SetVector<Instruction *> GatherShuffleSeq; 2534 2535 /// A list of blocks that we are going to CSE. 2536 SetVector<BasicBlock *> CSEBlocks; 2537 2538 /// Contains all scheduling relevant data for an instruction. 2539 /// A ScheduleData either represents a single instruction or a member of an 2540 /// instruction bundle (= a group of instructions which is combined into a 2541 /// vector instruction). 2542 struct ScheduleData { 2543 // The initial value for the dependency counters. It means that the 2544 // dependencies are not calculated yet. 2545 enum { InvalidDeps = -1 }; 2546 2547 ScheduleData() = default; 2548 2549 void init(int BlockSchedulingRegionID, Value *OpVal) { 2550 FirstInBundle = this; 2551 NextInBundle = nullptr; 2552 NextLoadStore = nullptr; 2553 IsScheduled = false; 2554 SchedulingRegionID = BlockSchedulingRegionID; 2555 clearDependencies(); 2556 OpValue = OpVal; 2557 TE = nullptr; 2558 } 2559 2560 /// Verify basic self consistency properties 2561 void verify() { 2562 if (hasValidDependencies()) { 2563 assert(UnscheduledDeps <= Dependencies && "invariant"); 2564 } else { 2565 assert(UnscheduledDeps == Dependencies && "invariant"); 2566 } 2567 2568 if (IsScheduled) { 2569 assert(isSchedulingEntity() && 2570 "unexpected scheduled state"); 2571 for (const ScheduleData *BundleMember = this; BundleMember; 2572 BundleMember = BundleMember->NextInBundle) { 2573 assert(BundleMember->hasValidDependencies() && 2574 BundleMember->UnscheduledDeps == 0 && 2575 "unexpected scheduled state"); 2576 assert((BundleMember == this || !BundleMember->IsScheduled) && 2577 "only bundle is marked scheduled"); 2578 } 2579 } 2580 2581 assert(Inst->getParent() == FirstInBundle->Inst->getParent() && 2582 "all bundle members must be in same basic block"); 2583 } 2584 2585 /// Returns true if the dependency information has been calculated. 2586 /// Note that depenendency validity can vary between instructions within 2587 /// a single bundle. 2588 bool hasValidDependencies() const { return Dependencies != InvalidDeps; } 2589 2590 /// Returns true for single instructions and for bundle representatives 2591 /// (= the head of a bundle). 2592 bool isSchedulingEntity() const { return FirstInBundle == this; } 2593 2594 /// Returns true if it represents an instruction bundle and not only a 2595 /// single instruction. 2596 bool isPartOfBundle() const { 2597 return NextInBundle != nullptr || FirstInBundle != this || TE; 2598 } 2599 2600 /// Returns true if it is ready for scheduling, i.e. it has no more 2601 /// unscheduled depending instructions/bundles. 2602 bool isReady() const { 2603 assert(isSchedulingEntity() && 2604 "can't consider non-scheduling entity for ready list"); 2605 return unscheduledDepsInBundle() == 0 && !IsScheduled; 2606 } 2607 2608 /// Modifies the number of unscheduled dependencies for this instruction, 2609 /// and returns the number of remaining dependencies for the containing 2610 /// bundle. 2611 int incrementUnscheduledDeps(int Incr) { 2612 assert(hasValidDependencies() && 2613 "increment of unscheduled deps would be meaningless"); 2614 UnscheduledDeps += Incr; 2615 return FirstInBundle->unscheduledDepsInBundle(); 2616 } 2617 2618 /// Sets the number of unscheduled dependencies to the number of 2619 /// dependencies. 2620 void resetUnscheduledDeps() { 2621 UnscheduledDeps = Dependencies; 2622 } 2623 2624 /// Clears all dependency information. 2625 void clearDependencies() { 2626 Dependencies = InvalidDeps; 2627 resetUnscheduledDeps(); 2628 MemoryDependencies.clear(); 2629 ControlDependencies.clear(); 2630 } 2631 2632 int unscheduledDepsInBundle() const { 2633 assert(isSchedulingEntity() && "only meaningful on the bundle"); 2634 int Sum = 0; 2635 for (const ScheduleData *BundleMember = this; BundleMember; 2636 BundleMember = BundleMember->NextInBundle) { 2637 if (BundleMember->UnscheduledDeps == InvalidDeps) 2638 return InvalidDeps; 2639 Sum += BundleMember->UnscheduledDeps; 2640 } 2641 return Sum; 2642 } 2643 2644 void dump(raw_ostream &os) const { 2645 if (!isSchedulingEntity()) { 2646 os << "/ " << *Inst; 2647 } else if (NextInBundle) { 2648 os << '[' << *Inst; 2649 ScheduleData *SD = NextInBundle; 2650 while (SD) { 2651 os << ';' << *SD->Inst; 2652 SD = SD->NextInBundle; 2653 } 2654 os << ']'; 2655 } else { 2656 os << *Inst; 2657 } 2658 } 2659 2660 Instruction *Inst = nullptr; 2661 2662 /// Opcode of the current instruction in the schedule data. 2663 Value *OpValue = nullptr; 2664 2665 /// The TreeEntry that this instruction corresponds to. 2666 TreeEntry *TE = nullptr; 2667 2668 /// Points to the head in an instruction bundle (and always to this for 2669 /// single instructions). 2670 ScheduleData *FirstInBundle = nullptr; 2671 2672 /// Single linked list of all instructions in a bundle. Null if it is a 2673 /// single instruction. 2674 ScheduleData *NextInBundle = nullptr; 2675 2676 /// Single linked list of all memory instructions (e.g. load, store, call) 2677 /// in the block - until the end of the scheduling region. 2678 ScheduleData *NextLoadStore = nullptr; 2679 2680 /// The dependent memory instructions. 2681 /// This list is derived on demand in calculateDependencies(). 2682 SmallVector<ScheduleData *, 4> MemoryDependencies; 2683 2684 /// List of instructions which this instruction could be control dependent 2685 /// on. Allowing such nodes to be scheduled below this one could introduce 2686 /// a runtime fault which didn't exist in the original program. 2687 /// ex: this is a load or udiv following a readonly call which inf loops 2688 SmallVector<ScheduleData *, 4> ControlDependencies; 2689 2690 /// This ScheduleData is in the current scheduling region if this matches 2691 /// the current SchedulingRegionID of BlockScheduling. 2692 int SchedulingRegionID = 0; 2693 2694 /// Used for getting a "good" final ordering of instructions. 2695 int SchedulingPriority = 0; 2696 2697 /// The number of dependencies. Constitutes of the number of users of the 2698 /// instruction plus the number of dependent memory instructions (if any). 2699 /// This value is calculated on demand. 2700 /// If InvalidDeps, the number of dependencies is not calculated yet. 2701 int Dependencies = InvalidDeps; 2702 2703 /// The number of dependencies minus the number of dependencies of scheduled 2704 /// instructions. As soon as this is zero, the instruction/bundle gets ready 2705 /// for scheduling. 2706 /// Note that this is negative as long as Dependencies is not calculated. 2707 int UnscheduledDeps = InvalidDeps; 2708 2709 /// True if this instruction is scheduled (or considered as scheduled in the 2710 /// dry-run). 2711 bool IsScheduled = false; 2712 }; 2713 2714 #ifndef NDEBUG 2715 friend inline raw_ostream &operator<<(raw_ostream &os, 2716 const BoUpSLP::ScheduleData &SD) { 2717 SD.dump(os); 2718 return os; 2719 } 2720 #endif 2721 2722 friend struct GraphTraits<BoUpSLP *>; 2723 friend struct DOTGraphTraits<BoUpSLP *>; 2724 2725 /// Contains all scheduling data for a basic block. 2726 /// It does not schedules instructions, which are not memory read/write 2727 /// instructions and their operands are either constants, or arguments, or 2728 /// phis, or instructions from others blocks, or their users are phis or from 2729 /// the other blocks. The resulting vector instructions can be placed at the 2730 /// beginning of the basic block without scheduling (if operands does not need 2731 /// to be scheduled) or at the end of the block (if users are outside of the 2732 /// block). It allows to save some compile time and memory used by the 2733 /// compiler. 2734 /// ScheduleData is assigned for each instruction in between the boundaries of 2735 /// the tree entry, even for those, which are not part of the graph. It is 2736 /// required to correctly follow the dependencies between the instructions and 2737 /// their correct scheduling. The ScheduleData is not allocated for the 2738 /// instructions, which do not require scheduling, like phis, nodes with 2739 /// extractelements/insertelements only or nodes with instructions, with 2740 /// uses/operands outside of the block. 2741 struct BlockScheduling { 2742 BlockScheduling(BasicBlock *BB) 2743 : BB(BB), ChunkSize(BB->size()), ChunkPos(ChunkSize) {} 2744 2745 void clear() { 2746 ReadyInsts.clear(); 2747 ScheduleStart = nullptr; 2748 ScheduleEnd = nullptr; 2749 FirstLoadStoreInRegion = nullptr; 2750 LastLoadStoreInRegion = nullptr; 2751 2752 // Reduce the maximum schedule region size by the size of the 2753 // previous scheduling run. 2754 ScheduleRegionSizeLimit -= ScheduleRegionSize; 2755 if (ScheduleRegionSizeLimit < MinScheduleRegionSize) 2756 ScheduleRegionSizeLimit = MinScheduleRegionSize; 2757 ScheduleRegionSize = 0; 2758 2759 // Make a new scheduling region, i.e. all existing ScheduleData is not 2760 // in the new region yet. 2761 ++SchedulingRegionID; 2762 } 2763 2764 ScheduleData *getScheduleData(Instruction *I) { 2765 if (BB != I->getParent()) 2766 // Avoid lookup if can't possibly be in map. 2767 return nullptr; 2768 ScheduleData *SD = ScheduleDataMap.lookup(I); 2769 if (SD && isInSchedulingRegion(SD)) 2770 return SD; 2771 return nullptr; 2772 } 2773 2774 ScheduleData *getScheduleData(Value *V) { 2775 if (auto *I = dyn_cast<Instruction>(V)) 2776 return getScheduleData(I); 2777 return nullptr; 2778 } 2779 2780 ScheduleData *getScheduleData(Value *V, Value *Key) { 2781 if (V == Key) 2782 return getScheduleData(V); 2783 auto I = ExtraScheduleDataMap.find(V); 2784 if (I != ExtraScheduleDataMap.end()) { 2785 ScheduleData *SD = I->second.lookup(Key); 2786 if (SD && isInSchedulingRegion(SD)) 2787 return SD; 2788 } 2789 return nullptr; 2790 } 2791 2792 bool isInSchedulingRegion(ScheduleData *SD) const { 2793 return SD->SchedulingRegionID == SchedulingRegionID; 2794 } 2795 2796 /// Marks an instruction as scheduled and puts all dependent ready 2797 /// instructions into the ready-list. 2798 template <typename ReadyListType> 2799 void schedule(ScheduleData *SD, ReadyListType &ReadyList) { 2800 SD->IsScheduled = true; 2801 LLVM_DEBUG(dbgs() << "SLP: schedule " << *SD << "\n"); 2802 2803 for (ScheduleData *BundleMember = SD; BundleMember; 2804 BundleMember = BundleMember->NextInBundle) { 2805 if (BundleMember->Inst != BundleMember->OpValue) 2806 continue; 2807 2808 // Handle the def-use chain dependencies. 2809 2810 // Decrement the unscheduled counter and insert to ready list if ready. 2811 auto &&DecrUnsched = [this, &ReadyList](Instruction *I) { 2812 doForAllOpcodes(I, [&ReadyList](ScheduleData *OpDef) { 2813 if (OpDef && OpDef->hasValidDependencies() && 2814 OpDef->incrementUnscheduledDeps(-1) == 0) { 2815 // There are no more unscheduled dependencies after 2816 // decrementing, so we can put the dependent instruction 2817 // into the ready list. 2818 ScheduleData *DepBundle = OpDef->FirstInBundle; 2819 assert(!DepBundle->IsScheduled && 2820 "already scheduled bundle gets ready"); 2821 ReadyList.insert(DepBundle); 2822 LLVM_DEBUG(dbgs() 2823 << "SLP: gets ready (def): " << *DepBundle << "\n"); 2824 } 2825 }); 2826 }; 2827 2828 // If BundleMember is a vector bundle, its operands may have been 2829 // reordered during buildTree(). We therefore need to get its operands 2830 // through the TreeEntry. 2831 if (TreeEntry *TE = BundleMember->TE) { 2832 // Need to search for the lane since the tree entry can be reordered. 2833 int Lane = std::distance(TE->Scalars.begin(), 2834 find(TE->Scalars, BundleMember->Inst)); 2835 assert(Lane >= 0 && "Lane not set"); 2836 2837 // Since vectorization tree is being built recursively this assertion 2838 // ensures that the tree entry has all operands set before reaching 2839 // this code. Couple of exceptions known at the moment are extracts 2840 // where their second (immediate) operand is not added. Since 2841 // immediates do not affect scheduler behavior this is considered 2842 // okay. 2843 auto *In = BundleMember->Inst; 2844 assert(In && 2845 (isa<ExtractValueInst>(In) || isa<ExtractElementInst>(In) || 2846 In->getNumOperands() == TE->getNumOperands()) && 2847 "Missed TreeEntry operands?"); 2848 (void)In; // fake use to avoid build failure when assertions disabled 2849 2850 for (unsigned OpIdx = 0, NumOperands = TE->getNumOperands(); 2851 OpIdx != NumOperands; ++OpIdx) 2852 if (auto *I = dyn_cast<Instruction>(TE->getOperand(OpIdx)[Lane])) 2853 DecrUnsched(I); 2854 } else { 2855 // If BundleMember is a stand-alone instruction, no operand reordering 2856 // has taken place, so we directly access its operands. 2857 for (Use &U : BundleMember->Inst->operands()) 2858 if (auto *I = dyn_cast<Instruction>(U.get())) 2859 DecrUnsched(I); 2860 } 2861 // Handle the memory dependencies. 2862 for (ScheduleData *MemoryDepSD : BundleMember->MemoryDependencies) { 2863 if (MemoryDepSD->incrementUnscheduledDeps(-1) == 0) { 2864 // There are no more unscheduled dependencies after decrementing, 2865 // so we can put the dependent instruction into the ready list. 2866 ScheduleData *DepBundle = MemoryDepSD->FirstInBundle; 2867 assert(!DepBundle->IsScheduled && 2868 "already scheduled bundle gets ready"); 2869 ReadyList.insert(DepBundle); 2870 LLVM_DEBUG(dbgs() 2871 << "SLP: gets ready (mem): " << *DepBundle << "\n"); 2872 } 2873 } 2874 // Handle the control dependencies. 2875 for (ScheduleData *DepSD : BundleMember->ControlDependencies) { 2876 if (DepSD->incrementUnscheduledDeps(-1) == 0) { 2877 // There are no more unscheduled dependencies after decrementing, 2878 // so we can put the dependent instruction into the ready list. 2879 ScheduleData *DepBundle = DepSD->FirstInBundle; 2880 assert(!DepBundle->IsScheduled && 2881 "already scheduled bundle gets ready"); 2882 ReadyList.insert(DepBundle); 2883 LLVM_DEBUG(dbgs() 2884 << "SLP: gets ready (ctl): " << *DepBundle << "\n"); 2885 } 2886 } 2887 2888 } 2889 } 2890 2891 /// Verify basic self consistency properties of the data structure. 2892 void verify() { 2893 if (!ScheduleStart) 2894 return; 2895 2896 assert(ScheduleStart->getParent() == ScheduleEnd->getParent() && 2897 ScheduleStart->comesBefore(ScheduleEnd) && 2898 "Not a valid scheduling region?"); 2899 2900 for (auto *I = ScheduleStart; I != ScheduleEnd; I = I->getNextNode()) { 2901 auto *SD = getScheduleData(I); 2902 if (!SD) 2903 continue; 2904 assert(isInSchedulingRegion(SD) && 2905 "primary schedule data not in window?"); 2906 assert(isInSchedulingRegion(SD->FirstInBundle) && 2907 "entire bundle in window!"); 2908 (void)SD; 2909 doForAllOpcodes(I, [](ScheduleData *SD) { SD->verify(); }); 2910 } 2911 2912 for (auto *SD : ReadyInsts) { 2913 assert(SD->isSchedulingEntity() && SD->isReady() && 2914 "item in ready list not ready?"); 2915 (void)SD; 2916 } 2917 } 2918 2919 void doForAllOpcodes(Value *V, 2920 function_ref<void(ScheduleData *SD)> Action) { 2921 if (ScheduleData *SD = getScheduleData(V)) 2922 Action(SD); 2923 auto I = ExtraScheduleDataMap.find(V); 2924 if (I != ExtraScheduleDataMap.end()) 2925 for (auto &P : I->second) 2926 if (isInSchedulingRegion(P.second)) 2927 Action(P.second); 2928 } 2929 2930 /// Put all instructions into the ReadyList which are ready for scheduling. 2931 template <typename ReadyListType> 2932 void initialFillReadyList(ReadyListType &ReadyList) { 2933 for (auto *I = ScheduleStart; I != ScheduleEnd; I = I->getNextNode()) { 2934 doForAllOpcodes(I, [&](ScheduleData *SD) { 2935 if (SD->isSchedulingEntity() && SD->isReady()) { 2936 ReadyList.insert(SD); 2937 LLVM_DEBUG(dbgs() 2938 << "SLP: initially in ready list: " << *SD << "\n"); 2939 } 2940 }); 2941 } 2942 } 2943 2944 /// Build a bundle from the ScheduleData nodes corresponding to the 2945 /// scalar instruction for each lane. 2946 ScheduleData *buildBundle(ArrayRef<Value *> VL); 2947 2948 /// Checks if a bundle of instructions can be scheduled, i.e. has no 2949 /// cyclic dependencies. This is only a dry-run, no instructions are 2950 /// actually moved at this stage. 2951 /// \returns the scheduling bundle. The returned Optional value is non-None 2952 /// if \p VL is allowed to be scheduled. 2953 Optional<ScheduleData *> 2954 tryScheduleBundle(ArrayRef<Value *> VL, BoUpSLP *SLP, 2955 const InstructionsState &S); 2956 2957 /// Un-bundles a group of instructions. 2958 void cancelScheduling(ArrayRef<Value *> VL, Value *OpValue); 2959 2960 /// Allocates schedule data chunk. 2961 ScheduleData *allocateScheduleDataChunks(); 2962 2963 /// Extends the scheduling region so that V is inside the region. 2964 /// \returns true if the region size is within the limit. 2965 bool extendSchedulingRegion(Value *V, const InstructionsState &S); 2966 2967 /// Initialize the ScheduleData structures for new instructions in the 2968 /// scheduling region. 2969 void initScheduleData(Instruction *FromI, Instruction *ToI, 2970 ScheduleData *PrevLoadStore, 2971 ScheduleData *NextLoadStore); 2972 2973 /// Updates the dependency information of a bundle and of all instructions/ 2974 /// bundles which depend on the original bundle. 2975 void calculateDependencies(ScheduleData *SD, bool InsertInReadyList, 2976 BoUpSLP *SLP); 2977 2978 /// Sets all instruction in the scheduling region to un-scheduled. 2979 void resetSchedule(); 2980 2981 BasicBlock *BB; 2982 2983 /// Simple memory allocation for ScheduleData. 2984 std::vector<std::unique_ptr<ScheduleData[]>> ScheduleDataChunks; 2985 2986 /// The size of a ScheduleData array in ScheduleDataChunks. 2987 int ChunkSize; 2988 2989 /// The allocator position in the current chunk, which is the last entry 2990 /// of ScheduleDataChunks. 2991 int ChunkPos; 2992 2993 /// Attaches ScheduleData to Instruction. 2994 /// Note that the mapping survives during all vectorization iterations, i.e. 2995 /// ScheduleData structures are recycled. 2996 DenseMap<Instruction *, ScheduleData *> ScheduleDataMap; 2997 2998 /// Attaches ScheduleData to Instruction with the leading key. 2999 DenseMap<Value *, SmallDenseMap<Value *, ScheduleData *>> 3000 ExtraScheduleDataMap; 3001 3002 /// The ready-list for scheduling (only used for the dry-run). 3003 SetVector<ScheduleData *> ReadyInsts; 3004 3005 /// The first instruction of the scheduling region. 3006 Instruction *ScheduleStart = nullptr; 3007 3008 /// The first instruction _after_ the scheduling region. 3009 Instruction *ScheduleEnd = nullptr; 3010 3011 /// The first memory accessing instruction in the scheduling region 3012 /// (can be null). 3013 ScheduleData *FirstLoadStoreInRegion = nullptr; 3014 3015 /// The last memory accessing instruction in the scheduling region 3016 /// (can be null). 3017 ScheduleData *LastLoadStoreInRegion = nullptr; 3018 3019 /// The current size of the scheduling region. 3020 int ScheduleRegionSize = 0; 3021 3022 /// The maximum size allowed for the scheduling region. 3023 int ScheduleRegionSizeLimit = ScheduleRegionSizeBudget; 3024 3025 /// The ID of the scheduling region. For a new vectorization iteration this 3026 /// is incremented which "removes" all ScheduleData from the region. 3027 /// Make sure that the initial SchedulingRegionID is greater than the 3028 /// initial SchedulingRegionID in ScheduleData (which is 0). 3029 int SchedulingRegionID = 1; 3030 }; 3031 3032 /// Attaches the BlockScheduling structures to basic blocks. 3033 MapVector<BasicBlock *, std::unique_ptr<BlockScheduling>> BlocksSchedules; 3034 3035 /// Performs the "real" scheduling. Done before vectorization is actually 3036 /// performed in a basic block. 3037 void scheduleBlock(BlockScheduling *BS); 3038 3039 /// List of users to ignore during scheduling and that don't need extracting. 3040 ArrayRef<Value *> UserIgnoreList; 3041 3042 /// A DenseMapInfo implementation for holding DenseMaps and DenseSets of 3043 /// sorted SmallVectors of unsigned. 3044 struct OrdersTypeDenseMapInfo { 3045 static OrdersType getEmptyKey() { 3046 OrdersType V; 3047 V.push_back(~1U); 3048 return V; 3049 } 3050 3051 static OrdersType getTombstoneKey() { 3052 OrdersType V; 3053 V.push_back(~2U); 3054 return V; 3055 } 3056 3057 static unsigned getHashValue(const OrdersType &V) { 3058 return static_cast<unsigned>(hash_combine_range(V.begin(), V.end())); 3059 } 3060 3061 static bool isEqual(const OrdersType &LHS, const OrdersType &RHS) { 3062 return LHS == RHS; 3063 } 3064 }; 3065 3066 // Analysis and block reference. 3067 Function *F; 3068 ScalarEvolution *SE; 3069 TargetTransformInfo *TTI; 3070 TargetLibraryInfo *TLI; 3071 LoopInfo *LI; 3072 DominatorTree *DT; 3073 AssumptionCache *AC; 3074 DemandedBits *DB; 3075 const DataLayout *DL; 3076 OptimizationRemarkEmitter *ORE; 3077 3078 unsigned MaxVecRegSize; // This is set by TTI or overridden by cl::opt. 3079 unsigned MinVecRegSize; // Set by cl::opt (default: 128). 3080 3081 /// Instruction builder to construct the vectorized tree. 3082 IRBuilder<> Builder; 3083 3084 /// A map of scalar integer values to the smallest bit width with which they 3085 /// can legally be represented. The values map to (width, signed) pairs, 3086 /// where "width" indicates the minimum bit width and "signed" is True if the 3087 /// value must be signed-extended, rather than zero-extended, back to its 3088 /// original width. 3089 MapVector<Value *, std::pair<uint64_t, bool>> MinBWs; 3090 }; 3091 3092 } // end namespace slpvectorizer 3093 3094 template <> struct GraphTraits<BoUpSLP *> { 3095 using TreeEntry = BoUpSLP::TreeEntry; 3096 3097 /// NodeRef has to be a pointer per the GraphWriter. 3098 using NodeRef = TreeEntry *; 3099 3100 using ContainerTy = BoUpSLP::TreeEntry::VecTreeTy; 3101 3102 /// Add the VectorizableTree to the index iterator to be able to return 3103 /// TreeEntry pointers. 3104 struct ChildIteratorType 3105 : public iterator_adaptor_base< 3106 ChildIteratorType, SmallVector<BoUpSLP::EdgeInfo, 1>::iterator> { 3107 ContainerTy &VectorizableTree; 3108 3109 ChildIteratorType(SmallVector<BoUpSLP::EdgeInfo, 1>::iterator W, 3110 ContainerTy &VT) 3111 : ChildIteratorType::iterator_adaptor_base(W), VectorizableTree(VT) {} 3112 3113 NodeRef operator*() { return I->UserTE; } 3114 }; 3115 3116 static NodeRef getEntryNode(BoUpSLP &R) { 3117 return R.VectorizableTree[0].get(); 3118 } 3119 3120 static ChildIteratorType child_begin(NodeRef N) { 3121 return {N->UserTreeIndices.begin(), N->Container}; 3122 } 3123 3124 static ChildIteratorType child_end(NodeRef N) { 3125 return {N->UserTreeIndices.end(), N->Container}; 3126 } 3127 3128 /// For the node iterator we just need to turn the TreeEntry iterator into a 3129 /// TreeEntry* iterator so that it dereferences to NodeRef. 3130 class nodes_iterator { 3131 using ItTy = ContainerTy::iterator; 3132 ItTy It; 3133 3134 public: 3135 nodes_iterator(const ItTy &It2) : It(It2) {} 3136 NodeRef operator*() { return It->get(); } 3137 nodes_iterator operator++() { 3138 ++It; 3139 return *this; 3140 } 3141 bool operator!=(const nodes_iterator &N2) const { return N2.It != It; } 3142 }; 3143 3144 static nodes_iterator nodes_begin(BoUpSLP *R) { 3145 return nodes_iterator(R->VectorizableTree.begin()); 3146 } 3147 3148 static nodes_iterator nodes_end(BoUpSLP *R) { 3149 return nodes_iterator(R->VectorizableTree.end()); 3150 } 3151 3152 static unsigned size(BoUpSLP *R) { return R->VectorizableTree.size(); } 3153 }; 3154 3155 template <> struct DOTGraphTraits<BoUpSLP *> : public DefaultDOTGraphTraits { 3156 using TreeEntry = BoUpSLP::TreeEntry; 3157 3158 DOTGraphTraits(bool isSimple = false) : DefaultDOTGraphTraits(isSimple) {} 3159 3160 std::string getNodeLabel(const TreeEntry *Entry, const BoUpSLP *R) { 3161 std::string Str; 3162 raw_string_ostream OS(Str); 3163 if (isSplat(Entry->Scalars)) 3164 OS << "<splat> "; 3165 for (auto V : Entry->Scalars) { 3166 OS << *V; 3167 if (llvm::any_of(R->ExternalUses, [&](const BoUpSLP::ExternalUser &EU) { 3168 return EU.Scalar == V; 3169 })) 3170 OS << " <extract>"; 3171 OS << "\n"; 3172 } 3173 return Str; 3174 } 3175 3176 static std::string getNodeAttributes(const TreeEntry *Entry, 3177 const BoUpSLP *) { 3178 if (Entry->State == TreeEntry::NeedToGather) 3179 return "color=red"; 3180 return ""; 3181 } 3182 }; 3183 3184 } // end namespace llvm 3185 3186 BoUpSLP::~BoUpSLP() { 3187 for (const auto &Pair : DeletedInstructions) { 3188 // Replace operands of ignored instructions with Undefs in case if they were 3189 // marked for deletion. 3190 if (Pair.getSecond()) { 3191 Value *Undef = UndefValue::get(Pair.getFirst()->getType()); 3192 Pair.getFirst()->replaceAllUsesWith(Undef); 3193 } 3194 Pair.getFirst()->dropAllReferences(); 3195 } 3196 for (const auto &Pair : DeletedInstructions) { 3197 assert(Pair.getFirst()->use_empty() && 3198 "trying to erase instruction with users."); 3199 Pair.getFirst()->eraseFromParent(); 3200 } 3201 #ifdef EXPENSIVE_CHECKS 3202 // If we could guarantee that this call is not extremely slow, we could 3203 // remove the ifdef limitation (see PR47712). 3204 assert(!verifyFunction(*F, &dbgs())); 3205 #endif 3206 } 3207 3208 void BoUpSLP::eraseInstructions(ArrayRef<Value *> AV) { 3209 for (auto *V : AV) { 3210 if (auto *I = dyn_cast<Instruction>(V)) 3211 eraseInstruction(I, /*ReplaceOpsWithUndef=*/true); 3212 }; 3213 } 3214 3215 /// Reorders the given \p Reuses mask according to the given \p Mask. \p Reuses 3216 /// contains original mask for the scalars reused in the node. Procedure 3217 /// transform this mask in accordance with the given \p Mask. 3218 static void reorderReuses(SmallVectorImpl<int> &Reuses, ArrayRef<int> Mask) { 3219 assert(!Mask.empty() && Reuses.size() == Mask.size() && 3220 "Expected non-empty mask."); 3221 SmallVector<int> Prev(Reuses.begin(), Reuses.end()); 3222 Prev.swap(Reuses); 3223 for (unsigned I = 0, E = Prev.size(); I < E; ++I) 3224 if (Mask[I] != UndefMaskElem) 3225 Reuses[Mask[I]] = Prev[I]; 3226 } 3227 3228 /// Reorders the given \p Order according to the given \p Mask. \p Order - is 3229 /// the original order of the scalars. Procedure transforms the provided order 3230 /// in accordance with the given \p Mask. If the resulting \p Order is just an 3231 /// identity order, \p Order is cleared. 3232 static void reorderOrder(SmallVectorImpl<unsigned> &Order, ArrayRef<int> Mask) { 3233 assert(!Mask.empty() && "Expected non-empty mask."); 3234 SmallVector<int> MaskOrder; 3235 if (Order.empty()) { 3236 MaskOrder.resize(Mask.size()); 3237 std::iota(MaskOrder.begin(), MaskOrder.end(), 0); 3238 } else { 3239 inversePermutation(Order, MaskOrder); 3240 } 3241 reorderReuses(MaskOrder, Mask); 3242 if (ShuffleVectorInst::isIdentityMask(MaskOrder)) { 3243 Order.clear(); 3244 return; 3245 } 3246 Order.assign(Mask.size(), Mask.size()); 3247 for (unsigned I = 0, E = Mask.size(); I < E; ++I) 3248 if (MaskOrder[I] != UndefMaskElem) 3249 Order[MaskOrder[I]] = I; 3250 fixupOrderingIndices(Order); 3251 } 3252 3253 Optional<BoUpSLP::OrdersType> 3254 BoUpSLP::findReusedOrderedScalars(const BoUpSLP::TreeEntry &TE) { 3255 assert(TE.State == TreeEntry::NeedToGather && "Expected gather node only."); 3256 unsigned NumScalars = TE.Scalars.size(); 3257 OrdersType CurrentOrder(NumScalars, NumScalars); 3258 SmallVector<int> Positions; 3259 SmallBitVector UsedPositions(NumScalars); 3260 const TreeEntry *STE = nullptr; 3261 // Try to find all gathered scalars that are gets vectorized in other 3262 // vectorize node. Here we can have only one single tree vector node to 3263 // correctly identify order of the gathered scalars. 3264 for (unsigned I = 0; I < NumScalars; ++I) { 3265 Value *V = TE.Scalars[I]; 3266 if (!isa<LoadInst, ExtractElementInst, ExtractValueInst>(V)) 3267 continue; 3268 if (const auto *LocalSTE = getTreeEntry(V)) { 3269 if (!STE) 3270 STE = LocalSTE; 3271 else if (STE != LocalSTE) 3272 // Take the order only from the single vector node. 3273 return None; 3274 unsigned Lane = 3275 std::distance(STE->Scalars.begin(), find(STE->Scalars, V)); 3276 if (Lane >= NumScalars) 3277 return None; 3278 if (CurrentOrder[Lane] != NumScalars) { 3279 if (Lane != I) 3280 continue; 3281 UsedPositions.reset(CurrentOrder[Lane]); 3282 } 3283 // The partial identity (where only some elements of the gather node are 3284 // in the identity order) is good. 3285 CurrentOrder[Lane] = I; 3286 UsedPositions.set(I); 3287 } 3288 } 3289 // Need to keep the order if we have a vector entry and at least 2 scalars or 3290 // the vectorized entry has just 2 scalars. 3291 if (STE && (UsedPositions.count() > 1 || STE->Scalars.size() == 2)) { 3292 auto &&IsIdentityOrder = [NumScalars](ArrayRef<unsigned> CurrentOrder) { 3293 for (unsigned I = 0; I < NumScalars; ++I) 3294 if (CurrentOrder[I] != I && CurrentOrder[I] != NumScalars) 3295 return false; 3296 return true; 3297 }; 3298 if (IsIdentityOrder(CurrentOrder)) { 3299 CurrentOrder.clear(); 3300 return CurrentOrder; 3301 } 3302 auto *It = CurrentOrder.begin(); 3303 for (unsigned I = 0; I < NumScalars;) { 3304 if (UsedPositions.test(I)) { 3305 ++I; 3306 continue; 3307 } 3308 if (*It == NumScalars) { 3309 *It = I; 3310 ++I; 3311 } 3312 ++It; 3313 } 3314 return CurrentOrder; 3315 } 3316 return None; 3317 } 3318 3319 Optional<BoUpSLP::OrdersType> BoUpSLP::getReorderingData(const TreeEntry &TE, 3320 bool TopToBottom) { 3321 // No need to reorder if need to shuffle reuses, still need to shuffle the 3322 // node. 3323 if (!TE.ReuseShuffleIndices.empty()) 3324 return None; 3325 if (TE.State == TreeEntry::Vectorize && 3326 (isa<LoadInst, ExtractElementInst, ExtractValueInst>(TE.getMainOp()) || 3327 (TopToBottom && isa<StoreInst, InsertElementInst>(TE.getMainOp()))) && 3328 !TE.isAltShuffle()) 3329 return TE.ReorderIndices; 3330 if (TE.State == TreeEntry::NeedToGather) { 3331 // TODO: add analysis of other gather nodes with extractelement 3332 // instructions and other values/instructions, not only undefs. 3333 if (((TE.getOpcode() == Instruction::ExtractElement && 3334 !TE.isAltShuffle()) || 3335 (all_of(TE.Scalars, 3336 [](Value *V) { 3337 return isa<UndefValue, ExtractElementInst>(V); 3338 }) && 3339 any_of(TE.Scalars, 3340 [](Value *V) { return isa<ExtractElementInst>(V); }))) && 3341 all_of(TE.Scalars, 3342 [](Value *V) { 3343 auto *EE = dyn_cast<ExtractElementInst>(V); 3344 return !EE || isa<FixedVectorType>(EE->getVectorOperandType()); 3345 }) && 3346 allSameType(TE.Scalars)) { 3347 // Check that gather of extractelements can be represented as 3348 // just a shuffle of a single vector. 3349 OrdersType CurrentOrder; 3350 bool Reuse = canReuseExtract(TE.Scalars, TE.getMainOp(), CurrentOrder); 3351 if (Reuse || !CurrentOrder.empty()) { 3352 if (!CurrentOrder.empty()) 3353 fixupOrderingIndices(CurrentOrder); 3354 return CurrentOrder; 3355 } 3356 } 3357 if (Optional<OrdersType> CurrentOrder = findReusedOrderedScalars(TE)) 3358 return CurrentOrder; 3359 } 3360 return None; 3361 } 3362 3363 void BoUpSLP::reorderTopToBottom() { 3364 // Maps VF to the graph nodes. 3365 DenseMap<unsigned, SetVector<TreeEntry *>> VFToOrderedEntries; 3366 // ExtractElement gather nodes which can be vectorized and need to handle 3367 // their ordering. 3368 DenseMap<const TreeEntry *, OrdersType> GathersToOrders; 3369 // Find all reorderable nodes with the given VF. 3370 // Currently the are vectorized stores,loads,extracts + some gathering of 3371 // extracts. 3372 for_each(VectorizableTree, [this, &VFToOrderedEntries, &GathersToOrders]( 3373 const std::unique_ptr<TreeEntry> &TE) { 3374 if (Optional<OrdersType> CurrentOrder = 3375 getReorderingData(*TE, /*TopToBottom=*/true)) { 3376 // Do not include ordering for nodes used in the alt opcode vectorization, 3377 // better to reorder them during bottom-to-top stage. If follow the order 3378 // here, it causes reordering of the whole graph though actually it is 3379 // profitable just to reorder the subgraph that starts from the alternate 3380 // opcode vectorization node. Such nodes already end-up with the shuffle 3381 // instruction and it is just enough to change this shuffle rather than 3382 // rotate the scalars for the whole graph. 3383 unsigned Cnt = 0; 3384 const TreeEntry *UserTE = TE.get(); 3385 while (UserTE && Cnt < RecursionMaxDepth) { 3386 if (UserTE->UserTreeIndices.size() != 1) 3387 break; 3388 if (all_of(UserTE->UserTreeIndices, [](const EdgeInfo &EI) { 3389 return EI.UserTE->State == TreeEntry::Vectorize && 3390 EI.UserTE->isAltShuffle() && EI.UserTE->Idx != 0; 3391 })) 3392 return; 3393 if (UserTE->UserTreeIndices.empty()) 3394 UserTE = nullptr; 3395 else 3396 UserTE = UserTE->UserTreeIndices.back().UserTE; 3397 ++Cnt; 3398 } 3399 VFToOrderedEntries[TE->Scalars.size()].insert(TE.get()); 3400 if (TE->State != TreeEntry::Vectorize) 3401 GathersToOrders.try_emplace(TE.get(), *CurrentOrder); 3402 } 3403 }); 3404 3405 // Reorder the graph nodes according to their vectorization factor. 3406 for (unsigned VF = VectorizableTree.front()->Scalars.size(); VF > 1; 3407 VF /= 2) { 3408 auto It = VFToOrderedEntries.find(VF); 3409 if (It == VFToOrderedEntries.end()) 3410 continue; 3411 // Try to find the most profitable order. We just are looking for the most 3412 // used order and reorder scalar elements in the nodes according to this 3413 // mostly used order. 3414 ArrayRef<TreeEntry *> OrderedEntries = It->second.getArrayRef(); 3415 // All operands are reordered and used only in this node - propagate the 3416 // most used order to the user node. 3417 MapVector<OrdersType, unsigned, 3418 DenseMap<OrdersType, unsigned, OrdersTypeDenseMapInfo>> 3419 OrdersUses; 3420 SmallPtrSet<const TreeEntry *, 4> VisitedOps; 3421 for (const TreeEntry *OpTE : OrderedEntries) { 3422 // No need to reorder this nodes, still need to extend and to use shuffle, 3423 // just need to merge reordering shuffle and the reuse shuffle. 3424 if (!OpTE->ReuseShuffleIndices.empty()) 3425 continue; 3426 // Count number of orders uses. 3427 const auto &Order = [OpTE, &GathersToOrders]() -> const OrdersType & { 3428 if (OpTE->State == TreeEntry::NeedToGather) 3429 return GathersToOrders.find(OpTE)->second; 3430 return OpTE->ReorderIndices; 3431 }(); 3432 // Stores actually store the mask, not the order, need to invert. 3433 if (OpTE->State == TreeEntry::Vectorize && !OpTE->isAltShuffle() && 3434 OpTE->getOpcode() == Instruction::Store && !Order.empty()) { 3435 SmallVector<int> Mask; 3436 inversePermutation(Order, Mask); 3437 unsigned E = Order.size(); 3438 OrdersType CurrentOrder(E, E); 3439 transform(Mask, CurrentOrder.begin(), [E](int Idx) { 3440 return Idx == UndefMaskElem ? E : static_cast<unsigned>(Idx); 3441 }); 3442 fixupOrderingIndices(CurrentOrder); 3443 ++OrdersUses.insert(std::make_pair(CurrentOrder, 0)).first->second; 3444 } else { 3445 ++OrdersUses.insert(std::make_pair(Order, 0)).first->second; 3446 } 3447 } 3448 // Set order of the user node. 3449 if (OrdersUses.empty()) 3450 continue; 3451 // Choose the most used order. 3452 ArrayRef<unsigned> BestOrder = OrdersUses.front().first; 3453 unsigned Cnt = OrdersUses.front().second; 3454 for (const auto &Pair : drop_begin(OrdersUses)) { 3455 if (Cnt < Pair.second || (Cnt == Pair.second && Pair.first.empty())) { 3456 BestOrder = Pair.first; 3457 Cnt = Pair.second; 3458 } 3459 } 3460 // Set order of the user node. 3461 if (BestOrder.empty()) 3462 continue; 3463 SmallVector<int> Mask; 3464 inversePermutation(BestOrder, Mask); 3465 SmallVector<int> MaskOrder(BestOrder.size(), UndefMaskElem); 3466 unsigned E = BestOrder.size(); 3467 transform(BestOrder, MaskOrder.begin(), [E](unsigned I) { 3468 return I < E ? static_cast<int>(I) : UndefMaskElem; 3469 }); 3470 // Do an actual reordering, if profitable. 3471 for (std::unique_ptr<TreeEntry> &TE : VectorizableTree) { 3472 // Just do the reordering for the nodes with the given VF. 3473 if (TE->Scalars.size() != VF) { 3474 if (TE->ReuseShuffleIndices.size() == VF) { 3475 // Need to reorder the reuses masks of the operands with smaller VF to 3476 // be able to find the match between the graph nodes and scalar 3477 // operands of the given node during vectorization/cost estimation. 3478 assert(all_of(TE->UserTreeIndices, 3479 [VF, &TE](const EdgeInfo &EI) { 3480 return EI.UserTE->Scalars.size() == VF || 3481 EI.UserTE->Scalars.size() == 3482 TE->Scalars.size(); 3483 }) && 3484 "All users must be of VF size."); 3485 // Update ordering of the operands with the smaller VF than the given 3486 // one. 3487 reorderReuses(TE->ReuseShuffleIndices, Mask); 3488 } 3489 continue; 3490 } 3491 if (TE->State == TreeEntry::Vectorize && 3492 isa<ExtractElementInst, ExtractValueInst, LoadInst, StoreInst, 3493 InsertElementInst>(TE->getMainOp()) && 3494 !TE->isAltShuffle()) { 3495 // Build correct orders for extract{element,value}, loads and 3496 // stores. 3497 reorderOrder(TE->ReorderIndices, Mask); 3498 if (isa<InsertElementInst, StoreInst>(TE->getMainOp())) 3499 TE->reorderOperands(Mask); 3500 } else { 3501 // Reorder the node and its operands. 3502 TE->reorderOperands(Mask); 3503 assert(TE->ReorderIndices.empty() && 3504 "Expected empty reorder sequence."); 3505 reorderScalars(TE->Scalars, Mask); 3506 } 3507 if (!TE->ReuseShuffleIndices.empty()) { 3508 // Apply reversed order to keep the original ordering of the reused 3509 // elements to avoid extra reorder indices shuffling. 3510 OrdersType CurrentOrder; 3511 reorderOrder(CurrentOrder, MaskOrder); 3512 SmallVector<int> NewReuses; 3513 inversePermutation(CurrentOrder, NewReuses); 3514 addMask(NewReuses, TE->ReuseShuffleIndices); 3515 TE->ReuseShuffleIndices.swap(NewReuses); 3516 } 3517 } 3518 } 3519 } 3520 3521 bool BoUpSLP::canReorderOperands( 3522 TreeEntry *UserTE, SmallVectorImpl<std::pair<unsigned, TreeEntry *>> &Edges, 3523 ArrayRef<TreeEntry *> ReorderableGathers, 3524 SmallVectorImpl<TreeEntry *> &GatherOps) { 3525 for (unsigned I = 0, E = UserTE->getNumOperands(); I < E; ++I) { 3526 if (any_of(Edges, [I](const std::pair<unsigned, TreeEntry *> &OpData) { 3527 return OpData.first == I && 3528 OpData.second->State == TreeEntry::Vectorize; 3529 })) 3530 continue; 3531 if (TreeEntry *TE = getVectorizedOperand(UserTE, I)) { 3532 // Do not reorder if operand node is used by many user nodes. 3533 if (any_of(TE->UserTreeIndices, 3534 [UserTE](const EdgeInfo &EI) { return EI.UserTE != UserTE; })) 3535 return false; 3536 // Add the node to the list of the ordered nodes with the identity 3537 // order. 3538 Edges.emplace_back(I, TE); 3539 continue; 3540 } 3541 ArrayRef<Value *> VL = UserTE->getOperand(I); 3542 TreeEntry *Gather = nullptr; 3543 if (count_if(ReorderableGathers, [VL, &Gather](TreeEntry *TE) { 3544 assert(TE->State != TreeEntry::Vectorize && 3545 "Only non-vectorized nodes are expected."); 3546 if (TE->isSame(VL)) { 3547 Gather = TE; 3548 return true; 3549 } 3550 return false; 3551 }) > 1) 3552 return false; 3553 if (Gather) 3554 GatherOps.push_back(Gather); 3555 } 3556 return true; 3557 } 3558 3559 void BoUpSLP::reorderBottomToTop(bool IgnoreReorder) { 3560 SetVector<TreeEntry *> OrderedEntries; 3561 DenseMap<const TreeEntry *, OrdersType> GathersToOrders; 3562 // Find all reorderable leaf nodes with the given VF. 3563 // Currently the are vectorized loads,extracts without alternate operands + 3564 // some gathering of extracts. 3565 SmallVector<TreeEntry *> NonVectorized; 3566 for_each(VectorizableTree, [this, &OrderedEntries, &GathersToOrders, 3567 &NonVectorized]( 3568 const std::unique_ptr<TreeEntry> &TE) { 3569 if (TE->State != TreeEntry::Vectorize) 3570 NonVectorized.push_back(TE.get()); 3571 if (Optional<OrdersType> CurrentOrder = 3572 getReorderingData(*TE, /*TopToBottom=*/false)) { 3573 OrderedEntries.insert(TE.get()); 3574 if (TE->State != TreeEntry::Vectorize) 3575 GathersToOrders.try_emplace(TE.get(), *CurrentOrder); 3576 } 3577 }); 3578 3579 // 1. Propagate order to the graph nodes, which use only reordered nodes. 3580 // I.e., if the node has operands, that are reordered, try to make at least 3581 // one operand order in the natural order and reorder others + reorder the 3582 // user node itself. 3583 SmallPtrSet<const TreeEntry *, 4> Visited; 3584 while (!OrderedEntries.empty()) { 3585 // 1. Filter out only reordered nodes. 3586 // 2. If the entry has multiple uses - skip it and jump to the next node. 3587 MapVector<TreeEntry *, SmallVector<std::pair<unsigned, TreeEntry *>>> Users; 3588 SmallVector<TreeEntry *> Filtered; 3589 for (TreeEntry *TE : OrderedEntries) { 3590 if (!(TE->State == TreeEntry::Vectorize || 3591 (TE->State == TreeEntry::NeedToGather && 3592 GathersToOrders.count(TE))) || 3593 TE->UserTreeIndices.empty() || !TE->ReuseShuffleIndices.empty() || 3594 !all_of(drop_begin(TE->UserTreeIndices), 3595 [TE](const EdgeInfo &EI) { 3596 return EI.UserTE == TE->UserTreeIndices.front().UserTE; 3597 }) || 3598 !Visited.insert(TE).second) { 3599 Filtered.push_back(TE); 3600 continue; 3601 } 3602 // Build a map between user nodes and their operands order to speedup 3603 // search. The graph currently does not provide this dependency directly. 3604 for (EdgeInfo &EI : TE->UserTreeIndices) { 3605 TreeEntry *UserTE = EI.UserTE; 3606 auto It = Users.find(UserTE); 3607 if (It == Users.end()) 3608 It = Users.insert({UserTE, {}}).first; 3609 It->second.emplace_back(EI.EdgeIdx, TE); 3610 } 3611 } 3612 // Erase filtered entries. 3613 for_each(Filtered, 3614 [&OrderedEntries](TreeEntry *TE) { OrderedEntries.remove(TE); }); 3615 for (auto &Data : Users) { 3616 // Check that operands are used only in the User node. 3617 SmallVector<TreeEntry *> GatherOps; 3618 if (!canReorderOperands(Data.first, Data.second, NonVectorized, 3619 GatherOps)) { 3620 for_each(Data.second, 3621 [&OrderedEntries](const std::pair<unsigned, TreeEntry *> &Op) { 3622 OrderedEntries.remove(Op.second); 3623 }); 3624 continue; 3625 } 3626 // All operands are reordered and used only in this node - propagate the 3627 // most used order to the user node. 3628 MapVector<OrdersType, unsigned, 3629 DenseMap<OrdersType, unsigned, OrdersTypeDenseMapInfo>> 3630 OrdersUses; 3631 // Do the analysis for each tree entry only once, otherwise the order of 3632 // the same node my be considered several times, though might be not 3633 // profitable. 3634 SmallPtrSet<const TreeEntry *, 4> VisitedOps; 3635 SmallPtrSet<const TreeEntry *, 4> VisitedUsers; 3636 for (const auto &Op : Data.second) { 3637 TreeEntry *OpTE = Op.second; 3638 if (!VisitedOps.insert(OpTE).second) 3639 continue; 3640 if (!OpTE->ReuseShuffleIndices.empty() || 3641 (IgnoreReorder && OpTE == VectorizableTree.front().get())) 3642 continue; 3643 const auto &Order = [OpTE, &GathersToOrders]() -> const OrdersType & { 3644 if (OpTE->State == TreeEntry::NeedToGather) 3645 return GathersToOrders.find(OpTE)->second; 3646 return OpTE->ReorderIndices; 3647 }(); 3648 unsigned NumOps = count_if( 3649 Data.second, [OpTE](const std::pair<unsigned, TreeEntry *> &P) { 3650 return P.second == OpTE; 3651 }); 3652 // Stores actually store the mask, not the order, need to invert. 3653 if (OpTE->State == TreeEntry::Vectorize && !OpTE->isAltShuffle() && 3654 OpTE->getOpcode() == Instruction::Store && !Order.empty()) { 3655 SmallVector<int> Mask; 3656 inversePermutation(Order, Mask); 3657 unsigned E = Order.size(); 3658 OrdersType CurrentOrder(E, E); 3659 transform(Mask, CurrentOrder.begin(), [E](int Idx) { 3660 return Idx == UndefMaskElem ? E : static_cast<unsigned>(Idx); 3661 }); 3662 fixupOrderingIndices(CurrentOrder); 3663 OrdersUses.insert(std::make_pair(CurrentOrder, 0)).first->second += 3664 NumOps; 3665 } else { 3666 OrdersUses.insert(std::make_pair(Order, 0)).first->second += NumOps; 3667 } 3668 auto Res = OrdersUses.insert(std::make_pair(OrdersType(), 0)); 3669 const auto &&AllowsReordering = [IgnoreReorder, &GathersToOrders]( 3670 const TreeEntry *TE) { 3671 if (!TE->ReorderIndices.empty() || !TE->ReuseShuffleIndices.empty() || 3672 (TE->State == TreeEntry::Vectorize && TE->isAltShuffle()) || 3673 (IgnoreReorder && TE->Idx == 0)) 3674 return true; 3675 if (TE->State == TreeEntry::NeedToGather) { 3676 auto It = GathersToOrders.find(TE); 3677 if (It != GathersToOrders.end()) 3678 return !It->second.empty(); 3679 return true; 3680 } 3681 return false; 3682 }; 3683 for (const EdgeInfo &EI : OpTE->UserTreeIndices) { 3684 TreeEntry *UserTE = EI.UserTE; 3685 if (!VisitedUsers.insert(UserTE).second) 3686 continue; 3687 // May reorder user node if it requires reordering, has reused 3688 // scalars, is an alternate op vectorize node or its op nodes require 3689 // reordering. 3690 if (AllowsReordering(UserTE)) 3691 continue; 3692 // Check if users allow reordering. 3693 // Currently look up just 1 level of operands to avoid increase of 3694 // the compile time. 3695 // Profitable to reorder if definitely more operands allow 3696 // reordering rather than those with natural order. 3697 ArrayRef<std::pair<unsigned, TreeEntry *>> Ops = Users[UserTE]; 3698 if (static_cast<unsigned>(count_if( 3699 Ops, [UserTE, &AllowsReordering]( 3700 const std::pair<unsigned, TreeEntry *> &Op) { 3701 return AllowsReordering(Op.second) && 3702 all_of(Op.second->UserTreeIndices, 3703 [UserTE](const EdgeInfo &EI) { 3704 return EI.UserTE == UserTE; 3705 }); 3706 })) <= Ops.size() / 2) 3707 ++Res.first->second; 3708 } 3709 } 3710 // If no orders - skip current nodes and jump to the next one, if any. 3711 if (OrdersUses.empty()) { 3712 for_each(Data.second, 3713 [&OrderedEntries](const std::pair<unsigned, TreeEntry *> &Op) { 3714 OrderedEntries.remove(Op.second); 3715 }); 3716 continue; 3717 } 3718 // Choose the best order. 3719 ArrayRef<unsigned> BestOrder = OrdersUses.front().first; 3720 unsigned Cnt = OrdersUses.front().second; 3721 for (const auto &Pair : drop_begin(OrdersUses)) { 3722 if (Cnt < Pair.second || (Cnt == Pair.second && Pair.first.empty())) { 3723 BestOrder = Pair.first; 3724 Cnt = Pair.second; 3725 } 3726 } 3727 // Set order of the user node (reordering of operands and user nodes). 3728 if (BestOrder.empty()) { 3729 for_each(Data.second, 3730 [&OrderedEntries](const std::pair<unsigned, TreeEntry *> &Op) { 3731 OrderedEntries.remove(Op.second); 3732 }); 3733 continue; 3734 } 3735 // Erase operands from OrderedEntries list and adjust their orders. 3736 VisitedOps.clear(); 3737 SmallVector<int> Mask; 3738 inversePermutation(BestOrder, Mask); 3739 SmallVector<int> MaskOrder(BestOrder.size(), UndefMaskElem); 3740 unsigned E = BestOrder.size(); 3741 transform(BestOrder, MaskOrder.begin(), [E](unsigned I) { 3742 return I < E ? static_cast<int>(I) : UndefMaskElem; 3743 }); 3744 for (const std::pair<unsigned, TreeEntry *> &Op : Data.second) { 3745 TreeEntry *TE = Op.second; 3746 OrderedEntries.remove(TE); 3747 if (!VisitedOps.insert(TE).second) 3748 continue; 3749 if (TE->ReuseShuffleIndices.size() == BestOrder.size()) { 3750 // Just reorder reuses indices. 3751 reorderReuses(TE->ReuseShuffleIndices, Mask); 3752 continue; 3753 } 3754 // Gathers are processed separately. 3755 if (TE->State != TreeEntry::Vectorize) 3756 continue; 3757 assert((BestOrder.size() == TE->ReorderIndices.size() || 3758 TE->ReorderIndices.empty()) && 3759 "Non-matching sizes of user/operand entries."); 3760 reorderOrder(TE->ReorderIndices, Mask); 3761 } 3762 // For gathers just need to reorder its scalars. 3763 for (TreeEntry *Gather : GatherOps) { 3764 assert(Gather->ReorderIndices.empty() && 3765 "Unexpected reordering of gathers."); 3766 if (!Gather->ReuseShuffleIndices.empty()) { 3767 // Just reorder reuses indices. 3768 reorderReuses(Gather->ReuseShuffleIndices, Mask); 3769 continue; 3770 } 3771 reorderScalars(Gather->Scalars, Mask); 3772 OrderedEntries.remove(Gather); 3773 } 3774 // Reorder operands of the user node and set the ordering for the user 3775 // node itself. 3776 if (Data.first->State != TreeEntry::Vectorize || 3777 !isa<ExtractElementInst, ExtractValueInst, LoadInst>( 3778 Data.first->getMainOp()) || 3779 Data.first->isAltShuffle()) 3780 Data.first->reorderOperands(Mask); 3781 if (!isa<InsertElementInst, StoreInst>(Data.first->getMainOp()) || 3782 Data.first->isAltShuffle()) { 3783 reorderScalars(Data.first->Scalars, Mask); 3784 reorderOrder(Data.first->ReorderIndices, MaskOrder); 3785 if (Data.first->ReuseShuffleIndices.empty() && 3786 !Data.first->ReorderIndices.empty() && 3787 !Data.first->isAltShuffle()) { 3788 // Insert user node to the list to try to sink reordering deeper in 3789 // the graph. 3790 OrderedEntries.insert(Data.first); 3791 } 3792 } else { 3793 reorderOrder(Data.first->ReorderIndices, Mask); 3794 } 3795 } 3796 } 3797 // If the reordering is unnecessary, just remove the reorder. 3798 if (IgnoreReorder && !VectorizableTree.front()->ReorderIndices.empty() && 3799 VectorizableTree.front()->ReuseShuffleIndices.empty()) 3800 VectorizableTree.front()->ReorderIndices.clear(); 3801 } 3802 3803 void BoUpSLP::buildExternalUses( 3804 const ExtraValueToDebugLocsMap &ExternallyUsedValues) { 3805 // Collect the values that we need to extract from the tree. 3806 for (auto &TEPtr : VectorizableTree) { 3807 TreeEntry *Entry = TEPtr.get(); 3808 3809 // No need to handle users of gathered values. 3810 if (Entry->State == TreeEntry::NeedToGather) 3811 continue; 3812 3813 // For each lane: 3814 for (int Lane = 0, LE = Entry->Scalars.size(); Lane != LE; ++Lane) { 3815 Value *Scalar = Entry->Scalars[Lane]; 3816 int FoundLane = Entry->findLaneForValue(Scalar); 3817 3818 // Check if the scalar is externally used as an extra arg. 3819 auto ExtI = ExternallyUsedValues.find(Scalar); 3820 if (ExtI != ExternallyUsedValues.end()) { 3821 LLVM_DEBUG(dbgs() << "SLP: Need to extract: Extra arg from lane " 3822 << Lane << " from " << *Scalar << ".\n"); 3823 ExternalUses.emplace_back(Scalar, nullptr, FoundLane); 3824 } 3825 for (User *U : Scalar->users()) { 3826 LLVM_DEBUG(dbgs() << "SLP: Checking user:" << *U << ".\n"); 3827 3828 Instruction *UserInst = dyn_cast<Instruction>(U); 3829 if (!UserInst) 3830 continue; 3831 3832 if (isDeleted(UserInst)) 3833 continue; 3834 3835 // Skip in-tree scalars that become vectors 3836 if (TreeEntry *UseEntry = getTreeEntry(U)) { 3837 Value *UseScalar = UseEntry->Scalars[0]; 3838 // Some in-tree scalars will remain as scalar in vectorized 3839 // instructions. If that is the case, the one in Lane 0 will 3840 // be used. 3841 if (UseScalar != U || 3842 UseEntry->State == TreeEntry::ScatterVectorize || 3843 !InTreeUserNeedToExtract(Scalar, UserInst, TLI)) { 3844 LLVM_DEBUG(dbgs() << "SLP: \tInternal user will be removed:" << *U 3845 << ".\n"); 3846 assert(UseEntry->State != TreeEntry::NeedToGather && "Bad state"); 3847 continue; 3848 } 3849 } 3850 3851 // Ignore users in the user ignore list. 3852 if (is_contained(UserIgnoreList, UserInst)) 3853 continue; 3854 3855 LLVM_DEBUG(dbgs() << "SLP: Need to extract:" << *U << " from lane " 3856 << Lane << " from " << *Scalar << ".\n"); 3857 ExternalUses.push_back(ExternalUser(Scalar, U, FoundLane)); 3858 } 3859 } 3860 } 3861 } 3862 3863 void BoUpSLP::buildTree(ArrayRef<Value *> Roots, 3864 ArrayRef<Value *> UserIgnoreLst) { 3865 deleteTree(); 3866 UserIgnoreList = UserIgnoreLst; 3867 if (!allSameType(Roots)) 3868 return; 3869 buildTree_rec(Roots, 0, EdgeInfo()); 3870 } 3871 3872 namespace { 3873 /// Tracks the state we can represent the loads in the given sequence. 3874 enum class LoadsState { Gather, Vectorize, ScatterVectorize }; 3875 } // anonymous namespace 3876 3877 /// Checks if the given array of loads can be represented as a vectorized, 3878 /// scatter or just simple gather. 3879 static LoadsState canVectorizeLoads(ArrayRef<Value *> VL, const Value *VL0, 3880 const TargetTransformInfo &TTI, 3881 const DataLayout &DL, ScalarEvolution &SE, 3882 SmallVectorImpl<unsigned> &Order, 3883 SmallVectorImpl<Value *> &PointerOps) { 3884 // Check that a vectorized load would load the same memory as a scalar 3885 // load. For example, we don't want to vectorize loads that are smaller 3886 // than 8-bit. Even though we have a packed struct {<i2, i2, i2, i2>} LLVM 3887 // treats loading/storing it as an i8 struct. If we vectorize loads/stores 3888 // from such a struct, we read/write packed bits disagreeing with the 3889 // unvectorized version. 3890 Type *ScalarTy = VL0->getType(); 3891 3892 if (DL.getTypeSizeInBits(ScalarTy) != DL.getTypeAllocSizeInBits(ScalarTy)) 3893 return LoadsState::Gather; 3894 3895 // Make sure all loads in the bundle are simple - we can't vectorize 3896 // atomic or volatile loads. 3897 PointerOps.clear(); 3898 PointerOps.resize(VL.size()); 3899 auto *POIter = PointerOps.begin(); 3900 for (Value *V : VL) { 3901 auto *L = cast<LoadInst>(V); 3902 if (!L->isSimple()) 3903 return LoadsState::Gather; 3904 *POIter = L->getPointerOperand(); 3905 ++POIter; 3906 } 3907 3908 Order.clear(); 3909 // Check the order of pointer operands. 3910 if (llvm::sortPtrAccesses(PointerOps, ScalarTy, DL, SE, Order)) { 3911 Value *Ptr0; 3912 Value *PtrN; 3913 if (Order.empty()) { 3914 Ptr0 = PointerOps.front(); 3915 PtrN = PointerOps.back(); 3916 } else { 3917 Ptr0 = PointerOps[Order.front()]; 3918 PtrN = PointerOps[Order.back()]; 3919 } 3920 Optional<int> Diff = 3921 getPointersDiff(ScalarTy, Ptr0, ScalarTy, PtrN, DL, SE); 3922 // Check that the sorted loads are consecutive. 3923 if (static_cast<unsigned>(*Diff) == VL.size() - 1) 3924 return LoadsState::Vectorize; 3925 Align CommonAlignment = cast<LoadInst>(VL0)->getAlign(); 3926 for (Value *V : VL) 3927 CommonAlignment = 3928 commonAlignment(CommonAlignment, cast<LoadInst>(V)->getAlign()); 3929 if (TTI.isLegalMaskedGather(FixedVectorType::get(ScalarTy, VL.size()), 3930 CommonAlignment)) 3931 return LoadsState::ScatterVectorize; 3932 } 3933 3934 return LoadsState::Gather; 3935 } 3936 3937 /// \return true if the specified list of values has only one instruction that 3938 /// requires scheduling, false otherwise. 3939 #ifndef NDEBUG 3940 static bool needToScheduleSingleInstruction(ArrayRef<Value *> VL) { 3941 Value *NeedsScheduling = nullptr; 3942 for (Value *V : VL) { 3943 if (doesNotNeedToBeScheduled(V)) 3944 continue; 3945 if (!NeedsScheduling) { 3946 NeedsScheduling = V; 3947 continue; 3948 } 3949 return false; 3950 } 3951 return NeedsScheduling; 3952 } 3953 #endif 3954 3955 void BoUpSLP::buildTree_rec(ArrayRef<Value *> VL, unsigned Depth, 3956 const EdgeInfo &UserTreeIdx) { 3957 assert((allConstant(VL) || allSameType(VL)) && "Invalid types!"); 3958 3959 SmallVector<int> ReuseShuffleIndicies; 3960 SmallVector<Value *> UniqueValues; 3961 auto &&TryToFindDuplicates = [&VL, &ReuseShuffleIndicies, &UniqueValues, 3962 &UserTreeIdx, 3963 this](const InstructionsState &S) { 3964 // Check that every instruction appears once in this bundle. 3965 DenseMap<Value *, unsigned> UniquePositions; 3966 for (Value *V : VL) { 3967 if (isConstant(V)) { 3968 ReuseShuffleIndicies.emplace_back( 3969 isa<UndefValue>(V) ? UndefMaskElem : UniqueValues.size()); 3970 UniqueValues.emplace_back(V); 3971 continue; 3972 } 3973 auto Res = UniquePositions.try_emplace(V, UniqueValues.size()); 3974 ReuseShuffleIndicies.emplace_back(Res.first->second); 3975 if (Res.second) 3976 UniqueValues.emplace_back(V); 3977 } 3978 size_t NumUniqueScalarValues = UniqueValues.size(); 3979 if (NumUniqueScalarValues == VL.size()) { 3980 ReuseShuffleIndicies.clear(); 3981 } else { 3982 LLVM_DEBUG(dbgs() << "SLP: Shuffle for reused scalars.\n"); 3983 if (NumUniqueScalarValues <= 1 || 3984 (UniquePositions.size() == 1 && all_of(UniqueValues, 3985 [](Value *V) { 3986 return isa<UndefValue>(V) || 3987 !isConstant(V); 3988 })) || 3989 !llvm::isPowerOf2_32(NumUniqueScalarValues)) { 3990 LLVM_DEBUG(dbgs() << "SLP: Scalar used twice in bundle.\n"); 3991 newTreeEntry(VL, None /*not vectorized*/, S, UserTreeIdx); 3992 return false; 3993 } 3994 VL = UniqueValues; 3995 } 3996 return true; 3997 }; 3998 3999 InstructionsState S = getSameOpcode(VL); 4000 if (Depth == RecursionMaxDepth) { 4001 LLVM_DEBUG(dbgs() << "SLP: Gathering due to max recursion depth.\n"); 4002 if (TryToFindDuplicates(S)) 4003 newTreeEntry(VL, None /*not vectorized*/, S, UserTreeIdx, 4004 ReuseShuffleIndicies); 4005 return; 4006 } 4007 4008 // Don't handle scalable vectors 4009 if (S.getOpcode() == Instruction::ExtractElement && 4010 isa<ScalableVectorType>( 4011 cast<ExtractElementInst>(S.OpValue)->getVectorOperandType())) { 4012 LLVM_DEBUG(dbgs() << "SLP: Gathering due to scalable vector type.\n"); 4013 if (TryToFindDuplicates(S)) 4014 newTreeEntry(VL, None /*not vectorized*/, S, UserTreeIdx, 4015 ReuseShuffleIndicies); 4016 return; 4017 } 4018 4019 // Don't handle vectors. 4020 if (S.OpValue->getType()->isVectorTy() && 4021 !isa<InsertElementInst>(S.OpValue)) { 4022 LLVM_DEBUG(dbgs() << "SLP: Gathering due to vector type.\n"); 4023 newTreeEntry(VL, None /*not vectorized*/, S, UserTreeIdx); 4024 return; 4025 } 4026 4027 if (StoreInst *SI = dyn_cast<StoreInst>(S.OpValue)) 4028 if (SI->getValueOperand()->getType()->isVectorTy()) { 4029 LLVM_DEBUG(dbgs() << "SLP: Gathering due to store vector type.\n"); 4030 newTreeEntry(VL, None /*not vectorized*/, S, UserTreeIdx); 4031 return; 4032 } 4033 4034 // If all of the operands are identical or constant we have a simple solution. 4035 // If we deal with insert/extract instructions, they all must have constant 4036 // indices, otherwise we should gather them, not try to vectorize. 4037 if (allConstant(VL) || isSplat(VL) || !allSameBlock(VL) || !S.getOpcode() || 4038 (isa<InsertElementInst, ExtractValueInst, ExtractElementInst>(S.MainOp) && 4039 !all_of(VL, isVectorLikeInstWithConstOps))) { 4040 LLVM_DEBUG(dbgs() << "SLP: Gathering due to C,S,B,O. \n"); 4041 if (TryToFindDuplicates(S)) 4042 newTreeEntry(VL, None /*not vectorized*/, S, UserTreeIdx, 4043 ReuseShuffleIndicies); 4044 return; 4045 } 4046 4047 // We now know that this is a vector of instructions of the same type from 4048 // the same block. 4049 4050 // Don't vectorize ephemeral values. 4051 for (Value *V : VL) { 4052 if (EphValues.count(V)) { 4053 LLVM_DEBUG(dbgs() << "SLP: The instruction (" << *V 4054 << ") is ephemeral.\n"); 4055 newTreeEntry(VL, None /*not vectorized*/, S, UserTreeIdx); 4056 return; 4057 } 4058 } 4059 4060 // Check if this is a duplicate of another entry. 4061 if (TreeEntry *E = getTreeEntry(S.OpValue)) { 4062 LLVM_DEBUG(dbgs() << "SLP: \tChecking bundle: " << *S.OpValue << ".\n"); 4063 if (!E->isSame(VL)) { 4064 LLVM_DEBUG(dbgs() << "SLP: Gathering due to partial overlap.\n"); 4065 if (TryToFindDuplicates(S)) 4066 newTreeEntry(VL, None /*not vectorized*/, S, UserTreeIdx, 4067 ReuseShuffleIndicies); 4068 return; 4069 } 4070 // Record the reuse of the tree node. FIXME, currently this is only used to 4071 // properly draw the graph rather than for the actual vectorization. 4072 E->UserTreeIndices.push_back(UserTreeIdx); 4073 LLVM_DEBUG(dbgs() << "SLP: Perfect diamond merge at " << *S.OpValue 4074 << ".\n"); 4075 return; 4076 } 4077 4078 // Check that none of the instructions in the bundle are already in the tree. 4079 for (Value *V : VL) { 4080 auto *I = dyn_cast<Instruction>(V); 4081 if (!I) 4082 continue; 4083 if (getTreeEntry(I)) { 4084 LLVM_DEBUG(dbgs() << "SLP: The instruction (" << *V 4085 << ") is already in tree.\n"); 4086 if (TryToFindDuplicates(S)) 4087 newTreeEntry(VL, None /*not vectorized*/, S, UserTreeIdx, 4088 ReuseShuffleIndicies); 4089 return; 4090 } 4091 } 4092 4093 // The reduction nodes (stored in UserIgnoreList) also should stay scalar. 4094 for (Value *V : VL) { 4095 if (is_contained(UserIgnoreList, V)) { 4096 LLVM_DEBUG(dbgs() << "SLP: Gathering due to gathered scalar.\n"); 4097 if (TryToFindDuplicates(S)) 4098 newTreeEntry(VL, None /*not vectorized*/, S, UserTreeIdx, 4099 ReuseShuffleIndicies); 4100 return; 4101 } 4102 } 4103 4104 // Check that all of the users of the scalars that we want to vectorize are 4105 // schedulable. 4106 auto *VL0 = cast<Instruction>(S.OpValue); 4107 BasicBlock *BB = VL0->getParent(); 4108 4109 if (!DT->isReachableFromEntry(BB)) { 4110 // Don't go into unreachable blocks. They may contain instructions with 4111 // dependency cycles which confuse the final scheduling. 4112 LLVM_DEBUG(dbgs() << "SLP: bundle in unreachable block.\n"); 4113 newTreeEntry(VL, None /*not vectorized*/, S, UserTreeIdx); 4114 return; 4115 } 4116 4117 // Check that every instruction appears once in this bundle. 4118 if (!TryToFindDuplicates(S)) 4119 return; 4120 4121 auto &BSRef = BlocksSchedules[BB]; 4122 if (!BSRef) 4123 BSRef = std::make_unique<BlockScheduling>(BB); 4124 4125 BlockScheduling &BS = *BSRef; 4126 4127 Optional<ScheduleData *> Bundle = BS.tryScheduleBundle(VL, this, S); 4128 #ifdef EXPENSIVE_CHECKS 4129 // Make sure we didn't break any internal invariants 4130 BS.verify(); 4131 #endif 4132 if (!Bundle) { 4133 LLVM_DEBUG(dbgs() << "SLP: We are not able to schedule this bundle!\n"); 4134 assert((!BS.getScheduleData(VL0) || 4135 !BS.getScheduleData(VL0)->isPartOfBundle()) && 4136 "tryScheduleBundle should cancelScheduling on failure"); 4137 newTreeEntry(VL, None /*not vectorized*/, S, UserTreeIdx, 4138 ReuseShuffleIndicies); 4139 return; 4140 } 4141 LLVM_DEBUG(dbgs() << "SLP: We are able to schedule this bundle.\n"); 4142 4143 unsigned ShuffleOrOp = S.isAltShuffle() ? 4144 (unsigned) Instruction::ShuffleVector : S.getOpcode(); 4145 switch (ShuffleOrOp) { 4146 case Instruction::PHI: { 4147 auto *PH = cast<PHINode>(VL0); 4148 4149 // Check for terminator values (e.g. invoke). 4150 for (Value *V : VL) 4151 for (Value *Incoming : cast<PHINode>(V)->incoming_values()) { 4152 Instruction *Term = dyn_cast<Instruction>(Incoming); 4153 if (Term && Term->isTerminator()) { 4154 LLVM_DEBUG(dbgs() 4155 << "SLP: Need to swizzle PHINodes (terminator use).\n"); 4156 BS.cancelScheduling(VL, VL0); 4157 newTreeEntry(VL, None /*not vectorized*/, S, UserTreeIdx, 4158 ReuseShuffleIndicies); 4159 return; 4160 } 4161 } 4162 4163 TreeEntry *TE = 4164 newTreeEntry(VL, Bundle, S, UserTreeIdx, ReuseShuffleIndicies); 4165 LLVM_DEBUG(dbgs() << "SLP: added a vector of PHINodes.\n"); 4166 4167 // Keeps the reordered operands to avoid code duplication. 4168 SmallVector<ValueList, 2> OperandsVec; 4169 for (unsigned I = 0, E = PH->getNumIncomingValues(); I < E; ++I) { 4170 if (!DT->isReachableFromEntry(PH->getIncomingBlock(I))) { 4171 ValueList Operands(VL.size(), PoisonValue::get(PH->getType())); 4172 TE->setOperand(I, Operands); 4173 OperandsVec.push_back(Operands); 4174 continue; 4175 } 4176 ValueList Operands; 4177 // Prepare the operand vector. 4178 for (Value *V : VL) 4179 Operands.push_back(cast<PHINode>(V)->getIncomingValueForBlock( 4180 PH->getIncomingBlock(I))); 4181 TE->setOperand(I, Operands); 4182 OperandsVec.push_back(Operands); 4183 } 4184 for (unsigned OpIdx = 0, OpE = OperandsVec.size(); OpIdx != OpE; ++OpIdx) 4185 buildTree_rec(OperandsVec[OpIdx], Depth + 1, {TE, OpIdx}); 4186 return; 4187 } 4188 case Instruction::ExtractValue: 4189 case Instruction::ExtractElement: { 4190 OrdersType CurrentOrder; 4191 bool Reuse = canReuseExtract(VL, VL0, CurrentOrder); 4192 if (Reuse) { 4193 LLVM_DEBUG(dbgs() << "SLP: Reusing or shuffling extract sequence.\n"); 4194 newTreeEntry(VL, Bundle /*vectorized*/, S, UserTreeIdx, 4195 ReuseShuffleIndicies); 4196 // This is a special case, as it does not gather, but at the same time 4197 // we are not extending buildTree_rec() towards the operands. 4198 ValueList Op0; 4199 Op0.assign(VL.size(), VL0->getOperand(0)); 4200 VectorizableTree.back()->setOperand(0, Op0); 4201 return; 4202 } 4203 if (!CurrentOrder.empty()) { 4204 LLVM_DEBUG({ 4205 dbgs() << "SLP: Reusing or shuffling of reordered extract sequence " 4206 "with order"; 4207 for (unsigned Idx : CurrentOrder) 4208 dbgs() << " " << Idx; 4209 dbgs() << "\n"; 4210 }); 4211 fixupOrderingIndices(CurrentOrder); 4212 // Insert new order with initial value 0, if it does not exist, 4213 // otherwise return the iterator to the existing one. 4214 newTreeEntry(VL, Bundle /*vectorized*/, S, UserTreeIdx, 4215 ReuseShuffleIndicies, CurrentOrder); 4216 // This is a special case, as it does not gather, but at the same time 4217 // we are not extending buildTree_rec() towards the operands. 4218 ValueList Op0; 4219 Op0.assign(VL.size(), VL0->getOperand(0)); 4220 VectorizableTree.back()->setOperand(0, Op0); 4221 return; 4222 } 4223 LLVM_DEBUG(dbgs() << "SLP: Gather extract sequence.\n"); 4224 newTreeEntry(VL, None /*not vectorized*/, S, UserTreeIdx, 4225 ReuseShuffleIndicies); 4226 BS.cancelScheduling(VL, VL0); 4227 return; 4228 } 4229 case Instruction::InsertElement: { 4230 assert(ReuseShuffleIndicies.empty() && "All inserts should be unique"); 4231 4232 // Check that we have a buildvector and not a shuffle of 2 or more 4233 // different vectors. 4234 ValueSet SourceVectors; 4235 for (Value *V : VL) { 4236 SourceVectors.insert(cast<Instruction>(V)->getOperand(0)); 4237 assert(getInsertIndex(V) != None && "Non-constant or undef index?"); 4238 } 4239 4240 if (count_if(VL, [&SourceVectors](Value *V) { 4241 return !SourceVectors.contains(V); 4242 }) >= 2) { 4243 // Found 2nd source vector - cancel. 4244 LLVM_DEBUG(dbgs() << "SLP: Gather of insertelement vectors with " 4245 "different source vectors.\n"); 4246 newTreeEntry(VL, None /*not vectorized*/, S, UserTreeIdx); 4247 BS.cancelScheduling(VL, VL0); 4248 return; 4249 } 4250 4251 auto OrdCompare = [](const std::pair<int, int> &P1, 4252 const std::pair<int, int> &P2) { 4253 return P1.first > P2.first; 4254 }; 4255 PriorityQueue<std::pair<int, int>, SmallVector<std::pair<int, int>>, 4256 decltype(OrdCompare)> 4257 Indices(OrdCompare); 4258 for (int I = 0, E = VL.size(); I < E; ++I) { 4259 unsigned Idx = *getInsertIndex(VL[I]); 4260 Indices.emplace(Idx, I); 4261 } 4262 OrdersType CurrentOrder(VL.size(), VL.size()); 4263 bool IsIdentity = true; 4264 for (int I = 0, E = VL.size(); I < E; ++I) { 4265 CurrentOrder[Indices.top().second] = I; 4266 IsIdentity &= Indices.top().second == I; 4267 Indices.pop(); 4268 } 4269 if (IsIdentity) 4270 CurrentOrder.clear(); 4271 TreeEntry *TE = newTreeEntry(VL, Bundle /*vectorized*/, S, UserTreeIdx, 4272 None, CurrentOrder); 4273 LLVM_DEBUG(dbgs() << "SLP: added inserts bundle.\n"); 4274 4275 constexpr int NumOps = 2; 4276 ValueList VectorOperands[NumOps]; 4277 for (int I = 0; I < NumOps; ++I) { 4278 for (Value *V : VL) 4279 VectorOperands[I].push_back(cast<Instruction>(V)->getOperand(I)); 4280 4281 TE->setOperand(I, VectorOperands[I]); 4282 } 4283 buildTree_rec(VectorOperands[NumOps - 1], Depth + 1, {TE, NumOps - 1}); 4284 return; 4285 } 4286 case Instruction::Load: { 4287 // Check that a vectorized load would load the same memory as a scalar 4288 // load. For example, we don't want to vectorize loads that are smaller 4289 // than 8-bit. Even though we have a packed struct {<i2, i2, i2, i2>} LLVM 4290 // treats loading/storing it as an i8 struct. If we vectorize loads/stores 4291 // from such a struct, we read/write packed bits disagreeing with the 4292 // unvectorized version. 4293 SmallVector<Value *> PointerOps; 4294 OrdersType CurrentOrder; 4295 TreeEntry *TE = nullptr; 4296 switch (canVectorizeLoads(VL, VL0, *TTI, *DL, *SE, CurrentOrder, 4297 PointerOps)) { 4298 case LoadsState::Vectorize: 4299 if (CurrentOrder.empty()) { 4300 // Original loads are consecutive and does not require reordering. 4301 TE = newTreeEntry(VL, Bundle /*vectorized*/, S, UserTreeIdx, 4302 ReuseShuffleIndicies); 4303 LLVM_DEBUG(dbgs() << "SLP: added a vector of loads.\n"); 4304 } else { 4305 fixupOrderingIndices(CurrentOrder); 4306 // Need to reorder. 4307 TE = newTreeEntry(VL, Bundle /*vectorized*/, S, UserTreeIdx, 4308 ReuseShuffleIndicies, CurrentOrder); 4309 LLVM_DEBUG(dbgs() << "SLP: added a vector of jumbled loads.\n"); 4310 } 4311 TE->setOperandsInOrder(); 4312 break; 4313 case LoadsState::ScatterVectorize: 4314 // Vectorizing non-consecutive loads with `llvm.masked.gather`. 4315 TE = newTreeEntry(VL, TreeEntry::ScatterVectorize, Bundle, S, 4316 UserTreeIdx, ReuseShuffleIndicies); 4317 TE->setOperandsInOrder(); 4318 buildTree_rec(PointerOps, Depth + 1, {TE, 0}); 4319 LLVM_DEBUG(dbgs() << "SLP: added a vector of non-consecutive loads.\n"); 4320 break; 4321 case LoadsState::Gather: 4322 BS.cancelScheduling(VL, VL0); 4323 newTreeEntry(VL, None /*not vectorized*/, S, UserTreeIdx, 4324 ReuseShuffleIndicies); 4325 #ifndef NDEBUG 4326 Type *ScalarTy = VL0->getType(); 4327 if (DL->getTypeSizeInBits(ScalarTy) != 4328 DL->getTypeAllocSizeInBits(ScalarTy)) 4329 LLVM_DEBUG(dbgs() << "SLP: Gathering loads of non-packed type.\n"); 4330 else if (any_of(VL, [](Value *V) { 4331 return !cast<LoadInst>(V)->isSimple(); 4332 })) 4333 LLVM_DEBUG(dbgs() << "SLP: Gathering non-simple loads.\n"); 4334 else 4335 LLVM_DEBUG(dbgs() << "SLP: Gathering non-consecutive loads.\n"); 4336 #endif // NDEBUG 4337 break; 4338 } 4339 return; 4340 } 4341 case Instruction::ZExt: 4342 case Instruction::SExt: 4343 case Instruction::FPToUI: 4344 case Instruction::FPToSI: 4345 case Instruction::FPExt: 4346 case Instruction::PtrToInt: 4347 case Instruction::IntToPtr: 4348 case Instruction::SIToFP: 4349 case Instruction::UIToFP: 4350 case Instruction::Trunc: 4351 case Instruction::FPTrunc: 4352 case Instruction::BitCast: { 4353 Type *SrcTy = VL0->getOperand(0)->getType(); 4354 for (Value *V : VL) { 4355 Type *Ty = cast<Instruction>(V)->getOperand(0)->getType(); 4356 if (Ty != SrcTy || !isValidElementType(Ty)) { 4357 BS.cancelScheduling(VL, VL0); 4358 newTreeEntry(VL, None /*not vectorized*/, S, UserTreeIdx, 4359 ReuseShuffleIndicies); 4360 LLVM_DEBUG(dbgs() 4361 << "SLP: Gathering casts with different src types.\n"); 4362 return; 4363 } 4364 } 4365 TreeEntry *TE = newTreeEntry(VL, Bundle /*vectorized*/, S, UserTreeIdx, 4366 ReuseShuffleIndicies); 4367 LLVM_DEBUG(dbgs() << "SLP: added a vector of casts.\n"); 4368 4369 TE->setOperandsInOrder(); 4370 for (unsigned i = 0, e = VL0->getNumOperands(); i < e; ++i) { 4371 ValueList Operands; 4372 // Prepare the operand vector. 4373 for (Value *V : VL) 4374 Operands.push_back(cast<Instruction>(V)->getOperand(i)); 4375 4376 buildTree_rec(Operands, Depth + 1, {TE, i}); 4377 } 4378 return; 4379 } 4380 case Instruction::ICmp: 4381 case Instruction::FCmp: { 4382 // Check that all of the compares have the same predicate. 4383 CmpInst::Predicate P0 = cast<CmpInst>(VL0)->getPredicate(); 4384 CmpInst::Predicate SwapP0 = CmpInst::getSwappedPredicate(P0); 4385 Type *ComparedTy = VL0->getOperand(0)->getType(); 4386 for (Value *V : VL) { 4387 CmpInst *Cmp = cast<CmpInst>(V); 4388 if ((Cmp->getPredicate() != P0 && Cmp->getPredicate() != SwapP0) || 4389 Cmp->getOperand(0)->getType() != ComparedTy) { 4390 BS.cancelScheduling(VL, VL0); 4391 newTreeEntry(VL, None /*not vectorized*/, S, UserTreeIdx, 4392 ReuseShuffleIndicies); 4393 LLVM_DEBUG(dbgs() 4394 << "SLP: Gathering cmp with different predicate.\n"); 4395 return; 4396 } 4397 } 4398 4399 TreeEntry *TE = newTreeEntry(VL, Bundle /*vectorized*/, S, UserTreeIdx, 4400 ReuseShuffleIndicies); 4401 LLVM_DEBUG(dbgs() << "SLP: added a vector of compares.\n"); 4402 4403 ValueList Left, Right; 4404 if (cast<CmpInst>(VL0)->isCommutative()) { 4405 // Commutative predicate - collect + sort operands of the instructions 4406 // so that each side is more likely to have the same opcode. 4407 assert(P0 == SwapP0 && "Commutative Predicate mismatch"); 4408 reorderInputsAccordingToOpcode(VL, Left, Right, *DL, *SE, *this); 4409 } else { 4410 // Collect operands - commute if it uses the swapped predicate. 4411 for (Value *V : VL) { 4412 auto *Cmp = cast<CmpInst>(V); 4413 Value *LHS = Cmp->getOperand(0); 4414 Value *RHS = Cmp->getOperand(1); 4415 if (Cmp->getPredicate() != P0) 4416 std::swap(LHS, RHS); 4417 Left.push_back(LHS); 4418 Right.push_back(RHS); 4419 } 4420 } 4421 TE->setOperand(0, Left); 4422 TE->setOperand(1, Right); 4423 buildTree_rec(Left, Depth + 1, {TE, 0}); 4424 buildTree_rec(Right, Depth + 1, {TE, 1}); 4425 return; 4426 } 4427 case Instruction::Select: 4428 case Instruction::FNeg: 4429 case Instruction::Add: 4430 case Instruction::FAdd: 4431 case Instruction::Sub: 4432 case Instruction::FSub: 4433 case Instruction::Mul: 4434 case Instruction::FMul: 4435 case Instruction::UDiv: 4436 case Instruction::SDiv: 4437 case Instruction::FDiv: 4438 case Instruction::URem: 4439 case Instruction::SRem: 4440 case Instruction::FRem: 4441 case Instruction::Shl: 4442 case Instruction::LShr: 4443 case Instruction::AShr: 4444 case Instruction::And: 4445 case Instruction::Or: 4446 case Instruction::Xor: { 4447 TreeEntry *TE = newTreeEntry(VL, Bundle /*vectorized*/, S, UserTreeIdx, 4448 ReuseShuffleIndicies); 4449 LLVM_DEBUG(dbgs() << "SLP: added a vector of un/bin op.\n"); 4450 4451 // Sort operands of the instructions so that each side is more likely to 4452 // have the same opcode. 4453 if (isa<BinaryOperator>(VL0) && VL0->isCommutative()) { 4454 ValueList Left, Right; 4455 reorderInputsAccordingToOpcode(VL, Left, Right, *DL, *SE, *this); 4456 TE->setOperand(0, Left); 4457 TE->setOperand(1, Right); 4458 buildTree_rec(Left, Depth + 1, {TE, 0}); 4459 buildTree_rec(Right, Depth + 1, {TE, 1}); 4460 return; 4461 } 4462 4463 TE->setOperandsInOrder(); 4464 for (unsigned i = 0, e = VL0->getNumOperands(); i < e; ++i) { 4465 ValueList Operands; 4466 // Prepare the operand vector. 4467 for (Value *V : VL) 4468 Operands.push_back(cast<Instruction>(V)->getOperand(i)); 4469 4470 buildTree_rec(Operands, Depth + 1, {TE, i}); 4471 } 4472 return; 4473 } 4474 case Instruction::GetElementPtr: { 4475 // We don't combine GEPs with complicated (nested) indexing. 4476 for (Value *V : VL) { 4477 if (cast<Instruction>(V)->getNumOperands() != 2) { 4478 LLVM_DEBUG(dbgs() << "SLP: not-vectorizable GEP (nested indexes).\n"); 4479 BS.cancelScheduling(VL, VL0); 4480 newTreeEntry(VL, None /*not vectorized*/, S, UserTreeIdx, 4481 ReuseShuffleIndicies); 4482 return; 4483 } 4484 } 4485 4486 // We can't combine several GEPs into one vector if they operate on 4487 // different types. 4488 Type *Ty0 = cast<GEPOperator>(VL0)->getSourceElementType(); 4489 for (Value *V : VL) { 4490 Type *CurTy = cast<GEPOperator>(V)->getSourceElementType(); 4491 if (Ty0 != CurTy) { 4492 LLVM_DEBUG(dbgs() 4493 << "SLP: not-vectorizable GEP (different types).\n"); 4494 BS.cancelScheduling(VL, VL0); 4495 newTreeEntry(VL, None /*not vectorized*/, S, UserTreeIdx, 4496 ReuseShuffleIndicies); 4497 return; 4498 } 4499 } 4500 4501 // We don't combine GEPs with non-constant indexes. 4502 Type *Ty1 = VL0->getOperand(1)->getType(); 4503 for (Value *V : VL) { 4504 auto Op = cast<Instruction>(V)->getOperand(1); 4505 if (!isa<ConstantInt>(Op) || 4506 (Op->getType() != Ty1 && 4507 Op->getType()->getScalarSizeInBits() > 4508 DL->getIndexSizeInBits( 4509 V->getType()->getPointerAddressSpace()))) { 4510 LLVM_DEBUG(dbgs() 4511 << "SLP: not-vectorizable GEP (non-constant indexes).\n"); 4512 BS.cancelScheduling(VL, VL0); 4513 newTreeEntry(VL, None /*not vectorized*/, S, UserTreeIdx, 4514 ReuseShuffleIndicies); 4515 return; 4516 } 4517 } 4518 4519 TreeEntry *TE = newTreeEntry(VL, Bundle /*vectorized*/, S, UserTreeIdx, 4520 ReuseShuffleIndicies); 4521 LLVM_DEBUG(dbgs() << "SLP: added a vector of GEPs.\n"); 4522 SmallVector<ValueList, 2> Operands(2); 4523 // Prepare the operand vector for pointer operands. 4524 for (Value *V : VL) 4525 Operands.front().push_back( 4526 cast<GetElementPtrInst>(V)->getPointerOperand()); 4527 TE->setOperand(0, Operands.front()); 4528 // Need to cast all indices to the same type before vectorization to 4529 // avoid crash. 4530 // Required to be able to find correct matches between different gather 4531 // nodes and reuse the vectorized values rather than trying to gather them 4532 // again. 4533 int IndexIdx = 1; 4534 Type *VL0Ty = VL0->getOperand(IndexIdx)->getType(); 4535 Type *Ty = all_of(VL, 4536 [VL0Ty, IndexIdx](Value *V) { 4537 return VL0Ty == cast<GetElementPtrInst>(V) 4538 ->getOperand(IndexIdx) 4539 ->getType(); 4540 }) 4541 ? VL0Ty 4542 : DL->getIndexType(cast<GetElementPtrInst>(VL0) 4543 ->getPointerOperandType() 4544 ->getScalarType()); 4545 // Prepare the operand vector. 4546 for (Value *V : VL) { 4547 auto *Op = cast<Instruction>(V)->getOperand(IndexIdx); 4548 auto *CI = cast<ConstantInt>(Op); 4549 Operands.back().push_back(ConstantExpr::getIntegerCast( 4550 CI, Ty, CI->getValue().isSignBitSet())); 4551 } 4552 TE->setOperand(IndexIdx, Operands.back()); 4553 4554 for (unsigned I = 0, Ops = Operands.size(); I < Ops; ++I) 4555 buildTree_rec(Operands[I], Depth + 1, {TE, I}); 4556 return; 4557 } 4558 case Instruction::Store: { 4559 // Check if the stores are consecutive or if we need to swizzle them. 4560 llvm::Type *ScalarTy = cast<StoreInst>(VL0)->getValueOperand()->getType(); 4561 // Avoid types that are padded when being allocated as scalars, while 4562 // being packed together in a vector (such as i1). 4563 if (DL->getTypeSizeInBits(ScalarTy) != 4564 DL->getTypeAllocSizeInBits(ScalarTy)) { 4565 BS.cancelScheduling(VL, VL0); 4566 newTreeEntry(VL, None /*not vectorized*/, S, UserTreeIdx, 4567 ReuseShuffleIndicies); 4568 LLVM_DEBUG(dbgs() << "SLP: Gathering stores of non-packed type.\n"); 4569 return; 4570 } 4571 // Make sure all stores in the bundle are simple - we can't vectorize 4572 // atomic or volatile stores. 4573 SmallVector<Value *, 4> PointerOps(VL.size()); 4574 ValueList Operands(VL.size()); 4575 auto POIter = PointerOps.begin(); 4576 auto OIter = Operands.begin(); 4577 for (Value *V : VL) { 4578 auto *SI = cast<StoreInst>(V); 4579 if (!SI->isSimple()) { 4580 BS.cancelScheduling(VL, VL0); 4581 newTreeEntry(VL, None /*not vectorized*/, S, UserTreeIdx, 4582 ReuseShuffleIndicies); 4583 LLVM_DEBUG(dbgs() << "SLP: Gathering non-simple stores.\n"); 4584 return; 4585 } 4586 *POIter = SI->getPointerOperand(); 4587 *OIter = SI->getValueOperand(); 4588 ++POIter; 4589 ++OIter; 4590 } 4591 4592 OrdersType CurrentOrder; 4593 // Check the order of pointer operands. 4594 if (llvm::sortPtrAccesses(PointerOps, ScalarTy, *DL, *SE, CurrentOrder)) { 4595 Value *Ptr0; 4596 Value *PtrN; 4597 if (CurrentOrder.empty()) { 4598 Ptr0 = PointerOps.front(); 4599 PtrN = PointerOps.back(); 4600 } else { 4601 Ptr0 = PointerOps[CurrentOrder.front()]; 4602 PtrN = PointerOps[CurrentOrder.back()]; 4603 } 4604 Optional<int> Dist = 4605 getPointersDiff(ScalarTy, Ptr0, ScalarTy, PtrN, *DL, *SE); 4606 // Check that the sorted pointer operands are consecutive. 4607 if (static_cast<unsigned>(*Dist) == VL.size() - 1) { 4608 if (CurrentOrder.empty()) { 4609 // Original stores are consecutive and does not require reordering. 4610 TreeEntry *TE = newTreeEntry(VL, Bundle /*vectorized*/, S, 4611 UserTreeIdx, ReuseShuffleIndicies); 4612 TE->setOperandsInOrder(); 4613 buildTree_rec(Operands, Depth + 1, {TE, 0}); 4614 LLVM_DEBUG(dbgs() << "SLP: added a vector of stores.\n"); 4615 } else { 4616 fixupOrderingIndices(CurrentOrder); 4617 TreeEntry *TE = 4618 newTreeEntry(VL, Bundle /*vectorized*/, S, UserTreeIdx, 4619 ReuseShuffleIndicies, CurrentOrder); 4620 TE->setOperandsInOrder(); 4621 buildTree_rec(Operands, Depth + 1, {TE, 0}); 4622 LLVM_DEBUG(dbgs() << "SLP: added a vector of jumbled stores.\n"); 4623 } 4624 return; 4625 } 4626 } 4627 4628 BS.cancelScheduling(VL, VL0); 4629 newTreeEntry(VL, None /*not vectorized*/, S, UserTreeIdx, 4630 ReuseShuffleIndicies); 4631 LLVM_DEBUG(dbgs() << "SLP: Non-consecutive store.\n"); 4632 return; 4633 } 4634 case Instruction::Call: { 4635 // Check if the calls are all to the same vectorizable intrinsic or 4636 // library function. 4637 CallInst *CI = cast<CallInst>(VL0); 4638 Intrinsic::ID ID = getVectorIntrinsicIDForCall(CI, TLI); 4639 4640 VFShape Shape = VFShape::get( 4641 *CI, ElementCount::getFixed(static_cast<unsigned int>(VL.size())), 4642 false /*HasGlobalPred*/); 4643 Function *VecFunc = VFDatabase(*CI).getVectorizedFunction(Shape); 4644 4645 if (!VecFunc && !isTriviallyVectorizable(ID)) { 4646 BS.cancelScheduling(VL, VL0); 4647 newTreeEntry(VL, None /*not vectorized*/, S, UserTreeIdx, 4648 ReuseShuffleIndicies); 4649 LLVM_DEBUG(dbgs() << "SLP: Non-vectorizable call.\n"); 4650 return; 4651 } 4652 Function *F = CI->getCalledFunction(); 4653 unsigned NumArgs = CI->arg_size(); 4654 SmallVector<Value*, 4> ScalarArgs(NumArgs, nullptr); 4655 for (unsigned j = 0; j != NumArgs; ++j) 4656 if (hasVectorInstrinsicScalarOpd(ID, j)) 4657 ScalarArgs[j] = CI->getArgOperand(j); 4658 for (Value *V : VL) { 4659 CallInst *CI2 = dyn_cast<CallInst>(V); 4660 if (!CI2 || CI2->getCalledFunction() != F || 4661 getVectorIntrinsicIDForCall(CI2, TLI) != ID || 4662 (VecFunc && 4663 VecFunc != VFDatabase(*CI2).getVectorizedFunction(Shape)) || 4664 !CI->hasIdenticalOperandBundleSchema(*CI2)) { 4665 BS.cancelScheduling(VL, VL0); 4666 newTreeEntry(VL, None /*not vectorized*/, S, UserTreeIdx, 4667 ReuseShuffleIndicies); 4668 LLVM_DEBUG(dbgs() << "SLP: mismatched calls:" << *CI << "!=" << *V 4669 << "\n"); 4670 return; 4671 } 4672 // Some intrinsics have scalar arguments and should be same in order for 4673 // them to be vectorized. 4674 for (unsigned j = 0; j != NumArgs; ++j) { 4675 if (hasVectorInstrinsicScalarOpd(ID, j)) { 4676 Value *A1J = CI2->getArgOperand(j); 4677 if (ScalarArgs[j] != A1J) { 4678 BS.cancelScheduling(VL, VL0); 4679 newTreeEntry(VL, None /*not vectorized*/, S, UserTreeIdx, 4680 ReuseShuffleIndicies); 4681 LLVM_DEBUG(dbgs() << "SLP: mismatched arguments in call:" << *CI 4682 << " argument " << ScalarArgs[j] << "!=" << A1J 4683 << "\n"); 4684 return; 4685 } 4686 } 4687 } 4688 // Verify that the bundle operands are identical between the two calls. 4689 if (CI->hasOperandBundles() && 4690 !std::equal(CI->op_begin() + CI->getBundleOperandsStartIndex(), 4691 CI->op_begin() + CI->getBundleOperandsEndIndex(), 4692 CI2->op_begin() + CI2->getBundleOperandsStartIndex())) { 4693 BS.cancelScheduling(VL, VL0); 4694 newTreeEntry(VL, None /*not vectorized*/, S, UserTreeIdx, 4695 ReuseShuffleIndicies); 4696 LLVM_DEBUG(dbgs() << "SLP: mismatched bundle operands in calls:" 4697 << *CI << "!=" << *V << '\n'); 4698 return; 4699 } 4700 } 4701 4702 TreeEntry *TE = newTreeEntry(VL, Bundle /*vectorized*/, S, UserTreeIdx, 4703 ReuseShuffleIndicies); 4704 TE->setOperandsInOrder(); 4705 for (unsigned i = 0, e = CI->arg_size(); i != e; ++i) { 4706 // For scalar operands no need to to create an entry since no need to 4707 // vectorize it. 4708 if (hasVectorInstrinsicScalarOpd(ID, i)) 4709 continue; 4710 ValueList Operands; 4711 // Prepare the operand vector. 4712 for (Value *V : VL) { 4713 auto *CI2 = cast<CallInst>(V); 4714 Operands.push_back(CI2->getArgOperand(i)); 4715 } 4716 buildTree_rec(Operands, Depth + 1, {TE, i}); 4717 } 4718 return; 4719 } 4720 case Instruction::ShuffleVector: { 4721 // If this is not an alternate sequence of opcode like add-sub 4722 // then do not vectorize this instruction. 4723 if (!S.isAltShuffle()) { 4724 BS.cancelScheduling(VL, VL0); 4725 newTreeEntry(VL, None /*not vectorized*/, S, UserTreeIdx, 4726 ReuseShuffleIndicies); 4727 LLVM_DEBUG(dbgs() << "SLP: ShuffleVector are not vectorized.\n"); 4728 return; 4729 } 4730 TreeEntry *TE = newTreeEntry(VL, Bundle /*vectorized*/, S, UserTreeIdx, 4731 ReuseShuffleIndicies); 4732 LLVM_DEBUG(dbgs() << "SLP: added a ShuffleVector op.\n"); 4733 4734 // Reorder operands if reordering would enable vectorization. 4735 auto *CI = dyn_cast<CmpInst>(VL0); 4736 if (isa<BinaryOperator>(VL0) || CI) { 4737 ValueList Left, Right; 4738 if (!CI || all_of(VL, [](Value *V) { 4739 return cast<CmpInst>(V)->isCommutative(); 4740 })) { 4741 reorderInputsAccordingToOpcode(VL, Left, Right, *DL, *SE, *this); 4742 } else { 4743 CmpInst::Predicate P0 = CI->getPredicate(); 4744 CmpInst::Predicate AltP0 = cast<CmpInst>(S.AltOp)->getPredicate(); 4745 assert(P0 != AltP0 && 4746 "Expected different main/alternate predicates."); 4747 CmpInst::Predicate AltP0Swapped = CmpInst::getSwappedPredicate(AltP0); 4748 Value *BaseOp0 = VL0->getOperand(0); 4749 Value *BaseOp1 = VL0->getOperand(1); 4750 // Collect operands - commute if it uses the swapped predicate or 4751 // alternate operation. 4752 for (Value *V : VL) { 4753 auto *Cmp = cast<CmpInst>(V); 4754 Value *LHS = Cmp->getOperand(0); 4755 Value *RHS = Cmp->getOperand(1); 4756 CmpInst::Predicate CurrentPred = Cmp->getPredicate(); 4757 if (P0 == AltP0Swapped) { 4758 if (CI != Cmp && S.AltOp != Cmp && 4759 ((P0 == CurrentPred && 4760 !areCompatibleCmpOps(BaseOp0, BaseOp1, LHS, RHS)) || 4761 (AltP0 == CurrentPred && 4762 areCompatibleCmpOps(BaseOp0, BaseOp1, LHS, RHS)))) 4763 std::swap(LHS, RHS); 4764 } else if (P0 != CurrentPred && AltP0 != CurrentPred) { 4765 std::swap(LHS, RHS); 4766 } 4767 Left.push_back(LHS); 4768 Right.push_back(RHS); 4769 } 4770 } 4771 TE->setOperand(0, Left); 4772 TE->setOperand(1, Right); 4773 buildTree_rec(Left, Depth + 1, {TE, 0}); 4774 buildTree_rec(Right, Depth + 1, {TE, 1}); 4775 return; 4776 } 4777 4778 TE->setOperandsInOrder(); 4779 for (unsigned i = 0, e = VL0->getNumOperands(); i < e; ++i) { 4780 ValueList Operands; 4781 // Prepare the operand vector. 4782 for (Value *V : VL) 4783 Operands.push_back(cast<Instruction>(V)->getOperand(i)); 4784 4785 buildTree_rec(Operands, Depth + 1, {TE, i}); 4786 } 4787 return; 4788 } 4789 default: 4790 BS.cancelScheduling(VL, VL0); 4791 newTreeEntry(VL, None /*not vectorized*/, S, UserTreeIdx, 4792 ReuseShuffleIndicies); 4793 LLVM_DEBUG(dbgs() << "SLP: Gathering unknown instruction.\n"); 4794 return; 4795 } 4796 } 4797 4798 unsigned BoUpSLP::canMapToVector(Type *T, const DataLayout &DL) const { 4799 unsigned N = 1; 4800 Type *EltTy = T; 4801 4802 while (isa<StructType>(EltTy) || isa<ArrayType>(EltTy) || 4803 isa<VectorType>(EltTy)) { 4804 if (auto *ST = dyn_cast<StructType>(EltTy)) { 4805 // Check that struct is homogeneous. 4806 for (const auto *Ty : ST->elements()) 4807 if (Ty != *ST->element_begin()) 4808 return 0; 4809 N *= ST->getNumElements(); 4810 EltTy = *ST->element_begin(); 4811 } else if (auto *AT = dyn_cast<ArrayType>(EltTy)) { 4812 N *= AT->getNumElements(); 4813 EltTy = AT->getElementType(); 4814 } else { 4815 auto *VT = cast<FixedVectorType>(EltTy); 4816 N *= VT->getNumElements(); 4817 EltTy = VT->getElementType(); 4818 } 4819 } 4820 4821 if (!isValidElementType(EltTy)) 4822 return 0; 4823 uint64_t VTSize = DL.getTypeStoreSizeInBits(FixedVectorType::get(EltTy, N)); 4824 if (VTSize < MinVecRegSize || VTSize > MaxVecRegSize || VTSize != DL.getTypeStoreSizeInBits(T)) 4825 return 0; 4826 return N; 4827 } 4828 4829 bool BoUpSLP::canReuseExtract(ArrayRef<Value *> VL, Value *OpValue, 4830 SmallVectorImpl<unsigned> &CurrentOrder) const { 4831 const auto *It = find_if(VL, [](Value *V) { 4832 return isa<ExtractElementInst, ExtractValueInst>(V); 4833 }); 4834 assert(It != VL.end() && "Expected at least one extract instruction."); 4835 auto *E0 = cast<Instruction>(*It); 4836 assert(all_of(VL, 4837 [](Value *V) { 4838 return isa<UndefValue, ExtractElementInst, ExtractValueInst>( 4839 V); 4840 }) && 4841 "Invalid opcode"); 4842 // Check if all of the extracts come from the same vector and from the 4843 // correct offset. 4844 Value *Vec = E0->getOperand(0); 4845 4846 CurrentOrder.clear(); 4847 4848 // We have to extract from a vector/aggregate with the same number of elements. 4849 unsigned NElts; 4850 if (E0->getOpcode() == Instruction::ExtractValue) { 4851 const DataLayout &DL = E0->getModule()->getDataLayout(); 4852 NElts = canMapToVector(Vec->getType(), DL); 4853 if (!NElts) 4854 return false; 4855 // Check if load can be rewritten as load of vector. 4856 LoadInst *LI = dyn_cast<LoadInst>(Vec); 4857 if (!LI || !LI->isSimple() || !LI->hasNUses(VL.size())) 4858 return false; 4859 } else { 4860 NElts = cast<FixedVectorType>(Vec->getType())->getNumElements(); 4861 } 4862 4863 if (NElts != VL.size()) 4864 return false; 4865 4866 // Check that all of the indices extract from the correct offset. 4867 bool ShouldKeepOrder = true; 4868 unsigned E = VL.size(); 4869 // Assign to all items the initial value E + 1 so we can check if the extract 4870 // instruction index was used already. 4871 // Also, later we can check that all the indices are used and we have a 4872 // consecutive access in the extract instructions, by checking that no 4873 // element of CurrentOrder still has value E + 1. 4874 CurrentOrder.assign(E, E); 4875 unsigned I = 0; 4876 for (; I < E; ++I) { 4877 auto *Inst = dyn_cast<Instruction>(VL[I]); 4878 if (!Inst) 4879 continue; 4880 if (Inst->getOperand(0) != Vec) 4881 break; 4882 if (auto *EE = dyn_cast<ExtractElementInst>(Inst)) 4883 if (isa<UndefValue>(EE->getIndexOperand())) 4884 continue; 4885 Optional<unsigned> Idx = getExtractIndex(Inst); 4886 if (!Idx) 4887 break; 4888 const unsigned ExtIdx = *Idx; 4889 if (ExtIdx != I) { 4890 if (ExtIdx >= E || CurrentOrder[ExtIdx] != E) 4891 break; 4892 ShouldKeepOrder = false; 4893 CurrentOrder[ExtIdx] = I; 4894 } else { 4895 if (CurrentOrder[I] != E) 4896 break; 4897 CurrentOrder[I] = I; 4898 } 4899 } 4900 if (I < E) { 4901 CurrentOrder.clear(); 4902 return false; 4903 } 4904 if (ShouldKeepOrder) 4905 CurrentOrder.clear(); 4906 4907 return ShouldKeepOrder; 4908 } 4909 4910 bool BoUpSLP::areAllUsersVectorized(Instruction *I, 4911 ArrayRef<Value *> VectorizedVals) const { 4912 return (I->hasOneUse() && is_contained(VectorizedVals, I)) || 4913 all_of(I->users(), [this](User *U) { 4914 return ScalarToTreeEntry.count(U) > 0 || 4915 isVectorLikeInstWithConstOps(U) || 4916 (isa<ExtractElementInst>(U) && MustGather.contains(U)); 4917 }); 4918 } 4919 4920 static std::pair<InstructionCost, InstructionCost> 4921 getVectorCallCosts(CallInst *CI, FixedVectorType *VecTy, 4922 TargetTransformInfo *TTI, TargetLibraryInfo *TLI) { 4923 Intrinsic::ID ID = getVectorIntrinsicIDForCall(CI, TLI); 4924 4925 // Calculate the cost of the scalar and vector calls. 4926 SmallVector<Type *, 4> VecTys; 4927 for (Use &Arg : CI->args()) 4928 VecTys.push_back( 4929 FixedVectorType::get(Arg->getType(), VecTy->getNumElements())); 4930 FastMathFlags FMF; 4931 if (auto *FPCI = dyn_cast<FPMathOperator>(CI)) 4932 FMF = FPCI->getFastMathFlags(); 4933 SmallVector<const Value *> Arguments(CI->args()); 4934 IntrinsicCostAttributes CostAttrs(ID, VecTy, Arguments, VecTys, FMF, 4935 dyn_cast<IntrinsicInst>(CI)); 4936 auto IntrinsicCost = 4937 TTI->getIntrinsicInstrCost(CostAttrs, TTI::TCK_RecipThroughput); 4938 4939 auto Shape = VFShape::get(*CI, ElementCount::getFixed(static_cast<unsigned>( 4940 VecTy->getNumElements())), 4941 false /*HasGlobalPred*/); 4942 Function *VecFunc = VFDatabase(*CI).getVectorizedFunction(Shape); 4943 auto LibCost = IntrinsicCost; 4944 if (!CI->isNoBuiltin() && VecFunc) { 4945 // Calculate the cost of the vector library call. 4946 // If the corresponding vector call is cheaper, return its cost. 4947 LibCost = TTI->getCallInstrCost(nullptr, VecTy, VecTys, 4948 TTI::TCK_RecipThroughput); 4949 } 4950 return {IntrinsicCost, LibCost}; 4951 } 4952 4953 /// Compute the cost of creating a vector of type \p VecTy containing the 4954 /// extracted values from \p VL. 4955 static InstructionCost 4956 computeExtractCost(ArrayRef<Value *> VL, FixedVectorType *VecTy, 4957 TargetTransformInfo::ShuffleKind ShuffleKind, 4958 ArrayRef<int> Mask, TargetTransformInfo &TTI) { 4959 unsigned NumOfParts = TTI.getNumberOfParts(VecTy); 4960 4961 if (ShuffleKind != TargetTransformInfo::SK_PermuteSingleSrc || !NumOfParts || 4962 VecTy->getNumElements() < NumOfParts) 4963 return TTI.getShuffleCost(ShuffleKind, VecTy, Mask); 4964 4965 bool AllConsecutive = true; 4966 unsigned EltsPerVector = VecTy->getNumElements() / NumOfParts; 4967 unsigned Idx = -1; 4968 InstructionCost Cost = 0; 4969 4970 // Process extracts in blocks of EltsPerVector to check if the source vector 4971 // operand can be re-used directly. If not, add the cost of creating a shuffle 4972 // to extract the values into a vector register. 4973 for (auto *V : VL) { 4974 ++Idx; 4975 4976 // Need to exclude undefs from analysis. 4977 if (isa<UndefValue>(V) || Mask[Idx] == UndefMaskElem) 4978 continue; 4979 4980 // Reached the start of a new vector registers. 4981 if (Idx % EltsPerVector == 0) { 4982 AllConsecutive = true; 4983 continue; 4984 } 4985 4986 // Check all extracts for a vector register on the target directly 4987 // extract values in order. 4988 unsigned CurrentIdx = *getExtractIndex(cast<Instruction>(V)); 4989 if (!isa<UndefValue>(VL[Idx - 1]) && Mask[Idx - 1] != UndefMaskElem) { 4990 unsigned PrevIdx = *getExtractIndex(cast<Instruction>(VL[Idx - 1])); 4991 AllConsecutive &= PrevIdx + 1 == CurrentIdx && 4992 CurrentIdx % EltsPerVector == Idx % EltsPerVector; 4993 } 4994 4995 if (AllConsecutive) 4996 continue; 4997 4998 // Skip all indices, except for the last index per vector block. 4999 if ((Idx + 1) % EltsPerVector != 0 && Idx + 1 != VL.size()) 5000 continue; 5001 5002 // If we have a series of extracts which are not consecutive and hence 5003 // cannot re-use the source vector register directly, compute the shuffle 5004 // cost to extract the a vector with EltsPerVector elements. 5005 Cost += TTI.getShuffleCost( 5006 TargetTransformInfo::SK_PermuteSingleSrc, 5007 FixedVectorType::get(VecTy->getElementType(), EltsPerVector)); 5008 } 5009 return Cost; 5010 } 5011 5012 /// Build shuffle mask for shuffle graph entries and lists of main and alternate 5013 /// operations operands. 5014 static void 5015 buildShuffleEntryMask(ArrayRef<Value *> VL, ArrayRef<unsigned> ReorderIndices, 5016 ArrayRef<int> ReusesIndices, 5017 const function_ref<bool(Instruction *)> IsAltOp, 5018 SmallVectorImpl<int> &Mask, 5019 SmallVectorImpl<Value *> *OpScalars = nullptr, 5020 SmallVectorImpl<Value *> *AltScalars = nullptr) { 5021 unsigned Sz = VL.size(); 5022 Mask.assign(Sz, UndefMaskElem); 5023 SmallVector<int> OrderMask; 5024 if (!ReorderIndices.empty()) 5025 inversePermutation(ReorderIndices, OrderMask); 5026 for (unsigned I = 0; I < Sz; ++I) { 5027 unsigned Idx = I; 5028 if (!ReorderIndices.empty()) 5029 Idx = OrderMask[I]; 5030 auto *OpInst = cast<Instruction>(VL[Idx]); 5031 if (IsAltOp(OpInst)) { 5032 Mask[I] = Sz + Idx; 5033 if (AltScalars) 5034 AltScalars->push_back(OpInst); 5035 } else { 5036 Mask[I] = Idx; 5037 if (OpScalars) 5038 OpScalars->push_back(OpInst); 5039 } 5040 } 5041 if (!ReusesIndices.empty()) { 5042 SmallVector<int> NewMask(ReusesIndices.size(), UndefMaskElem); 5043 transform(ReusesIndices, NewMask.begin(), [&Mask](int Idx) { 5044 return Idx != UndefMaskElem ? Mask[Idx] : UndefMaskElem; 5045 }); 5046 Mask.swap(NewMask); 5047 } 5048 } 5049 5050 /// Checks if the specified instruction \p I is an alternate operation for the 5051 /// given \p MainOp and \p AltOp instructions. 5052 static bool isAlternateInstruction(const Instruction *I, 5053 const Instruction *MainOp, 5054 const Instruction *AltOp) { 5055 if (auto *CI0 = dyn_cast<CmpInst>(MainOp)) { 5056 auto *AltCI0 = cast<CmpInst>(AltOp); 5057 auto *CI = cast<CmpInst>(I); 5058 CmpInst::Predicate P0 = CI0->getPredicate(); 5059 CmpInst::Predicate AltP0 = AltCI0->getPredicate(); 5060 assert(P0 != AltP0 && "Expected different main/alternate predicates."); 5061 CmpInst::Predicate AltP0Swapped = CmpInst::getSwappedPredicate(AltP0); 5062 CmpInst::Predicate CurrentPred = CI->getPredicate(); 5063 if (P0 == AltP0Swapped) 5064 return I == AltCI0 || 5065 (I != MainOp && 5066 !areCompatibleCmpOps(CI0->getOperand(0), CI0->getOperand(1), 5067 CI->getOperand(0), CI->getOperand(1))); 5068 return AltP0 == CurrentPred || AltP0Swapped == CurrentPred; 5069 } 5070 return I->getOpcode() == AltOp->getOpcode(); 5071 } 5072 5073 InstructionCost BoUpSLP::getEntryCost(const TreeEntry *E, 5074 ArrayRef<Value *> VectorizedVals) { 5075 ArrayRef<Value*> VL = E->Scalars; 5076 5077 Type *ScalarTy = VL[0]->getType(); 5078 if (StoreInst *SI = dyn_cast<StoreInst>(VL[0])) 5079 ScalarTy = SI->getValueOperand()->getType(); 5080 else if (CmpInst *CI = dyn_cast<CmpInst>(VL[0])) 5081 ScalarTy = CI->getOperand(0)->getType(); 5082 else if (auto *IE = dyn_cast<InsertElementInst>(VL[0])) 5083 ScalarTy = IE->getOperand(1)->getType(); 5084 auto *VecTy = FixedVectorType::get(ScalarTy, VL.size()); 5085 TTI::TargetCostKind CostKind = TTI::TCK_RecipThroughput; 5086 5087 // If we have computed a smaller type for the expression, update VecTy so 5088 // that the costs will be accurate. 5089 if (MinBWs.count(VL[0])) 5090 VecTy = FixedVectorType::get( 5091 IntegerType::get(F->getContext(), MinBWs[VL[0]].first), VL.size()); 5092 unsigned EntryVF = E->getVectorFactor(); 5093 auto *FinalVecTy = FixedVectorType::get(VecTy->getElementType(), EntryVF); 5094 5095 bool NeedToShuffleReuses = !E->ReuseShuffleIndices.empty(); 5096 // FIXME: it tries to fix a problem with MSVC buildbots. 5097 TargetTransformInfo &TTIRef = *TTI; 5098 auto &&AdjustExtractsCost = [this, &TTIRef, CostKind, VL, VecTy, 5099 VectorizedVals, E](InstructionCost &Cost) { 5100 DenseMap<Value *, int> ExtractVectorsTys; 5101 SmallPtrSet<Value *, 4> CheckedExtracts; 5102 for (auto *V : VL) { 5103 if (isa<UndefValue>(V)) 5104 continue; 5105 // If all users of instruction are going to be vectorized and this 5106 // instruction itself is not going to be vectorized, consider this 5107 // instruction as dead and remove its cost from the final cost of the 5108 // vectorized tree. 5109 // Also, avoid adjusting the cost for extractelements with multiple uses 5110 // in different graph entries. 5111 const TreeEntry *VE = getTreeEntry(V); 5112 if (!CheckedExtracts.insert(V).second || 5113 !areAllUsersVectorized(cast<Instruction>(V), VectorizedVals) || 5114 (VE && VE != E)) 5115 continue; 5116 auto *EE = cast<ExtractElementInst>(V); 5117 Optional<unsigned> EEIdx = getExtractIndex(EE); 5118 if (!EEIdx) 5119 continue; 5120 unsigned Idx = *EEIdx; 5121 if (TTIRef.getNumberOfParts(VecTy) != 5122 TTIRef.getNumberOfParts(EE->getVectorOperandType())) { 5123 auto It = 5124 ExtractVectorsTys.try_emplace(EE->getVectorOperand(), Idx).first; 5125 It->getSecond() = std::min<int>(It->second, Idx); 5126 } 5127 // Take credit for instruction that will become dead. 5128 if (EE->hasOneUse()) { 5129 Instruction *Ext = EE->user_back(); 5130 if ((isa<SExtInst>(Ext) || isa<ZExtInst>(Ext)) && 5131 all_of(Ext->users(), 5132 [](User *U) { return isa<GetElementPtrInst>(U); })) { 5133 // Use getExtractWithExtendCost() to calculate the cost of 5134 // extractelement/ext pair. 5135 Cost -= 5136 TTIRef.getExtractWithExtendCost(Ext->getOpcode(), Ext->getType(), 5137 EE->getVectorOperandType(), Idx); 5138 // Add back the cost of s|zext which is subtracted separately. 5139 Cost += TTIRef.getCastInstrCost( 5140 Ext->getOpcode(), Ext->getType(), EE->getType(), 5141 TTI::getCastContextHint(Ext), CostKind, Ext); 5142 continue; 5143 } 5144 } 5145 Cost -= TTIRef.getVectorInstrCost(Instruction::ExtractElement, 5146 EE->getVectorOperandType(), Idx); 5147 } 5148 // Add a cost for subvector extracts/inserts if required. 5149 for (const auto &Data : ExtractVectorsTys) { 5150 auto *EEVTy = cast<FixedVectorType>(Data.first->getType()); 5151 unsigned NumElts = VecTy->getNumElements(); 5152 if (Data.second % NumElts == 0) 5153 continue; 5154 if (TTIRef.getNumberOfParts(EEVTy) > TTIRef.getNumberOfParts(VecTy)) { 5155 unsigned Idx = (Data.second / NumElts) * NumElts; 5156 unsigned EENumElts = EEVTy->getNumElements(); 5157 if (Idx + NumElts <= EENumElts) { 5158 Cost += 5159 TTIRef.getShuffleCost(TargetTransformInfo::SK_ExtractSubvector, 5160 EEVTy, None, Idx, VecTy); 5161 } else { 5162 // Need to round up the subvector type vectorization factor to avoid a 5163 // crash in cost model functions. Make SubVT so that Idx + VF of SubVT 5164 // <= EENumElts. 5165 auto *SubVT = 5166 FixedVectorType::get(VecTy->getElementType(), EENumElts - Idx); 5167 Cost += 5168 TTIRef.getShuffleCost(TargetTransformInfo::SK_ExtractSubvector, 5169 EEVTy, None, Idx, SubVT); 5170 } 5171 } else { 5172 Cost += TTIRef.getShuffleCost(TargetTransformInfo::SK_InsertSubvector, 5173 VecTy, None, 0, EEVTy); 5174 } 5175 } 5176 }; 5177 if (E->State == TreeEntry::NeedToGather) { 5178 if (allConstant(VL)) 5179 return 0; 5180 if (isa<InsertElementInst>(VL[0])) 5181 return InstructionCost::getInvalid(); 5182 SmallVector<int> Mask; 5183 SmallVector<const TreeEntry *> Entries; 5184 Optional<TargetTransformInfo::ShuffleKind> Shuffle = 5185 isGatherShuffledEntry(E, Mask, Entries); 5186 if (Shuffle.hasValue()) { 5187 InstructionCost GatherCost = 0; 5188 if (ShuffleVectorInst::isIdentityMask(Mask)) { 5189 // Perfect match in the graph, will reuse the previously vectorized 5190 // node. Cost is 0. 5191 LLVM_DEBUG( 5192 dbgs() 5193 << "SLP: perfect diamond match for gather bundle that starts with " 5194 << *VL.front() << ".\n"); 5195 if (NeedToShuffleReuses) 5196 GatherCost = 5197 TTI->getShuffleCost(TargetTransformInfo::SK_PermuteSingleSrc, 5198 FinalVecTy, E->ReuseShuffleIndices); 5199 } else { 5200 LLVM_DEBUG(dbgs() << "SLP: shuffled " << Entries.size() 5201 << " entries for bundle that starts with " 5202 << *VL.front() << ".\n"); 5203 // Detected that instead of gather we can emit a shuffle of single/two 5204 // previously vectorized nodes. Add the cost of the permutation rather 5205 // than gather. 5206 ::addMask(Mask, E->ReuseShuffleIndices); 5207 GatherCost = TTI->getShuffleCost(*Shuffle, FinalVecTy, Mask); 5208 } 5209 return GatherCost; 5210 } 5211 if ((E->getOpcode() == Instruction::ExtractElement || 5212 all_of(E->Scalars, 5213 [](Value *V) { 5214 return isa<ExtractElementInst, UndefValue>(V); 5215 })) && 5216 allSameType(VL)) { 5217 // Check that gather of extractelements can be represented as just a 5218 // shuffle of a single/two vectors the scalars are extracted from. 5219 SmallVector<int> Mask; 5220 Optional<TargetTransformInfo::ShuffleKind> ShuffleKind = 5221 isFixedVectorShuffle(VL, Mask); 5222 if (ShuffleKind.hasValue()) { 5223 // Found the bunch of extractelement instructions that must be gathered 5224 // into a vector and can be represented as a permutation elements in a 5225 // single input vector or of 2 input vectors. 5226 InstructionCost Cost = 5227 computeExtractCost(VL, VecTy, *ShuffleKind, Mask, *TTI); 5228 AdjustExtractsCost(Cost); 5229 if (NeedToShuffleReuses) 5230 Cost += TTI->getShuffleCost(TargetTransformInfo::SK_PermuteSingleSrc, 5231 FinalVecTy, E->ReuseShuffleIndices); 5232 return Cost; 5233 } 5234 } 5235 if (isSplat(VL)) { 5236 // Found the broadcasting of the single scalar, calculate the cost as the 5237 // broadcast. 5238 assert(VecTy == FinalVecTy && 5239 "No reused scalars expected for broadcast."); 5240 return TTI->getShuffleCost(TargetTransformInfo::SK_Broadcast, VecTy); 5241 } 5242 InstructionCost ReuseShuffleCost = 0; 5243 if (NeedToShuffleReuses) 5244 ReuseShuffleCost = TTI->getShuffleCost( 5245 TTI::SK_PermuteSingleSrc, FinalVecTy, E->ReuseShuffleIndices); 5246 // Improve gather cost for gather of loads, if we can group some of the 5247 // loads into vector loads. 5248 if (VL.size() > 2 && E->getOpcode() == Instruction::Load && 5249 !E->isAltShuffle()) { 5250 BoUpSLP::ValueSet VectorizedLoads; 5251 unsigned StartIdx = 0; 5252 unsigned VF = VL.size() / 2; 5253 unsigned VectorizedCnt = 0; 5254 unsigned ScatterVectorizeCnt = 0; 5255 const unsigned Sz = DL->getTypeSizeInBits(E->getMainOp()->getType()); 5256 for (unsigned MinVF = getMinVF(2 * Sz); VF >= MinVF; VF /= 2) { 5257 for (unsigned Cnt = StartIdx, End = VL.size(); Cnt + VF <= End; 5258 Cnt += VF) { 5259 ArrayRef<Value *> Slice = VL.slice(Cnt, VF); 5260 if (!VectorizedLoads.count(Slice.front()) && 5261 !VectorizedLoads.count(Slice.back()) && allSameBlock(Slice)) { 5262 SmallVector<Value *> PointerOps; 5263 OrdersType CurrentOrder; 5264 LoadsState LS = canVectorizeLoads(Slice, Slice.front(), *TTI, *DL, 5265 *SE, CurrentOrder, PointerOps); 5266 switch (LS) { 5267 case LoadsState::Vectorize: 5268 case LoadsState::ScatterVectorize: 5269 // Mark the vectorized loads so that we don't vectorize them 5270 // again. 5271 if (LS == LoadsState::Vectorize) 5272 ++VectorizedCnt; 5273 else 5274 ++ScatterVectorizeCnt; 5275 VectorizedLoads.insert(Slice.begin(), Slice.end()); 5276 // If we vectorized initial block, no need to try to vectorize it 5277 // again. 5278 if (Cnt == StartIdx) 5279 StartIdx += VF; 5280 break; 5281 case LoadsState::Gather: 5282 break; 5283 } 5284 } 5285 } 5286 // Check if the whole array was vectorized already - exit. 5287 if (StartIdx >= VL.size()) 5288 break; 5289 // Found vectorizable parts - exit. 5290 if (!VectorizedLoads.empty()) 5291 break; 5292 } 5293 if (!VectorizedLoads.empty()) { 5294 InstructionCost GatherCost = 0; 5295 unsigned NumParts = TTI->getNumberOfParts(VecTy); 5296 bool NeedInsertSubvectorAnalysis = 5297 !NumParts || (VL.size() / VF) > NumParts; 5298 // Get the cost for gathered loads. 5299 for (unsigned I = 0, End = VL.size(); I < End; I += VF) { 5300 if (VectorizedLoads.contains(VL[I])) 5301 continue; 5302 GatherCost += getGatherCost(VL.slice(I, VF)); 5303 } 5304 // The cost for vectorized loads. 5305 InstructionCost ScalarsCost = 0; 5306 for (Value *V : VectorizedLoads) { 5307 auto *LI = cast<LoadInst>(V); 5308 ScalarsCost += TTI->getMemoryOpCost( 5309 Instruction::Load, LI->getType(), LI->getAlign(), 5310 LI->getPointerAddressSpace(), CostKind, LI); 5311 } 5312 auto *LI = cast<LoadInst>(E->getMainOp()); 5313 auto *LoadTy = FixedVectorType::get(LI->getType(), VF); 5314 Align Alignment = LI->getAlign(); 5315 GatherCost += 5316 VectorizedCnt * 5317 TTI->getMemoryOpCost(Instruction::Load, LoadTy, Alignment, 5318 LI->getPointerAddressSpace(), CostKind, LI); 5319 GatherCost += ScatterVectorizeCnt * 5320 TTI->getGatherScatterOpCost( 5321 Instruction::Load, LoadTy, LI->getPointerOperand(), 5322 /*VariableMask=*/false, Alignment, CostKind, LI); 5323 if (NeedInsertSubvectorAnalysis) { 5324 // Add the cost for the subvectors insert. 5325 for (int I = VF, E = VL.size(); I < E; I += VF) 5326 GatherCost += TTI->getShuffleCost(TTI::SK_InsertSubvector, VecTy, 5327 None, I, LoadTy); 5328 } 5329 return ReuseShuffleCost + GatherCost - ScalarsCost; 5330 } 5331 } 5332 return ReuseShuffleCost + getGatherCost(VL); 5333 } 5334 InstructionCost CommonCost = 0; 5335 SmallVector<int> Mask; 5336 if (!E->ReorderIndices.empty()) { 5337 SmallVector<int> NewMask; 5338 if (E->getOpcode() == Instruction::Store) { 5339 // For stores the order is actually a mask. 5340 NewMask.resize(E->ReorderIndices.size()); 5341 copy(E->ReorderIndices, NewMask.begin()); 5342 } else { 5343 inversePermutation(E->ReorderIndices, NewMask); 5344 } 5345 ::addMask(Mask, NewMask); 5346 } 5347 if (NeedToShuffleReuses) 5348 ::addMask(Mask, E->ReuseShuffleIndices); 5349 if (!Mask.empty() && !ShuffleVectorInst::isIdentityMask(Mask)) 5350 CommonCost = 5351 TTI->getShuffleCost(TTI::SK_PermuteSingleSrc, FinalVecTy, Mask); 5352 assert((E->State == TreeEntry::Vectorize || 5353 E->State == TreeEntry::ScatterVectorize) && 5354 "Unhandled state"); 5355 assert(E->getOpcode() && allSameType(VL) && allSameBlock(VL) && "Invalid VL"); 5356 Instruction *VL0 = E->getMainOp(); 5357 unsigned ShuffleOrOp = 5358 E->isAltShuffle() ? (unsigned)Instruction::ShuffleVector : E->getOpcode(); 5359 switch (ShuffleOrOp) { 5360 case Instruction::PHI: 5361 return 0; 5362 5363 case Instruction::ExtractValue: 5364 case Instruction::ExtractElement: { 5365 // The common cost of removal ExtractElement/ExtractValue instructions + 5366 // the cost of shuffles, if required to resuffle the original vector. 5367 if (NeedToShuffleReuses) { 5368 unsigned Idx = 0; 5369 for (unsigned I : E->ReuseShuffleIndices) { 5370 if (ShuffleOrOp == Instruction::ExtractElement) { 5371 auto *EE = cast<ExtractElementInst>(VL[I]); 5372 CommonCost -= TTI->getVectorInstrCost(Instruction::ExtractElement, 5373 EE->getVectorOperandType(), 5374 *getExtractIndex(EE)); 5375 } else { 5376 CommonCost -= TTI->getVectorInstrCost(Instruction::ExtractElement, 5377 VecTy, Idx); 5378 ++Idx; 5379 } 5380 } 5381 Idx = EntryVF; 5382 for (Value *V : VL) { 5383 if (ShuffleOrOp == Instruction::ExtractElement) { 5384 auto *EE = cast<ExtractElementInst>(V); 5385 CommonCost += TTI->getVectorInstrCost(Instruction::ExtractElement, 5386 EE->getVectorOperandType(), 5387 *getExtractIndex(EE)); 5388 } else { 5389 --Idx; 5390 CommonCost += TTI->getVectorInstrCost(Instruction::ExtractElement, 5391 VecTy, Idx); 5392 } 5393 } 5394 } 5395 if (ShuffleOrOp == Instruction::ExtractValue) { 5396 for (unsigned I = 0, E = VL.size(); I < E; ++I) { 5397 auto *EI = cast<Instruction>(VL[I]); 5398 // Take credit for instruction that will become dead. 5399 if (EI->hasOneUse()) { 5400 Instruction *Ext = EI->user_back(); 5401 if ((isa<SExtInst>(Ext) || isa<ZExtInst>(Ext)) && 5402 all_of(Ext->users(), 5403 [](User *U) { return isa<GetElementPtrInst>(U); })) { 5404 // Use getExtractWithExtendCost() to calculate the cost of 5405 // extractelement/ext pair. 5406 CommonCost -= TTI->getExtractWithExtendCost( 5407 Ext->getOpcode(), Ext->getType(), VecTy, I); 5408 // Add back the cost of s|zext which is subtracted separately. 5409 CommonCost += TTI->getCastInstrCost( 5410 Ext->getOpcode(), Ext->getType(), EI->getType(), 5411 TTI::getCastContextHint(Ext), CostKind, Ext); 5412 continue; 5413 } 5414 } 5415 CommonCost -= 5416 TTI->getVectorInstrCost(Instruction::ExtractElement, VecTy, I); 5417 } 5418 } else { 5419 AdjustExtractsCost(CommonCost); 5420 } 5421 return CommonCost; 5422 } 5423 case Instruction::InsertElement: { 5424 assert(E->ReuseShuffleIndices.empty() && 5425 "Unique insertelements only are expected."); 5426 auto *SrcVecTy = cast<FixedVectorType>(VL0->getType()); 5427 5428 unsigned const NumElts = SrcVecTy->getNumElements(); 5429 unsigned const NumScalars = VL.size(); 5430 APInt DemandedElts = APInt::getZero(NumElts); 5431 // TODO: Add support for Instruction::InsertValue. 5432 SmallVector<int> Mask; 5433 if (!E->ReorderIndices.empty()) { 5434 inversePermutation(E->ReorderIndices, Mask); 5435 Mask.append(NumElts - NumScalars, UndefMaskElem); 5436 } else { 5437 Mask.assign(NumElts, UndefMaskElem); 5438 std::iota(Mask.begin(), std::next(Mask.begin(), NumScalars), 0); 5439 } 5440 unsigned Offset = *getInsertIndex(VL0); 5441 bool IsIdentity = true; 5442 SmallVector<int> PrevMask(NumElts, UndefMaskElem); 5443 Mask.swap(PrevMask); 5444 for (unsigned I = 0; I < NumScalars; ++I) { 5445 unsigned InsertIdx = *getInsertIndex(VL[PrevMask[I]]); 5446 DemandedElts.setBit(InsertIdx); 5447 IsIdentity &= InsertIdx - Offset == I; 5448 Mask[InsertIdx - Offset] = I; 5449 } 5450 assert(Offset < NumElts && "Failed to find vector index offset"); 5451 5452 InstructionCost Cost = 0; 5453 Cost -= TTI->getScalarizationOverhead(SrcVecTy, DemandedElts, 5454 /*Insert*/ true, /*Extract*/ false); 5455 5456 if (IsIdentity && NumElts != NumScalars && Offset % NumScalars != 0) { 5457 // FIXME: Replace with SK_InsertSubvector once it is properly supported. 5458 unsigned Sz = PowerOf2Ceil(Offset + NumScalars); 5459 Cost += TTI->getShuffleCost( 5460 TargetTransformInfo::SK_PermuteSingleSrc, 5461 FixedVectorType::get(SrcVecTy->getElementType(), Sz)); 5462 } else if (!IsIdentity) { 5463 auto *FirstInsert = 5464 cast<Instruction>(*find_if(E->Scalars, [E](Value *V) { 5465 return !is_contained(E->Scalars, 5466 cast<Instruction>(V)->getOperand(0)); 5467 })); 5468 if (isUndefVector(FirstInsert->getOperand(0))) { 5469 Cost += TTI->getShuffleCost(TTI::SK_PermuteSingleSrc, SrcVecTy, Mask); 5470 } else { 5471 SmallVector<int> InsertMask(NumElts); 5472 std::iota(InsertMask.begin(), InsertMask.end(), 0); 5473 for (unsigned I = 0; I < NumElts; I++) { 5474 if (Mask[I] != UndefMaskElem) 5475 InsertMask[Offset + I] = NumElts + I; 5476 } 5477 Cost += 5478 TTI->getShuffleCost(TTI::SK_PermuteTwoSrc, SrcVecTy, InsertMask); 5479 } 5480 } 5481 5482 return Cost; 5483 } 5484 case Instruction::ZExt: 5485 case Instruction::SExt: 5486 case Instruction::FPToUI: 5487 case Instruction::FPToSI: 5488 case Instruction::FPExt: 5489 case Instruction::PtrToInt: 5490 case Instruction::IntToPtr: 5491 case Instruction::SIToFP: 5492 case Instruction::UIToFP: 5493 case Instruction::Trunc: 5494 case Instruction::FPTrunc: 5495 case Instruction::BitCast: { 5496 Type *SrcTy = VL0->getOperand(0)->getType(); 5497 InstructionCost ScalarEltCost = 5498 TTI->getCastInstrCost(E->getOpcode(), ScalarTy, SrcTy, 5499 TTI::getCastContextHint(VL0), CostKind, VL0); 5500 if (NeedToShuffleReuses) { 5501 CommonCost -= (EntryVF - VL.size()) * ScalarEltCost; 5502 } 5503 5504 // Calculate the cost of this instruction. 5505 InstructionCost ScalarCost = VL.size() * ScalarEltCost; 5506 5507 auto *SrcVecTy = FixedVectorType::get(SrcTy, VL.size()); 5508 InstructionCost VecCost = 0; 5509 // Check if the values are candidates to demote. 5510 if (!MinBWs.count(VL0) || VecTy != SrcVecTy) { 5511 VecCost = CommonCost + TTI->getCastInstrCost( 5512 E->getOpcode(), VecTy, SrcVecTy, 5513 TTI::getCastContextHint(VL0), CostKind, VL0); 5514 } 5515 LLVM_DEBUG(dumpTreeCosts(E, CommonCost, VecCost, ScalarCost)); 5516 return VecCost - ScalarCost; 5517 } 5518 case Instruction::FCmp: 5519 case Instruction::ICmp: 5520 case Instruction::Select: { 5521 // Calculate the cost of this instruction. 5522 InstructionCost ScalarEltCost = 5523 TTI->getCmpSelInstrCost(E->getOpcode(), ScalarTy, Builder.getInt1Ty(), 5524 CmpInst::BAD_ICMP_PREDICATE, CostKind, VL0); 5525 if (NeedToShuffleReuses) { 5526 CommonCost -= (EntryVF - VL.size()) * ScalarEltCost; 5527 } 5528 auto *MaskTy = FixedVectorType::get(Builder.getInt1Ty(), VL.size()); 5529 InstructionCost ScalarCost = VecTy->getNumElements() * ScalarEltCost; 5530 5531 // Check if all entries in VL are either compares or selects with compares 5532 // as condition that have the same predicates. 5533 CmpInst::Predicate VecPred = CmpInst::BAD_ICMP_PREDICATE; 5534 bool First = true; 5535 for (auto *V : VL) { 5536 CmpInst::Predicate CurrentPred; 5537 auto MatchCmp = m_Cmp(CurrentPred, m_Value(), m_Value()); 5538 if ((!match(V, m_Select(MatchCmp, m_Value(), m_Value())) && 5539 !match(V, MatchCmp)) || 5540 (!First && VecPred != CurrentPred)) { 5541 VecPred = CmpInst::BAD_ICMP_PREDICATE; 5542 break; 5543 } 5544 First = false; 5545 VecPred = CurrentPred; 5546 } 5547 5548 InstructionCost VecCost = TTI->getCmpSelInstrCost( 5549 E->getOpcode(), VecTy, MaskTy, VecPred, CostKind, VL0); 5550 // Check if it is possible and profitable to use min/max for selects in 5551 // VL. 5552 // 5553 auto IntrinsicAndUse = canConvertToMinOrMaxIntrinsic(VL); 5554 if (IntrinsicAndUse.first != Intrinsic::not_intrinsic) { 5555 IntrinsicCostAttributes CostAttrs(IntrinsicAndUse.first, VecTy, 5556 {VecTy, VecTy}); 5557 InstructionCost IntrinsicCost = 5558 TTI->getIntrinsicInstrCost(CostAttrs, CostKind); 5559 // If the selects are the only uses of the compares, they will be dead 5560 // and we can adjust the cost by removing their cost. 5561 if (IntrinsicAndUse.second) 5562 IntrinsicCost -= TTI->getCmpSelInstrCost(Instruction::ICmp, VecTy, 5563 MaskTy, VecPred, CostKind); 5564 VecCost = std::min(VecCost, IntrinsicCost); 5565 } 5566 LLVM_DEBUG(dumpTreeCosts(E, CommonCost, VecCost, ScalarCost)); 5567 return CommonCost + VecCost - ScalarCost; 5568 } 5569 case Instruction::FNeg: 5570 case Instruction::Add: 5571 case Instruction::FAdd: 5572 case Instruction::Sub: 5573 case Instruction::FSub: 5574 case Instruction::Mul: 5575 case Instruction::FMul: 5576 case Instruction::UDiv: 5577 case Instruction::SDiv: 5578 case Instruction::FDiv: 5579 case Instruction::URem: 5580 case Instruction::SRem: 5581 case Instruction::FRem: 5582 case Instruction::Shl: 5583 case Instruction::LShr: 5584 case Instruction::AShr: 5585 case Instruction::And: 5586 case Instruction::Or: 5587 case Instruction::Xor: { 5588 // Certain instructions can be cheaper to vectorize if they have a 5589 // constant second vector operand. 5590 TargetTransformInfo::OperandValueKind Op1VK = 5591 TargetTransformInfo::OK_AnyValue; 5592 TargetTransformInfo::OperandValueKind Op2VK = 5593 TargetTransformInfo::OK_UniformConstantValue; 5594 TargetTransformInfo::OperandValueProperties Op1VP = 5595 TargetTransformInfo::OP_None; 5596 TargetTransformInfo::OperandValueProperties Op2VP = 5597 TargetTransformInfo::OP_PowerOf2; 5598 5599 // If all operands are exactly the same ConstantInt then set the 5600 // operand kind to OK_UniformConstantValue. 5601 // If instead not all operands are constants, then set the operand kind 5602 // to OK_AnyValue. If all operands are constants but not the same, 5603 // then set the operand kind to OK_NonUniformConstantValue. 5604 ConstantInt *CInt0 = nullptr; 5605 for (unsigned i = 0, e = VL.size(); i < e; ++i) { 5606 const Instruction *I = cast<Instruction>(VL[i]); 5607 unsigned OpIdx = isa<BinaryOperator>(I) ? 1 : 0; 5608 ConstantInt *CInt = dyn_cast<ConstantInt>(I->getOperand(OpIdx)); 5609 if (!CInt) { 5610 Op2VK = TargetTransformInfo::OK_AnyValue; 5611 Op2VP = TargetTransformInfo::OP_None; 5612 break; 5613 } 5614 if (Op2VP == TargetTransformInfo::OP_PowerOf2 && 5615 !CInt->getValue().isPowerOf2()) 5616 Op2VP = TargetTransformInfo::OP_None; 5617 if (i == 0) { 5618 CInt0 = CInt; 5619 continue; 5620 } 5621 if (CInt0 != CInt) 5622 Op2VK = TargetTransformInfo::OK_NonUniformConstantValue; 5623 } 5624 5625 SmallVector<const Value *, 4> Operands(VL0->operand_values()); 5626 InstructionCost ScalarEltCost = 5627 TTI->getArithmeticInstrCost(E->getOpcode(), ScalarTy, CostKind, Op1VK, 5628 Op2VK, Op1VP, Op2VP, Operands, VL0); 5629 if (NeedToShuffleReuses) { 5630 CommonCost -= (EntryVF - VL.size()) * ScalarEltCost; 5631 } 5632 InstructionCost ScalarCost = VecTy->getNumElements() * ScalarEltCost; 5633 InstructionCost VecCost = 5634 TTI->getArithmeticInstrCost(E->getOpcode(), VecTy, CostKind, Op1VK, 5635 Op2VK, Op1VP, Op2VP, Operands, VL0); 5636 LLVM_DEBUG(dumpTreeCosts(E, CommonCost, VecCost, ScalarCost)); 5637 return CommonCost + VecCost - ScalarCost; 5638 } 5639 case Instruction::GetElementPtr: { 5640 TargetTransformInfo::OperandValueKind Op1VK = 5641 TargetTransformInfo::OK_AnyValue; 5642 TargetTransformInfo::OperandValueKind Op2VK = 5643 TargetTransformInfo::OK_UniformConstantValue; 5644 5645 InstructionCost ScalarEltCost = TTI->getArithmeticInstrCost( 5646 Instruction::Add, ScalarTy, CostKind, Op1VK, Op2VK); 5647 if (NeedToShuffleReuses) { 5648 CommonCost -= (EntryVF - VL.size()) * ScalarEltCost; 5649 } 5650 InstructionCost ScalarCost = VecTy->getNumElements() * ScalarEltCost; 5651 InstructionCost VecCost = TTI->getArithmeticInstrCost( 5652 Instruction::Add, VecTy, CostKind, Op1VK, Op2VK); 5653 LLVM_DEBUG(dumpTreeCosts(E, CommonCost, VecCost, ScalarCost)); 5654 return CommonCost + VecCost - ScalarCost; 5655 } 5656 case Instruction::Load: { 5657 // Cost of wide load - cost of scalar loads. 5658 Align Alignment = cast<LoadInst>(VL0)->getAlign(); 5659 InstructionCost ScalarEltCost = TTI->getMemoryOpCost( 5660 Instruction::Load, ScalarTy, Alignment, 0, CostKind, VL0); 5661 if (NeedToShuffleReuses) { 5662 CommonCost -= (EntryVF - VL.size()) * ScalarEltCost; 5663 } 5664 InstructionCost ScalarLdCost = VecTy->getNumElements() * ScalarEltCost; 5665 InstructionCost VecLdCost; 5666 if (E->State == TreeEntry::Vectorize) { 5667 VecLdCost = TTI->getMemoryOpCost(Instruction::Load, VecTy, Alignment, 0, 5668 CostKind, VL0); 5669 } else { 5670 assert(E->State == TreeEntry::ScatterVectorize && "Unknown EntryState"); 5671 Align CommonAlignment = Alignment; 5672 for (Value *V : VL) 5673 CommonAlignment = 5674 commonAlignment(CommonAlignment, cast<LoadInst>(V)->getAlign()); 5675 VecLdCost = TTI->getGatherScatterOpCost( 5676 Instruction::Load, VecTy, cast<LoadInst>(VL0)->getPointerOperand(), 5677 /*VariableMask=*/false, CommonAlignment, CostKind, VL0); 5678 } 5679 LLVM_DEBUG(dumpTreeCosts(E, CommonCost, VecLdCost, ScalarLdCost)); 5680 return CommonCost + VecLdCost - ScalarLdCost; 5681 } 5682 case Instruction::Store: { 5683 // We know that we can merge the stores. Calculate the cost. 5684 bool IsReorder = !E->ReorderIndices.empty(); 5685 auto *SI = 5686 cast<StoreInst>(IsReorder ? VL[E->ReorderIndices.front()] : VL0); 5687 Align Alignment = SI->getAlign(); 5688 InstructionCost ScalarEltCost = TTI->getMemoryOpCost( 5689 Instruction::Store, ScalarTy, Alignment, 0, CostKind, VL0); 5690 InstructionCost ScalarStCost = VecTy->getNumElements() * ScalarEltCost; 5691 InstructionCost VecStCost = TTI->getMemoryOpCost( 5692 Instruction::Store, VecTy, Alignment, 0, CostKind, VL0); 5693 LLVM_DEBUG(dumpTreeCosts(E, CommonCost, VecStCost, ScalarStCost)); 5694 return CommonCost + VecStCost - ScalarStCost; 5695 } 5696 case Instruction::Call: { 5697 CallInst *CI = cast<CallInst>(VL0); 5698 Intrinsic::ID ID = getVectorIntrinsicIDForCall(CI, TLI); 5699 5700 // Calculate the cost of the scalar and vector calls. 5701 IntrinsicCostAttributes CostAttrs(ID, *CI, 1); 5702 InstructionCost ScalarEltCost = 5703 TTI->getIntrinsicInstrCost(CostAttrs, CostKind); 5704 if (NeedToShuffleReuses) { 5705 CommonCost -= (EntryVF - VL.size()) * ScalarEltCost; 5706 } 5707 InstructionCost ScalarCallCost = VecTy->getNumElements() * ScalarEltCost; 5708 5709 auto VecCallCosts = getVectorCallCosts(CI, VecTy, TTI, TLI); 5710 InstructionCost VecCallCost = 5711 std::min(VecCallCosts.first, VecCallCosts.second); 5712 5713 LLVM_DEBUG(dbgs() << "SLP: Call cost " << VecCallCost - ScalarCallCost 5714 << " (" << VecCallCost << "-" << ScalarCallCost << ")" 5715 << " for " << *CI << "\n"); 5716 5717 return CommonCost + VecCallCost - ScalarCallCost; 5718 } 5719 case Instruction::ShuffleVector: { 5720 assert(E->isAltShuffle() && 5721 ((Instruction::isBinaryOp(E->getOpcode()) && 5722 Instruction::isBinaryOp(E->getAltOpcode())) || 5723 (Instruction::isCast(E->getOpcode()) && 5724 Instruction::isCast(E->getAltOpcode())) || 5725 (isa<CmpInst>(VL0) && isa<CmpInst>(E->getAltOp()))) && 5726 "Invalid Shuffle Vector Operand"); 5727 InstructionCost ScalarCost = 0; 5728 if (NeedToShuffleReuses) { 5729 for (unsigned Idx : E->ReuseShuffleIndices) { 5730 Instruction *I = cast<Instruction>(VL[Idx]); 5731 CommonCost -= TTI->getInstructionCost(I, CostKind); 5732 } 5733 for (Value *V : VL) { 5734 Instruction *I = cast<Instruction>(V); 5735 CommonCost += TTI->getInstructionCost(I, CostKind); 5736 } 5737 } 5738 for (Value *V : VL) { 5739 Instruction *I = cast<Instruction>(V); 5740 assert(E->isOpcodeOrAlt(I) && "Unexpected main/alternate opcode"); 5741 ScalarCost += TTI->getInstructionCost(I, CostKind); 5742 } 5743 // VecCost is equal to sum of the cost of creating 2 vectors 5744 // and the cost of creating shuffle. 5745 InstructionCost VecCost = 0; 5746 // Try to find the previous shuffle node with the same operands and same 5747 // main/alternate ops. 5748 auto &&TryFindNodeWithEqualOperands = [this, E]() { 5749 for (const std::unique_ptr<TreeEntry> &TE : VectorizableTree) { 5750 if (TE.get() == E) 5751 break; 5752 if (TE->isAltShuffle() && 5753 ((TE->getOpcode() == E->getOpcode() && 5754 TE->getAltOpcode() == E->getAltOpcode()) || 5755 (TE->getOpcode() == E->getAltOpcode() && 5756 TE->getAltOpcode() == E->getOpcode())) && 5757 TE->hasEqualOperands(*E)) 5758 return true; 5759 } 5760 return false; 5761 }; 5762 if (TryFindNodeWithEqualOperands()) { 5763 LLVM_DEBUG({ 5764 dbgs() << "SLP: diamond match for alternate node found.\n"; 5765 E->dump(); 5766 }); 5767 // No need to add new vector costs here since we're going to reuse 5768 // same main/alternate vector ops, just do different shuffling. 5769 } else if (Instruction::isBinaryOp(E->getOpcode())) { 5770 VecCost = TTI->getArithmeticInstrCost(E->getOpcode(), VecTy, CostKind); 5771 VecCost += TTI->getArithmeticInstrCost(E->getAltOpcode(), VecTy, 5772 CostKind); 5773 } else if (auto *CI0 = dyn_cast<CmpInst>(VL0)) { 5774 VecCost = TTI->getCmpSelInstrCost(E->getOpcode(), ScalarTy, 5775 Builder.getInt1Ty(), 5776 CI0->getPredicate(), CostKind, VL0); 5777 VecCost += TTI->getCmpSelInstrCost( 5778 E->getOpcode(), ScalarTy, Builder.getInt1Ty(), 5779 cast<CmpInst>(E->getAltOp())->getPredicate(), CostKind, 5780 E->getAltOp()); 5781 } else { 5782 Type *Src0SclTy = E->getMainOp()->getOperand(0)->getType(); 5783 Type *Src1SclTy = E->getAltOp()->getOperand(0)->getType(); 5784 auto *Src0Ty = FixedVectorType::get(Src0SclTy, VL.size()); 5785 auto *Src1Ty = FixedVectorType::get(Src1SclTy, VL.size()); 5786 VecCost = TTI->getCastInstrCost(E->getOpcode(), VecTy, Src0Ty, 5787 TTI::CastContextHint::None, CostKind); 5788 VecCost += TTI->getCastInstrCost(E->getAltOpcode(), VecTy, Src1Ty, 5789 TTI::CastContextHint::None, CostKind); 5790 } 5791 5792 SmallVector<int> Mask; 5793 buildShuffleEntryMask( 5794 E->Scalars, E->ReorderIndices, E->ReuseShuffleIndices, 5795 [E](Instruction *I) { 5796 assert(E->isOpcodeOrAlt(I) && "Unexpected main/alternate opcode"); 5797 return isAlternateInstruction(I, E->getMainOp(), E->getAltOp()); 5798 }, 5799 Mask); 5800 CommonCost = 5801 TTI->getShuffleCost(TargetTransformInfo::SK_Select, FinalVecTy, Mask); 5802 LLVM_DEBUG(dumpTreeCosts(E, CommonCost, VecCost, ScalarCost)); 5803 return CommonCost + VecCost - ScalarCost; 5804 } 5805 default: 5806 llvm_unreachable("Unknown instruction"); 5807 } 5808 } 5809 5810 bool BoUpSLP::isFullyVectorizableTinyTree(bool ForReduction) const { 5811 LLVM_DEBUG(dbgs() << "SLP: Check whether the tree with height " 5812 << VectorizableTree.size() << " is fully vectorizable .\n"); 5813 5814 auto &&AreVectorizableGathers = [this](const TreeEntry *TE, unsigned Limit) { 5815 SmallVector<int> Mask; 5816 return TE->State == TreeEntry::NeedToGather && 5817 !any_of(TE->Scalars, 5818 [this](Value *V) { return EphValues.contains(V); }) && 5819 (allConstant(TE->Scalars) || isSplat(TE->Scalars) || 5820 TE->Scalars.size() < Limit || 5821 ((TE->getOpcode() == Instruction::ExtractElement || 5822 all_of(TE->Scalars, 5823 [](Value *V) { 5824 return isa<ExtractElementInst, UndefValue>(V); 5825 })) && 5826 isFixedVectorShuffle(TE->Scalars, Mask)) || 5827 (TE->State == TreeEntry::NeedToGather && 5828 TE->getOpcode() == Instruction::Load && !TE->isAltShuffle())); 5829 }; 5830 5831 // We only handle trees of heights 1 and 2. 5832 if (VectorizableTree.size() == 1 && 5833 (VectorizableTree[0]->State == TreeEntry::Vectorize || 5834 (ForReduction && 5835 AreVectorizableGathers(VectorizableTree[0].get(), 5836 VectorizableTree[0]->Scalars.size()) && 5837 VectorizableTree[0]->getVectorFactor() > 2))) 5838 return true; 5839 5840 if (VectorizableTree.size() != 2) 5841 return false; 5842 5843 // Handle splat and all-constants stores. Also try to vectorize tiny trees 5844 // with the second gather nodes if they have less scalar operands rather than 5845 // the initial tree element (may be profitable to shuffle the second gather) 5846 // or they are extractelements, which form shuffle. 5847 SmallVector<int> Mask; 5848 if (VectorizableTree[0]->State == TreeEntry::Vectorize && 5849 AreVectorizableGathers(VectorizableTree[1].get(), 5850 VectorizableTree[0]->Scalars.size())) 5851 return true; 5852 5853 // Gathering cost would be too much for tiny trees. 5854 if (VectorizableTree[0]->State == TreeEntry::NeedToGather || 5855 (VectorizableTree[1]->State == TreeEntry::NeedToGather && 5856 VectorizableTree[0]->State != TreeEntry::ScatterVectorize)) 5857 return false; 5858 5859 return true; 5860 } 5861 5862 static bool isLoadCombineCandidateImpl(Value *Root, unsigned NumElts, 5863 TargetTransformInfo *TTI, 5864 bool MustMatchOrInst) { 5865 // Look past the root to find a source value. Arbitrarily follow the 5866 // path through operand 0 of any 'or'. Also, peek through optional 5867 // shift-left-by-multiple-of-8-bits. 5868 Value *ZextLoad = Root; 5869 const APInt *ShAmtC; 5870 bool FoundOr = false; 5871 while (!isa<ConstantExpr>(ZextLoad) && 5872 (match(ZextLoad, m_Or(m_Value(), m_Value())) || 5873 (match(ZextLoad, m_Shl(m_Value(), m_APInt(ShAmtC))) && 5874 ShAmtC->urem(8) == 0))) { 5875 auto *BinOp = cast<BinaryOperator>(ZextLoad); 5876 ZextLoad = BinOp->getOperand(0); 5877 if (BinOp->getOpcode() == Instruction::Or) 5878 FoundOr = true; 5879 } 5880 // Check if the input is an extended load of the required or/shift expression. 5881 Value *Load; 5882 if ((MustMatchOrInst && !FoundOr) || ZextLoad == Root || 5883 !match(ZextLoad, m_ZExt(m_Value(Load))) || !isa<LoadInst>(Load)) 5884 return false; 5885 5886 // Require that the total load bit width is a legal integer type. 5887 // For example, <8 x i8> --> i64 is a legal integer on a 64-bit target. 5888 // But <16 x i8> --> i128 is not, so the backend probably can't reduce it. 5889 Type *SrcTy = Load->getType(); 5890 unsigned LoadBitWidth = SrcTy->getIntegerBitWidth() * NumElts; 5891 if (!TTI->isTypeLegal(IntegerType::get(Root->getContext(), LoadBitWidth))) 5892 return false; 5893 5894 // Everything matched - assume that we can fold the whole sequence using 5895 // load combining. 5896 LLVM_DEBUG(dbgs() << "SLP: Assume load combining for tree starting at " 5897 << *(cast<Instruction>(Root)) << "\n"); 5898 5899 return true; 5900 } 5901 5902 bool BoUpSLP::isLoadCombineReductionCandidate(RecurKind RdxKind) const { 5903 if (RdxKind != RecurKind::Or) 5904 return false; 5905 5906 unsigned NumElts = VectorizableTree[0]->Scalars.size(); 5907 Value *FirstReduced = VectorizableTree[0]->Scalars[0]; 5908 return isLoadCombineCandidateImpl(FirstReduced, NumElts, TTI, 5909 /* MatchOr */ false); 5910 } 5911 5912 bool BoUpSLP::isLoadCombineCandidate() const { 5913 // Peek through a final sequence of stores and check if all operations are 5914 // likely to be load-combined. 5915 unsigned NumElts = VectorizableTree[0]->Scalars.size(); 5916 for (Value *Scalar : VectorizableTree[0]->Scalars) { 5917 Value *X; 5918 if (!match(Scalar, m_Store(m_Value(X), m_Value())) || 5919 !isLoadCombineCandidateImpl(X, NumElts, TTI, /* MatchOr */ true)) 5920 return false; 5921 } 5922 return true; 5923 } 5924 5925 bool BoUpSLP::isTreeTinyAndNotFullyVectorizable(bool ForReduction) const { 5926 // No need to vectorize inserts of gathered values. 5927 if (VectorizableTree.size() == 2 && 5928 isa<InsertElementInst>(VectorizableTree[0]->Scalars[0]) && 5929 VectorizableTree[1]->State == TreeEntry::NeedToGather) 5930 return true; 5931 5932 // We can vectorize the tree if its size is greater than or equal to the 5933 // minimum size specified by the MinTreeSize command line option. 5934 if (VectorizableTree.size() >= MinTreeSize) 5935 return false; 5936 5937 // If we have a tiny tree (a tree whose size is less than MinTreeSize), we 5938 // can vectorize it if we can prove it fully vectorizable. 5939 if (isFullyVectorizableTinyTree(ForReduction)) 5940 return false; 5941 5942 assert(VectorizableTree.empty() 5943 ? ExternalUses.empty() 5944 : true && "We shouldn't have any external users"); 5945 5946 // Otherwise, we can't vectorize the tree. It is both tiny and not fully 5947 // vectorizable. 5948 return true; 5949 } 5950 5951 InstructionCost BoUpSLP::getSpillCost() const { 5952 // Walk from the bottom of the tree to the top, tracking which values are 5953 // live. When we see a call instruction that is not part of our tree, 5954 // query TTI to see if there is a cost to keeping values live over it 5955 // (for example, if spills and fills are required). 5956 unsigned BundleWidth = VectorizableTree.front()->Scalars.size(); 5957 InstructionCost Cost = 0; 5958 5959 SmallPtrSet<Instruction*, 4> LiveValues; 5960 Instruction *PrevInst = nullptr; 5961 5962 // The entries in VectorizableTree are not necessarily ordered by their 5963 // position in basic blocks. Collect them and order them by dominance so later 5964 // instructions are guaranteed to be visited first. For instructions in 5965 // different basic blocks, we only scan to the beginning of the block, so 5966 // their order does not matter, as long as all instructions in a basic block 5967 // are grouped together. Using dominance ensures a deterministic order. 5968 SmallVector<Instruction *, 16> OrderedScalars; 5969 for (const auto &TEPtr : VectorizableTree) { 5970 Instruction *Inst = dyn_cast<Instruction>(TEPtr->Scalars[0]); 5971 if (!Inst) 5972 continue; 5973 OrderedScalars.push_back(Inst); 5974 } 5975 llvm::sort(OrderedScalars, [&](Instruction *A, Instruction *B) { 5976 auto *NodeA = DT->getNode(A->getParent()); 5977 auto *NodeB = DT->getNode(B->getParent()); 5978 assert(NodeA && "Should only process reachable instructions"); 5979 assert(NodeB && "Should only process reachable instructions"); 5980 assert((NodeA == NodeB) == (NodeA->getDFSNumIn() == NodeB->getDFSNumIn()) && 5981 "Different nodes should have different DFS numbers"); 5982 if (NodeA != NodeB) 5983 return NodeA->getDFSNumIn() < NodeB->getDFSNumIn(); 5984 return B->comesBefore(A); 5985 }); 5986 5987 for (Instruction *Inst : OrderedScalars) { 5988 if (!PrevInst) { 5989 PrevInst = Inst; 5990 continue; 5991 } 5992 5993 // Update LiveValues. 5994 LiveValues.erase(PrevInst); 5995 for (auto &J : PrevInst->operands()) { 5996 if (isa<Instruction>(&*J) && getTreeEntry(&*J)) 5997 LiveValues.insert(cast<Instruction>(&*J)); 5998 } 5999 6000 LLVM_DEBUG({ 6001 dbgs() << "SLP: #LV: " << LiveValues.size(); 6002 for (auto *X : LiveValues) 6003 dbgs() << " " << X->getName(); 6004 dbgs() << ", Looking at "; 6005 Inst->dump(); 6006 }); 6007 6008 // Now find the sequence of instructions between PrevInst and Inst. 6009 unsigned NumCalls = 0; 6010 BasicBlock::reverse_iterator InstIt = ++Inst->getIterator().getReverse(), 6011 PrevInstIt = 6012 PrevInst->getIterator().getReverse(); 6013 while (InstIt != PrevInstIt) { 6014 if (PrevInstIt == PrevInst->getParent()->rend()) { 6015 PrevInstIt = Inst->getParent()->rbegin(); 6016 continue; 6017 } 6018 6019 // Debug information does not impact spill cost. 6020 if ((isa<CallInst>(&*PrevInstIt) && 6021 !isa<DbgInfoIntrinsic>(&*PrevInstIt)) && 6022 &*PrevInstIt != PrevInst) 6023 NumCalls++; 6024 6025 ++PrevInstIt; 6026 } 6027 6028 if (NumCalls) { 6029 SmallVector<Type*, 4> V; 6030 for (auto *II : LiveValues) { 6031 auto *ScalarTy = II->getType(); 6032 if (auto *VectorTy = dyn_cast<FixedVectorType>(ScalarTy)) 6033 ScalarTy = VectorTy->getElementType(); 6034 V.push_back(FixedVectorType::get(ScalarTy, BundleWidth)); 6035 } 6036 Cost += NumCalls * TTI->getCostOfKeepingLiveOverCall(V); 6037 } 6038 6039 PrevInst = Inst; 6040 } 6041 6042 return Cost; 6043 } 6044 6045 /// Check if two insertelement instructions are from the same buildvector. 6046 static bool areTwoInsertFromSameBuildVector(InsertElementInst *VU, 6047 InsertElementInst *V) { 6048 // Instructions must be from the same basic blocks. 6049 if (VU->getParent() != V->getParent()) 6050 return false; 6051 // Checks if 2 insertelements are from the same buildvector. 6052 if (VU->getType() != V->getType()) 6053 return false; 6054 // Multiple used inserts are separate nodes. 6055 if (!VU->hasOneUse() && !V->hasOneUse()) 6056 return false; 6057 auto *IE1 = VU; 6058 auto *IE2 = V; 6059 // Go through the vector operand of insertelement instructions trying to find 6060 // either VU as the original vector for IE2 or V as the original vector for 6061 // IE1. 6062 do { 6063 if (IE2 == VU || IE1 == V) 6064 return true; 6065 if (IE1) { 6066 if (IE1 != VU && !IE1->hasOneUse()) 6067 IE1 = nullptr; 6068 else 6069 IE1 = dyn_cast<InsertElementInst>(IE1->getOperand(0)); 6070 } 6071 if (IE2) { 6072 if (IE2 != V && !IE2->hasOneUse()) 6073 IE2 = nullptr; 6074 else 6075 IE2 = dyn_cast<InsertElementInst>(IE2->getOperand(0)); 6076 } 6077 } while (IE1 || IE2); 6078 return false; 6079 } 6080 6081 InstructionCost BoUpSLP::getTreeCost(ArrayRef<Value *> VectorizedVals) { 6082 InstructionCost Cost = 0; 6083 LLVM_DEBUG(dbgs() << "SLP: Calculating cost for tree of size " 6084 << VectorizableTree.size() << ".\n"); 6085 6086 unsigned BundleWidth = VectorizableTree[0]->Scalars.size(); 6087 6088 for (unsigned I = 0, E = VectorizableTree.size(); I < E; ++I) { 6089 TreeEntry &TE = *VectorizableTree[I]; 6090 6091 InstructionCost C = getEntryCost(&TE, VectorizedVals); 6092 Cost += C; 6093 LLVM_DEBUG(dbgs() << "SLP: Adding cost " << C 6094 << " for bundle that starts with " << *TE.Scalars[0] 6095 << ".\n" 6096 << "SLP: Current total cost = " << Cost << "\n"); 6097 } 6098 6099 SmallPtrSet<Value *, 16> ExtractCostCalculated; 6100 InstructionCost ExtractCost = 0; 6101 SmallVector<unsigned> VF; 6102 SmallVector<SmallVector<int>> ShuffleMask; 6103 SmallVector<Value *> FirstUsers; 6104 SmallVector<APInt> DemandedElts; 6105 for (ExternalUser &EU : ExternalUses) { 6106 // We only add extract cost once for the same scalar. 6107 if (!isa_and_nonnull<InsertElementInst>(EU.User) && 6108 !ExtractCostCalculated.insert(EU.Scalar).second) 6109 continue; 6110 6111 // Uses by ephemeral values are free (because the ephemeral value will be 6112 // removed prior to code generation, and so the extraction will be 6113 // removed as well). 6114 if (EphValues.count(EU.User)) 6115 continue; 6116 6117 // No extract cost for vector "scalar" 6118 if (isa<FixedVectorType>(EU.Scalar->getType())) 6119 continue; 6120 6121 // Already counted the cost for external uses when tried to adjust the cost 6122 // for extractelements, no need to add it again. 6123 if (isa<ExtractElementInst>(EU.Scalar)) 6124 continue; 6125 6126 // If found user is an insertelement, do not calculate extract cost but try 6127 // to detect it as a final shuffled/identity match. 6128 if (auto *VU = dyn_cast_or_null<InsertElementInst>(EU.User)) { 6129 if (auto *FTy = dyn_cast<FixedVectorType>(VU->getType())) { 6130 Optional<unsigned> InsertIdx = getInsertIndex(VU); 6131 if (InsertIdx) { 6132 auto *It = find_if(FirstUsers, [VU](Value *V) { 6133 return areTwoInsertFromSameBuildVector(VU, 6134 cast<InsertElementInst>(V)); 6135 }); 6136 int VecId = -1; 6137 if (It == FirstUsers.end()) { 6138 VF.push_back(FTy->getNumElements()); 6139 ShuffleMask.emplace_back(VF.back(), UndefMaskElem); 6140 // Find the insertvector, vectorized in tree, if any. 6141 Value *Base = VU; 6142 while (isa<InsertElementInst>(Base)) { 6143 // Build the mask for the vectorized insertelement instructions. 6144 if (const TreeEntry *E = getTreeEntry(Base)) { 6145 VU = cast<InsertElementInst>(Base); 6146 do { 6147 int Idx = E->findLaneForValue(Base); 6148 ShuffleMask.back()[Idx] = Idx; 6149 Base = cast<InsertElementInst>(Base)->getOperand(0); 6150 } while (E == getTreeEntry(Base)); 6151 break; 6152 } 6153 Base = cast<InsertElementInst>(Base)->getOperand(0); 6154 } 6155 FirstUsers.push_back(VU); 6156 DemandedElts.push_back(APInt::getZero(VF.back())); 6157 VecId = FirstUsers.size() - 1; 6158 } else { 6159 VecId = std::distance(FirstUsers.begin(), It); 6160 } 6161 ShuffleMask[VecId][*InsertIdx] = EU.Lane; 6162 DemandedElts[VecId].setBit(*InsertIdx); 6163 continue; 6164 } 6165 } 6166 } 6167 6168 // If we plan to rewrite the tree in a smaller type, we will need to sign 6169 // extend the extracted value back to the original type. Here, we account 6170 // for the extract and the added cost of the sign extend if needed. 6171 auto *VecTy = FixedVectorType::get(EU.Scalar->getType(), BundleWidth); 6172 auto *ScalarRoot = VectorizableTree[0]->Scalars[0]; 6173 if (MinBWs.count(ScalarRoot)) { 6174 auto *MinTy = IntegerType::get(F->getContext(), MinBWs[ScalarRoot].first); 6175 auto Extend = 6176 MinBWs[ScalarRoot].second ? Instruction::SExt : Instruction::ZExt; 6177 VecTy = FixedVectorType::get(MinTy, BundleWidth); 6178 ExtractCost += TTI->getExtractWithExtendCost(Extend, EU.Scalar->getType(), 6179 VecTy, EU.Lane); 6180 } else { 6181 ExtractCost += 6182 TTI->getVectorInstrCost(Instruction::ExtractElement, VecTy, EU.Lane); 6183 } 6184 } 6185 6186 InstructionCost SpillCost = getSpillCost(); 6187 Cost += SpillCost + ExtractCost; 6188 if (FirstUsers.size() == 1) { 6189 int Limit = ShuffleMask.front().size() * 2; 6190 if (all_of(ShuffleMask.front(), [Limit](int Idx) { return Idx < Limit; }) && 6191 !ShuffleVectorInst::isIdentityMask(ShuffleMask.front())) { 6192 InstructionCost C = TTI->getShuffleCost( 6193 TTI::SK_PermuteSingleSrc, 6194 cast<FixedVectorType>(FirstUsers.front()->getType()), 6195 ShuffleMask.front()); 6196 LLVM_DEBUG(dbgs() << "SLP: Adding cost " << C 6197 << " for final shuffle of insertelement external users " 6198 << *VectorizableTree.front()->Scalars.front() << ".\n" 6199 << "SLP: Current total cost = " << Cost << "\n"); 6200 Cost += C; 6201 } 6202 InstructionCost InsertCost = TTI->getScalarizationOverhead( 6203 cast<FixedVectorType>(FirstUsers.front()->getType()), 6204 DemandedElts.front(), /*Insert*/ true, /*Extract*/ false); 6205 LLVM_DEBUG(dbgs() << "SLP: subtracting the cost " << InsertCost 6206 << " for insertelements gather.\n" 6207 << "SLP: Current total cost = " << Cost << "\n"); 6208 Cost -= InsertCost; 6209 } else if (FirstUsers.size() >= 2) { 6210 unsigned MaxVF = *std::max_element(VF.begin(), VF.end()); 6211 // Combined masks of the first 2 vectors. 6212 SmallVector<int> CombinedMask(MaxVF, UndefMaskElem); 6213 copy(ShuffleMask.front(), CombinedMask.begin()); 6214 APInt CombinedDemandedElts = DemandedElts.front().zextOrSelf(MaxVF); 6215 auto *VecTy = FixedVectorType::get( 6216 cast<VectorType>(FirstUsers.front()->getType())->getElementType(), 6217 MaxVF); 6218 for (int I = 0, E = ShuffleMask[1].size(); I < E; ++I) { 6219 if (ShuffleMask[1][I] != UndefMaskElem) { 6220 CombinedMask[I] = ShuffleMask[1][I] + MaxVF; 6221 CombinedDemandedElts.setBit(I); 6222 } 6223 } 6224 InstructionCost C = 6225 TTI->getShuffleCost(TTI::SK_PermuteTwoSrc, VecTy, CombinedMask); 6226 LLVM_DEBUG(dbgs() << "SLP: Adding cost " << C 6227 << " for final shuffle of vector node and external " 6228 "insertelement users " 6229 << *VectorizableTree.front()->Scalars.front() << ".\n" 6230 << "SLP: Current total cost = " << Cost << "\n"); 6231 Cost += C; 6232 InstructionCost InsertCost = TTI->getScalarizationOverhead( 6233 VecTy, CombinedDemandedElts, /*Insert*/ true, /*Extract*/ false); 6234 LLVM_DEBUG(dbgs() << "SLP: subtracting the cost " << InsertCost 6235 << " for insertelements gather.\n" 6236 << "SLP: Current total cost = " << Cost << "\n"); 6237 Cost -= InsertCost; 6238 for (int I = 2, E = FirstUsers.size(); I < E; ++I) { 6239 // Other elements - permutation of 2 vectors (the initial one and the 6240 // next Ith incoming vector). 6241 unsigned VF = ShuffleMask[I].size(); 6242 for (unsigned Idx = 0; Idx < VF; ++Idx) { 6243 int Mask = ShuffleMask[I][Idx]; 6244 if (Mask != UndefMaskElem) 6245 CombinedMask[Idx] = MaxVF + Mask; 6246 else if (CombinedMask[Idx] != UndefMaskElem) 6247 CombinedMask[Idx] = Idx; 6248 } 6249 for (unsigned Idx = VF; Idx < MaxVF; ++Idx) 6250 if (CombinedMask[Idx] != UndefMaskElem) 6251 CombinedMask[Idx] = Idx; 6252 InstructionCost C = 6253 TTI->getShuffleCost(TTI::SK_PermuteTwoSrc, VecTy, CombinedMask); 6254 LLVM_DEBUG(dbgs() << "SLP: Adding cost " << C 6255 << " for final shuffle of vector node and external " 6256 "insertelement users " 6257 << *VectorizableTree.front()->Scalars.front() << ".\n" 6258 << "SLP: Current total cost = " << Cost << "\n"); 6259 Cost += C; 6260 InstructionCost InsertCost = TTI->getScalarizationOverhead( 6261 cast<FixedVectorType>(FirstUsers[I]->getType()), DemandedElts[I], 6262 /*Insert*/ true, /*Extract*/ false); 6263 LLVM_DEBUG(dbgs() << "SLP: subtracting the cost " << InsertCost 6264 << " for insertelements gather.\n" 6265 << "SLP: Current total cost = " << Cost << "\n"); 6266 Cost -= InsertCost; 6267 } 6268 } 6269 6270 #ifndef NDEBUG 6271 SmallString<256> Str; 6272 { 6273 raw_svector_ostream OS(Str); 6274 OS << "SLP: Spill Cost = " << SpillCost << ".\n" 6275 << "SLP: Extract Cost = " << ExtractCost << ".\n" 6276 << "SLP: Total Cost = " << Cost << ".\n"; 6277 } 6278 LLVM_DEBUG(dbgs() << Str); 6279 if (ViewSLPTree) 6280 ViewGraph(this, "SLP" + F->getName(), false, Str); 6281 #endif 6282 6283 return Cost; 6284 } 6285 6286 Optional<TargetTransformInfo::ShuffleKind> 6287 BoUpSLP::isGatherShuffledEntry(const TreeEntry *TE, SmallVectorImpl<int> &Mask, 6288 SmallVectorImpl<const TreeEntry *> &Entries) { 6289 // TODO: currently checking only for Scalars in the tree entry, need to count 6290 // reused elements too for better cost estimation. 6291 Mask.assign(TE->Scalars.size(), UndefMaskElem); 6292 Entries.clear(); 6293 // Build a lists of values to tree entries. 6294 DenseMap<Value *, SmallPtrSet<const TreeEntry *, 4>> ValueToTEs; 6295 for (const std::unique_ptr<TreeEntry> &EntryPtr : VectorizableTree) { 6296 if (EntryPtr.get() == TE) 6297 break; 6298 if (EntryPtr->State != TreeEntry::NeedToGather) 6299 continue; 6300 for (Value *V : EntryPtr->Scalars) 6301 ValueToTEs.try_emplace(V).first->getSecond().insert(EntryPtr.get()); 6302 } 6303 // Find all tree entries used by the gathered values. If no common entries 6304 // found - not a shuffle. 6305 // Here we build a set of tree nodes for each gathered value and trying to 6306 // find the intersection between these sets. If we have at least one common 6307 // tree node for each gathered value - we have just a permutation of the 6308 // single vector. If we have 2 different sets, we're in situation where we 6309 // have a permutation of 2 input vectors. 6310 SmallVector<SmallPtrSet<const TreeEntry *, 4>> UsedTEs; 6311 DenseMap<Value *, int> UsedValuesEntry; 6312 for (Value *V : TE->Scalars) { 6313 if (isa<UndefValue>(V)) 6314 continue; 6315 // Build a list of tree entries where V is used. 6316 SmallPtrSet<const TreeEntry *, 4> VToTEs; 6317 auto It = ValueToTEs.find(V); 6318 if (It != ValueToTEs.end()) 6319 VToTEs = It->second; 6320 if (const TreeEntry *VTE = getTreeEntry(V)) 6321 VToTEs.insert(VTE); 6322 if (VToTEs.empty()) 6323 return None; 6324 if (UsedTEs.empty()) { 6325 // The first iteration, just insert the list of nodes to vector. 6326 UsedTEs.push_back(VToTEs); 6327 } else { 6328 // Need to check if there are any previously used tree nodes which use V. 6329 // If there are no such nodes, consider that we have another one input 6330 // vector. 6331 SmallPtrSet<const TreeEntry *, 4> SavedVToTEs(VToTEs); 6332 unsigned Idx = 0; 6333 for (SmallPtrSet<const TreeEntry *, 4> &Set : UsedTEs) { 6334 // Do we have a non-empty intersection of previously listed tree entries 6335 // and tree entries using current V? 6336 set_intersect(VToTEs, Set); 6337 if (!VToTEs.empty()) { 6338 // Yes, write the new subset and continue analysis for the next 6339 // scalar. 6340 Set.swap(VToTEs); 6341 break; 6342 } 6343 VToTEs = SavedVToTEs; 6344 ++Idx; 6345 } 6346 // No non-empty intersection found - need to add a second set of possible 6347 // source vectors. 6348 if (Idx == UsedTEs.size()) { 6349 // If the number of input vectors is greater than 2 - not a permutation, 6350 // fallback to the regular gather. 6351 if (UsedTEs.size() == 2) 6352 return None; 6353 UsedTEs.push_back(SavedVToTEs); 6354 Idx = UsedTEs.size() - 1; 6355 } 6356 UsedValuesEntry.try_emplace(V, Idx); 6357 } 6358 } 6359 6360 unsigned VF = 0; 6361 if (UsedTEs.size() == 1) { 6362 // Try to find the perfect match in another gather node at first. 6363 auto It = find_if(UsedTEs.front(), [TE](const TreeEntry *EntryPtr) { 6364 return EntryPtr->isSame(TE->Scalars); 6365 }); 6366 if (It != UsedTEs.front().end()) { 6367 Entries.push_back(*It); 6368 std::iota(Mask.begin(), Mask.end(), 0); 6369 return TargetTransformInfo::SK_PermuteSingleSrc; 6370 } 6371 // No perfect match, just shuffle, so choose the first tree node. 6372 Entries.push_back(*UsedTEs.front().begin()); 6373 } else { 6374 // Try to find nodes with the same vector factor. 6375 assert(UsedTEs.size() == 2 && "Expected at max 2 permuted entries."); 6376 DenseMap<int, const TreeEntry *> VFToTE; 6377 for (const TreeEntry *TE : UsedTEs.front()) 6378 VFToTE.try_emplace(TE->getVectorFactor(), TE); 6379 for (const TreeEntry *TE : UsedTEs.back()) { 6380 auto It = VFToTE.find(TE->getVectorFactor()); 6381 if (It != VFToTE.end()) { 6382 VF = It->first; 6383 Entries.push_back(It->second); 6384 Entries.push_back(TE); 6385 break; 6386 } 6387 } 6388 // No 2 source vectors with the same vector factor - give up and do regular 6389 // gather. 6390 if (Entries.empty()) 6391 return None; 6392 } 6393 6394 // Build a shuffle mask for better cost estimation and vector emission. 6395 for (int I = 0, E = TE->Scalars.size(); I < E; ++I) { 6396 Value *V = TE->Scalars[I]; 6397 if (isa<UndefValue>(V)) 6398 continue; 6399 unsigned Idx = UsedValuesEntry.lookup(V); 6400 const TreeEntry *VTE = Entries[Idx]; 6401 int FoundLane = VTE->findLaneForValue(V); 6402 Mask[I] = Idx * VF + FoundLane; 6403 // Extra check required by isSingleSourceMaskImpl function (called by 6404 // ShuffleVectorInst::isSingleSourceMask). 6405 if (Mask[I] >= 2 * E) 6406 return None; 6407 } 6408 switch (Entries.size()) { 6409 case 1: 6410 return TargetTransformInfo::SK_PermuteSingleSrc; 6411 case 2: 6412 return TargetTransformInfo::SK_PermuteTwoSrc; 6413 default: 6414 break; 6415 } 6416 return None; 6417 } 6418 6419 InstructionCost BoUpSLP::getGatherCost(FixedVectorType *Ty, 6420 const APInt &ShuffledIndices, 6421 bool NeedToShuffle) const { 6422 InstructionCost Cost = 6423 TTI->getScalarizationOverhead(Ty, ~ShuffledIndices, /*Insert*/ true, 6424 /*Extract*/ false); 6425 if (NeedToShuffle) 6426 Cost += TTI->getShuffleCost(TargetTransformInfo::SK_PermuteSingleSrc, Ty); 6427 return Cost; 6428 } 6429 6430 InstructionCost BoUpSLP::getGatherCost(ArrayRef<Value *> VL) const { 6431 // Find the type of the operands in VL. 6432 Type *ScalarTy = VL[0]->getType(); 6433 if (StoreInst *SI = dyn_cast<StoreInst>(VL[0])) 6434 ScalarTy = SI->getValueOperand()->getType(); 6435 auto *VecTy = FixedVectorType::get(ScalarTy, VL.size()); 6436 bool DuplicateNonConst = false; 6437 // Find the cost of inserting/extracting values from the vector. 6438 // Check if the same elements are inserted several times and count them as 6439 // shuffle candidates. 6440 APInt ShuffledElements = APInt::getZero(VL.size()); 6441 DenseSet<Value *> UniqueElements; 6442 // Iterate in reverse order to consider insert elements with the high cost. 6443 for (unsigned I = VL.size(); I > 0; --I) { 6444 unsigned Idx = I - 1; 6445 // No need to shuffle duplicates for constants. 6446 if (isConstant(VL[Idx])) { 6447 ShuffledElements.setBit(Idx); 6448 continue; 6449 } 6450 if (!UniqueElements.insert(VL[Idx]).second) { 6451 DuplicateNonConst = true; 6452 ShuffledElements.setBit(Idx); 6453 } 6454 } 6455 return getGatherCost(VecTy, ShuffledElements, DuplicateNonConst); 6456 } 6457 6458 // Perform operand reordering on the instructions in VL and return the reordered 6459 // operands in Left and Right. 6460 void BoUpSLP::reorderInputsAccordingToOpcode(ArrayRef<Value *> VL, 6461 SmallVectorImpl<Value *> &Left, 6462 SmallVectorImpl<Value *> &Right, 6463 const DataLayout &DL, 6464 ScalarEvolution &SE, 6465 const BoUpSLP &R) { 6466 if (VL.empty()) 6467 return; 6468 VLOperands Ops(VL, DL, SE, R); 6469 // Reorder the operands in place. 6470 Ops.reorder(); 6471 Left = Ops.getVL(0); 6472 Right = Ops.getVL(1); 6473 } 6474 6475 void BoUpSLP::setInsertPointAfterBundle(const TreeEntry *E) { 6476 // Get the basic block this bundle is in. All instructions in the bundle 6477 // should be in this block. 6478 auto *Front = E->getMainOp(); 6479 auto *BB = Front->getParent(); 6480 assert(llvm::all_of(E->Scalars, [=](Value *V) -> bool { 6481 auto *I = cast<Instruction>(V); 6482 return !E->isOpcodeOrAlt(I) || I->getParent() == BB; 6483 })); 6484 6485 auto &&FindLastInst = [E, Front]() { 6486 Instruction *LastInst = Front; 6487 for (Value *V : E->Scalars) { 6488 auto *I = dyn_cast<Instruction>(V); 6489 if (!I) 6490 continue; 6491 if (LastInst->comesBefore(I)) 6492 LastInst = I; 6493 } 6494 return LastInst; 6495 }; 6496 6497 auto &&FindFirstInst = [E, Front]() { 6498 Instruction *FirstInst = Front; 6499 for (Value *V : E->Scalars) { 6500 auto *I = dyn_cast<Instruction>(V); 6501 if (!I) 6502 continue; 6503 if (I->comesBefore(FirstInst)) 6504 FirstInst = I; 6505 } 6506 return FirstInst; 6507 }; 6508 6509 // Set the insert point to the beginning of the basic block if the entry 6510 // should not be scheduled. 6511 if (E->State != TreeEntry::NeedToGather && 6512 doesNotNeedToSchedule(E->Scalars)) { 6513 BasicBlock::iterator InsertPt; 6514 if (all_of(E->Scalars, isUsedOutsideBlock)) 6515 InsertPt = FindLastInst()->getIterator(); 6516 else 6517 InsertPt = FindFirstInst()->getIterator(); 6518 Builder.SetInsertPoint(BB, InsertPt); 6519 Builder.SetCurrentDebugLocation(Front->getDebugLoc()); 6520 return; 6521 } 6522 6523 // The last instruction in the bundle in program order. 6524 Instruction *LastInst = nullptr; 6525 6526 // Find the last instruction. The common case should be that BB has been 6527 // scheduled, and the last instruction is VL.back(). So we start with 6528 // VL.back() and iterate over schedule data until we reach the end of the 6529 // bundle. The end of the bundle is marked by null ScheduleData. 6530 if (BlocksSchedules.count(BB)) { 6531 Value *V = E->isOneOf(E->Scalars.back()); 6532 if (doesNotNeedToBeScheduled(V)) 6533 V = *find_if_not(E->Scalars, doesNotNeedToBeScheduled); 6534 auto *Bundle = BlocksSchedules[BB]->getScheduleData(V); 6535 if (Bundle && Bundle->isPartOfBundle()) 6536 for (; Bundle; Bundle = Bundle->NextInBundle) 6537 if (Bundle->OpValue == Bundle->Inst) 6538 LastInst = Bundle->Inst; 6539 } 6540 6541 // LastInst can still be null at this point if there's either not an entry 6542 // for BB in BlocksSchedules or there's no ScheduleData available for 6543 // VL.back(). This can be the case if buildTree_rec aborts for various 6544 // reasons (e.g., the maximum recursion depth is reached, the maximum region 6545 // size is reached, etc.). ScheduleData is initialized in the scheduling 6546 // "dry-run". 6547 // 6548 // If this happens, we can still find the last instruction by brute force. We 6549 // iterate forwards from Front (inclusive) until we either see all 6550 // instructions in the bundle or reach the end of the block. If Front is the 6551 // last instruction in program order, LastInst will be set to Front, and we 6552 // will visit all the remaining instructions in the block. 6553 // 6554 // One of the reasons we exit early from buildTree_rec is to place an upper 6555 // bound on compile-time. Thus, taking an additional compile-time hit here is 6556 // not ideal. However, this should be exceedingly rare since it requires that 6557 // we both exit early from buildTree_rec and that the bundle be out-of-order 6558 // (causing us to iterate all the way to the end of the block). 6559 if (!LastInst) 6560 LastInst = FindLastInst(); 6561 assert(LastInst && "Failed to find last instruction in bundle"); 6562 6563 // Set the insertion point after the last instruction in the bundle. Set the 6564 // debug location to Front. 6565 Builder.SetInsertPoint(BB, ++LastInst->getIterator()); 6566 Builder.SetCurrentDebugLocation(Front->getDebugLoc()); 6567 } 6568 6569 Value *BoUpSLP::gather(ArrayRef<Value *> VL) { 6570 // List of instructions/lanes from current block and/or the blocks which are 6571 // part of the current loop. These instructions will be inserted at the end to 6572 // make it possible to optimize loops and hoist invariant instructions out of 6573 // the loops body with better chances for success. 6574 SmallVector<std::pair<Value *, unsigned>, 4> PostponedInsts; 6575 SmallSet<int, 4> PostponedIndices; 6576 Loop *L = LI->getLoopFor(Builder.GetInsertBlock()); 6577 auto &&CheckPredecessor = [](BasicBlock *InstBB, BasicBlock *InsertBB) { 6578 SmallPtrSet<BasicBlock *, 4> Visited; 6579 while (InsertBB && InsertBB != InstBB && Visited.insert(InsertBB).second) 6580 InsertBB = InsertBB->getSinglePredecessor(); 6581 return InsertBB && InsertBB == InstBB; 6582 }; 6583 for (int I = 0, E = VL.size(); I < E; ++I) { 6584 if (auto *Inst = dyn_cast<Instruction>(VL[I])) 6585 if ((CheckPredecessor(Inst->getParent(), Builder.GetInsertBlock()) || 6586 getTreeEntry(Inst) || (L && (L->contains(Inst)))) && 6587 PostponedIndices.insert(I).second) 6588 PostponedInsts.emplace_back(Inst, I); 6589 } 6590 6591 auto &&CreateInsertElement = [this](Value *Vec, Value *V, unsigned Pos) { 6592 Vec = Builder.CreateInsertElement(Vec, V, Builder.getInt32(Pos)); 6593 auto *InsElt = dyn_cast<InsertElementInst>(Vec); 6594 if (!InsElt) 6595 return Vec; 6596 GatherShuffleSeq.insert(InsElt); 6597 CSEBlocks.insert(InsElt->getParent()); 6598 // Add to our 'need-to-extract' list. 6599 if (TreeEntry *Entry = getTreeEntry(V)) { 6600 // Find which lane we need to extract. 6601 unsigned FoundLane = Entry->findLaneForValue(V); 6602 ExternalUses.emplace_back(V, InsElt, FoundLane); 6603 } 6604 return Vec; 6605 }; 6606 Value *Val0 = 6607 isa<StoreInst>(VL[0]) ? cast<StoreInst>(VL[0])->getValueOperand() : VL[0]; 6608 FixedVectorType *VecTy = FixedVectorType::get(Val0->getType(), VL.size()); 6609 Value *Vec = PoisonValue::get(VecTy); 6610 SmallVector<int> NonConsts; 6611 // Insert constant values at first. 6612 for (int I = 0, E = VL.size(); I < E; ++I) { 6613 if (PostponedIndices.contains(I)) 6614 continue; 6615 if (!isConstant(VL[I])) { 6616 NonConsts.push_back(I); 6617 continue; 6618 } 6619 Vec = CreateInsertElement(Vec, VL[I], I); 6620 } 6621 // Insert non-constant values. 6622 for (int I : NonConsts) 6623 Vec = CreateInsertElement(Vec, VL[I], I); 6624 // Append instructions, which are/may be part of the loop, in the end to make 6625 // it possible to hoist non-loop-based instructions. 6626 for (const std::pair<Value *, unsigned> &Pair : PostponedInsts) 6627 Vec = CreateInsertElement(Vec, Pair.first, Pair.second); 6628 6629 return Vec; 6630 } 6631 6632 namespace { 6633 /// Merges shuffle masks and emits final shuffle instruction, if required. 6634 class ShuffleInstructionBuilder { 6635 IRBuilderBase &Builder; 6636 const unsigned VF = 0; 6637 bool IsFinalized = false; 6638 SmallVector<int, 4> Mask; 6639 /// Holds all of the instructions that we gathered. 6640 SetVector<Instruction *> &GatherShuffleSeq; 6641 /// A list of blocks that we are going to CSE. 6642 SetVector<BasicBlock *> &CSEBlocks; 6643 6644 public: 6645 ShuffleInstructionBuilder(IRBuilderBase &Builder, unsigned VF, 6646 SetVector<Instruction *> &GatherShuffleSeq, 6647 SetVector<BasicBlock *> &CSEBlocks) 6648 : Builder(Builder), VF(VF), GatherShuffleSeq(GatherShuffleSeq), 6649 CSEBlocks(CSEBlocks) {} 6650 6651 /// Adds a mask, inverting it before applying. 6652 void addInversedMask(ArrayRef<unsigned> SubMask) { 6653 if (SubMask.empty()) 6654 return; 6655 SmallVector<int, 4> NewMask; 6656 inversePermutation(SubMask, NewMask); 6657 addMask(NewMask); 6658 } 6659 6660 /// Functions adds masks, merging them into single one. 6661 void addMask(ArrayRef<unsigned> SubMask) { 6662 SmallVector<int, 4> NewMask(SubMask.begin(), SubMask.end()); 6663 addMask(NewMask); 6664 } 6665 6666 void addMask(ArrayRef<int> SubMask) { ::addMask(Mask, SubMask); } 6667 6668 Value *finalize(Value *V) { 6669 IsFinalized = true; 6670 unsigned ValueVF = cast<FixedVectorType>(V->getType())->getNumElements(); 6671 if (VF == ValueVF && Mask.empty()) 6672 return V; 6673 SmallVector<int, 4> NormalizedMask(VF, UndefMaskElem); 6674 std::iota(NormalizedMask.begin(), NormalizedMask.end(), 0); 6675 addMask(NormalizedMask); 6676 6677 if (VF == ValueVF && ShuffleVectorInst::isIdentityMask(Mask)) 6678 return V; 6679 Value *Vec = Builder.CreateShuffleVector(V, Mask, "shuffle"); 6680 if (auto *I = dyn_cast<Instruction>(Vec)) { 6681 GatherShuffleSeq.insert(I); 6682 CSEBlocks.insert(I->getParent()); 6683 } 6684 return Vec; 6685 } 6686 6687 ~ShuffleInstructionBuilder() { 6688 assert((IsFinalized || Mask.empty()) && 6689 "Shuffle construction must be finalized."); 6690 } 6691 }; 6692 } // namespace 6693 6694 Value *BoUpSLP::vectorizeTree(ArrayRef<Value *> VL) { 6695 const unsigned VF = VL.size(); 6696 InstructionsState S = getSameOpcode(VL); 6697 if (S.getOpcode()) { 6698 if (TreeEntry *E = getTreeEntry(S.OpValue)) 6699 if (E->isSame(VL)) { 6700 Value *V = vectorizeTree(E); 6701 if (VF != cast<FixedVectorType>(V->getType())->getNumElements()) { 6702 if (!E->ReuseShuffleIndices.empty()) { 6703 // Reshuffle to get only unique values. 6704 // If some of the scalars are duplicated in the vectorization tree 6705 // entry, we do not vectorize them but instead generate a mask for 6706 // the reuses. But if there are several users of the same entry, 6707 // they may have different vectorization factors. This is especially 6708 // important for PHI nodes. In this case, we need to adapt the 6709 // resulting instruction for the user vectorization factor and have 6710 // to reshuffle it again to take only unique elements of the vector. 6711 // Without this code the function incorrectly returns reduced vector 6712 // instruction with the same elements, not with the unique ones. 6713 6714 // block: 6715 // %phi = phi <2 x > { .., %entry} {%shuffle, %block} 6716 // %2 = shuffle <2 x > %phi, poison, <4 x > <1, 1, 0, 0> 6717 // ... (use %2) 6718 // %shuffle = shuffle <2 x> %2, poison, <2 x> {2, 0} 6719 // br %block 6720 SmallVector<int> UniqueIdxs(VF, UndefMaskElem); 6721 SmallSet<int, 4> UsedIdxs; 6722 int Pos = 0; 6723 int Sz = VL.size(); 6724 for (int Idx : E->ReuseShuffleIndices) { 6725 if (Idx != Sz && Idx != UndefMaskElem && 6726 UsedIdxs.insert(Idx).second) 6727 UniqueIdxs[Idx] = Pos; 6728 ++Pos; 6729 } 6730 assert(VF >= UsedIdxs.size() && "Expected vectorization factor " 6731 "less than original vector size."); 6732 UniqueIdxs.append(VF - UsedIdxs.size(), UndefMaskElem); 6733 V = Builder.CreateShuffleVector(V, UniqueIdxs, "shrink.shuffle"); 6734 } else { 6735 assert(VF < cast<FixedVectorType>(V->getType())->getNumElements() && 6736 "Expected vectorization factor less " 6737 "than original vector size."); 6738 SmallVector<int> UniformMask(VF, 0); 6739 std::iota(UniformMask.begin(), UniformMask.end(), 0); 6740 V = Builder.CreateShuffleVector(V, UniformMask, "shrink.shuffle"); 6741 } 6742 if (auto *I = dyn_cast<Instruction>(V)) { 6743 GatherShuffleSeq.insert(I); 6744 CSEBlocks.insert(I->getParent()); 6745 } 6746 } 6747 return V; 6748 } 6749 } 6750 6751 // Can't vectorize this, so simply build a new vector with each lane 6752 // corresponding to the requested value. 6753 return createBuildVector(VL); 6754 } 6755 Value *BoUpSLP::createBuildVector(ArrayRef<Value *> VL) { 6756 unsigned VF = VL.size(); 6757 // Exploit possible reuse of values across lanes. 6758 SmallVector<int> ReuseShuffleIndicies; 6759 SmallVector<Value *> UniqueValues; 6760 if (VL.size() > 2) { 6761 DenseMap<Value *, unsigned> UniquePositions; 6762 unsigned NumValues = 6763 std::distance(VL.begin(), find_if(reverse(VL), [](Value *V) { 6764 return !isa<UndefValue>(V); 6765 }).base()); 6766 VF = std::max<unsigned>(VF, PowerOf2Ceil(NumValues)); 6767 int UniqueVals = 0; 6768 for (Value *V : VL.drop_back(VL.size() - VF)) { 6769 if (isa<UndefValue>(V)) { 6770 ReuseShuffleIndicies.emplace_back(UndefMaskElem); 6771 continue; 6772 } 6773 if (isConstant(V)) { 6774 ReuseShuffleIndicies.emplace_back(UniqueValues.size()); 6775 UniqueValues.emplace_back(V); 6776 continue; 6777 } 6778 auto Res = UniquePositions.try_emplace(V, UniqueValues.size()); 6779 ReuseShuffleIndicies.emplace_back(Res.first->second); 6780 if (Res.second) { 6781 UniqueValues.emplace_back(V); 6782 ++UniqueVals; 6783 } 6784 } 6785 if (UniqueVals == 1 && UniqueValues.size() == 1) { 6786 // Emit pure splat vector. 6787 ReuseShuffleIndicies.append(VF - ReuseShuffleIndicies.size(), 6788 UndefMaskElem); 6789 } else if (UniqueValues.size() >= VF - 1 || UniqueValues.size() <= 1) { 6790 ReuseShuffleIndicies.clear(); 6791 UniqueValues.clear(); 6792 UniqueValues.append(VL.begin(), std::next(VL.begin(), NumValues)); 6793 } 6794 UniqueValues.append(VF - UniqueValues.size(), 6795 PoisonValue::get(VL[0]->getType())); 6796 VL = UniqueValues; 6797 } 6798 6799 ShuffleInstructionBuilder ShuffleBuilder(Builder, VF, GatherShuffleSeq, 6800 CSEBlocks); 6801 Value *Vec = gather(VL); 6802 if (!ReuseShuffleIndicies.empty()) { 6803 ShuffleBuilder.addMask(ReuseShuffleIndicies); 6804 Vec = ShuffleBuilder.finalize(Vec); 6805 } 6806 return Vec; 6807 } 6808 6809 Value *BoUpSLP::vectorizeTree(TreeEntry *E) { 6810 IRBuilder<>::InsertPointGuard Guard(Builder); 6811 6812 if (E->VectorizedValue) { 6813 LLVM_DEBUG(dbgs() << "SLP: Diamond merged for " << *E->Scalars[0] << ".\n"); 6814 return E->VectorizedValue; 6815 } 6816 6817 bool NeedToShuffleReuses = !E->ReuseShuffleIndices.empty(); 6818 unsigned VF = E->getVectorFactor(); 6819 ShuffleInstructionBuilder ShuffleBuilder(Builder, VF, GatherShuffleSeq, 6820 CSEBlocks); 6821 if (E->State == TreeEntry::NeedToGather) { 6822 if (E->getMainOp()) 6823 setInsertPointAfterBundle(E); 6824 Value *Vec; 6825 SmallVector<int> Mask; 6826 SmallVector<const TreeEntry *> Entries; 6827 Optional<TargetTransformInfo::ShuffleKind> Shuffle = 6828 isGatherShuffledEntry(E, Mask, Entries); 6829 if (Shuffle.hasValue()) { 6830 assert((Entries.size() == 1 || Entries.size() == 2) && 6831 "Expected shuffle of 1 or 2 entries."); 6832 Vec = Builder.CreateShuffleVector(Entries.front()->VectorizedValue, 6833 Entries.back()->VectorizedValue, Mask); 6834 if (auto *I = dyn_cast<Instruction>(Vec)) { 6835 GatherShuffleSeq.insert(I); 6836 CSEBlocks.insert(I->getParent()); 6837 } 6838 } else { 6839 Vec = gather(E->Scalars); 6840 } 6841 if (NeedToShuffleReuses) { 6842 ShuffleBuilder.addMask(E->ReuseShuffleIndices); 6843 Vec = ShuffleBuilder.finalize(Vec); 6844 } 6845 E->VectorizedValue = Vec; 6846 return Vec; 6847 } 6848 6849 assert((E->State == TreeEntry::Vectorize || 6850 E->State == TreeEntry::ScatterVectorize) && 6851 "Unhandled state"); 6852 unsigned ShuffleOrOp = 6853 E->isAltShuffle() ? (unsigned)Instruction::ShuffleVector : E->getOpcode(); 6854 Instruction *VL0 = E->getMainOp(); 6855 Type *ScalarTy = VL0->getType(); 6856 if (auto *Store = dyn_cast<StoreInst>(VL0)) 6857 ScalarTy = Store->getValueOperand()->getType(); 6858 else if (auto *IE = dyn_cast<InsertElementInst>(VL0)) 6859 ScalarTy = IE->getOperand(1)->getType(); 6860 auto *VecTy = FixedVectorType::get(ScalarTy, E->Scalars.size()); 6861 switch (ShuffleOrOp) { 6862 case Instruction::PHI: { 6863 assert( 6864 (E->ReorderIndices.empty() || E != VectorizableTree.front().get()) && 6865 "PHI reordering is free."); 6866 auto *PH = cast<PHINode>(VL0); 6867 Builder.SetInsertPoint(PH->getParent()->getFirstNonPHI()); 6868 Builder.SetCurrentDebugLocation(PH->getDebugLoc()); 6869 PHINode *NewPhi = Builder.CreatePHI(VecTy, PH->getNumIncomingValues()); 6870 Value *V = NewPhi; 6871 6872 // Adjust insertion point once all PHI's have been generated. 6873 Builder.SetInsertPoint(&*PH->getParent()->getFirstInsertionPt()); 6874 Builder.SetCurrentDebugLocation(PH->getDebugLoc()); 6875 6876 ShuffleBuilder.addInversedMask(E->ReorderIndices); 6877 ShuffleBuilder.addMask(E->ReuseShuffleIndices); 6878 V = ShuffleBuilder.finalize(V); 6879 6880 E->VectorizedValue = V; 6881 6882 // PHINodes may have multiple entries from the same block. We want to 6883 // visit every block once. 6884 SmallPtrSet<BasicBlock*, 4> VisitedBBs; 6885 6886 for (unsigned i = 0, e = PH->getNumIncomingValues(); i < e; ++i) { 6887 ValueList Operands; 6888 BasicBlock *IBB = PH->getIncomingBlock(i); 6889 6890 if (!VisitedBBs.insert(IBB).second) { 6891 NewPhi->addIncoming(NewPhi->getIncomingValueForBlock(IBB), IBB); 6892 continue; 6893 } 6894 6895 Builder.SetInsertPoint(IBB->getTerminator()); 6896 Builder.SetCurrentDebugLocation(PH->getDebugLoc()); 6897 Value *Vec = vectorizeTree(E->getOperand(i)); 6898 NewPhi->addIncoming(Vec, IBB); 6899 } 6900 6901 assert(NewPhi->getNumIncomingValues() == PH->getNumIncomingValues() && 6902 "Invalid number of incoming values"); 6903 return V; 6904 } 6905 6906 case Instruction::ExtractElement: { 6907 Value *V = E->getSingleOperand(0); 6908 Builder.SetInsertPoint(VL0); 6909 ShuffleBuilder.addInversedMask(E->ReorderIndices); 6910 ShuffleBuilder.addMask(E->ReuseShuffleIndices); 6911 V = ShuffleBuilder.finalize(V); 6912 E->VectorizedValue = V; 6913 return V; 6914 } 6915 case Instruction::ExtractValue: { 6916 auto *LI = cast<LoadInst>(E->getSingleOperand(0)); 6917 Builder.SetInsertPoint(LI); 6918 auto *PtrTy = PointerType::get(VecTy, LI->getPointerAddressSpace()); 6919 Value *Ptr = Builder.CreateBitCast(LI->getOperand(0), PtrTy); 6920 LoadInst *V = Builder.CreateAlignedLoad(VecTy, Ptr, LI->getAlign()); 6921 Value *NewV = propagateMetadata(V, E->Scalars); 6922 ShuffleBuilder.addInversedMask(E->ReorderIndices); 6923 ShuffleBuilder.addMask(E->ReuseShuffleIndices); 6924 NewV = ShuffleBuilder.finalize(NewV); 6925 E->VectorizedValue = NewV; 6926 return NewV; 6927 } 6928 case Instruction::InsertElement: { 6929 assert(E->ReuseShuffleIndices.empty() && "All inserts should be unique"); 6930 Builder.SetInsertPoint(cast<Instruction>(E->Scalars.back())); 6931 Value *V = vectorizeTree(E->getOperand(1)); 6932 6933 // Create InsertVector shuffle if necessary 6934 auto *FirstInsert = cast<Instruction>(*find_if(E->Scalars, [E](Value *V) { 6935 return !is_contained(E->Scalars, cast<Instruction>(V)->getOperand(0)); 6936 })); 6937 const unsigned NumElts = 6938 cast<FixedVectorType>(FirstInsert->getType())->getNumElements(); 6939 const unsigned NumScalars = E->Scalars.size(); 6940 6941 unsigned Offset = *getInsertIndex(VL0); 6942 assert(Offset < NumElts && "Failed to find vector index offset"); 6943 6944 // Create shuffle to resize vector 6945 SmallVector<int> Mask; 6946 if (!E->ReorderIndices.empty()) { 6947 inversePermutation(E->ReorderIndices, Mask); 6948 Mask.append(NumElts - NumScalars, UndefMaskElem); 6949 } else { 6950 Mask.assign(NumElts, UndefMaskElem); 6951 std::iota(Mask.begin(), std::next(Mask.begin(), NumScalars), 0); 6952 } 6953 // Create InsertVector shuffle if necessary 6954 bool IsIdentity = true; 6955 SmallVector<int> PrevMask(NumElts, UndefMaskElem); 6956 Mask.swap(PrevMask); 6957 for (unsigned I = 0; I < NumScalars; ++I) { 6958 Value *Scalar = E->Scalars[PrevMask[I]]; 6959 unsigned InsertIdx = *getInsertIndex(Scalar); 6960 IsIdentity &= InsertIdx - Offset == I; 6961 Mask[InsertIdx - Offset] = I; 6962 } 6963 if (!IsIdentity || NumElts != NumScalars) { 6964 V = Builder.CreateShuffleVector(V, Mask); 6965 if (auto *I = dyn_cast<Instruction>(V)) { 6966 GatherShuffleSeq.insert(I); 6967 CSEBlocks.insert(I->getParent()); 6968 } 6969 } 6970 6971 if ((!IsIdentity || Offset != 0 || 6972 !isUndefVector(FirstInsert->getOperand(0))) && 6973 NumElts != NumScalars) { 6974 SmallVector<int> InsertMask(NumElts); 6975 std::iota(InsertMask.begin(), InsertMask.end(), 0); 6976 for (unsigned I = 0; I < NumElts; I++) { 6977 if (Mask[I] != UndefMaskElem) 6978 InsertMask[Offset + I] = NumElts + I; 6979 } 6980 6981 V = Builder.CreateShuffleVector( 6982 FirstInsert->getOperand(0), V, InsertMask, 6983 cast<Instruction>(E->Scalars.back())->getName()); 6984 if (auto *I = dyn_cast<Instruction>(V)) { 6985 GatherShuffleSeq.insert(I); 6986 CSEBlocks.insert(I->getParent()); 6987 } 6988 } 6989 6990 ++NumVectorInstructions; 6991 E->VectorizedValue = V; 6992 return V; 6993 } 6994 case Instruction::ZExt: 6995 case Instruction::SExt: 6996 case Instruction::FPToUI: 6997 case Instruction::FPToSI: 6998 case Instruction::FPExt: 6999 case Instruction::PtrToInt: 7000 case Instruction::IntToPtr: 7001 case Instruction::SIToFP: 7002 case Instruction::UIToFP: 7003 case Instruction::Trunc: 7004 case Instruction::FPTrunc: 7005 case Instruction::BitCast: { 7006 setInsertPointAfterBundle(E); 7007 7008 Value *InVec = vectorizeTree(E->getOperand(0)); 7009 7010 if (E->VectorizedValue) { 7011 LLVM_DEBUG(dbgs() << "SLP: Diamond merged for " << *VL0 << ".\n"); 7012 return E->VectorizedValue; 7013 } 7014 7015 auto *CI = cast<CastInst>(VL0); 7016 Value *V = Builder.CreateCast(CI->getOpcode(), InVec, VecTy); 7017 ShuffleBuilder.addInversedMask(E->ReorderIndices); 7018 ShuffleBuilder.addMask(E->ReuseShuffleIndices); 7019 V = ShuffleBuilder.finalize(V); 7020 7021 E->VectorizedValue = V; 7022 ++NumVectorInstructions; 7023 return V; 7024 } 7025 case Instruction::FCmp: 7026 case Instruction::ICmp: { 7027 setInsertPointAfterBundle(E); 7028 7029 Value *L = vectorizeTree(E->getOperand(0)); 7030 Value *R = vectorizeTree(E->getOperand(1)); 7031 7032 if (E->VectorizedValue) { 7033 LLVM_DEBUG(dbgs() << "SLP: Diamond merged for " << *VL0 << ".\n"); 7034 return E->VectorizedValue; 7035 } 7036 7037 CmpInst::Predicate P0 = cast<CmpInst>(VL0)->getPredicate(); 7038 Value *V = Builder.CreateCmp(P0, L, R); 7039 propagateIRFlags(V, E->Scalars, VL0); 7040 ShuffleBuilder.addInversedMask(E->ReorderIndices); 7041 ShuffleBuilder.addMask(E->ReuseShuffleIndices); 7042 V = ShuffleBuilder.finalize(V); 7043 7044 E->VectorizedValue = V; 7045 ++NumVectorInstructions; 7046 return V; 7047 } 7048 case Instruction::Select: { 7049 setInsertPointAfterBundle(E); 7050 7051 Value *Cond = vectorizeTree(E->getOperand(0)); 7052 Value *True = vectorizeTree(E->getOperand(1)); 7053 Value *False = vectorizeTree(E->getOperand(2)); 7054 7055 if (E->VectorizedValue) { 7056 LLVM_DEBUG(dbgs() << "SLP: Diamond merged for " << *VL0 << ".\n"); 7057 return E->VectorizedValue; 7058 } 7059 7060 Value *V = Builder.CreateSelect(Cond, True, False); 7061 ShuffleBuilder.addInversedMask(E->ReorderIndices); 7062 ShuffleBuilder.addMask(E->ReuseShuffleIndices); 7063 V = ShuffleBuilder.finalize(V); 7064 7065 E->VectorizedValue = V; 7066 ++NumVectorInstructions; 7067 return V; 7068 } 7069 case Instruction::FNeg: { 7070 setInsertPointAfterBundle(E); 7071 7072 Value *Op = vectorizeTree(E->getOperand(0)); 7073 7074 if (E->VectorizedValue) { 7075 LLVM_DEBUG(dbgs() << "SLP: Diamond merged for " << *VL0 << ".\n"); 7076 return E->VectorizedValue; 7077 } 7078 7079 Value *V = Builder.CreateUnOp( 7080 static_cast<Instruction::UnaryOps>(E->getOpcode()), Op); 7081 propagateIRFlags(V, E->Scalars, VL0); 7082 if (auto *I = dyn_cast<Instruction>(V)) 7083 V = propagateMetadata(I, E->Scalars); 7084 7085 ShuffleBuilder.addInversedMask(E->ReorderIndices); 7086 ShuffleBuilder.addMask(E->ReuseShuffleIndices); 7087 V = ShuffleBuilder.finalize(V); 7088 7089 E->VectorizedValue = V; 7090 ++NumVectorInstructions; 7091 7092 return V; 7093 } 7094 case Instruction::Add: 7095 case Instruction::FAdd: 7096 case Instruction::Sub: 7097 case Instruction::FSub: 7098 case Instruction::Mul: 7099 case Instruction::FMul: 7100 case Instruction::UDiv: 7101 case Instruction::SDiv: 7102 case Instruction::FDiv: 7103 case Instruction::URem: 7104 case Instruction::SRem: 7105 case Instruction::FRem: 7106 case Instruction::Shl: 7107 case Instruction::LShr: 7108 case Instruction::AShr: 7109 case Instruction::And: 7110 case Instruction::Or: 7111 case Instruction::Xor: { 7112 setInsertPointAfterBundle(E); 7113 7114 Value *LHS = vectorizeTree(E->getOperand(0)); 7115 Value *RHS = vectorizeTree(E->getOperand(1)); 7116 7117 if (E->VectorizedValue) { 7118 LLVM_DEBUG(dbgs() << "SLP: Diamond merged for " << *VL0 << ".\n"); 7119 return E->VectorizedValue; 7120 } 7121 7122 Value *V = Builder.CreateBinOp( 7123 static_cast<Instruction::BinaryOps>(E->getOpcode()), LHS, 7124 RHS); 7125 propagateIRFlags(V, E->Scalars, VL0); 7126 if (auto *I = dyn_cast<Instruction>(V)) 7127 V = propagateMetadata(I, E->Scalars); 7128 7129 ShuffleBuilder.addInversedMask(E->ReorderIndices); 7130 ShuffleBuilder.addMask(E->ReuseShuffleIndices); 7131 V = ShuffleBuilder.finalize(V); 7132 7133 E->VectorizedValue = V; 7134 ++NumVectorInstructions; 7135 7136 return V; 7137 } 7138 case Instruction::Load: { 7139 // Loads are inserted at the head of the tree because we don't want to 7140 // sink them all the way down past store instructions. 7141 setInsertPointAfterBundle(E); 7142 7143 LoadInst *LI = cast<LoadInst>(VL0); 7144 Instruction *NewLI; 7145 unsigned AS = LI->getPointerAddressSpace(); 7146 Value *PO = LI->getPointerOperand(); 7147 if (E->State == TreeEntry::Vectorize) { 7148 Value *VecPtr = Builder.CreateBitCast(PO, VecTy->getPointerTo(AS)); 7149 NewLI = Builder.CreateAlignedLoad(VecTy, VecPtr, LI->getAlign()); 7150 7151 // The pointer operand uses an in-tree scalar so we add the new BitCast 7152 // or LoadInst to ExternalUses list to make sure that an extract will 7153 // be generated in the future. 7154 if (TreeEntry *Entry = getTreeEntry(PO)) { 7155 // Find which lane we need to extract. 7156 unsigned FoundLane = Entry->findLaneForValue(PO); 7157 ExternalUses.emplace_back( 7158 PO, PO != VecPtr ? cast<User>(VecPtr) : NewLI, FoundLane); 7159 } 7160 } else { 7161 assert(E->State == TreeEntry::ScatterVectorize && "Unhandled state"); 7162 Value *VecPtr = vectorizeTree(E->getOperand(0)); 7163 // Use the minimum alignment of the gathered loads. 7164 Align CommonAlignment = LI->getAlign(); 7165 for (Value *V : E->Scalars) 7166 CommonAlignment = 7167 commonAlignment(CommonAlignment, cast<LoadInst>(V)->getAlign()); 7168 NewLI = Builder.CreateMaskedGather(VecTy, VecPtr, CommonAlignment); 7169 } 7170 Value *V = propagateMetadata(NewLI, E->Scalars); 7171 7172 ShuffleBuilder.addInversedMask(E->ReorderIndices); 7173 ShuffleBuilder.addMask(E->ReuseShuffleIndices); 7174 V = ShuffleBuilder.finalize(V); 7175 E->VectorizedValue = V; 7176 ++NumVectorInstructions; 7177 return V; 7178 } 7179 case Instruction::Store: { 7180 auto *SI = cast<StoreInst>(VL0); 7181 unsigned AS = SI->getPointerAddressSpace(); 7182 7183 setInsertPointAfterBundle(E); 7184 7185 Value *VecValue = vectorizeTree(E->getOperand(0)); 7186 ShuffleBuilder.addMask(E->ReorderIndices); 7187 VecValue = ShuffleBuilder.finalize(VecValue); 7188 7189 Value *ScalarPtr = SI->getPointerOperand(); 7190 Value *VecPtr = Builder.CreateBitCast( 7191 ScalarPtr, VecValue->getType()->getPointerTo(AS)); 7192 StoreInst *ST = 7193 Builder.CreateAlignedStore(VecValue, VecPtr, SI->getAlign()); 7194 7195 // The pointer operand uses an in-tree scalar, so add the new BitCast or 7196 // StoreInst to ExternalUses to make sure that an extract will be 7197 // generated in the future. 7198 if (TreeEntry *Entry = getTreeEntry(ScalarPtr)) { 7199 // Find which lane we need to extract. 7200 unsigned FoundLane = Entry->findLaneForValue(ScalarPtr); 7201 ExternalUses.push_back(ExternalUser( 7202 ScalarPtr, ScalarPtr != VecPtr ? cast<User>(VecPtr) : ST, 7203 FoundLane)); 7204 } 7205 7206 Value *V = propagateMetadata(ST, E->Scalars); 7207 7208 E->VectorizedValue = V; 7209 ++NumVectorInstructions; 7210 return V; 7211 } 7212 case Instruction::GetElementPtr: { 7213 auto *GEP0 = cast<GetElementPtrInst>(VL0); 7214 setInsertPointAfterBundle(E); 7215 7216 Value *Op0 = vectorizeTree(E->getOperand(0)); 7217 7218 SmallVector<Value *> OpVecs; 7219 for (int J = 1, N = GEP0->getNumOperands(); J < N; ++J) { 7220 Value *OpVec = vectorizeTree(E->getOperand(J)); 7221 OpVecs.push_back(OpVec); 7222 } 7223 7224 Value *V = Builder.CreateGEP(GEP0->getSourceElementType(), Op0, OpVecs); 7225 if (Instruction *I = dyn_cast<Instruction>(V)) 7226 V = propagateMetadata(I, E->Scalars); 7227 7228 ShuffleBuilder.addInversedMask(E->ReorderIndices); 7229 ShuffleBuilder.addMask(E->ReuseShuffleIndices); 7230 V = ShuffleBuilder.finalize(V); 7231 7232 E->VectorizedValue = V; 7233 ++NumVectorInstructions; 7234 7235 return V; 7236 } 7237 case Instruction::Call: { 7238 CallInst *CI = cast<CallInst>(VL0); 7239 setInsertPointAfterBundle(E); 7240 7241 Intrinsic::ID IID = Intrinsic::not_intrinsic; 7242 if (Function *FI = CI->getCalledFunction()) 7243 IID = FI->getIntrinsicID(); 7244 7245 Intrinsic::ID ID = getVectorIntrinsicIDForCall(CI, TLI); 7246 7247 auto VecCallCosts = getVectorCallCosts(CI, VecTy, TTI, TLI); 7248 bool UseIntrinsic = ID != Intrinsic::not_intrinsic && 7249 VecCallCosts.first <= VecCallCosts.second; 7250 7251 Value *ScalarArg = nullptr; 7252 std::vector<Value *> OpVecs; 7253 SmallVector<Type *, 2> TysForDecl = 7254 {FixedVectorType::get(CI->getType(), E->Scalars.size())}; 7255 for (int j = 0, e = CI->arg_size(); j < e; ++j) { 7256 ValueList OpVL; 7257 // Some intrinsics have scalar arguments. This argument should not be 7258 // vectorized. 7259 if (UseIntrinsic && hasVectorInstrinsicScalarOpd(IID, j)) { 7260 CallInst *CEI = cast<CallInst>(VL0); 7261 ScalarArg = CEI->getArgOperand(j); 7262 OpVecs.push_back(CEI->getArgOperand(j)); 7263 if (hasVectorInstrinsicOverloadedScalarOpd(IID, j)) 7264 TysForDecl.push_back(ScalarArg->getType()); 7265 continue; 7266 } 7267 7268 Value *OpVec = vectorizeTree(E->getOperand(j)); 7269 LLVM_DEBUG(dbgs() << "SLP: OpVec[" << j << "]: " << *OpVec << "\n"); 7270 OpVecs.push_back(OpVec); 7271 } 7272 7273 Function *CF; 7274 if (!UseIntrinsic) { 7275 VFShape Shape = 7276 VFShape::get(*CI, ElementCount::getFixed(static_cast<unsigned>( 7277 VecTy->getNumElements())), 7278 false /*HasGlobalPred*/); 7279 CF = VFDatabase(*CI).getVectorizedFunction(Shape); 7280 } else { 7281 CF = Intrinsic::getDeclaration(F->getParent(), ID, TysForDecl); 7282 } 7283 7284 SmallVector<OperandBundleDef, 1> OpBundles; 7285 CI->getOperandBundlesAsDefs(OpBundles); 7286 Value *V = Builder.CreateCall(CF, OpVecs, OpBundles); 7287 7288 // The scalar argument uses an in-tree scalar so we add the new vectorized 7289 // call to ExternalUses list to make sure that an extract will be 7290 // generated in the future. 7291 if (ScalarArg) { 7292 if (TreeEntry *Entry = getTreeEntry(ScalarArg)) { 7293 // Find which lane we need to extract. 7294 unsigned FoundLane = Entry->findLaneForValue(ScalarArg); 7295 ExternalUses.push_back( 7296 ExternalUser(ScalarArg, cast<User>(V), FoundLane)); 7297 } 7298 } 7299 7300 propagateIRFlags(V, E->Scalars, VL0); 7301 ShuffleBuilder.addInversedMask(E->ReorderIndices); 7302 ShuffleBuilder.addMask(E->ReuseShuffleIndices); 7303 V = ShuffleBuilder.finalize(V); 7304 7305 E->VectorizedValue = V; 7306 ++NumVectorInstructions; 7307 return V; 7308 } 7309 case Instruction::ShuffleVector: { 7310 assert(E->isAltShuffle() && 7311 ((Instruction::isBinaryOp(E->getOpcode()) && 7312 Instruction::isBinaryOp(E->getAltOpcode())) || 7313 (Instruction::isCast(E->getOpcode()) && 7314 Instruction::isCast(E->getAltOpcode())) || 7315 (isa<CmpInst>(VL0) && isa<CmpInst>(E->getAltOp()))) && 7316 "Invalid Shuffle Vector Operand"); 7317 7318 Value *LHS = nullptr, *RHS = nullptr; 7319 if (Instruction::isBinaryOp(E->getOpcode()) || isa<CmpInst>(VL0)) { 7320 setInsertPointAfterBundle(E); 7321 LHS = vectorizeTree(E->getOperand(0)); 7322 RHS = vectorizeTree(E->getOperand(1)); 7323 } else { 7324 setInsertPointAfterBundle(E); 7325 LHS = vectorizeTree(E->getOperand(0)); 7326 } 7327 7328 if (E->VectorizedValue) { 7329 LLVM_DEBUG(dbgs() << "SLP: Diamond merged for " << *VL0 << ".\n"); 7330 return E->VectorizedValue; 7331 } 7332 7333 Value *V0, *V1; 7334 if (Instruction::isBinaryOp(E->getOpcode())) { 7335 V0 = Builder.CreateBinOp( 7336 static_cast<Instruction::BinaryOps>(E->getOpcode()), LHS, RHS); 7337 V1 = Builder.CreateBinOp( 7338 static_cast<Instruction::BinaryOps>(E->getAltOpcode()), LHS, RHS); 7339 } else if (auto *CI0 = dyn_cast<CmpInst>(VL0)) { 7340 V0 = Builder.CreateCmp(CI0->getPredicate(), LHS, RHS); 7341 auto *AltCI = cast<CmpInst>(E->getAltOp()); 7342 CmpInst::Predicate AltPred = AltCI->getPredicate(); 7343 V1 = Builder.CreateCmp(AltPred, LHS, RHS); 7344 } else { 7345 V0 = Builder.CreateCast( 7346 static_cast<Instruction::CastOps>(E->getOpcode()), LHS, VecTy); 7347 V1 = Builder.CreateCast( 7348 static_cast<Instruction::CastOps>(E->getAltOpcode()), LHS, VecTy); 7349 } 7350 // Add V0 and V1 to later analysis to try to find and remove matching 7351 // instruction, if any. 7352 for (Value *V : {V0, V1}) { 7353 if (auto *I = dyn_cast<Instruction>(V)) { 7354 GatherShuffleSeq.insert(I); 7355 CSEBlocks.insert(I->getParent()); 7356 } 7357 } 7358 7359 // Create shuffle to take alternate operations from the vector. 7360 // Also, gather up main and alt scalar ops to propagate IR flags to 7361 // each vector operation. 7362 ValueList OpScalars, AltScalars; 7363 SmallVector<int> Mask; 7364 buildShuffleEntryMask( 7365 E->Scalars, E->ReorderIndices, E->ReuseShuffleIndices, 7366 [E](Instruction *I) { 7367 assert(E->isOpcodeOrAlt(I) && "Unexpected main/alternate opcode"); 7368 return isAlternateInstruction(I, E->getMainOp(), E->getAltOp()); 7369 }, 7370 Mask, &OpScalars, &AltScalars); 7371 7372 propagateIRFlags(V0, OpScalars); 7373 propagateIRFlags(V1, AltScalars); 7374 7375 Value *V = Builder.CreateShuffleVector(V0, V1, Mask); 7376 if (auto *I = dyn_cast<Instruction>(V)) { 7377 V = propagateMetadata(I, E->Scalars); 7378 GatherShuffleSeq.insert(I); 7379 CSEBlocks.insert(I->getParent()); 7380 } 7381 V = ShuffleBuilder.finalize(V); 7382 7383 E->VectorizedValue = V; 7384 ++NumVectorInstructions; 7385 7386 return V; 7387 } 7388 default: 7389 llvm_unreachable("unknown inst"); 7390 } 7391 return nullptr; 7392 } 7393 7394 Value *BoUpSLP::vectorizeTree() { 7395 ExtraValueToDebugLocsMap ExternallyUsedValues; 7396 return vectorizeTree(ExternallyUsedValues); 7397 } 7398 7399 Value * 7400 BoUpSLP::vectorizeTree(ExtraValueToDebugLocsMap &ExternallyUsedValues) { 7401 // All blocks must be scheduled before any instructions are inserted. 7402 for (auto &BSIter : BlocksSchedules) { 7403 scheduleBlock(BSIter.second.get()); 7404 } 7405 7406 Builder.SetInsertPoint(&F->getEntryBlock().front()); 7407 auto *VectorRoot = vectorizeTree(VectorizableTree[0].get()); 7408 7409 // If the vectorized tree can be rewritten in a smaller type, we truncate the 7410 // vectorized root. InstCombine will then rewrite the entire expression. We 7411 // sign extend the extracted values below. 7412 auto *ScalarRoot = VectorizableTree[0]->Scalars[0]; 7413 if (MinBWs.count(ScalarRoot)) { 7414 if (auto *I = dyn_cast<Instruction>(VectorRoot)) { 7415 // If current instr is a phi and not the last phi, insert it after the 7416 // last phi node. 7417 if (isa<PHINode>(I)) 7418 Builder.SetInsertPoint(&*I->getParent()->getFirstInsertionPt()); 7419 else 7420 Builder.SetInsertPoint(&*++BasicBlock::iterator(I)); 7421 } 7422 auto BundleWidth = VectorizableTree[0]->Scalars.size(); 7423 auto *MinTy = IntegerType::get(F->getContext(), MinBWs[ScalarRoot].first); 7424 auto *VecTy = FixedVectorType::get(MinTy, BundleWidth); 7425 auto *Trunc = Builder.CreateTrunc(VectorRoot, VecTy); 7426 VectorizableTree[0]->VectorizedValue = Trunc; 7427 } 7428 7429 LLVM_DEBUG(dbgs() << "SLP: Extracting " << ExternalUses.size() 7430 << " values .\n"); 7431 7432 // Extract all of the elements with the external uses. 7433 for (const auto &ExternalUse : ExternalUses) { 7434 Value *Scalar = ExternalUse.Scalar; 7435 llvm::User *User = ExternalUse.User; 7436 7437 // Skip users that we already RAUW. This happens when one instruction 7438 // has multiple uses of the same value. 7439 if (User && !is_contained(Scalar->users(), User)) 7440 continue; 7441 TreeEntry *E = getTreeEntry(Scalar); 7442 assert(E && "Invalid scalar"); 7443 assert(E->State != TreeEntry::NeedToGather && 7444 "Extracting from a gather list"); 7445 7446 Value *Vec = E->VectorizedValue; 7447 assert(Vec && "Can't find vectorizable value"); 7448 7449 Value *Lane = Builder.getInt32(ExternalUse.Lane); 7450 auto ExtractAndExtendIfNeeded = [&](Value *Vec) { 7451 if (Scalar->getType() != Vec->getType()) { 7452 Value *Ex; 7453 // "Reuse" the existing extract to improve final codegen. 7454 if (auto *ES = dyn_cast<ExtractElementInst>(Scalar)) { 7455 Ex = Builder.CreateExtractElement(ES->getOperand(0), 7456 ES->getOperand(1)); 7457 } else { 7458 Ex = Builder.CreateExtractElement(Vec, Lane); 7459 } 7460 // If necessary, sign-extend or zero-extend ScalarRoot 7461 // to the larger type. 7462 if (!MinBWs.count(ScalarRoot)) 7463 return Ex; 7464 if (MinBWs[ScalarRoot].second) 7465 return Builder.CreateSExt(Ex, Scalar->getType()); 7466 return Builder.CreateZExt(Ex, Scalar->getType()); 7467 } 7468 assert(isa<FixedVectorType>(Scalar->getType()) && 7469 isa<InsertElementInst>(Scalar) && 7470 "In-tree scalar of vector type is not insertelement?"); 7471 return Vec; 7472 }; 7473 // If User == nullptr, the Scalar is used as extra arg. Generate 7474 // ExtractElement instruction and update the record for this scalar in 7475 // ExternallyUsedValues. 7476 if (!User) { 7477 assert(ExternallyUsedValues.count(Scalar) && 7478 "Scalar with nullptr as an external user must be registered in " 7479 "ExternallyUsedValues map"); 7480 if (auto *VecI = dyn_cast<Instruction>(Vec)) { 7481 Builder.SetInsertPoint(VecI->getParent(), 7482 std::next(VecI->getIterator())); 7483 } else { 7484 Builder.SetInsertPoint(&F->getEntryBlock().front()); 7485 } 7486 Value *NewInst = ExtractAndExtendIfNeeded(Vec); 7487 CSEBlocks.insert(cast<Instruction>(Scalar)->getParent()); 7488 auto &NewInstLocs = ExternallyUsedValues[NewInst]; 7489 auto It = ExternallyUsedValues.find(Scalar); 7490 assert(It != ExternallyUsedValues.end() && 7491 "Externally used scalar is not found in ExternallyUsedValues"); 7492 NewInstLocs.append(It->second); 7493 ExternallyUsedValues.erase(Scalar); 7494 // Required to update internally referenced instructions. 7495 Scalar->replaceAllUsesWith(NewInst); 7496 continue; 7497 } 7498 7499 // Generate extracts for out-of-tree users. 7500 // Find the insertion point for the extractelement lane. 7501 if (auto *VecI = dyn_cast<Instruction>(Vec)) { 7502 if (PHINode *PH = dyn_cast<PHINode>(User)) { 7503 for (int i = 0, e = PH->getNumIncomingValues(); i != e; ++i) { 7504 if (PH->getIncomingValue(i) == Scalar) { 7505 Instruction *IncomingTerminator = 7506 PH->getIncomingBlock(i)->getTerminator(); 7507 if (isa<CatchSwitchInst>(IncomingTerminator)) { 7508 Builder.SetInsertPoint(VecI->getParent(), 7509 std::next(VecI->getIterator())); 7510 } else { 7511 Builder.SetInsertPoint(PH->getIncomingBlock(i)->getTerminator()); 7512 } 7513 Value *NewInst = ExtractAndExtendIfNeeded(Vec); 7514 CSEBlocks.insert(PH->getIncomingBlock(i)); 7515 PH->setOperand(i, NewInst); 7516 } 7517 } 7518 } else { 7519 Builder.SetInsertPoint(cast<Instruction>(User)); 7520 Value *NewInst = ExtractAndExtendIfNeeded(Vec); 7521 CSEBlocks.insert(cast<Instruction>(User)->getParent()); 7522 User->replaceUsesOfWith(Scalar, NewInst); 7523 } 7524 } else { 7525 Builder.SetInsertPoint(&F->getEntryBlock().front()); 7526 Value *NewInst = ExtractAndExtendIfNeeded(Vec); 7527 CSEBlocks.insert(&F->getEntryBlock()); 7528 User->replaceUsesOfWith(Scalar, NewInst); 7529 } 7530 7531 LLVM_DEBUG(dbgs() << "SLP: Replaced:" << *User << ".\n"); 7532 } 7533 7534 // For each vectorized value: 7535 for (auto &TEPtr : VectorizableTree) { 7536 TreeEntry *Entry = TEPtr.get(); 7537 7538 // No need to handle users of gathered values. 7539 if (Entry->State == TreeEntry::NeedToGather) 7540 continue; 7541 7542 assert(Entry->VectorizedValue && "Can't find vectorizable value"); 7543 7544 // For each lane: 7545 for (int Lane = 0, LE = Entry->Scalars.size(); Lane != LE; ++Lane) { 7546 Value *Scalar = Entry->Scalars[Lane]; 7547 7548 #ifndef NDEBUG 7549 Type *Ty = Scalar->getType(); 7550 if (!Ty->isVoidTy()) { 7551 for (User *U : Scalar->users()) { 7552 LLVM_DEBUG(dbgs() << "SLP: \tvalidating user:" << *U << ".\n"); 7553 7554 // It is legal to delete users in the ignorelist. 7555 assert((getTreeEntry(U) || is_contained(UserIgnoreList, U) || 7556 (isa_and_nonnull<Instruction>(U) && 7557 isDeleted(cast<Instruction>(U)))) && 7558 "Deleting out-of-tree value"); 7559 } 7560 } 7561 #endif 7562 LLVM_DEBUG(dbgs() << "SLP: \tErasing scalar:" << *Scalar << ".\n"); 7563 eraseInstruction(cast<Instruction>(Scalar)); 7564 } 7565 } 7566 7567 Builder.ClearInsertionPoint(); 7568 InstrElementSize.clear(); 7569 7570 return VectorizableTree[0]->VectorizedValue; 7571 } 7572 7573 void BoUpSLP::optimizeGatherSequence() { 7574 LLVM_DEBUG(dbgs() << "SLP: Optimizing " << GatherShuffleSeq.size() 7575 << " gather sequences instructions.\n"); 7576 // LICM InsertElementInst sequences. 7577 for (Instruction *I : GatherShuffleSeq) { 7578 if (isDeleted(I)) 7579 continue; 7580 7581 // Check if this block is inside a loop. 7582 Loop *L = LI->getLoopFor(I->getParent()); 7583 if (!L) 7584 continue; 7585 7586 // Check if it has a preheader. 7587 BasicBlock *PreHeader = L->getLoopPreheader(); 7588 if (!PreHeader) 7589 continue; 7590 7591 // If the vector or the element that we insert into it are 7592 // instructions that are defined in this basic block then we can't 7593 // hoist this instruction. 7594 if (any_of(I->operands(), [L](Value *V) { 7595 auto *OpI = dyn_cast<Instruction>(V); 7596 return OpI && L->contains(OpI); 7597 })) 7598 continue; 7599 7600 // We can hoist this instruction. Move it to the pre-header. 7601 I->moveBefore(PreHeader->getTerminator()); 7602 } 7603 7604 // Make a list of all reachable blocks in our CSE queue. 7605 SmallVector<const DomTreeNode *, 8> CSEWorkList; 7606 CSEWorkList.reserve(CSEBlocks.size()); 7607 for (BasicBlock *BB : CSEBlocks) 7608 if (DomTreeNode *N = DT->getNode(BB)) { 7609 assert(DT->isReachableFromEntry(N)); 7610 CSEWorkList.push_back(N); 7611 } 7612 7613 // Sort blocks by domination. This ensures we visit a block after all blocks 7614 // dominating it are visited. 7615 llvm::sort(CSEWorkList, [](const DomTreeNode *A, const DomTreeNode *B) { 7616 assert((A == B) == (A->getDFSNumIn() == B->getDFSNumIn()) && 7617 "Different nodes should have different DFS numbers"); 7618 return A->getDFSNumIn() < B->getDFSNumIn(); 7619 }); 7620 7621 // Less defined shuffles can be replaced by the more defined copies. 7622 // Between two shuffles one is less defined if it has the same vector operands 7623 // and its mask indeces are the same as in the first one or undefs. E.g. 7624 // shuffle %0, poison, <0, 0, 0, undef> is less defined than shuffle %0, 7625 // poison, <0, 0, 0, 0>. 7626 auto &&IsIdenticalOrLessDefined = [this](Instruction *I1, Instruction *I2, 7627 SmallVectorImpl<int> &NewMask) { 7628 if (I1->getType() != I2->getType()) 7629 return false; 7630 auto *SI1 = dyn_cast<ShuffleVectorInst>(I1); 7631 auto *SI2 = dyn_cast<ShuffleVectorInst>(I2); 7632 if (!SI1 || !SI2) 7633 return I1->isIdenticalTo(I2); 7634 if (SI1->isIdenticalTo(SI2)) 7635 return true; 7636 for (int I = 0, E = SI1->getNumOperands(); I < E; ++I) 7637 if (SI1->getOperand(I) != SI2->getOperand(I)) 7638 return false; 7639 // Check if the second instruction is more defined than the first one. 7640 NewMask.assign(SI2->getShuffleMask().begin(), SI2->getShuffleMask().end()); 7641 ArrayRef<int> SM1 = SI1->getShuffleMask(); 7642 // Count trailing undefs in the mask to check the final number of used 7643 // registers. 7644 unsigned LastUndefsCnt = 0; 7645 for (int I = 0, E = NewMask.size(); I < E; ++I) { 7646 if (SM1[I] == UndefMaskElem) 7647 ++LastUndefsCnt; 7648 else 7649 LastUndefsCnt = 0; 7650 if (NewMask[I] != UndefMaskElem && SM1[I] != UndefMaskElem && 7651 NewMask[I] != SM1[I]) 7652 return false; 7653 if (NewMask[I] == UndefMaskElem) 7654 NewMask[I] = SM1[I]; 7655 } 7656 // Check if the last undefs actually change the final number of used vector 7657 // registers. 7658 return SM1.size() - LastUndefsCnt > 1 && 7659 TTI->getNumberOfParts(SI1->getType()) == 7660 TTI->getNumberOfParts( 7661 FixedVectorType::get(SI1->getType()->getElementType(), 7662 SM1.size() - LastUndefsCnt)); 7663 }; 7664 // Perform O(N^2) search over the gather/shuffle sequences and merge identical 7665 // instructions. TODO: We can further optimize this scan if we split the 7666 // instructions into different buckets based on the insert lane. 7667 SmallVector<Instruction *, 16> Visited; 7668 for (auto I = CSEWorkList.begin(), E = CSEWorkList.end(); I != E; ++I) { 7669 assert(*I && 7670 (I == CSEWorkList.begin() || !DT->dominates(*I, *std::prev(I))) && 7671 "Worklist not sorted properly!"); 7672 BasicBlock *BB = (*I)->getBlock(); 7673 // For all instructions in blocks containing gather sequences: 7674 for (Instruction &In : llvm::make_early_inc_range(*BB)) { 7675 if (isDeleted(&In)) 7676 continue; 7677 if (!isa<InsertElementInst>(&In) && !isa<ExtractElementInst>(&In) && 7678 !isa<ShuffleVectorInst>(&In) && !GatherShuffleSeq.contains(&In)) 7679 continue; 7680 7681 // Check if we can replace this instruction with any of the 7682 // visited instructions. 7683 bool Replaced = false; 7684 for (Instruction *&V : Visited) { 7685 SmallVector<int> NewMask; 7686 if (IsIdenticalOrLessDefined(&In, V, NewMask) && 7687 DT->dominates(V->getParent(), In.getParent())) { 7688 In.replaceAllUsesWith(V); 7689 eraseInstruction(&In); 7690 if (auto *SI = dyn_cast<ShuffleVectorInst>(V)) 7691 if (!NewMask.empty()) 7692 SI->setShuffleMask(NewMask); 7693 Replaced = true; 7694 break; 7695 } 7696 if (isa<ShuffleVectorInst>(In) && isa<ShuffleVectorInst>(V) && 7697 GatherShuffleSeq.contains(V) && 7698 IsIdenticalOrLessDefined(V, &In, NewMask) && 7699 DT->dominates(In.getParent(), V->getParent())) { 7700 In.moveAfter(V); 7701 V->replaceAllUsesWith(&In); 7702 eraseInstruction(V); 7703 if (auto *SI = dyn_cast<ShuffleVectorInst>(&In)) 7704 if (!NewMask.empty()) 7705 SI->setShuffleMask(NewMask); 7706 V = &In; 7707 Replaced = true; 7708 break; 7709 } 7710 } 7711 if (!Replaced) { 7712 assert(!is_contained(Visited, &In)); 7713 Visited.push_back(&In); 7714 } 7715 } 7716 } 7717 CSEBlocks.clear(); 7718 GatherShuffleSeq.clear(); 7719 } 7720 7721 BoUpSLP::ScheduleData * 7722 BoUpSLP::BlockScheduling::buildBundle(ArrayRef<Value *> VL) { 7723 ScheduleData *Bundle = nullptr; 7724 ScheduleData *PrevInBundle = nullptr; 7725 for (Value *V : VL) { 7726 if (doesNotNeedToBeScheduled(V)) 7727 continue; 7728 ScheduleData *BundleMember = getScheduleData(V); 7729 assert(BundleMember && 7730 "no ScheduleData for bundle member " 7731 "(maybe not in same basic block)"); 7732 assert(BundleMember->isSchedulingEntity() && 7733 "bundle member already part of other bundle"); 7734 if (PrevInBundle) { 7735 PrevInBundle->NextInBundle = BundleMember; 7736 } else { 7737 Bundle = BundleMember; 7738 } 7739 7740 // Group the instructions to a bundle. 7741 BundleMember->FirstInBundle = Bundle; 7742 PrevInBundle = BundleMember; 7743 } 7744 assert(Bundle && "Failed to find schedule bundle"); 7745 return Bundle; 7746 } 7747 7748 // Groups the instructions to a bundle (which is then a single scheduling entity) 7749 // and schedules instructions until the bundle gets ready. 7750 Optional<BoUpSLP::ScheduleData *> 7751 BoUpSLP::BlockScheduling::tryScheduleBundle(ArrayRef<Value *> VL, BoUpSLP *SLP, 7752 const InstructionsState &S) { 7753 // No need to schedule PHIs, insertelement, extractelement and extractvalue 7754 // instructions. 7755 if (isa<PHINode>(S.OpValue) || isVectorLikeInstWithConstOps(S.OpValue) || 7756 doesNotNeedToSchedule(VL)) 7757 return nullptr; 7758 7759 // Initialize the instruction bundle. 7760 Instruction *OldScheduleEnd = ScheduleEnd; 7761 LLVM_DEBUG(dbgs() << "SLP: bundle: " << *S.OpValue << "\n"); 7762 7763 auto TryScheduleBundleImpl = [this, OldScheduleEnd, SLP](bool ReSchedule, 7764 ScheduleData *Bundle) { 7765 // The scheduling region got new instructions at the lower end (or it is a 7766 // new region for the first bundle). This makes it necessary to 7767 // recalculate all dependencies. 7768 // It is seldom that this needs to be done a second time after adding the 7769 // initial bundle to the region. 7770 if (ScheduleEnd != OldScheduleEnd) { 7771 for (auto *I = ScheduleStart; I != ScheduleEnd; I = I->getNextNode()) 7772 doForAllOpcodes(I, [](ScheduleData *SD) { SD->clearDependencies(); }); 7773 ReSchedule = true; 7774 } 7775 if (Bundle) { 7776 LLVM_DEBUG(dbgs() << "SLP: try schedule bundle " << *Bundle 7777 << " in block " << BB->getName() << "\n"); 7778 calculateDependencies(Bundle, /*InsertInReadyList=*/true, SLP); 7779 } 7780 7781 if (ReSchedule) { 7782 resetSchedule(); 7783 initialFillReadyList(ReadyInsts); 7784 } 7785 7786 // Now try to schedule the new bundle or (if no bundle) just calculate 7787 // dependencies. As soon as the bundle is "ready" it means that there are no 7788 // cyclic dependencies and we can schedule it. Note that's important that we 7789 // don't "schedule" the bundle yet (see cancelScheduling). 7790 while (((!Bundle && ReSchedule) || (Bundle && !Bundle->isReady())) && 7791 !ReadyInsts.empty()) { 7792 ScheduleData *Picked = ReadyInsts.pop_back_val(); 7793 assert(Picked->isSchedulingEntity() && Picked->isReady() && 7794 "must be ready to schedule"); 7795 schedule(Picked, ReadyInsts); 7796 } 7797 }; 7798 7799 // Make sure that the scheduling region contains all 7800 // instructions of the bundle. 7801 for (Value *V : VL) { 7802 if (doesNotNeedToBeScheduled(V)) 7803 continue; 7804 if (!extendSchedulingRegion(V, S)) { 7805 // If the scheduling region got new instructions at the lower end (or it 7806 // is a new region for the first bundle). This makes it necessary to 7807 // recalculate all dependencies. 7808 // Otherwise the compiler may crash trying to incorrectly calculate 7809 // dependencies and emit instruction in the wrong order at the actual 7810 // scheduling. 7811 TryScheduleBundleImpl(/*ReSchedule=*/false, nullptr); 7812 return None; 7813 } 7814 } 7815 7816 bool ReSchedule = false; 7817 for (Value *V : VL) { 7818 if (doesNotNeedToBeScheduled(V)) 7819 continue; 7820 ScheduleData *BundleMember = getScheduleData(V); 7821 assert(BundleMember && 7822 "no ScheduleData for bundle member (maybe not in same basic block)"); 7823 7824 // Make sure we don't leave the pieces of the bundle in the ready list when 7825 // whole bundle might not be ready. 7826 ReadyInsts.remove(BundleMember); 7827 7828 if (!BundleMember->IsScheduled) 7829 continue; 7830 // A bundle member was scheduled as single instruction before and now 7831 // needs to be scheduled as part of the bundle. We just get rid of the 7832 // existing schedule. 7833 LLVM_DEBUG(dbgs() << "SLP: reset schedule because " << *BundleMember 7834 << " was already scheduled\n"); 7835 ReSchedule = true; 7836 } 7837 7838 auto *Bundle = buildBundle(VL); 7839 TryScheduleBundleImpl(ReSchedule, Bundle); 7840 if (!Bundle->isReady()) { 7841 cancelScheduling(VL, S.OpValue); 7842 return None; 7843 } 7844 return Bundle; 7845 } 7846 7847 void BoUpSLP::BlockScheduling::cancelScheduling(ArrayRef<Value *> VL, 7848 Value *OpValue) { 7849 if (isa<PHINode>(OpValue) || isVectorLikeInstWithConstOps(OpValue) || 7850 doesNotNeedToSchedule(VL)) 7851 return; 7852 7853 if (doesNotNeedToBeScheduled(OpValue)) 7854 OpValue = *find_if_not(VL, doesNotNeedToBeScheduled); 7855 ScheduleData *Bundle = getScheduleData(OpValue); 7856 LLVM_DEBUG(dbgs() << "SLP: cancel scheduling of " << *Bundle << "\n"); 7857 assert(!Bundle->IsScheduled && 7858 "Can't cancel bundle which is already scheduled"); 7859 assert(Bundle->isSchedulingEntity() && 7860 (Bundle->isPartOfBundle() || needToScheduleSingleInstruction(VL)) && 7861 "tried to unbundle something which is not a bundle"); 7862 7863 // Remove the bundle from the ready list. 7864 if (Bundle->isReady()) 7865 ReadyInsts.remove(Bundle); 7866 7867 // Un-bundle: make single instructions out of the bundle. 7868 ScheduleData *BundleMember = Bundle; 7869 while (BundleMember) { 7870 assert(BundleMember->FirstInBundle == Bundle && "corrupt bundle links"); 7871 BundleMember->FirstInBundle = BundleMember; 7872 ScheduleData *Next = BundleMember->NextInBundle; 7873 BundleMember->NextInBundle = nullptr; 7874 BundleMember->TE = nullptr; 7875 if (BundleMember->unscheduledDepsInBundle() == 0) { 7876 ReadyInsts.insert(BundleMember); 7877 } 7878 BundleMember = Next; 7879 } 7880 } 7881 7882 BoUpSLP::ScheduleData *BoUpSLP::BlockScheduling::allocateScheduleDataChunks() { 7883 // Allocate a new ScheduleData for the instruction. 7884 if (ChunkPos >= ChunkSize) { 7885 ScheduleDataChunks.push_back(std::make_unique<ScheduleData[]>(ChunkSize)); 7886 ChunkPos = 0; 7887 } 7888 return &(ScheduleDataChunks.back()[ChunkPos++]); 7889 } 7890 7891 bool BoUpSLP::BlockScheduling::extendSchedulingRegion(Value *V, 7892 const InstructionsState &S) { 7893 if (getScheduleData(V, isOneOf(S, V))) 7894 return true; 7895 Instruction *I = dyn_cast<Instruction>(V); 7896 assert(I && "bundle member must be an instruction"); 7897 assert(!isa<PHINode>(I) && !isVectorLikeInstWithConstOps(I) && 7898 !doesNotNeedToBeScheduled(I) && 7899 "phi nodes/insertelements/extractelements/extractvalues don't need to " 7900 "be scheduled"); 7901 auto &&CheckScheduleForI = [this, &S](Instruction *I) -> bool { 7902 ScheduleData *ISD = getScheduleData(I); 7903 if (!ISD) 7904 return false; 7905 assert(isInSchedulingRegion(ISD) && 7906 "ScheduleData not in scheduling region"); 7907 ScheduleData *SD = allocateScheduleDataChunks(); 7908 SD->Inst = I; 7909 SD->init(SchedulingRegionID, S.OpValue); 7910 ExtraScheduleDataMap[I][S.OpValue] = SD; 7911 return true; 7912 }; 7913 if (CheckScheduleForI(I)) 7914 return true; 7915 if (!ScheduleStart) { 7916 // It's the first instruction in the new region. 7917 initScheduleData(I, I->getNextNode(), nullptr, nullptr); 7918 ScheduleStart = I; 7919 ScheduleEnd = I->getNextNode(); 7920 if (isOneOf(S, I) != I) 7921 CheckScheduleForI(I); 7922 assert(ScheduleEnd && "tried to vectorize a terminator?"); 7923 LLVM_DEBUG(dbgs() << "SLP: initialize schedule region to " << *I << "\n"); 7924 return true; 7925 } 7926 // Search up and down at the same time, because we don't know if the new 7927 // instruction is above or below the existing scheduling region. 7928 BasicBlock::reverse_iterator UpIter = 7929 ++ScheduleStart->getIterator().getReverse(); 7930 BasicBlock::reverse_iterator UpperEnd = BB->rend(); 7931 BasicBlock::iterator DownIter = ScheduleEnd->getIterator(); 7932 BasicBlock::iterator LowerEnd = BB->end(); 7933 while (UpIter != UpperEnd && DownIter != LowerEnd && &*UpIter != I && 7934 &*DownIter != I) { 7935 if (++ScheduleRegionSize > ScheduleRegionSizeLimit) { 7936 LLVM_DEBUG(dbgs() << "SLP: exceeded schedule region size limit\n"); 7937 return false; 7938 } 7939 7940 ++UpIter; 7941 ++DownIter; 7942 } 7943 if (DownIter == LowerEnd || (UpIter != UpperEnd && &*UpIter == I)) { 7944 assert(I->getParent() == ScheduleStart->getParent() && 7945 "Instruction is in wrong basic block."); 7946 initScheduleData(I, ScheduleStart, nullptr, FirstLoadStoreInRegion); 7947 ScheduleStart = I; 7948 if (isOneOf(S, I) != I) 7949 CheckScheduleForI(I); 7950 LLVM_DEBUG(dbgs() << "SLP: extend schedule region start to " << *I 7951 << "\n"); 7952 return true; 7953 } 7954 assert((UpIter == UpperEnd || (DownIter != LowerEnd && &*DownIter == I)) && 7955 "Expected to reach top of the basic block or instruction down the " 7956 "lower end."); 7957 assert(I->getParent() == ScheduleEnd->getParent() && 7958 "Instruction is in wrong basic block."); 7959 initScheduleData(ScheduleEnd, I->getNextNode(), LastLoadStoreInRegion, 7960 nullptr); 7961 ScheduleEnd = I->getNextNode(); 7962 if (isOneOf(S, I) != I) 7963 CheckScheduleForI(I); 7964 assert(ScheduleEnd && "tried to vectorize a terminator?"); 7965 LLVM_DEBUG(dbgs() << "SLP: extend schedule region end to " << *I << "\n"); 7966 return true; 7967 } 7968 7969 void BoUpSLP::BlockScheduling::initScheduleData(Instruction *FromI, 7970 Instruction *ToI, 7971 ScheduleData *PrevLoadStore, 7972 ScheduleData *NextLoadStore) { 7973 ScheduleData *CurrentLoadStore = PrevLoadStore; 7974 for (Instruction *I = FromI; I != ToI; I = I->getNextNode()) { 7975 // No need to allocate data for non-schedulable instructions. 7976 if (doesNotNeedToBeScheduled(I)) 7977 continue; 7978 ScheduleData *SD = ScheduleDataMap.lookup(I); 7979 if (!SD) { 7980 SD = allocateScheduleDataChunks(); 7981 ScheduleDataMap[I] = SD; 7982 SD->Inst = I; 7983 } 7984 assert(!isInSchedulingRegion(SD) && 7985 "new ScheduleData already in scheduling region"); 7986 SD->init(SchedulingRegionID, I); 7987 7988 if (I->mayReadOrWriteMemory() && 7989 (!isa<IntrinsicInst>(I) || 7990 (cast<IntrinsicInst>(I)->getIntrinsicID() != Intrinsic::sideeffect && 7991 cast<IntrinsicInst>(I)->getIntrinsicID() != 7992 Intrinsic::pseudoprobe))) { 7993 // Update the linked list of memory accessing instructions. 7994 if (CurrentLoadStore) { 7995 CurrentLoadStore->NextLoadStore = SD; 7996 } else { 7997 FirstLoadStoreInRegion = SD; 7998 } 7999 CurrentLoadStore = SD; 8000 } 8001 } 8002 if (NextLoadStore) { 8003 if (CurrentLoadStore) 8004 CurrentLoadStore->NextLoadStore = NextLoadStore; 8005 } else { 8006 LastLoadStoreInRegion = CurrentLoadStore; 8007 } 8008 } 8009 8010 void BoUpSLP::BlockScheduling::calculateDependencies(ScheduleData *SD, 8011 bool InsertInReadyList, 8012 BoUpSLP *SLP) { 8013 assert(SD->isSchedulingEntity()); 8014 8015 SmallVector<ScheduleData *, 10> WorkList; 8016 WorkList.push_back(SD); 8017 8018 while (!WorkList.empty()) { 8019 ScheduleData *SD = WorkList.pop_back_val(); 8020 for (ScheduleData *BundleMember = SD; BundleMember; 8021 BundleMember = BundleMember->NextInBundle) { 8022 assert(isInSchedulingRegion(BundleMember)); 8023 if (BundleMember->hasValidDependencies()) 8024 continue; 8025 8026 LLVM_DEBUG(dbgs() << "SLP: update deps of " << *BundleMember 8027 << "\n"); 8028 BundleMember->Dependencies = 0; 8029 BundleMember->resetUnscheduledDeps(); 8030 8031 // Handle def-use chain dependencies. 8032 if (BundleMember->OpValue != BundleMember->Inst) { 8033 if (ScheduleData *UseSD = getScheduleData(BundleMember->Inst)) { 8034 BundleMember->Dependencies++; 8035 ScheduleData *DestBundle = UseSD->FirstInBundle; 8036 if (!DestBundle->IsScheduled) 8037 BundleMember->incrementUnscheduledDeps(1); 8038 if (!DestBundle->hasValidDependencies()) 8039 WorkList.push_back(DestBundle); 8040 } 8041 } else { 8042 for (User *U : BundleMember->Inst->users()) { 8043 if (ScheduleData *UseSD = getScheduleData(cast<Instruction>(U))) { 8044 BundleMember->Dependencies++; 8045 ScheduleData *DestBundle = UseSD->FirstInBundle; 8046 if (!DestBundle->IsScheduled) 8047 BundleMember->incrementUnscheduledDeps(1); 8048 if (!DestBundle->hasValidDependencies()) 8049 WorkList.push_back(DestBundle); 8050 } 8051 } 8052 } 8053 8054 // Any instruction which isn't safe to speculate at the begining of the 8055 // block is control dependend on any early exit or non-willreturn call 8056 // which proceeds it. 8057 if (!isGuaranteedToTransferExecutionToSuccessor(BundleMember->Inst)) { 8058 for (Instruction *I = BundleMember->Inst->getNextNode(); 8059 I != ScheduleEnd; I = I->getNextNode()) { 8060 if (isSafeToSpeculativelyExecute(I, &*BB->begin())) 8061 continue; 8062 8063 // Add the dependency 8064 auto *DepDest = getScheduleData(I); 8065 assert(DepDest && "must be in schedule window"); 8066 DepDest->ControlDependencies.push_back(BundleMember); 8067 BundleMember->Dependencies++; 8068 ScheduleData *DestBundle = DepDest->FirstInBundle; 8069 if (!DestBundle->IsScheduled) 8070 BundleMember->incrementUnscheduledDeps(1); 8071 if (!DestBundle->hasValidDependencies()) 8072 WorkList.push_back(DestBundle); 8073 8074 if (!isGuaranteedToTransferExecutionToSuccessor(I)) 8075 // Everything past here must be control dependent on I. 8076 break; 8077 } 8078 } 8079 8080 // If we have an inalloc alloca instruction, it needs to be scheduled 8081 // after any preceeding stacksave. 8082 if (match(BundleMember->Inst, m_Intrinsic<Intrinsic::stacksave>())) { 8083 for (Instruction *I = BundleMember->Inst->getNextNode(); 8084 I != ScheduleEnd; I = I->getNextNode()) { 8085 if (match(I, m_Intrinsic<Intrinsic::stacksave>())) 8086 // Any allocas past here must be control dependent on I, and I 8087 // must be memory dependend on BundleMember->Inst. 8088 break; 8089 8090 if (!isa<AllocaInst>(I)) 8091 continue; 8092 8093 // Add the dependency 8094 auto *DepDest = getScheduleData(I); 8095 assert(DepDest && "must be in schedule window"); 8096 DepDest->ControlDependencies.push_back(BundleMember); 8097 BundleMember->Dependencies++; 8098 ScheduleData *DestBundle = DepDest->FirstInBundle; 8099 if (!DestBundle->IsScheduled) 8100 BundleMember->incrementUnscheduledDeps(1); 8101 if (!DestBundle->hasValidDependencies()) 8102 WorkList.push_back(DestBundle); 8103 } 8104 } 8105 8106 8107 // Handle the memory dependencies (if any). 8108 ScheduleData *DepDest = BundleMember->NextLoadStore; 8109 if (!DepDest) 8110 continue; 8111 Instruction *SrcInst = BundleMember->Inst; 8112 assert(SrcInst->mayReadOrWriteMemory() && 8113 "NextLoadStore list for non memory effecting bundle?"); 8114 MemoryLocation SrcLoc = getLocation(SrcInst); 8115 bool SrcMayWrite = BundleMember->Inst->mayWriteToMemory(); 8116 unsigned numAliased = 0; 8117 unsigned DistToSrc = 1; 8118 8119 for ( ; DepDest; DepDest = DepDest->NextLoadStore) { 8120 assert(isInSchedulingRegion(DepDest)); 8121 8122 // We have two limits to reduce the complexity: 8123 // 1) AliasedCheckLimit: It's a small limit to reduce calls to 8124 // SLP->isAliased (which is the expensive part in this loop). 8125 // 2) MaxMemDepDistance: It's for very large blocks and it aborts 8126 // the whole loop (even if the loop is fast, it's quadratic). 8127 // It's important for the loop break condition (see below) to 8128 // check this limit even between two read-only instructions. 8129 if (DistToSrc >= MaxMemDepDistance || 8130 ((SrcMayWrite || DepDest->Inst->mayWriteToMemory()) && 8131 (numAliased >= AliasedCheckLimit || 8132 SLP->isAliased(SrcLoc, SrcInst, DepDest->Inst)))) { 8133 8134 // We increment the counter only if the locations are aliased 8135 // (instead of counting all alias checks). This gives a better 8136 // balance between reduced runtime and accurate dependencies. 8137 numAliased++; 8138 8139 DepDest->MemoryDependencies.push_back(BundleMember); 8140 BundleMember->Dependencies++; 8141 ScheduleData *DestBundle = DepDest->FirstInBundle; 8142 if (!DestBundle->IsScheduled) { 8143 BundleMember->incrementUnscheduledDeps(1); 8144 } 8145 if (!DestBundle->hasValidDependencies()) { 8146 WorkList.push_back(DestBundle); 8147 } 8148 } 8149 8150 // Example, explaining the loop break condition: Let's assume our 8151 // starting instruction is i0 and MaxMemDepDistance = 3. 8152 // 8153 // +--------v--v--v 8154 // i0,i1,i2,i3,i4,i5,i6,i7,i8 8155 // +--------^--^--^ 8156 // 8157 // MaxMemDepDistance let us stop alias-checking at i3 and we add 8158 // dependencies from i0 to i3,i4,.. (even if they are not aliased). 8159 // Previously we already added dependencies from i3 to i6,i7,i8 8160 // (because of MaxMemDepDistance). As we added a dependency from 8161 // i0 to i3, we have transitive dependencies from i0 to i6,i7,i8 8162 // and we can abort this loop at i6. 8163 if (DistToSrc >= 2 * MaxMemDepDistance) 8164 break; 8165 DistToSrc++; 8166 } 8167 } 8168 if (InsertInReadyList && SD->isReady()) { 8169 ReadyInsts.insert(SD); 8170 LLVM_DEBUG(dbgs() << "SLP: gets ready on update: " << *SD->Inst 8171 << "\n"); 8172 } 8173 } 8174 } 8175 8176 void BoUpSLP::BlockScheduling::resetSchedule() { 8177 assert(ScheduleStart && 8178 "tried to reset schedule on block which has not been scheduled"); 8179 for (Instruction *I = ScheduleStart; I != ScheduleEnd; I = I->getNextNode()) { 8180 doForAllOpcodes(I, [&](ScheduleData *SD) { 8181 assert(isInSchedulingRegion(SD) && 8182 "ScheduleData not in scheduling region"); 8183 SD->IsScheduled = false; 8184 SD->resetUnscheduledDeps(); 8185 }); 8186 } 8187 ReadyInsts.clear(); 8188 } 8189 8190 void BoUpSLP::scheduleBlock(BlockScheduling *BS) { 8191 if (!BS->ScheduleStart) 8192 return; 8193 8194 LLVM_DEBUG(dbgs() << "SLP: schedule block " << BS->BB->getName() << "\n"); 8195 8196 BS->resetSchedule(); 8197 8198 // For the real scheduling we use a more sophisticated ready-list: it is 8199 // sorted by the original instruction location. This lets the final schedule 8200 // be as close as possible to the original instruction order. 8201 // WARNING: If changing this order causes a correctness issue, that means 8202 // there is some missing dependence edge in the schedule data graph. 8203 struct ScheduleDataCompare { 8204 bool operator()(ScheduleData *SD1, ScheduleData *SD2) const { 8205 return SD2->SchedulingPriority < SD1->SchedulingPriority; 8206 } 8207 }; 8208 std::set<ScheduleData *, ScheduleDataCompare> ReadyInsts; 8209 8210 // Ensure that all dependency data is updated and fill the ready-list with 8211 // initial instructions. 8212 int Idx = 0; 8213 int NumToSchedule = 0; 8214 for (auto *I = BS->ScheduleStart; I != BS->ScheduleEnd; 8215 I = I->getNextNode()) { 8216 BS->doForAllOpcodes(I, [this, &Idx, &NumToSchedule, BS](ScheduleData *SD) { 8217 TreeEntry *SDTE = getTreeEntry(SD->Inst); 8218 (void)SDTE; 8219 assert((isVectorLikeInstWithConstOps(SD->Inst) || 8220 SD->isPartOfBundle() == 8221 (SDTE && !doesNotNeedToSchedule(SDTE->Scalars))) && 8222 "scheduler and vectorizer bundle mismatch"); 8223 SD->FirstInBundle->SchedulingPriority = Idx++; 8224 if (SD->isSchedulingEntity()) { 8225 BS->calculateDependencies(SD, false, this); 8226 NumToSchedule++; 8227 } 8228 }); 8229 } 8230 BS->initialFillReadyList(ReadyInsts); 8231 8232 Instruction *LastScheduledInst = BS->ScheduleEnd; 8233 8234 // Do the "real" scheduling. 8235 while (!ReadyInsts.empty()) { 8236 ScheduleData *picked = *ReadyInsts.begin(); 8237 ReadyInsts.erase(ReadyInsts.begin()); 8238 8239 // Move the scheduled instruction(s) to their dedicated places, if not 8240 // there yet. 8241 for (ScheduleData *BundleMember = picked; BundleMember; 8242 BundleMember = BundleMember->NextInBundle) { 8243 Instruction *pickedInst = BundleMember->Inst; 8244 if (pickedInst->getNextNode() != LastScheduledInst) 8245 pickedInst->moveBefore(LastScheduledInst); 8246 LastScheduledInst = pickedInst; 8247 } 8248 8249 BS->schedule(picked, ReadyInsts); 8250 NumToSchedule--; 8251 } 8252 assert(NumToSchedule == 0 && "could not schedule all instructions"); 8253 8254 // Check that we didn't break any of our invariants. 8255 #ifdef EXPENSIVE_CHECKS 8256 BS->verify(); 8257 #endif 8258 8259 #if !defined(NDEBUG) || defined(EXPENSIVE_CHECKS) 8260 // Check that all schedulable entities got scheduled 8261 for (auto *I = BS->ScheduleStart; I != BS->ScheduleEnd; I = I->getNextNode()) { 8262 BS->doForAllOpcodes(I, [&](ScheduleData *SD) { 8263 if (SD->isSchedulingEntity() && SD->hasValidDependencies()) { 8264 assert(SD->IsScheduled && "must be scheduled at this point"); 8265 } 8266 }); 8267 } 8268 #endif 8269 8270 // Avoid duplicate scheduling of the block. 8271 BS->ScheduleStart = nullptr; 8272 } 8273 8274 unsigned BoUpSLP::getVectorElementSize(Value *V) { 8275 // If V is a store, just return the width of the stored value (or value 8276 // truncated just before storing) without traversing the expression tree. 8277 // This is the common case. 8278 if (auto *Store = dyn_cast<StoreInst>(V)) { 8279 if (auto *Trunc = dyn_cast<TruncInst>(Store->getValueOperand())) 8280 return DL->getTypeSizeInBits(Trunc->getSrcTy()); 8281 return DL->getTypeSizeInBits(Store->getValueOperand()->getType()); 8282 } 8283 8284 if (auto *IEI = dyn_cast<InsertElementInst>(V)) 8285 return getVectorElementSize(IEI->getOperand(1)); 8286 8287 auto E = InstrElementSize.find(V); 8288 if (E != InstrElementSize.end()) 8289 return E->second; 8290 8291 // If V is not a store, we can traverse the expression tree to find loads 8292 // that feed it. The type of the loaded value may indicate a more suitable 8293 // width than V's type. We want to base the vector element size on the width 8294 // of memory operations where possible. 8295 SmallVector<std::pair<Instruction *, BasicBlock *>, 16> Worklist; 8296 SmallPtrSet<Instruction *, 16> Visited; 8297 if (auto *I = dyn_cast<Instruction>(V)) { 8298 Worklist.emplace_back(I, I->getParent()); 8299 Visited.insert(I); 8300 } 8301 8302 // Traverse the expression tree in bottom-up order looking for loads. If we 8303 // encounter an instruction we don't yet handle, we give up. 8304 auto Width = 0u; 8305 while (!Worklist.empty()) { 8306 Instruction *I; 8307 BasicBlock *Parent; 8308 std::tie(I, Parent) = Worklist.pop_back_val(); 8309 8310 // We should only be looking at scalar instructions here. If the current 8311 // instruction has a vector type, skip. 8312 auto *Ty = I->getType(); 8313 if (isa<VectorType>(Ty)) 8314 continue; 8315 8316 // If the current instruction is a load, update MaxWidth to reflect the 8317 // width of the loaded value. 8318 if (isa<LoadInst>(I) || isa<ExtractElementInst>(I) || 8319 isa<ExtractValueInst>(I)) 8320 Width = std::max<unsigned>(Width, DL->getTypeSizeInBits(Ty)); 8321 8322 // Otherwise, we need to visit the operands of the instruction. We only 8323 // handle the interesting cases from buildTree here. If an operand is an 8324 // instruction we haven't yet visited and from the same basic block as the 8325 // user or the use is a PHI node, we add it to the worklist. 8326 else if (isa<PHINode>(I) || isa<CastInst>(I) || isa<GetElementPtrInst>(I) || 8327 isa<CmpInst>(I) || isa<SelectInst>(I) || isa<BinaryOperator>(I) || 8328 isa<UnaryOperator>(I)) { 8329 for (Use &U : I->operands()) 8330 if (auto *J = dyn_cast<Instruction>(U.get())) 8331 if (Visited.insert(J).second && 8332 (isa<PHINode>(I) || J->getParent() == Parent)) 8333 Worklist.emplace_back(J, J->getParent()); 8334 } else { 8335 break; 8336 } 8337 } 8338 8339 // If we didn't encounter a memory access in the expression tree, or if we 8340 // gave up for some reason, just return the width of V. Otherwise, return the 8341 // maximum width we found. 8342 if (!Width) { 8343 if (auto *CI = dyn_cast<CmpInst>(V)) 8344 V = CI->getOperand(0); 8345 Width = DL->getTypeSizeInBits(V->getType()); 8346 } 8347 8348 for (Instruction *I : Visited) 8349 InstrElementSize[I] = Width; 8350 8351 return Width; 8352 } 8353 8354 // Determine if a value V in a vectorizable expression Expr can be demoted to a 8355 // smaller type with a truncation. We collect the values that will be demoted 8356 // in ToDemote and additional roots that require investigating in Roots. 8357 static bool collectValuesToDemote(Value *V, SmallPtrSetImpl<Value *> &Expr, 8358 SmallVectorImpl<Value *> &ToDemote, 8359 SmallVectorImpl<Value *> &Roots) { 8360 // We can always demote constants. 8361 if (isa<Constant>(V)) { 8362 ToDemote.push_back(V); 8363 return true; 8364 } 8365 8366 // If the value is not an instruction in the expression with only one use, it 8367 // cannot be demoted. 8368 auto *I = dyn_cast<Instruction>(V); 8369 if (!I || !I->hasOneUse() || !Expr.count(I)) 8370 return false; 8371 8372 switch (I->getOpcode()) { 8373 8374 // We can always demote truncations and extensions. Since truncations can 8375 // seed additional demotion, we save the truncated value. 8376 case Instruction::Trunc: 8377 Roots.push_back(I->getOperand(0)); 8378 break; 8379 case Instruction::ZExt: 8380 case Instruction::SExt: 8381 if (isa<ExtractElementInst>(I->getOperand(0)) || 8382 isa<InsertElementInst>(I->getOperand(0))) 8383 return false; 8384 break; 8385 8386 // We can demote certain binary operations if we can demote both of their 8387 // operands. 8388 case Instruction::Add: 8389 case Instruction::Sub: 8390 case Instruction::Mul: 8391 case Instruction::And: 8392 case Instruction::Or: 8393 case Instruction::Xor: 8394 if (!collectValuesToDemote(I->getOperand(0), Expr, ToDemote, Roots) || 8395 !collectValuesToDemote(I->getOperand(1), Expr, ToDemote, Roots)) 8396 return false; 8397 break; 8398 8399 // We can demote selects if we can demote their true and false values. 8400 case Instruction::Select: { 8401 SelectInst *SI = cast<SelectInst>(I); 8402 if (!collectValuesToDemote(SI->getTrueValue(), Expr, ToDemote, Roots) || 8403 !collectValuesToDemote(SI->getFalseValue(), Expr, ToDemote, Roots)) 8404 return false; 8405 break; 8406 } 8407 8408 // We can demote phis if we can demote all their incoming operands. Note that 8409 // we don't need to worry about cycles since we ensure single use above. 8410 case Instruction::PHI: { 8411 PHINode *PN = cast<PHINode>(I); 8412 for (Value *IncValue : PN->incoming_values()) 8413 if (!collectValuesToDemote(IncValue, Expr, ToDemote, Roots)) 8414 return false; 8415 break; 8416 } 8417 8418 // Otherwise, conservatively give up. 8419 default: 8420 return false; 8421 } 8422 8423 // Record the value that we can demote. 8424 ToDemote.push_back(V); 8425 return true; 8426 } 8427 8428 void BoUpSLP::computeMinimumValueSizes() { 8429 // If there are no external uses, the expression tree must be rooted by a 8430 // store. We can't demote in-memory values, so there is nothing to do here. 8431 if (ExternalUses.empty()) 8432 return; 8433 8434 // We only attempt to truncate integer expressions. 8435 auto &TreeRoot = VectorizableTree[0]->Scalars; 8436 auto *TreeRootIT = dyn_cast<IntegerType>(TreeRoot[0]->getType()); 8437 if (!TreeRootIT) 8438 return; 8439 8440 // If the expression is not rooted by a store, these roots should have 8441 // external uses. We will rely on InstCombine to rewrite the expression in 8442 // the narrower type. However, InstCombine only rewrites single-use values. 8443 // This means that if a tree entry other than a root is used externally, it 8444 // must have multiple uses and InstCombine will not rewrite it. The code 8445 // below ensures that only the roots are used externally. 8446 SmallPtrSet<Value *, 32> Expr(TreeRoot.begin(), TreeRoot.end()); 8447 for (auto &EU : ExternalUses) 8448 if (!Expr.erase(EU.Scalar)) 8449 return; 8450 if (!Expr.empty()) 8451 return; 8452 8453 // Collect the scalar values of the vectorizable expression. We will use this 8454 // context to determine which values can be demoted. If we see a truncation, 8455 // we mark it as seeding another demotion. 8456 for (auto &EntryPtr : VectorizableTree) 8457 Expr.insert(EntryPtr->Scalars.begin(), EntryPtr->Scalars.end()); 8458 8459 // Ensure the roots of the vectorizable tree don't form a cycle. They must 8460 // have a single external user that is not in the vectorizable tree. 8461 for (auto *Root : TreeRoot) 8462 if (!Root->hasOneUse() || Expr.count(*Root->user_begin())) 8463 return; 8464 8465 // Conservatively determine if we can actually truncate the roots of the 8466 // expression. Collect the values that can be demoted in ToDemote and 8467 // additional roots that require investigating in Roots. 8468 SmallVector<Value *, 32> ToDemote; 8469 SmallVector<Value *, 4> Roots; 8470 for (auto *Root : TreeRoot) 8471 if (!collectValuesToDemote(Root, Expr, ToDemote, Roots)) 8472 return; 8473 8474 // The maximum bit width required to represent all the values that can be 8475 // demoted without loss of precision. It would be safe to truncate the roots 8476 // of the expression to this width. 8477 auto MaxBitWidth = 8u; 8478 8479 // We first check if all the bits of the roots are demanded. If they're not, 8480 // we can truncate the roots to this narrower type. 8481 for (auto *Root : TreeRoot) { 8482 auto Mask = DB->getDemandedBits(cast<Instruction>(Root)); 8483 MaxBitWidth = std::max<unsigned>( 8484 Mask.getBitWidth() - Mask.countLeadingZeros(), MaxBitWidth); 8485 } 8486 8487 // True if the roots can be zero-extended back to their original type, rather 8488 // than sign-extended. We know that if the leading bits are not demanded, we 8489 // can safely zero-extend. So we initialize IsKnownPositive to True. 8490 bool IsKnownPositive = true; 8491 8492 // If all the bits of the roots are demanded, we can try a little harder to 8493 // compute a narrower type. This can happen, for example, if the roots are 8494 // getelementptr indices. InstCombine promotes these indices to the pointer 8495 // width. Thus, all their bits are technically demanded even though the 8496 // address computation might be vectorized in a smaller type. 8497 // 8498 // We start by looking at each entry that can be demoted. We compute the 8499 // maximum bit width required to store the scalar by using ValueTracking to 8500 // compute the number of high-order bits we can truncate. 8501 if (MaxBitWidth == DL->getTypeSizeInBits(TreeRoot[0]->getType()) && 8502 llvm::all_of(TreeRoot, [](Value *R) { 8503 assert(R->hasOneUse() && "Root should have only one use!"); 8504 return isa<GetElementPtrInst>(R->user_back()); 8505 })) { 8506 MaxBitWidth = 8u; 8507 8508 // Determine if the sign bit of all the roots is known to be zero. If not, 8509 // IsKnownPositive is set to False. 8510 IsKnownPositive = llvm::all_of(TreeRoot, [&](Value *R) { 8511 KnownBits Known = computeKnownBits(R, *DL); 8512 return Known.isNonNegative(); 8513 }); 8514 8515 // Determine the maximum number of bits required to store the scalar 8516 // values. 8517 for (auto *Scalar : ToDemote) { 8518 auto NumSignBits = ComputeNumSignBits(Scalar, *DL, 0, AC, nullptr, DT); 8519 auto NumTypeBits = DL->getTypeSizeInBits(Scalar->getType()); 8520 MaxBitWidth = std::max<unsigned>(NumTypeBits - NumSignBits, MaxBitWidth); 8521 } 8522 8523 // If we can't prove that the sign bit is zero, we must add one to the 8524 // maximum bit width to account for the unknown sign bit. This preserves 8525 // the existing sign bit so we can safely sign-extend the root back to the 8526 // original type. Otherwise, if we know the sign bit is zero, we will 8527 // zero-extend the root instead. 8528 // 8529 // FIXME: This is somewhat suboptimal, as there will be cases where adding 8530 // one to the maximum bit width will yield a larger-than-necessary 8531 // type. In general, we need to add an extra bit only if we can't 8532 // prove that the upper bit of the original type is equal to the 8533 // upper bit of the proposed smaller type. If these two bits are the 8534 // same (either zero or one) we know that sign-extending from the 8535 // smaller type will result in the same value. Here, since we can't 8536 // yet prove this, we are just making the proposed smaller type 8537 // larger to ensure correctness. 8538 if (!IsKnownPositive) 8539 ++MaxBitWidth; 8540 } 8541 8542 // Round MaxBitWidth up to the next power-of-two. 8543 if (!isPowerOf2_64(MaxBitWidth)) 8544 MaxBitWidth = NextPowerOf2(MaxBitWidth); 8545 8546 // If the maximum bit width we compute is less than the with of the roots' 8547 // type, we can proceed with the narrowing. Otherwise, do nothing. 8548 if (MaxBitWidth >= TreeRootIT->getBitWidth()) 8549 return; 8550 8551 // If we can truncate the root, we must collect additional values that might 8552 // be demoted as a result. That is, those seeded by truncations we will 8553 // modify. 8554 while (!Roots.empty()) 8555 collectValuesToDemote(Roots.pop_back_val(), Expr, ToDemote, Roots); 8556 8557 // Finally, map the values we can demote to the maximum bit with we computed. 8558 for (auto *Scalar : ToDemote) 8559 MinBWs[Scalar] = std::make_pair(MaxBitWidth, !IsKnownPositive); 8560 } 8561 8562 namespace { 8563 8564 /// The SLPVectorizer Pass. 8565 struct SLPVectorizer : public FunctionPass { 8566 SLPVectorizerPass Impl; 8567 8568 /// Pass identification, replacement for typeid 8569 static char ID; 8570 8571 explicit SLPVectorizer() : FunctionPass(ID) { 8572 initializeSLPVectorizerPass(*PassRegistry::getPassRegistry()); 8573 } 8574 8575 bool doInitialization(Module &M) override { return false; } 8576 8577 bool runOnFunction(Function &F) override { 8578 if (skipFunction(F)) 8579 return false; 8580 8581 auto *SE = &getAnalysis<ScalarEvolutionWrapperPass>().getSE(); 8582 auto *TTI = &getAnalysis<TargetTransformInfoWrapperPass>().getTTI(F); 8583 auto *TLIP = getAnalysisIfAvailable<TargetLibraryInfoWrapperPass>(); 8584 auto *TLI = TLIP ? &TLIP->getTLI(F) : nullptr; 8585 auto *AA = &getAnalysis<AAResultsWrapperPass>().getAAResults(); 8586 auto *LI = &getAnalysis<LoopInfoWrapperPass>().getLoopInfo(); 8587 auto *DT = &getAnalysis<DominatorTreeWrapperPass>().getDomTree(); 8588 auto *AC = &getAnalysis<AssumptionCacheTracker>().getAssumptionCache(F); 8589 auto *DB = &getAnalysis<DemandedBitsWrapperPass>().getDemandedBits(); 8590 auto *ORE = &getAnalysis<OptimizationRemarkEmitterWrapperPass>().getORE(); 8591 8592 return Impl.runImpl(F, SE, TTI, TLI, AA, LI, DT, AC, DB, ORE); 8593 } 8594 8595 void getAnalysisUsage(AnalysisUsage &AU) const override { 8596 FunctionPass::getAnalysisUsage(AU); 8597 AU.addRequired<AssumptionCacheTracker>(); 8598 AU.addRequired<ScalarEvolutionWrapperPass>(); 8599 AU.addRequired<AAResultsWrapperPass>(); 8600 AU.addRequired<TargetTransformInfoWrapperPass>(); 8601 AU.addRequired<LoopInfoWrapperPass>(); 8602 AU.addRequired<DominatorTreeWrapperPass>(); 8603 AU.addRequired<DemandedBitsWrapperPass>(); 8604 AU.addRequired<OptimizationRemarkEmitterWrapperPass>(); 8605 AU.addRequired<InjectTLIMappingsLegacy>(); 8606 AU.addPreserved<LoopInfoWrapperPass>(); 8607 AU.addPreserved<DominatorTreeWrapperPass>(); 8608 AU.addPreserved<AAResultsWrapperPass>(); 8609 AU.addPreserved<GlobalsAAWrapperPass>(); 8610 AU.setPreservesCFG(); 8611 } 8612 }; 8613 8614 } // end anonymous namespace 8615 8616 PreservedAnalyses SLPVectorizerPass::run(Function &F, FunctionAnalysisManager &AM) { 8617 auto *SE = &AM.getResult<ScalarEvolutionAnalysis>(F); 8618 auto *TTI = &AM.getResult<TargetIRAnalysis>(F); 8619 auto *TLI = AM.getCachedResult<TargetLibraryAnalysis>(F); 8620 auto *AA = &AM.getResult<AAManager>(F); 8621 auto *LI = &AM.getResult<LoopAnalysis>(F); 8622 auto *DT = &AM.getResult<DominatorTreeAnalysis>(F); 8623 auto *AC = &AM.getResult<AssumptionAnalysis>(F); 8624 auto *DB = &AM.getResult<DemandedBitsAnalysis>(F); 8625 auto *ORE = &AM.getResult<OptimizationRemarkEmitterAnalysis>(F); 8626 8627 bool Changed = runImpl(F, SE, TTI, TLI, AA, LI, DT, AC, DB, ORE); 8628 if (!Changed) 8629 return PreservedAnalyses::all(); 8630 8631 PreservedAnalyses PA; 8632 PA.preserveSet<CFGAnalyses>(); 8633 return PA; 8634 } 8635 8636 bool SLPVectorizerPass::runImpl(Function &F, ScalarEvolution *SE_, 8637 TargetTransformInfo *TTI_, 8638 TargetLibraryInfo *TLI_, AAResults *AA_, 8639 LoopInfo *LI_, DominatorTree *DT_, 8640 AssumptionCache *AC_, DemandedBits *DB_, 8641 OptimizationRemarkEmitter *ORE_) { 8642 if (!RunSLPVectorization) 8643 return false; 8644 SE = SE_; 8645 TTI = TTI_; 8646 TLI = TLI_; 8647 AA = AA_; 8648 LI = LI_; 8649 DT = DT_; 8650 AC = AC_; 8651 DB = DB_; 8652 DL = &F.getParent()->getDataLayout(); 8653 8654 Stores.clear(); 8655 GEPs.clear(); 8656 bool Changed = false; 8657 8658 // If the target claims to have no vector registers don't attempt 8659 // vectorization. 8660 if (!TTI->getNumberOfRegisters(TTI->getRegisterClassForType(true))) { 8661 LLVM_DEBUG( 8662 dbgs() << "SLP: Didn't find any vector registers for target, abort.\n"); 8663 return false; 8664 } 8665 8666 // Don't vectorize when the attribute NoImplicitFloat is used. 8667 if (F.hasFnAttribute(Attribute::NoImplicitFloat)) 8668 return false; 8669 8670 LLVM_DEBUG(dbgs() << "SLP: Analyzing blocks in " << F.getName() << ".\n"); 8671 8672 // Use the bottom up slp vectorizer to construct chains that start with 8673 // store instructions. 8674 BoUpSLP R(&F, SE, TTI, TLI, AA, LI, DT, AC, DB, DL, ORE_); 8675 8676 // A general note: the vectorizer must use BoUpSLP::eraseInstruction() to 8677 // delete instructions. 8678 8679 // Update DFS numbers now so that we can use them for ordering. 8680 DT->updateDFSNumbers(); 8681 8682 // Scan the blocks in the function in post order. 8683 for (auto BB : post_order(&F.getEntryBlock())) { 8684 collectSeedInstructions(BB); 8685 8686 // Vectorize trees that end at stores. 8687 if (!Stores.empty()) { 8688 LLVM_DEBUG(dbgs() << "SLP: Found stores for " << Stores.size() 8689 << " underlying objects.\n"); 8690 Changed |= vectorizeStoreChains(R); 8691 } 8692 8693 // Vectorize trees that end at reductions. 8694 Changed |= vectorizeChainsInBlock(BB, R); 8695 8696 // Vectorize the index computations of getelementptr instructions. This 8697 // is primarily intended to catch gather-like idioms ending at 8698 // non-consecutive loads. 8699 if (!GEPs.empty()) { 8700 LLVM_DEBUG(dbgs() << "SLP: Found GEPs for " << GEPs.size() 8701 << " underlying objects.\n"); 8702 Changed |= vectorizeGEPIndices(BB, R); 8703 } 8704 } 8705 8706 if (Changed) { 8707 R.optimizeGatherSequence(); 8708 LLVM_DEBUG(dbgs() << "SLP: vectorized \"" << F.getName() << "\"\n"); 8709 } 8710 return Changed; 8711 } 8712 8713 bool SLPVectorizerPass::vectorizeStoreChain(ArrayRef<Value *> Chain, BoUpSLP &R, 8714 unsigned Idx) { 8715 LLVM_DEBUG(dbgs() << "SLP: Analyzing a store chain of length " << Chain.size() 8716 << "\n"); 8717 const unsigned Sz = R.getVectorElementSize(Chain[0]); 8718 const unsigned MinVF = R.getMinVecRegSize() / Sz; 8719 unsigned VF = Chain.size(); 8720 8721 if (!isPowerOf2_32(Sz) || !isPowerOf2_32(VF) || VF < 2 || VF < MinVF) 8722 return false; 8723 8724 LLVM_DEBUG(dbgs() << "SLP: Analyzing " << VF << " stores at offset " << Idx 8725 << "\n"); 8726 8727 R.buildTree(Chain); 8728 if (R.isTreeTinyAndNotFullyVectorizable()) 8729 return false; 8730 if (R.isLoadCombineCandidate()) 8731 return false; 8732 R.reorderTopToBottom(); 8733 R.reorderBottomToTop(); 8734 R.buildExternalUses(); 8735 8736 R.computeMinimumValueSizes(); 8737 8738 InstructionCost Cost = R.getTreeCost(); 8739 8740 LLVM_DEBUG(dbgs() << "SLP: Found cost = " << Cost << " for VF =" << VF << "\n"); 8741 if (Cost < -SLPCostThreshold) { 8742 LLVM_DEBUG(dbgs() << "SLP: Decided to vectorize cost = " << Cost << "\n"); 8743 8744 using namespace ore; 8745 8746 R.getORE()->emit(OptimizationRemark(SV_NAME, "StoresVectorized", 8747 cast<StoreInst>(Chain[0])) 8748 << "Stores SLP vectorized with cost " << NV("Cost", Cost) 8749 << " and with tree size " 8750 << NV("TreeSize", R.getTreeSize())); 8751 8752 R.vectorizeTree(); 8753 return true; 8754 } 8755 8756 return false; 8757 } 8758 8759 bool SLPVectorizerPass::vectorizeStores(ArrayRef<StoreInst *> Stores, 8760 BoUpSLP &R) { 8761 // We may run into multiple chains that merge into a single chain. We mark the 8762 // stores that we vectorized so that we don't visit the same store twice. 8763 BoUpSLP::ValueSet VectorizedStores; 8764 bool Changed = false; 8765 8766 int E = Stores.size(); 8767 SmallBitVector Tails(E, false); 8768 int MaxIter = MaxStoreLookup.getValue(); 8769 SmallVector<std::pair<int, int>, 16> ConsecutiveChain( 8770 E, std::make_pair(E, INT_MAX)); 8771 SmallVector<SmallBitVector, 4> CheckedPairs(E, SmallBitVector(E, false)); 8772 int IterCnt; 8773 auto &&FindConsecutiveAccess = [this, &Stores, &Tails, &IterCnt, MaxIter, 8774 &CheckedPairs, 8775 &ConsecutiveChain](int K, int Idx) { 8776 if (IterCnt >= MaxIter) 8777 return true; 8778 if (CheckedPairs[Idx].test(K)) 8779 return ConsecutiveChain[K].second == 1 && 8780 ConsecutiveChain[K].first == Idx; 8781 ++IterCnt; 8782 CheckedPairs[Idx].set(K); 8783 CheckedPairs[K].set(Idx); 8784 Optional<int> Diff = getPointersDiff( 8785 Stores[K]->getValueOperand()->getType(), Stores[K]->getPointerOperand(), 8786 Stores[Idx]->getValueOperand()->getType(), 8787 Stores[Idx]->getPointerOperand(), *DL, *SE, /*StrictCheck=*/true); 8788 if (!Diff || *Diff == 0) 8789 return false; 8790 int Val = *Diff; 8791 if (Val < 0) { 8792 if (ConsecutiveChain[Idx].second > -Val) { 8793 Tails.set(K); 8794 ConsecutiveChain[Idx] = std::make_pair(K, -Val); 8795 } 8796 return false; 8797 } 8798 if (ConsecutiveChain[K].second <= Val) 8799 return false; 8800 8801 Tails.set(Idx); 8802 ConsecutiveChain[K] = std::make_pair(Idx, Val); 8803 return Val == 1; 8804 }; 8805 // Do a quadratic search on all of the given stores in reverse order and find 8806 // all of the pairs of stores that follow each other. 8807 for (int Idx = E - 1; Idx >= 0; --Idx) { 8808 // If a store has multiple consecutive store candidates, search according 8809 // to the sequence: Idx-1, Idx+1, Idx-2, Idx+2, ... 8810 // This is because usually pairing with immediate succeeding or preceding 8811 // candidate create the best chance to find slp vectorization opportunity. 8812 const int MaxLookDepth = std::max(E - Idx, Idx + 1); 8813 IterCnt = 0; 8814 for (int Offset = 1, F = MaxLookDepth; Offset < F; ++Offset) 8815 if ((Idx >= Offset && FindConsecutiveAccess(Idx - Offset, Idx)) || 8816 (Idx + Offset < E && FindConsecutiveAccess(Idx + Offset, Idx))) 8817 break; 8818 } 8819 8820 // Tracks if we tried to vectorize stores starting from the given tail 8821 // already. 8822 SmallBitVector TriedTails(E, false); 8823 // For stores that start but don't end a link in the chain: 8824 for (int Cnt = E; Cnt > 0; --Cnt) { 8825 int I = Cnt - 1; 8826 if (ConsecutiveChain[I].first == E || Tails.test(I)) 8827 continue; 8828 // We found a store instr that starts a chain. Now follow the chain and try 8829 // to vectorize it. 8830 BoUpSLP::ValueList Operands; 8831 // Collect the chain into a list. 8832 while (I != E && !VectorizedStores.count(Stores[I])) { 8833 Operands.push_back(Stores[I]); 8834 Tails.set(I); 8835 if (ConsecutiveChain[I].second != 1) { 8836 // Mark the new end in the chain and go back, if required. It might be 8837 // required if the original stores come in reversed order, for example. 8838 if (ConsecutiveChain[I].first != E && 8839 Tails.test(ConsecutiveChain[I].first) && !TriedTails.test(I) && 8840 !VectorizedStores.count(Stores[ConsecutiveChain[I].first])) { 8841 TriedTails.set(I); 8842 Tails.reset(ConsecutiveChain[I].first); 8843 if (Cnt < ConsecutiveChain[I].first + 2) 8844 Cnt = ConsecutiveChain[I].first + 2; 8845 } 8846 break; 8847 } 8848 // Move to the next value in the chain. 8849 I = ConsecutiveChain[I].first; 8850 } 8851 assert(!Operands.empty() && "Expected non-empty list of stores."); 8852 8853 unsigned MaxVecRegSize = R.getMaxVecRegSize(); 8854 unsigned EltSize = R.getVectorElementSize(Operands[0]); 8855 unsigned MaxElts = llvm::PowerOf2Floor(MaxVecRegSize / EltSize); 8856 8857 unsigned MinVF = R.getMinVF(EltSize); 8858 unsigned MaxVF = std::min(R.getMaximumVF(EltSize, Instruction::Store), 8859 MaxElts); 8860 8861 // FIXME: Is division-by-2 the correct step? Should we assert that the 8862 // register size is a power-of-2? 8863 unsigned StartIdx = 0; 8864 for (unsigned Size = MaxVF; Size >= MinVF; Size /= 2) { 8865 for (unsigned Cnt = StartIdx, E = Operands.size(); Cnt + Size <= E;) { 8866 ArrayRef<Value *> Slice = makeArrayRef(Operands).slice(Cnt, Size); 8867 if (!VectorizedStores.count(Slice.front()) && 8868 !VectorizedStores.count(Slice.back()) && 8869 vectorizeStoreChain(Slice, R, Cnt)) { 8870 // Mark the vectorized stores so that we don't vectorize them again. 8871 VectorizedStores.insert(Slice.begin(), Slice.end()); 8872 Changed = true; 8873 // If we vectorized initial block, no need to try to vectorize it 8874 // again. 8875 if (Cnt == StartIdx) 8876 StartIdx += Size; 8877 Cnt += Size; 8878 continue; 8879 } 8880 ++Cnt; 8881 } 8882 // Check if the whole array was vectorized already - exit. 8883 if (StartIdx >= Operands.size()) 8884 break; 8885 } 8886 } 8887 8888 return Changed; 8889 } 8890 8891 void SLPVectorizerPass::collectSeedInstructions(BasicBlock *BB) { 8892 // Initialize the collections. We will make a single pass over the block. 8893 Stores.clear(); 8894 GEPs.clear(); 8895 8896 // Visit the store and getelementptr instructions in BB and organize them in 8897 // Stores and GEPs according to the underlying objects of their pointer 8898 // operands. 8899 for (Instruction &I : *BB) { 8900 // Ignore store instructions that are volatile or have a pointer operand 8901 // that doesn't point to a scalar type. 8902 if (auto *SI = dyn_cast<StoreInst>(&I)) { 8903 if (!SI->isSimple()) 8904 continue; 8905 if (!isValidElementType(SI->getValueOperand()->getType())) 8906 continue; 8907 Stores[getUnderlyingObject(SI->getPointerOperand())].push_back(SI); 8908 } 8909 8910 // Ignore getelementptr instructions that have more than one index, a 8911 // constant index, or a pointer operand that doesn't point to a scalar 8912 // type. 8913 else if (auto *GEP = dyn_cast<GetElementPtrInst>(&I)) { 8914 auto Idx = GEP->idx_begin()->get(); 8915 if (GEP->getNumIndices() > 1 || isa<Constant>(Idx)) 8916 continue; 8917 if (!isValidElementType(Idx->getType())) 8918 continue; 8919 if (GEP->getType()->isVectorTy()) 8920 continue; 8921 GEPs[GEP->getPointerOperand()].push_back(GEP); 8922 } 8923 } 8924 } 8925 8926 bool SLPVectorizerPass::tryToVectorizePair(Value *A, Value *B, BoUpSLP &R) { 8927 if (!A || !B) 8928 return false; 8929 if (isa<InsertElementInst>(A) || isa<InsertElementInst>(B)) 8930 return false; 8931 Value *VL[] = {A, B}; 8932 return tryToVectorizeList(VL, R); 8933 } 8934 8935 bool SLPVectorizerPass::tryToVectorizeList(ArrayRef<Value *> VL, BoUpSLP &R, 8936 bool LimitForRegisterSize) { 8937 if (VL.size() < 2) 8938 return false; 8939 8940 LLVM_DEBUG(dbgs() << "SLP: Trying to vectorize a list of length = " 8941 << VL.size() << ".\n"); 8942 8943 // Check that all of the parts are instructions of the same type, 8944 // we permit an alternate opcode via InstructionsState. 8945 InstructionsState S = getSameOpcode(VL); 8946 if (!S.getOpcode()) 8947 return false; 8948 8949 Instruction *I0 = cast<Instruction>(S.OpValue); 8950 // Make sure invalid types (including vector type) are rejected before 8951 // determining vectorization factor for scalar instructions. 8952 for (Value *V : VL) { 8953 Type *Ty = V->getType(); 8954 if (!isa<InsertElementInst>(V) && !isValidElementType(Ty)) { 8955 // NOTE: the following will give user internal llvm type name, which may 8956 // not be useful. 8957 R.getORE()->emit([&]() { 8958 std::string type_str; 8959 llvm::raw_string_ostream rso(type_str); 8960 Ty->print(rso); 8961 return OptimizationRemarkMissed(SV_NAME, "UnsupportedType", I0) 8962 << "Cannot SLP vectorize list: type " 8963 << rso.str() + " is unsupported by vectorizer"; 8964 }); 8965 return false; 8966 } 8967 } 8968 8969 unsigned Sz = R.getVectorElementSize(I0); 8970 unsigned MinVF = R.getMinVF(Sz); 8971 unsigned MaxVF = std::max<unsigned>(PowerOf2Floor(VL.size()), MinVF); 8972 MaxVF = std::min(R.getMaximumVF(Sz, S.getOpcode()), MaxVF); 8973 if (MaxVF < 2) { 8974 R.getORE()->emit([&]() { 8975 return OptimizationRemarkMissed(SV_NAME, "SmallVF", I0) 8976 << "Cannot SLP vectorize list: vectorization factor " 8977 << "less than 2 is not supported"; 8978 }); 8979 return false; 8980 } 8981 8982 bool Changed = false; 8983 bool CandidateFound = false; 8984 InstructionCost MinCost = SLPCostThreshold.getValue(); 8985 Type *ScalarTy = VL[0]->getType(); 8986 if (auto *IE = dyn_cast<InsertElementInst>(VL[0])) 8987 ScalarTy = IE->getOperand(1)->getType(); 8988 8989 unsigned NextInst = 0, MaxInst = VL.size(); 8990 for (unsigned VF = MaxVF; NextInst + 1 < MaxInst && VF >= MinVF; VF /= 2) { 8991 // No actual vectorization should happen, if number of parts is the same as 8992 // provided vectorization factor (i.e. the scalar type is used for vector 8993 // code during codegen). 8994 auto *VecTy = FixedVectorType::get(ScalarTy, VF); 8995 if (TTI->getNumberOfParts(VecTy) == VF) 8996 continue; 8997 for (unsigned I = NextInst; I < MaxInst; ++I) { 8998 unsigned OpsWidth = 0; 8999 9000 if (I + VF > MaxInst) 9001 OpsWidth = MaxInst - I; 9002 else 9003 OpsWidth = VF; 9004 9005 if (!isPowerOf2_32(OpsWidth)) 9006 continue; 9007 9008 if ((LimitForRegisterSize && OpsWidth < MaxVF) || 9009 (VF > MinVF && OpsWidth <= VF / 2) || (VF == MinVF && OpsWidth < 2)) 9010 break; 9011 9012 ArrayRef<Value *> Ops = VL.slice(I, OpsWidth); 9013 // Check that a previous iteration of this loop did not delete the Value. 9014 if (llvm::any_of(Ops, [&R](Value *V) { 9015 auto *I = dyn_cast<Instruction>(V); 9016 return I && R.isDeleted(I); 9017 })) 9018 continue; 9019 9020 LLVM_DEBUG(dbgs() << "SLP: Analyzing " << OpsWidth << " operations " 9021 << "\n"); 9022 9023 R.buildTree(Ops); 9024 if (R.isTreeTinyAndNotFullyVectorizable()) 9025 continue; 9026 R.reorderTopToBottom(); 9027 R.reorderBottomToTop(!isa<InsertElementInst>(Ops.front())); 9028 R.buildExternalUses(); 9029 9030 R.computeMinimumValueSizes(); 9031 InstructionCost Cost = R.getTreeCost(); 9032 CandidateFound = true; 9033 MinCost = std::min(MinCost, Cost); 9034 9035 if (Cost < -SLPCostThreshold) { 9036 LLVM_DEBUG(dbgs() << "SLP: Vectorizing list at cost:" << Cost << ".\n"); 9037 R.getORE()->emit(OptimizationRemark(SV_NAME, "VectorizedList", 9038 cast<Instruction>(Ops[0])) 9039 << "SLP vectorized with cost " << ore::NV("Cost", Cost) 9040 << " and with tree size " 9041 << ore::NV("TreeSize", R.getTreeSize())); 9042 9043 R.vectorizeTree(); 9044 // Move to the next bundle. 9045 I += VF - 1; 9046 NextInst = I + 1; 9047 Changed = true; 9048 } 9049 } 9050 } 9051 9052 if (!Changed && CandidateFound) { 9053 R.getORE()->emit([&]() { 9054 return OptimizationRemarkMissed(SV_NAME, "NotBeneficial", I0) 9055 << "List vectorization was possible but not beneficial with cost " 9056 << ore::NV("Cost", MinCost) << " >= " 9057 << ore::NV("Treshold", -SLPCostThreshold); 9058 }); 9059 } else if (!Changed) { 9060 R.getORE()->emit([&]() { 9061 return OptimizationRemarkMissed(SV_NAME, "NotPossible", I0) 9062 << "Cannot SLP vectorize list: vectorization was impossible" 9063 << " with available vectorization factors"; 9064 }); 9065 } 9066 return Changed; 9067 } 9068 9069 bool SLPVectorizerPass::tryToVectorize(Instruction *I, BoUpSLP &R) { 9070 if (!I) 9071 return false; 9072 9073 if (!isa<BinaryOperator>(I) && !isa<CmpInst>(I)) 9074 return false; 9075 9076 Value *P = I->getParent(); 9077 9078 // Vectorize in current basic block only. 9079 auto *Op0 = dyn_cast<Instruction>(I->getOperand(0)); 9080 auto *Op1 = dyn_cast<Instruction>(I->getOperand(1)); 9081 if (!Op0 || !Op1 || Op0->getParent() != P || Op1->getParent() != P) 9082 return false; 9083 9084 // Try to vectorize V. 9085 if (tryToVectorizePair(Op0, Op1, R)) 9086 return true; 9087 9088 auto *A = dyn_cast<BinaryOperator>(Op0); 9089 auto *B = dyn_cast<BinaryOperator>(Op1); 9090 // Try to skip B. 9091 if (B && B->hasOneUse()) { 9092 auto *B0 = dyn_cast<BinaryOperator>(B->getOperand(0)); 9093 auto *B1 = dyn_cast<BinaryOperator>(B->getOperand(1)); 9094 if (B0 && B0->getParent() == P && tryToVectorizePair(A, B0, R)) 9095 return true; 9096 if (B1 && B1->getParent() == P && tryToVectorizePair(A, B1, R)) 9097 return true; 9098 } 9099 9100 // Try to skip A. 9101 if (A && A->hasOneUse()) { 9102 auto *A0 = dyn_cast<BinaryOperator>(A->getOperand(0)); 9103 auto *A1 = dyn_cast<BinaryOperator>(A->getOperand(1)); 9104 if (A0 && A0->getParent() == P && tryToVectorizePair(A0, B, R)) 9105 return true; 9106 if (A1 && A1->getParent() == P && tryToVectorizePair(A1, B, R)) 9107 return true; 9108 } 9109 return false; 9110 } 9111 9112 namespace { 9113 9114 /// Model horizontal reductions. 9115 /// 9116 /// A horizontal reduction is a tree of reduction instructions that has values 9117 /// that can be put into a vector as its leaves. For example: 9118 /// 9119 /// mul mul mul mul 9120 /// \ / \ / 9121 /// + + 9122 /// \ / 9123 /// + 9124 /// This tree has "mul" as its leaf values and "+" as its reduction 9125 /// instructions. A reduction can feed into a store or a binary operation 9126 /// feeding a phi. 9127 /// ... 9128 /// \ / 9129 /// + 9130 /// | 9131 /// phi += 9132 /// 9133 /// Or: 9134 /// ... 9135 /// \ / 9136 /// + 9137 /// | 9138 /// *p = 9139 /// 9140 class HorizontalReduction { 9141 using ReductionOpsType = SmallVector<Value *, 16>; 9142 using ReductionOpsListType = SmallVector<ReductionOpsType, 2>; 9143 ReductionOpsListType ReductionOps; 9144 SmallVector<Value *, 32> ReducedVals; 9145 // Use map vector to make stable output. 9146 MapVector<Instruction *, Value *> ExtraArgs; 9147 WeakTrackingVH ReductionRoot; 9148 /// The type of reduction operation. 9149 RecurKind RdxKind; 9150 9151 const unsigned INVALID_OPERAND_INDEX = std::numeric_limits<unsigned>::max(); 9152 9153 static bool isCmpSelMinMax(Instruction *I) { 9154 return match(I, m_Select(m_Cmp(), m_Value(), m_Value())) && 9155 RecurrenceDescriptor::isMinMaxRecurrenceKind(getRdxKind(I)); 9156 } 9157 9158 // And/or are potentially poison-safe logical patterns like: 9159 // select x, y, false 9160 // select x, true, y 9161 static bool isBoolLogicOp(Instruction *I) { 9162 return match(I, m_LogicalAnd(m_Value(), m_Value())) || 9163 match(I, m_LogicalOr(m_Value(), m_Value())); 9164 } 9165 9166 /// Checks if instruction is associative and can be vectorized. 9167 static bool isVectorizable(RecurKind Kind, Instruction *I) { 9168 if (Kind == RecurKind::None) 9169 return false; 9170 9171 // Integer ops that map to select instructions or intrinsics are fine. 9172 if (RecurrenceDescriptor::isIntMinMaxRecurrenceKind(Kind) || 9173 isBoolLogicOp(I)) 9174 return true; 9175 9176 if (Kind == RecurKind::FMax || Kind == RecurKind::FMin) { 9177 // FP min/max are associative except for NaN and -0.0. We do not 9178 // have to rule out -0.0 here because the intrinsic semantics do not 9179 // specify a fixed result for it. 9180 return I->getFastMathFlags().noNaNs(); 9181 } 9182 9183 return I->isAssociative(); 9184 } 9185 9186 static Value *getRdxOperand(Instruction *I, unsigned Index) { 9187 // Poison-safe 'or' takes the form: select X, true, Y 9188 // To make that work with the normal operand processing, we skip the 9189 // true value operand. 9190 // TODO: Change the code and data structures to handle this without a hack. 9191 if (getRdxKind(I) == RecurKind::Or && isa<SelectInst>(I) && Index == 1) 9192 return I->getOperand(2); 9193 return I->getOperand(Index); 9194 } 9195 9196 /// Checks if the ParentStackElem.first should be marked as a reduction 9197 /// operation with an extra argument or as extra argument itself. 9198 void markExtraArg(std::pair<Instruction *, unsigned> &ParentStackElem, 9199 Value *ExtraArg) { 9200 if (ExtraArgs.count(ParentStackElem.first)) { 9201 ExtraArgs[ParentStackElem.first] = nullptr; 9202 // We ran into something like: 9203 // ParentStackElem.first = ExtraArgs[ParentStackElem.first] + ExtraArg. 9204 // The whole ParentStackElem.first should be considered as an extra value 9205 // in this case. 9206 // Do not perform analysis of remaining operands of ParentStackElem.first 9207 // instruction, this whole instruction is an extra argument. 9208 ParentStackElem.second = INVALID_OPERAND_INDEX; 9209 } else { 9210 // We ran into something like: 9211 // ParentStackElem.first += ... + ExtraArg + ... 9212 ExtraArgs[ParentStackElem.first] = ExtraArg; 9213 } 9214 } 9215 9216 /// Creates reduction operation with the current opcode. 9217 static Value *createOp(IRBuilder<> &Builder, RecurKind Kind, Value *LHS, 9218 Value *RHS, const Twine &Name, bool UseSelect) { 9219 unsigned RdxOpcode = RecurrenceDescriptor::getOpcode(Kind); 9220 switch (Kind) { 9221 case RecurKind::Or: 9222 if (UseSelect && 9223 LHS->getType() == CmpInst::makeCmpResultType(LHS->getType())) 9224 return Builder.CreateSelect(LHS, Builder.getTrue(), RHS, Name); 9225 return Builder.CreateBinOp((Instruction::BinaryOps)RdxOpcode, LHS, RHS, 9226 Name); 9227 case RecurKind::And: 9228 if (UseSelect && 9229 LHS->getType() == CmpInst::makeCmpResultType(LHS->getType())) 9230 return Builder.CreateSelect(LHS, RHS, Builder.getFalse(), Name); 9231 return Builder.CreateBinOp((Instruction::BinaryOps)RdxOpcode, LHS, RHS, 9232 Name); 9233 case RecurKind::Add: 9234 case RecurKind::Mul: 9235 case RecurKind::Xor: 9236 case RecurKind::FAdd: 9237 case RecurKind::FMul: 9238 return Builder.CreateBinOp((Instruction::BinaryOps)RdxOpcode, LHS, RHS, 9239 Name); 9240 case RecurKind::FMax: 9241 return Builder.CreateBinaryIntrinsic(Intrinsic::maxnum, LHS, RHS); 9242 case RecurKind::FMin: 9243 return Builder.CreateBinaryIntrinsic(Intrinsic::minnum, LHS, RHS); 9244 case RecurKind::SMax: 9245 if (UseSelect) { 9246 Value *Cmp = Builder.CreateICmpSGT(LHS, RHS, Name); 9247 return Builder.CreateSelect(Cmp, LHS, RHS, Name); 9248 } 9249 return Builder.CreateBinaryIntrinsic(Intrinsic::smax, LHS, RHS); 9250 case RecurKind::SMin: 9251 if (UseSelect) { 9252 Value *Cmp = Builder.CreateICmpSLT(LHS, RHS, Name); 9253 return Builder.CreateSelect(Cmp, LHS, RHS, Name); 9254 } 9255 return Builder.CreateBinaryIntrinsic(Intrinsic::smin, LHS, RHS); 9256 case RecurKind::UMax: 9257 if (UseSelect) { 9258 Value *Cmp = Builder.CreateICmpUGT(LHS, RHS, Name); 9259 return Builder.CreateSelect(Cmp, LHS, RHS, Name); 9260 } 9261 return Builder.CreateBinaryIntrinsic(Intrinsic::umax, LHS, RHS); 9262 case RecurKind::UMin: 9263 if (UseSelect) { 9264 Value *Cmp = Builder.CreateICmpULT(LHS, RHS, Name); 9265 return Builder.CreateSelect(Cmp, LHS, RHS, Name); 9266 } 9267 return Builder.CreateBinaryIntrinsic(Intrinsic::umin, LHS, RHS); 9268 default: 9269 llvm_unreachable("Unknown reduction operation."); 9270 } 9271 } 9272 9273 /// Creates reduction operation with the current opcode with the IR flags 9274 /// from \p ReductionOps. 9275 static Value *createOp(IRBuilder<> &Builder, RecurKind RdxKind, Value *LHS, 9276 Value *RHS, const Twine &Name, 9277 const ReductionOpsListType &ReductionOps) { 9278 bool UseSelect = ReductionOps.size() == 2 || 9279 // Logical or/and. 9280 (ReductionOps.size() == 1 && 9281 isa<SelectInst>(ReductionOps.front().front())); 9282 assert((!UseSelect || ReductionOps.size() != 2 || 9283 isa<SelectInst>(ReductionOps[1][0])) && 9284 "Expected cmp + select pairs for reduction"); 9285 Value *Op = createOp(Builder, RdxKind, LHS, RHS, Name, UseSelect); 9286 if (RecurrenceDescriptor::isIntMinMaxRecurrenceKind(RdxKind)) { 9287 if (auto *Sel = dyn_cast<SelectInst>(Op)) { 9288 propagateIRFlags(Sel->getCondition(), ReductionOps[0]); 9289 propagateIRFlags(Op, ReductionOps[1]); 9290 return Op; 9291 } 9292 } 9293 propagateIRFlags(Op, ReductionOps[0]); 9294 return Op; 9295 } 9296 9297 /// Creates reduction operation with the current opcode with the IR flags 9298 /// from \p I. 9299 static Value *createOp(IRBuilder<> &Builder, RecurKind RdxKind, Value *LHS, 9300 Value *RHS, const Twine &Name, Instruction *I) { 9301 auto *SelI = dyn_cast<SelectInst>(I); 9302 Value *Op = createOp(Builder, RdxKind, LHS, RHS, Name, SelI != nullptr); 9303 if (SelI && RecurrenceDescriptor::isIntMinMaxRecurrenceKind(RdxKind)) { 9304 if (auto *Sel = dyn_cast<SelectInst>(Op)) 9305 propagateIRFlags(Sel->getCondition(), SelI->getCondition()); 9306 } 9307 propagateIRFlags(Op, I); 9308 return Op; 9309 } 9310 9311 static RecurKind getRdxKind(Instruction *I) { 9312 assert(I && "Expected instruction for reduction matching"); 9313 if (match(I, m_Add(m_Value(), m_Value()))) 9314 return RecurKind::Add; 9315 if (match(I, m_Mul(m_Value(), m_Value()))) 9316 return RecurKind::Mul; 9317 if (match(I, m_And(m_Value(), m_Value())) || 9318 match(I, m_LogicalAnd(m_Value(), m_Value()))) 9319 return RecurKind::And; 9320 if (match(I, m_Or(m_Value(), m_Value())) || 9321 match(I, m_LogicalOr(m_Value(), m_Value()))) 9322 return RecurKind::Or; 9323 if (match(I, m_Xor(m_Value(), m_Value()))) 9324 return RecurKind::Xor; 9325 if (match(I, m_FAdd(m_Value(), m_Value()))) 9326 return RecurKind::FAdd; 9327 if (match(I, m_FMul(m_Value(), m_Value()))) 9328 return RecurKind::FMul; 9329 9330 if (match(I, m_Intrinsic<Intrinsic::maxnum>(m_Value(), m_Value()))) 9331 return RecurKind::FMax; 9332 if (match(I, m_Intrinsic<Intrinsic::minnum>(m_Value(), m_Value()))) 9333 return RecurKind::FMin; 9334 9335 // This matches either cmp+select or intrinsics. SLP is expected to handle 9336 // either form. 9337 // TODO: If we are canonicalizing to intrinsics, we can remove several 9338 // special-case paths that deal with selects. 9339 if (match(I, m_SMax(m_Value(), m_Value()))) 9340 return RecurKind::SMax; 9341 if (match(I, m_SMin(m_Value(), m_Value()))) 9342 return RecurKind::SMin; 9343 if (match(I, m_UMax(m_Value(), m_Value()))) 9344 return RecurKind::UMax; 9345 if (match(I, m_UMin(m_Value(), m_Value()))) 9346 return RecurKind::UMin; 9347 9348 if (auto *Select = dyn_cast<SelectInst>(I)) { 9349 // Try harder: look for min/max pattern based on instructions producing 9350 // same values such as: select ((cmp Inst1, Inst2), Inst1, Inst2). 9351 // During the intermediate stages of SLP, it's very common to have 9352 // pattern like this (since optimizeGatherSequence is run only once 9353 // at the end): 9354 // %1 = extractelement <2 x i32> %a, i32 0 9355 // %2 = extractelement <2 x i32> %a, i32 1 9356 // %cond = icmp sgt i32 %1, %2 9357 // %3 = extractelement <2 x i32> %a, i32 0 9358 // %4 = extractelement <2 x i32> %a, i32 1 9359 // %select = select i1 %cond, i32 %3, i32 %4 9360 CmpInst::Predicate Pred; 9361 Instruction *L1; 9362 Instruction *L2; 9363 9364 Value *LHS = Select->getTrueValue(); 9365 Value *RHS = Select->getFalseValue(); 9366 Value *Cond = Select->getCondition(); 9367 9368 // TODO: Support inverse predicates. 9369 if (match(Cond, m_Cmp(Pred, m_Specific(LHS), m_Instruction(L2)))) { 9370 if (!isa<ExtractElementInst>(RHS) || 9371 !L2->isIdenticalTo(cast<Instruction>(RHS))) 9372 return RecurKind::None; 9373 } else if (match(Cond, m_Cmp(Pred, m_Instruction(L1), m_Specific(RHS)))) { 9374 if (!isa<ExtractElementInst>(LHS) || 9375 !L1->isIdenticalTo(cast<Instruction>(LHS))) 9376 return RecurKind::None; 9377 } else { 9378 if (!isa<ExtractElementInst>(LHS) || !isa<ExtractElementInst>(RHS)) 9379 return RecurKind::None; 9380 if (!match(Cond, m_Cmp(Pred, m_Instruction(L1), m_Instruction(L2))) || 9381 !L1->isIdenticalTo(cast<Instruction>(LHS)) || 9382 !L2->isIdenticalTo(cast<Instruction>(RHS))) 9383 return RecurKind::None; 9384 } 9385 9386 switch (Pred) { 9387 default: 9388 return RecurKind::None; 9389 case CmpInst::ICMP_SGT: 9390 case CmpInst::ICMP_SGE: 9391 return RecurKind::SMax; 9392 case CmpInst::ICMP_SLT: 9393 case CmpInst::ICMP_SLE: 9394 return RecurKind::SMin; 9395 case CmpInst::ICMP_UGT: 9396 case CmpInst::ICMP_UGE: 9397 return RecurKind::UMax; 9398 case CmpInst::ICMP_ULT: 9399 case CmpInst::ICMP_ULE: 9400 return RecurKind::UMin; 9401 } 9402 } 9403 return RecurKind::None; 9404 } 9405 9406 /// Get the index of the first operand. 9407 static unsigned getFirstOperandIndex(Instruction *I) { 9408 return isCmpSelMinMax(I) ? 1 : 0; 9409 } 9410 9411 /// Total number of operands in the reduction operation. 9412 static unsigned getNumberOfOperands(Instruction *I) { 9413 return isCmpSelMinMax(I) ? 3 : 2; 9414 } 9415 9416 /// Checks if the instruction is in basic block \p BB. 9417 /// For a cmp+sel min/max reduction check that both ops are in \p BB. 9418 static bool hasSameParent(Instruction *I, BasicBlock *BB) { 9419 if (isCmpSelMinMax(I) || (isBoolLogicOp(I) && isa<SelectInst>(I))) { 9420 auto *Sel = cast<SelectInst>(I); 9421 auto *Cmp = dyn_cast<Instruction>(Sel->getCondition()); 9422 return Sel->getParent() == BB && Cmp && Cmp->getParent() == BB; 9423 } 9424 return I->getParent() == BB; 9425 } 9426 9427 /// Expected number of uses for reduction operations/reduced values. 9428 static bool hasRequiredNumberOfUses(bool IsCmpSelMinMax, Instruction *I) { 9429 if (IsCmpSelMinMax) { 9430 // SelectInst must be used twice while the condition op must have single 9431 // use only. 9432 if (auto *Sel = dyn_cast<SelectInst>(I)) 9433 return Sel->hasNUses(2) && Sel->getCondition()->hasOneUse(); 9434 return I->hasNUses(2); 9435 } 9436 9437 // Arithmetic reduction operation must be used once only. 9438 return I->hasOneUse(); 9439 } 9440 9441 /// Initializes the list of reduction operations. 9442 void initReductionOps(Instruction *I) { 9443 if (isCmpSelMinMax(I)) 9444 ReductionOps.assign(2, ReductionOpsType()); 9445 else 9446 ReductionOps.assign(1, ReductionOpsType()); 9447 } 9448 9449 /// Add all reduction operations for the reduction instruction \p I. 9450 void addReductionOps(Instruction *I) { 9451 if (isCmpSelMinMax(I)) { 9452 ReductionOps[0].emplace_back(cast<SelectInst>(I)->getCondition()); 9453 ReductionOps[1].emplace_back(I); 9454 } else { 9455 ReductionOps[0].emplace_back(I); 9456 } 9457 } 9458 9459 static Value *getLHS(RecurKind Kind, Instruction *I) { 9460 if (Kind == RecurKind::None) 9461 return nullptr; 9462 return I->getOperand(getFirstOperandIndex(I)); 9463 } 9464 static Value *getRHS(RecurKind Kind, Instruction *I) { 9465 if (Kind == RecurKind::None) 9466 return nullptr; 9467 return I->getOperand(getFirstOperandIndex(I) + 1); 9468 } 9469 9470 public: 9471 HorizontalReduction() = default; 9472 9473 /// Try to find a reduction tree. 9474 bool matchAssociativeReduction(PHINode *Phi, Instruction *Inst) { 9475 assert((!Phi || is_contained(Phi->operands(), Inst)) && 9476 "Phi needs to use the binary operator"); 9477 assert((isa<BinaryOperator>(Inst) || isa<SelectInst>(Inst) || 9478 isa<IntrinsicInst>(Inst)) && 9479 "Expected binop, select, or intrinsic for reduction matching"); 9480 RdxKind = getRdxKind(Inst); 9481 9482 // We could have a initial reductions that is not an add. 9483 // r *= v1 + v2 + v3 + v4 9484 // In such a case start looking for a tree rooted in the first '+'. 9485 if (Phi) { 9486 if (getLHS(RdxKind, Inst) == Phi) { 9487 Phi = nullptr; 9488 Inst = dyn_cast<Instruction>(getRHS(RdxKind, Inst)); 9489 if (!Inst) 9490 return false; 9491 RdxKind = getRdxKind(Inst); 9492 } else if (getRHS(RdxKind, Inst) == Phi) { 9493 Phi = nullptr; 9494 Inst = dyn_cast<Instruction>(getLHS(RdxKind, Inst)); 9495 if (!Inst) 9496 return false; 9497 RdxKind = getRdxKind(Inst); 9498 } 9499 } 9500 9501 if (!isVectorizable(RdxKind, Inst)) 9502 return false; 9503 9504 // Analyze "regular" integer/FP types for reductions - no target-specific 9505 // types or pointers. 9506 Type *Ty = Inst->getType(); 9507 if (!isValidElementType(Ty) || Ty->isPointerTy()) 9508 return false; 9509 9510 // Though the ultimate reduction may have multiple uses, its condition must 9511 // have only single use. 9512 if (auto *Sel = dyn_cast<SelectInst>(Inst)) 9513 if (!Sel->getCondition()->hasOneUse()) 9514 return false; 9515 9516 ReductionRoot = Inst; 9517 9518 // The opcode for leaf values that we perform a reduction on. 9519 // For example: load(x) + load(y) + load(z) + fptoui(w) 9520 // The leaf opcode for 'w' does not match, so we don't include it as a 9521 // potential candidate for the reduction. 9522 unsigned LeafOpcode = 0; 9523 9524 // Post-order traverse the reduction tree starting at Inst. We only handle 9525 // true trees containing binary operators or selects. 9526 SmallVector<std::pair<Instruction *, unsigned>, 32> Stack; 9527 Stack.push_back(std::make_pair(Inst, getFirstOperandIndex(Inst))); 9528 initReductionOps(Inst); 9529 while (!Stack.empty()) { 9530 Instruction *TreeN = Stack.back().first; 9531 unsigned EdgeToVisit = Stack.back().second++; 9532 const RecurKind TreeRdxKind = getRdxKind(TreeN); 9533 bool IsReducedValue = TreeRdxKind != RdxKind; 9534 9535 // Postorder visit. 9536 if (IsReducedValue || EdgeToVisit >= getNumberOfOperands(TreeN)) { 9537 if (IsReducedValue) 9538 ReducedVals.push_back(TreeN); 9539 else { 9540 auto ExtraArgsIter = ExtraArgs.find(TreeN); 9541 if (ExtraArgsIter != ExtraArgs.end() && !ExtraArgsIter->second) { 9542 // Check if TreeN is an extra argument of its parent operation. 9543 if (Stack.size() <= 1) { 9544 // TreeN can't be an extra argument as it is a root reduction 9545 // operation. 9546 return false; 9547 } 9548 // Yes, TreeN is an extra argument, do not add it to a list of 9549 // reduction operations. 9550 // Stack[Stack.size() - 2] always points to the parent operation. 9551 markExtraArg(Stack[Stack.size() - 2], TreeN); 9552 ExtraArgs.erase(TreeN); 9553 } else 9554 addReductionOps(TreeN); 9555 } 9556 // Retract. 9557 Stack.pop_back(); 9558 continue; 9559 } 9560 9561 // Visit operands. 9562 Value *EdgeVal = getRdxOperand(TreeN, EdgeToVisit); 9563 auto *EdgeInst = dyn_cast<Instruction>(EdgeVal); 9564 if (!EdgeInst) { 9565 // Edge value is not a reduction instruction or a leaf instruction. 9566 // (It may be a constant, function argument, or something else.) 9567 markExtraArg(Stack.back(), EdgeVal); 9568 continue; 9569 } 9570 RecurKind EdgeRdxKind = getRdxKind(EdgeInst); 9571 // Continue analysis if the next operand is a reduction operation or 9572 // (possibly) a leaf value. If the leaf value opcode is not set, 9573 // the first met operation != reduction operation is considered as the 9574 // leaf opcode. 9575 // Only handle trees in the current basic block. 9576 // Each tree node needs to have minimal number of users except for the 9577 // ultimate reduction. 9578 const bool IsRdxInst = EdgeRdxKind == RdxKind; 9579 if (EdgeInst != Phi && EdgeInst != Inst && 9580 hasSameParent(EdgeInst, Inst->getParent()) && 9581 hasRequiredNumberOfUses(isCmpSelMinMax(Inst), EdgeInst) && 9582 (!LeafOpcode || LeafOpcode == EdgeInst->getOpcode() || IsRdxInst)) { 9583 if (IsRdxInst) { 9584 // We need to be able to reassociate the reduction operations. 9585 if (!isVectorizable(EdgeRdxKind, EdgeInst)) { 9586 // I is an extra argument for TreeN (its parent operation). 9587 markExtraArg(Stack.back(), EdgeInst); 9588 continue; 9589 } 9590 } else if (!LeafOpcode) { 9591 LeafOpcode = EdgeInst->getOpcode(); 9592 } 9593 Stack.push_back( 9594 std::make_pair(EdgeInst, getFirstOperandIndex(EdgeInst))); 9595 continue; 9596 } 9597 // I is an extra argument for TreeN (its parent operation). 9598 markExtraArg(Stack.back(), EdgeInst); 9599 } 9600 return true; 9601 } 9602 9603 /// Attempt to vectorize the tree found by matchAssociativeReduction. 9604 Value *tryToReduce(BoUpSLP &V, TargetTransformInfo *TTI) { 9605 // If there are a sufficient number of reduction values, reduce 9606 // to a nearby power-of-2. We can safely generate oversized 9607 // vectors and rely on the backend to split them to legal sizes. 9608 unsigned NumReducedVals = ReducedVals.size(); 9609 if (NumReducedVals < 4) 9610 return nullptr; 9611 9612 // Intersect the fast-math-flags from all reduction operations. 9613 FastMathFlags RdxFMF; 9614 RdxFMF.set(); 9615 for (ReductionOpsType &RdxOp : ReductionOps) { 9616 for (Value *RdxVal : RdxOp) { 9617 if (auto *FPMO = dyn_cast<FPMathOperator>(RdxVal)) 9618 RdxFMF &= FPMO->getFastMathFlags(); 9619 } 9620 } 9621 9622 IRBuilder<> Builder(cast<Instruction>(ReductionRoot)); 9623 Builder.setFastMathFlags(RdxFMF); 9624 9625 BoUpSLP::ExtraValueToDebugLocsMap ExternallyUsedValues; 9626 // The same extra argument may be used several times, so log each attempt 9627 // to use it. 9628 for (const std::pair<Instruction *, Value *> &Pair : ExtraArgs) { 9629 assert(Pair.first && "DebugLoc must be set."); 9630 ExternallyUsedValues[Pair.second].push_back(Pair.first); 9631 } 9632 9633 // The compare instruction of a min/max is the insertion point for new 9634 // instructions and may be replaced with a new compare instruction. 9635 auto getCmpForMinMaxReduction = [](Instruction *RdxRootInst) { 9636 assert(isa<SelectInst>(RdxRootInst) && 9637 "Expected min/max reduction to have select root instruction"); 9638 Value *ScalarCond = cast<SelectInst>(RdxRootInst)->getCondition(); 9639 assert(isa<Instruction>(ScalarCond) && 9640 "Expected min/max reduction to have compare condition"); 9641 return cast<Instruction>(ScalarCond); 9642 }; 9643 9644 // The reduction root is used as the insertion point for new instructions, 9645 // so set it as externally used to prevent it from being deleted. 9646 ExternallyUsedValues[ReductionRoot]; 9647 SmallVector<Value *, 16> IgnoreList; 9648 for (ReductionOpsType &RdxOp : ReductionOps) 9649 IgnoreList.append(RdxOp.begin(), RdxOp.end()); 9650 9651 unsigned ReduxWidth = PowerOf2Floor(NumReducedVals); 9652 if (NumReducedVals > ReduxWidth) { 9653 // In the loop below, we are building a tree based on a window of 9654 // 'ReduxWidth' values. 9655 // If the operands of those values have common traits (compare predicate, 9656 // constant operand, etc), then we want to group those together to 9657 // minimize the cost of the reduction. 9658 9659 // TODO: This should be extended to count common operands for 9660 // compares and binops. 9661 9662 // Step 1: Count the number of times each compare predicate occurs. 9663 SmallDenseMap<unsigned, unsigned> PredCountMap; 9664 for (Value *RdxVal : ReducedVals) { 9665 CmpInst::Predicate Pred; 9666 if (match(RdxVal, m_Cmp(Pred, m_Value(), m_Value()))) 9667 ++PredCountMap[Pred]; 9668 } 9669 // Step 2: Sort the values so the most common predicates come first. 9670 stable_sort(ReducedVals, [&PredCountMap](Value *A, Value *B) { 9671 CmpInst::Predicate PredA, PredB; 9672 if (match(A, m_Cmp(PredA, m_Value(), m_Value())) && 9673 match(B, m_Cmp(PredB, m_Value(), m_Value()))) { 9674 return PredCountMap[PredA] > PredCountMap[PredB]; 9675 } 9676 return false; 9677 }); 9678 } 9679 9680 Value *VectorizedTree = nullptr; 9681 unsigned i = 0; 9682 while (i < NumReducedVals - ReduxWidth + 1 && ReduxWidth > 2) { 9683 ArrayRef<Value *> VL(&ReducedVals[i], ReduxWidth); 9684 V.buildTree(VL, IgnoreList); 9685 if (V.isTreeTinyAndNotFullyVectorizable(/*ForReduction=*/true)) 9686 break; 9687 if (V.isLoadCombineReductionCandidate(RdxKind)) 9688 break; 9689 V.reorderTopToBottom(); 9690 V.reorderBottomToTop(/*IgnoreReorder=*/true); 9691 V.buildExternalUses(ExternallyUsedValues); 9692 9693 // For a poison-safe boolean logic reduction, do not replace select 9694 // instructions with logic ops. All reduced values will be frozen (see 9695 // below) to prevent leaking poison. 9696 if (isa<SelectInst>(ReductionRoot) && 9697 isBoolLogicOp(cast<Instruction>(ReductionRoot)) && 9698 NumReducedVals != ReduxWidth) 9699 break; 9700 9701 V.computeMinimumValueSizes(); 9702 9703 // Estimate cost. 9704 InstructionCost TreeCost = 9705 V.getTreeCost(makeArrayRef(&ReducedVals[i], ReduxWidth)); 9706 InstructionCost ReductionCost = 9707 getReductionCost(TTI, ReducedVals[i], ReduxWidth, RdxFMF); 9708 InstructionCost Cost = TreeCost + ReductionCost; 9709 if (!Cost.isValid()) { 9710 LLVM_DEBUG(dbgs() << "Encountered invalid baseline cost.\n"); 9711 return nullptr; 9712 } 9713 if (Cost >= -SLPCostThreshold) { 9714 V.getORE()->emit([&]() { 9715 return OptimizationRemarkMissed(SV_NAME, "HorSLPNotBeneficial", 9716 cast<Instruction>(VL[0])) 9717 << "Vectorizing horizontal reduction is possible" 9718 << "but not beneficial with cost " << ore::NV("Cost", Cost) 9719 << " and threshold " 9720 << ore::NV("Threshold", -SLPCostThreshold); 9721 }); 9722 break; 9723 } 9724 9725 LLVM_DEBUG(dbgs() << "SLP: Vectorizing horizontal reduction at cost:" 9726 << Cost << ". (HorRdx)\n"); 9727 V.getORE()->emit([&]() { 9728 return OptimizationRemark(SV_NAME, "VectorizedHorizontalReduction", 9729 cast<Instruction>(VL[0])) 9730 << "Vectorized horizontal reduction with cost " 9731 << ore::NV("Cost", Cost) << " and with tree size " 9732 << ore::NV("TreeSize", V.getTreeSize()); 9733 }); 9734 9735 // Vectorize a tree. 9736 DebugLoc Loc = cast<Instruction>(ReducedVals[i])->getDebugLoc(); 9737 Value *VectorizedRoot = V.vectorizeTree(ExternallyUsedValues); 9738 9739 // Emit a reduction. If the root is a select (min/max idiom), the insert 9740 // point is the compare condition of that select. 9741 Instruction *RdxRootInst = cast<Instruction>(ReductionRoot); 9742 if (isCmpSelMinMax(RdxRootInst)) 9743 Builder.SetInsertPoint(getCmpForMinMaxReduction(RdxRootInst)); 9744 else 9745 Builder.SetInsertPoint(RdxRootInst); 9746 9747 // To prevent poison from leaking across what used to be sequential, safe, 9748 // scalar boolean logic operations, the reduction operand must be frozen. 9749 if (isa<SelectInst>(RdxRootInst) && isBoolLogicOp(RdxRootInst)) 9750 VectorizedRoot = Builder.CreateFreeze(VectorizedRoot); 9751 9752 Value *ReducedSubTree = 9753 emitReduction(VectorizedRoot, Builder, ReduxWidth, TTI); 9754 9755 if (!VectorizedTree) { 9756 // Initialize the final value in the reduction. 9757 VectorizedTree = ReducedSubTree; 9758 } else { 9759 // Update the final value in the reduction. 9760 Builder.SetCurrentDebugLocation(Loc); 9761 VectorizedTree = createOp(Builder, RdxKind, VectorizedTree, 9762 ReducedSubTree, "op.rdx", ReductionOps); 9763 } 9764 i += ReduxWidth; 9765 ReduxWidth = PowerOf2Floor(NumReducedVals - i); 9766 } 9767 9768 if (VectorizedTree) { 9769 // Finish the reduction. 9770 for (; i < NumReducedVals; ++i) { 9771 auto *I = cast<Instruction>(ReducedVals[i]); 9772 Builder.SetCurrentDebugLocation(I->getDebugLoc()); 9773 VectorizedTree = 9774 createOp(Builder, RdxKind, VectorizedTree, I, "", ReductionOps); 9775 } 9776 for (auto &Pair : ExternallyUsedValues) { 9777 // Add each externally used value to the final reduction. 9778 for (auto *I : Pair.second) { 9779 Builder.SetCurrentDebugLocation(I->getDebugLoc()); 9780 VectorizedTree = createOp(Builder, RdxKind, VectorizedTree, 9781 Pair.first, "op.extra", I); 9782 } 9783 } 9784 9785 ReductionRoot->replaceAllUsesWith(VectorizedTree); 9786 9787 // Mark all scalar reduction ops for deletion, they are replaced by the 9788 // vector reductions. 9789 V.eraseInstructions(IgnoreList); 9790 } 9791 return VectorizedTree; 9792 } 9793 9794 unsigned numReductionValues() const { return ReducedVals.size(); } 9795 9796 private: 9797 /// Calculate the cost of a reduction. 9798 InstructionCost getReductionCost(TargetTransformInfo *TTI, 9799 Value *FirstReducedVal, unsigned ReduxWidth, 9800 FastMathFlags FMF) { 9801 TTI::TargetCostKind CostKind = TTI::TCK_RecipThroughput; 9802 Type *ScalarTy = FirstReducedVal->getType(); 9803 FixedVectorType *VectorTy = FixedVectorType::get(ScalarTy, ReduxWidth); 9804 InstructionCost VectorCost, ScalarCost; 9805 switch (RdxKind) { 9806 case RecurKind::Add: 9807 case RecurKind::Mul: 9808 case RecurKind::Or: 9809 case RecurKind::And: 9810 case RecurKind::Xor: 9811 case RecurKind::FAdd: 9812 case RecurKind::FMul: { 9813 unsigned RdxOpcode = RecurrenceDescriptor::getOpcode(RdxKind); 9814 VectorCost = 9815 TTI->getArithmeticReductionCost(RdxOpcode, VectorTy, FMF, CostKind); 9816 ScalarCost = TTI->getArithmeticInstrCost(RdxOpcode, ScalarTy, CostKind); 9817 break; 9818 } 9819 case RecurKind::FMax: 9820 case RecurKind::FMin: { 9821 auto *SclCondTy = CmpInst::makeCmpResultType(ScalarTy); 9822 auto *VecCondTy = cast<VectorType>(CmpInst::makeCmpResultType(VectorTy)); 9823 VectorCost = TTI->getMinMaxReductionCost(VectorTy, VecCondTy, 9824 /*IsUnsigned=*/false, CostKind); 9825 CmpInst::Predicate RdxPred = getMinMaxReductionPredicate(RdxKind); 9826 ScalarCost = TTI->getCmpSelInstrCost(Instruction::FCmp, ScalarTy, 9827 SclCondTy, RdxPred, CostKind) + 9828 TTI->getCmpSelInstrCost(Instruction::Select, ScalarTy, 9829 SclCondTy, RdxPred, CostKind); 9830 break; 9831 } 9832 case RecurKind::SMax: 9833 case RecurKind::SMin: 9834 case RecurKind::UMax: 9835 case RecurKind::UMin: { 9836 auto *SclCondTy = CmpInst::makeCmpResultType(ScalarTy); 9837 auto *VecCondTy = cast<VectorType>(CmpInst::makeCmpResultType(VectorTy)); 9838 bool IsUnsigned = 9839 RdxKind == RecurKind::UMax || RdxKind == RecurKind::UMin; 9840 VectorCost = TTI->getMinMaxReductionCost(VectorTy, VecCondTy, IsUnsigned, 9841 CostKind); 9842 CmpInst::Predicate RdxPred = getMinMaxReductionPredicate(RdxKind); 9843 ScalarCost = TTI->getCmpSelInstrCost(Instruction::ICmp, ScalarTy, 9844 SclCondTy, RdxPred, CostKind) + 9845 TTI->getCmpSelInstrCost(Instruction::Select, ScalarTy, 9846 SclCondTy, RdxPred, CostKind); 9847 break; 9848 } 9849 default: 9850 llvm_unreachable("Expected arithmetic or min/max reduction operation"); 9851 } 9852 9853 // Scalar cost is repeated for N-1 elements. 9854 ScalarCost *= (ReduxWidth - 1); 9855 LLVM_DEBUG(dbgs() << "SLP: Adding cost " << VectorCost - ScalarCost 9856 << " for reduction that starts with " << *FirstReducedVal 9857 << " (It is a splitting reduction)\n"); 9858 return VectorCost - ScalarCost; 9859 } 9860 9861 /// Emit a horizontal reduction of the vectorized value. 9862 Value *emitReduction(Value *VectorizedValue, IRBuilder<> &Builder, 9863 unsigned ReduxWidth, const TargetTransformInfo *TTI) { 9864 assert(VectorizedValue && "Need to have a vectorized tree node"); 9865 assert(isPowerOf2_32(ReduxWidth) && 9866 "We only handle power-of-two reductions for now"); 9867 assert(RdxKind != RecurKind::FMulAdd && 9868 "A call to the llvm.fmuladd intrinsic is not handled yet"); 9869 9870 ++NumVectorInstructions; 9871 return createSimpleTargetReduction(Builder, TTI, VectorizedValue, RdxKind); 9872 } 9873 }; 9874 9875 } // end anonymous namespace 9876 9877 static Optional<unsigned> getAggregateSize(Instruction *InsertInst) { 9878 if (auto *IE = dyn_cast<InsertElementInst>(InsertInst)) 9879 return cast<FixedVectorType>(IE->getType())->getNumElements(); 9880 9881 unsigned AggregateSize = 1; 9882 auto *IV = cast<InsertValueInst>(InsertInst); 9883 Type *CurrentType = IV->getType(); 9884 do { 9885 if (auto *ST = dyn_cast<StructType>(CurrentType)) { 9886 for (auto *Elt : ST->elements()) 9887 if (Elt != ST->getElementType(0)) // check homogeneity 9888 return None; 9889 AggregateSize *= ST->getNumElements(); 9890 CurrentType = ST->getElementType(0); 9891 } else if (auto *AT = dyn_cast<ArrayType>(CurrentType)) { 9892 AggregateSize *= AT->getNumElements(); 9893 CurrentType = AT->getElementType(); 9894 } else if (auto *VT = dyn_cast<FixedVectorType>(CurrentType)) { 9895 AggregateSize *= VT->getNumElements(); 9896 return AggregateSize; 9897 } else if (CurrentType->isSingleValueType()) { 9898 return AggregateSize; 9899 } else { 9900 return None; 9901 } 9902 } while (true); 9903 } 9904 9905 static void findBuildAggregate_rec(Instruction *LastInsertInst, 9906 TargetTransformInfo *TTI, 9907 SmallVectorImpl<Value *> &BuildVectorOpds, 9908 SmallVectorImpl<Value *> &InsertElts, 9909 unsigned OperandOffset) { 9910 do { 9911 Value *InsertedOperand = LastInsertInst->getOperand(1); 9912 Optional<unsigned> OperandIndex = 9913 getInsertIndex(LastInsertInst, OperandOffset); 9914 if (!OperandIndex) 9915 return; 9916 if (isa<InsertElementInst>(InsertedOperand) || 9917 isa<InsertValueInst>(InsertedOperand)) { 9918 findBuildAggregate_rec(cast<Instruction>(InsertedOperand), TTI, 9919 BuildVectorOpds, InsertElts, *OperandIndex); 9920 9921 } else { 9922 BuildVectorOpds[*OperandIndex] = InsertedOperand; 9923 InsertElts[*OperandIndex] = LastInsertInst; 9924 } 9925 LastInsertInst = dyn_cast<Instruction>(LastInsertInst->getOperand(0)); 9926 } while (LastInsertInst != nullptr && 9927 (isa<InsertValueInst>(LastInsertInst) || 9928 isa<InsertElementInst>(LastInsertInst)) && 9929 LastInsertInst->hasOneUse()); 9930 } 9931 9932 /// Recognize construction of vectors like 9933 /// %ra = insertelement <4 x float> poison, float %s0, i32 0 9934 /// %rb = insertelement <4 x float> %ra, float %s1, i32 1 9935 /// %rc = insertelement <4 x float> %rb, float %s2, i32 2 9936 /// %rd = insertelement <4 x float> %rc, float %s3, i32 3 9937 /// starting from the last insertelement or insertvalue instruction. 9938 /// 9939 /// Also recognize homogeneous aggregates like {<2 x float>, <2 x float>}, 9940 /// {{float, float}, {float, float}}, [2 x {float, float}] and so on. 9941 /// See llvm/test/Transforms/SLPVectorizer/X86/pr42022.ll for examples. 9942 /// 9943 /// Assume LastInsertInst is of InsertElementInst or InsertValueInst type. 9944 /// 9945 /// \return true if it matches. 9946 static bool findBuildAggregate(Instruction *LastInsertInst, 9947 TargetTransformInfo *TTI, 9948 SmallVectorImpl<Value *> &BuildVectorOpds, 9949 SmallVectorImpl<Value *> &InsertElts) { 9950 9951 assert((isa<InsertElementInst>(LastInsertInst) || 9952 isa<InsertValueInst>(LastInsertInst)) && 9953 "Expected insertelement or insertvalue instruction!"); 9954 9955 assert((BuildVectorOpds.empty() && InsertElts.empty()) && 9956 "Expected empty result vectors!"); 9957 9958 Optional<unsigned> AggregateSize = getAggregateSize(LastInsertInst); 9959 if (!AggregateSize) 9960 return false; 9961 BuildVectorOpds.resize(*AggregateSize); 9962 InsertElts.resize(*AggregateSize); 9963 9964 findBuildAggregate_rec(LastInsertInst, TTI, BuildVectorOpds, InsertElts, 0); 9965 llvm::erase_value(BuildVectorOpds, nullptr); 9966 llvm::erase_value(InsertElts, nullptr); 9967 if (BuildVectorOpds.size() >= 2) 9968 return true; 9969 9970 return false; 9971 } 9972 9973 /// Try and get a reduction value from a phi node. 9974 /// 9975 /// Given a phi node \p P in a block \p ParentBB, consider possible reductions 9976 /// if they come from either \p ParentBB or a containing loop latch. 9977 /// 9978 /// \returns A candidate reduction value if possible, or \code nullptr \endcode 9979 /// if not possible. 9980 static Value *getReductionValue(const DominatorTree *DT, PHINode *P, 9981 BasicBlock *ParentBB, LoopInfo *LI) { 9982 // There are situations where the reduction value is not dominated by the 9983 // reduction phi. Vectorizing such cases has been reported to cause 9984 // miscompiles. See PR25787. 9985 auto DominatedReduxValue = [&](Value *R) { 9986 return isa<Instruction>(R) && 9987 DT->dominates(P->getParent(), cast<Instruction>(R)->getParent()); 9988 }; 9989 9990 Value *Rdx = nullptr; 9991 9992 // Return the incoming value if it comes from the same BB as the phi node. 9993 if (P->getIncomingBlock(0) == ParentBB) { 9994 Rdx = P->getIncomingValue(0); 9995 } else if (P->getIncomingBlock(1) == ParentBB) { 9996 Rdx = P->getIncomingValue(1); 9997 } 9998 9999 if (Rdx && DominatedReduxValue(Rdx)) 10000 return Rdx; 10001 10002 // Otherwise, check whether we have a loop latch to look at. 10003 Loop *BBL = LI->getLoopFor(ParentBB); 10004 if (!BBL) 10005 return nullptr; 10006 BasicBlock *BBLatch = BBL->getLoopLatch(); 10007 if (!BBLatch) 10008 return nullptr; 10009 10010 // There is a loop latch, return the incoming value if it comes from 10011 // that. This reduction pattern occasionally turns up. 10012 if (P->getIncomingBlock(0) == BBLatch) { 10013 Rdx = P->getIncomingValue(0); 10014 } else if (P->getIncomingBlock(1) == BBLatch) { 10015 Rdx = P->getIncomingValue(1); 10016 } 10017 10018 if (Rdx && DominatedReduxValue(Rdx)) 10019 return Rdx; 10020 10021 return nullptr; 10022 } 10023 10024 static bool matchRdxBop(Instruction *I, Value *&V0, Value *&V1) { 10025 if (match(I, m_BinOp(m_Value(V0), m_Value(V1)))) 10026 return true; 10027 if (match(I, m_Intrinsic<Intrinsic::maxnum>(m_Value(V0), m_Value(V1)))) 10028 return true; 10029 if (match(I, m_Intrinsic<Intrinsic::minnum>(m_Value(V0), m_Value(V1)))) 10030 return true; 10031 if (match(I, m_Intrinsic<Intrinsic::smax>(m_Value(V0), m_Value(V1)))) 10032 return true; 10033 if (match(I, m_Intrinsic<Intrinsic::smin>(m_Value(V0), m_Value(V1)))) 10034 return true; 10035 if (match(I, m_Intrinsic<Intrinsic::umax>(m_Value(V0), m_Value(V1)))) 10036 return true; 10037 if (match(I, m_Intrinsic<Intrinsic::umin>(m_Value(V0), m_Value(V1)))) 10038 return true; 10039 return false; 10040 } 10041 10042 /// Attempt to reduce a horizontal reduction. 10043 /// If it is legal to match a horizontal reduction feeding the phi node \a P 10044 /// with reduction operators \a Root (or one of its operands) in a basic block 10045 /// \a BB, then check if it can be done. If horizontal reduction is not found 10046 /// and root instruction is a binary operation, vectorization of the operands is 10047 /// attempted. 10048 /// \returns true if a horizontal reduction was matched and reduced or operands 10049 /// of one of the binary instruction were vectorized. 10050 /// \returns false if a horizontal reduction was not matched (or not possible) 10051 /// or no vectorization of any binary operation feeding \a Root instruction was 10052 /// performed. 10053 static bool tryToVectorizeHorReductionOrInstOperands( 10054 PHINode *P, Instruction *Root, BasicBlock *BB, BoUpSLP &R, 10055 TargetTransformInfo *TTI, 10056 const function_ref<bool(Instruction *, BoUpSLP &)> Vectorize) { 10057 if (!ShouldVectorizeHor) 10058 return false; 10059 10060 if (!Root) 10061 return false; 10062 10063 if (Root->getParent() != BB || isa<PHINode>(Root)) 10064 return false; 10065 // Start analysis starting from Root instruction. If horizontal reduction is 10066 // found, try to vectorize it. If it is not a horizontal reduction or 10067 // vectorization is not possible or not effective, and currently analyzed 10068 // instruction is a binary operation, try to vectorize the operands, using 10069 // pre-order DFS traversal order. If the operands were not vectorized, repeat 10070 // the same procedure considering each operand as a possible root of the 10071 // horizontal reduction. 10072 // Interrupt the process if the Root instruction itself was vectorized or all 10073 // sub-trees not higher that RecursionMaxDepth were analyzed/vectorized. 10074 // Skip the analysis of CmpInsts.Compiler implements postanalysis of the 10075 // CmpInsts so we can skip extra attempts in 10076 // tryToVectorizeHorReductionOrInstOperands and save compile time. 10077 std::queue<std::pair<Instruction *, unsigned>> Stack; 10078 Stack.emplace(Root, 0); 10079 SmallPtrSet<Value *, 8> VisitedInstrs; 10080 SmallVector<WeakTrackingVH> PostponedInsts; 10081 bool Res = false; 10082 auto &&TryToReduce = [TTI, &P, &R](Instruction *Inst, Value *&B0, 10083 Value *&B1) -> Value * { 10084 bool IsBinop = matchRdxBop(Inst, B0, B1); 10085 bool IsSelect = match(Inst, m_Select(m_Value(), m_Value(), m_Value())); 10086 if (IsBinop || IsSelect) { 10087 HorizontalReduction HorRdx; 10088 if (HorRdx.matchAssociativeReduction(P, Inst)) 10089 return HorRdx.tryToReduce(R, TTI); 10090 } 10091 return nullptr; 10092 }; 10093 while (!Stack.empty()) { 10094 Instruction *Inst; 10095 unsigned Level; 10096 std::tie(Inst, Level) = Stack.front(); 10097 Stack.pop(); 10098 // Do not try to analyze instruction that has already been vectorized. 10099 // This may happen when we vectorize instruction operands on a previous 10100 // iteration while stack was populated before that happened. 10101 if (R.isDeleted(Inst)) 10102 continue; 10103 Value *B0 = nullptr, *B1 = nullptr; 10104 if (Value *V = TryToReduce(Inst, B0, B1)) { 10105 Res = true; 10106 // Set P to nullptr to avoid re-analysis of phi node in 10107 // matchAssociativeReduction function unless this is the root node. 10108 P = nullptr; 10109 if (auto *I = dyn_cast<Instruction>(V)) { 10110 // Try to find another reduction. 10111 Stack.emplace(I, Level); 10112 continue; 10113 } 10114 } else { 10115 bool IsBinop = B0 && B1; 10116 if (P && IsBinop) { 10117 Inst = dyn_cast<Instruction>(B0); 10118 if (Inst == P) 10119 Inst = dyn_cast<Instruction>(B1); 10120 if (!Inst) { 10121 // Set P to nullptr to avoid re-analysis of phi node in 10122 // matchAssociativeReduction function unless this is the root node. 10123 P = nullptr; 10124 continue; 10125 } 10126 } 10127 // Set P to nullptr to avoid re-analysis of phi node in 10128 // matchAssociativeReduction function unless this is the root node. 10129 P = nullptr; 10130 // Do not try to vectorize CmpInst operands, this is done separately. 10131 // Final attempt for binop args vectorization should happen after the loop 10132 // to try to find reductions. 10133 if (!isa<CmpInst>(Inst)) 10134 PostponedInsts.push_back(Inst); 10135 } 10136 10137 // Try to vectorize operands. 10138 // Continue analysis for the instruction from the same basic block only to 10139 // save compile time. 10140 if (++Level < RecursionMaxDepth) 10141 for (auto *Op : Inst->operand_values()) 10142 if (VisitedInstrs.insert(Op).second) 10143 if (auto *I = dyn_cast<Instruction>(Op)) 10144 // Do not try to vectorize CmpInst operands, this is done 10145 // separately. 10146 if (!isa<PHINode>(I) && !isa<CmpInst>(I) && !R.isDeleted(I) && 10147 I->getParent() == BB) 10148 Stack.emplace(I, Level); 10149 } 10150 // Try to vectorized binops where reductions were not found. 10151 for (Value *V : PostponedInsts) 10152 if (auto *Inst = dyn_cast<Instruction>(V)) 10153 if (!R.isDeleted(Inst)) 10154 Res |= Vectorize(Inst, R); 10155 return Res; 10156 } 10157 10158 bool SLPVectorizerPass::vectorizeRootInstruction(PHINode *P, Value *V, 10159 BasicBlock *BB, BoUpSLP &R, 10160 TargetTransformInfo *TTI) { 10161 auto *I = dyn_cast_or_null<Instruction>(V); 10162 if (!I) 10163 return false; 10164 10165 if (!isa<BinaryOperator>(I)) 10166 P = nullptr; 10167 // Try to match and vectorize a horizontal reduction. 10168 auto &&ExtraVectorization = [this](Instruction *I, BoUpSLP &R) -> bool { 10169 return tryToVectorize(I, R); 10170 }; 10171 return tryToVectorizeHorReductionOrInstOperands(P, I, BB, R, TTI, 10172 ExtraVectorization); 10173 } 10174 10175 bool SLPVectorizerPass::vectorizeInsertValueInst(InsertValueInst *IVI, 10176 BasicBlock *BB, BoUpSLP &R) { 10177 const DataLayout &DL = BB->getModule()->getDataLayout(); 10178 if (!R.canMapToVector(IVI->getType(), DL)) 10179 return false; 10180 10181 SmallVector<Value *, 16> BuildVectorOpds; 10182 SmallVector<Value *, 16> BuildVectorInsts; 10183 if (!findBuildAggregate(IVI, TTI, BuildVectorOpds, BuildVectorInsts)) 10184 return false; 10185 10186 LLVM_DEBUG(dbgs() << "SLP: array mappable to vector: " << *IVI << "\n"); 10187 // Aggregate value is unlikely to be processed in vector register. 10188 return tryToVectorizeList(BuildVectorOpds, R); 10189 } 10190 10191 bool SLPVectorizerPass::vectorizeInsertElementInst(InsertElementInst *IEI, 10192 BasicBlock *BB, BoUpSLP &R) { 10193 SmallVector<Value *, 16> BuildVectorInsts; 10194 SmallVector<Value *, 16> BuildVectorOpds; 10195 SmallVector<int> Mask; 10196 if (!findBuildAggregate(IEI, TTI, BuildVectorOpds, BuildVectorInsts) || 10197 (llvm::all_of( 10198 BuildVectorOpds, 10199 [](Value *V) { return isa<ExtractElementInst, UndefValue>(V); }) && 10200 isFixedVectorShuffle(BuildVectorOpds, Mask))) 10201 return false; 10202 10203 LLVM_DEBUG(dbgs() << "SLP: array mappable to vector: " << *IEI << "\n"); 10204 return tryToVectorizeList(BuildVectorInsts, R); 10205 } 10206 10207 template <typename T> 10208 static bool 10209 tryToVectorizeSequence(SmallVectorImpl<T *> &Incoming, 10210 function_ref<unsigned(T *)> Limit, 10211 function_ref<bool(T *, T *)> Comparator, 10212 function_ref<bool(T *, T *)> AreCompatible, 10213 function_ref<bool(ArrayRef<T *>, bool)> TryToVectorizeHelper, 10214 bool LimitForRegisterSize) { 10215 bool Changed = false; 10216 // Sort by type, parent, operands. 10217 stable_sort(Incoming, Comparator); 10218 10219 // Try to vectorize elements base on their type. 10220 SmallVector<T *> Candidates; 10221 for (auto *IncIt = Incoming.begin(), *E = Incoming.end(); IncIt != E;) { 10222 // Look for the next elements with the same type, parent and operand 10223 // kinds. 10224 auto *SameTypeIt = IncIt; 10225 while (SameTypeIt != E && AreCompatible(*SameTypeIt, *IncIt)) 10226 ++SameTypeIt; 10227 10228 // Try to vectorize them. 10229 unsigned NumElts = (SameTypeIt - IncIt); 10230 LLVM_DEBUG(dbgs() << "SLP: Trying to vectorize starting at nodes (" 10231 << NumElts << ")\n"); 10232 // The vectorization is a 3-state attempt: 10233 // 1. Try to vectorize instructions with the same/alternate opcodes with the 10234 // size of maximal register at first. 10235 // 2. Try to vectorize remaining instructions with the same type, if 10236 // possible. This may result in the better vectorization results rather than 10237 // if we try just to vectorize instructions with the same/alternate opcodes. 10238 // 3. Final attempt to try to vectorize all instructions with the 10239 // same/alternate ops only, this may result in some extra final 10240 // vectorization. 10241 if (NumElts > 1 && 10242 TryToVectorizeHelper(makeArrayRef(IncIt, NumElts), LimitForRegisterSize)) { 10243 // Success start over because instructions might have been changed. 10244 Changed = true; 10245 } else if (NumElts < Limit(*IncIt) && 10246 (Candidates.empty() || 10247 Candidates.front()->getType() == (*IncIt)->getType())) { 10248 Candidates.append(IncIt, std::next(IncIt, NumElts)); 10249 } 10250 // Final attempt to vectorize instructions with the same types. 10251 if (Candidates.size() > 1 && 10252 (SameTypeIt == E || (*SameTypeIt)->getType() != (*IncIt)->getType())) { 10253 if (TryToVectorizeHelper(Candidates, /*LimitForRegisterSize=*/false)) { 10254 // Success start over because instructions might have been changed. 10255 Changed = true; 10256 } else if (LimitForRegisterSize) { 10257 // Try to vectorize using small vectors. 10258 for (auto *It = Candidates.begin(), *End = Candidates.end(); 10259 It != End;) { 10260 auto *SameTypeIt = It; 10261 while (SameTypeIt != End && AreCompatible(*SameTypeIt, *It)) 10262 ++SameTypeIt; 10263 unsigned NumElts = (SameTypeIt - It); 10264 if (NumElts > 1 && TryToVectorizeHelper(makeArrayRef(It, NumElts), 10265 /*LimitForRegisterSize=*/false)) 10266 Changed = true; 10267 It = SameTypeIt; 10268 } 10269 } 10270 Candidates.clear(); 10271 } 10272 10273 // Start over at the next instruction of a different type (or the end). 10274 IncIt = SameTypeIt; 10275 } 10276 return Changed; 10277 } 10278 10279 /// Compare two cmp instructions. If IsCompatibility is true, function returns 10280 /// true if 2 cmps have same/swapped predicates and mos compatible corresponding 10281 /// operands. If IsCompatibility is false, function implements strict weak 10282 /// ordering relation between two cmp instructions, returning true if the first 10283 /// instruction is "less" than the second, i.e. its predicate is less than the 10284 /// predicate of the second or the operands IDs are less than the operands IDs 10285 /// of the second cmp instruction. 10286 template <bool IsCompatibility> 10287 static bool compareCmp(Value *V, Value *V2, 10288 function_ref<bool(Instruction *)> IsDeleted) { 10289 auto *CI1 = cast<CmpInst>(V); 10290 auto *CI2 = cast<CmpInst>(V2); 10291 if (IsDeleted(CI2) || !isValidElementType(CI2->getType())) 10292 return false; 10293 if (CI1->getOperand(0)->getType()->getTypeID() < 10294 CI2->getOperand(0)->getType()->getTypeID()) 10295 return !IsCompatibility; 10296 if (CI1->getOperand(0)->getType()->getTypeID() > 10297 CI2->getOperand(0)->getType()->getTypeID()) 10298 return false; 10299 CmpInst::Predicate Pred1 = CI1->getPredicate(); 10300 CmpInst::Predicate Pred2 = CI2->getPredicate(); 10301 CmpInst::Predicate SwapPred1 = CmpInst::getSwappedPredicate(Pred1); 10302 CmpInst::Predicate SwapPred2 = CmpInst::getSwappedPredicate(Pred2); 10303 CmpInst::Predicate BasePred1 = std::min(Pred1, SwapPred1); 10304 CmpInst::Predicate BasePred2 = std::min(Pred2, SwapPred2); 10305 if (BasePred1 < BasePred2) 10306 return !IsCompatibility; 10307 if (BasePred1 > BasePred2) 10308 return false; 10309 // Compare operands. 10310 bool LEPreds = Pred1 <= Pred2; 10311 bool GEPreds = Pred1 >= Pred2; 10312 for (int I = 0, E = CI1->getNumOperands(); I < E; ++I) { 10313 auto *Op1 = CI1->getOperand(LEPreds ? I : E - I - 1); 10314 auto *Op2 = CI2->getOperand(GEPreds ? I : E - I - 1); 10315 if (Op1->getValueID() < Op2->getValueID()) 10316 return !IsCompatibility; 10317 if (Op1->getValueID() > Op2->getValueID()) 10318 return false; 10319 if (auto *I1 = dyn_cast<Instruction>(Op1)) 10320 if (auto *I2 = dyn_cast<Instruction>(Op2)) { 10321 if (I1->getParent() != I2->getParent()) 10322 return false; 10323 InstructionsState S = getSameOpcode({I1, I2}); 10324 if (S.getOpcode()) 10325 continue; 10326 return false; 10327 } 10328 } 10329 return IsCompatibility; 10330 } 10331 10332 bool SLPVectorizerPass::vectorizeSimpleInstructions( 10333 SmallVectorImpl<Instruction *> &Instructions, BasicBlock *BB, BoUpSLP &R, 10334 bool AtTerminator) { 10335 bool OpsChanged = false; 10336 SmallVector<Instruction *, 4> PostponedCmps; 10337 for (auto *I : reverse(Instructions)) { 10338 if (R.isDeleted(I)) 10339 continue; 10340 if (auto *LastInsertValue = dyn_cast<InsertValueInst>(I)) 10341 OpsChanged |= vectorizeInsertValueInst(LastInsertValue, BB, R); 10342 else if (auto *LastInsertElem = dyn_cast<InsertElementInst>(I)) 10343 OpsChanged |= vectorizeInsertElementInst(LastInsertElem, BB, R); 10344 else if (isa<CmpInst>(I)) 10345 PostponedCmps.push_back(I); 10346 } 10347 if (AtTerminator) { 10348 // Try to find reductions first. 10349 for (Instruction *I : PostponedCmps) { 10350 if (R.isDeleted(I)) 10351 continue; 10352 for (Value *Op : I->operands()) 10353 OpsChanged |= vectorizeRootInstruction(nullptr, Op, BB, R, TTI); 10354 } 10355 // Try to vectorize operands as vector bundles. 10356 for (Instruction *I : PostponedCmps) { 10357 if (R.isDeleted(I)) 10358 continue; 10359 OpsChanged |= tryToVectorize(I, R); 10360 } 10361 // Try to vectorize list of compares. 10362 // Sort by type, compare predicate, etc. 10363 auto &&CompareSorter = [&R](Value *V, Value *V2) { 10364 return compareCmp<false>(V, V2, 10365 [&R](Instruction *I) { return R.isDeleted(I); }); 10366 }; 10367 10368 auto &&AreCompatibleCompares = [&R](Value *V1, Value *V2) { 10369 if (V1 == V2) 10370 return true; 10371 return compareCmp<true>(V1, V2, 10372 [&R](Instruction *I) { return R.isDeleted(I); }); 10373 }; 10374 auto Limit = [&R](Value *V) { 10375 unsigned EltSize = R.getVectorElementSize(V); 10376 return std::max(2U, R.getMaxVecRegSize() / EltSize); 10377 }; 10378 10379 SmallVector<Value *> Vals(PostponedCmps.begin(), PostponedCmps.end()); 10380 OpsChanged |= tryToVectorizeSequence<Value>( 10381 Vals, Limit, CompareSorter, AreCompatibleCompares, 10382 [this, &R](ArrayRef<Value *> Candidates, bool LimitForRegisterSize) { 10383 // Exclude possible reductions from other blocks. 10384 bool ArePossiblyReducedInOtherBlock = 10385 any_of(Candidates, [](Value *V) { 10386 return any_of(V->users(), [V](User *U) { 10387 return isa<SelectInst>(U) && 10388 cast<SelectInst>(U)->getParent() != 10389 cast<Instruction>(V)->getParent(); 10390 }); 10391 }); 10392 if (ArePossiblyReducedInOtherBlock) 10393 return false; 10394 return tryToVectorizeList(Candidates, R, LimitForRegisterSize); 10395 }, 10396 /*LimitForRegisterSize=*/true); 10397 Instructions.clear(); 10398 } else { 10399 // Insert in reverse order since the PostponedCmps vector was filled in 10400 // reverse order. 10401 Instructions.assign(PostponedCmps.rbegin(), PostponedCmps.rend()); 10402 } 10403 return OpsChanged; 10404 } 10405 10406 bool SLPVectorizerPass::vectorizeChainsInBlock(BasicBlock *BB, BoUpSLP &R) { 10407 bool Changed = false; 10408 SmallVector<Value *, 4> Incoming; 10409 SmallPtrSet<Value *, 16> VisitedInstrs; 10410 // Maps phi nodes to the non-phi nodes found in the use tree for each phi 10411 // node. Allows better to identify the chains that can be vectorized in the 10412 // better way. 10413 DenseMap<Value *, SmallVector<Value *, 4>> PHIToOpcodes; 10414 auto PHICompare = [this, &PHIToOpcodes](Value *V1, Value *V2) { 10415 assert(isValidElementType(V1->getType()) && 10416 isValidElementType(V2->getType()) && 10417 "Expected vectorizable types only."); 10418 // It is fine to compare type IDs here, since we expect only vectorizable 10419 // types, like ints, floats and pointers, we don't care about other type. 10420 if (V1->getType()->getTypeID() < V2->getType()->getTypeID()) 10421 return true; 10422 if (V1->getType()->getTypeID() > V2->getType()->getTypeID()) 10423 return false; 10424 ArrayRef<Value *> Opcodes1 = PHIToOpcodes[V1]; 10425 ArrayRef<Value *> Opcodes2 = PHIToOpcodes[V2]; 10426 if (Opcodes1.size() < Opcodes2.size()) 10427 return true; 10428 if (Opcodes1.size() > Opcodes2.size()) 10429 return false; 10430 Optional<bool> ConstOrder; 10431 for (int I = 0, E = Opcodes1.size(); I < E; ++I) { 10432 // Undefs are compatible with any other value. 10433 if (isa<UndefValue>(Opcodes1[I]) || isa<UndefValue>(Opcodes2[I])) { 10434 if (!ConstOrder) 10435 ConstOrder = 10436 !isa<UndefValue>(Opcodes1[I]) && isa<UndefValue>(Opcodes2[I]); 10437 continue; 10438 } 10439 if (auto *I1 = dyn_cast<Instruction>(Opcodes1[I])) 10440 if (auto *I2 = dyn_cast<Instruction>(Opcodes2[I])) { 10441 DomTreeNodeBase<BasicBlock> *NodeI1 = DT->getNode(I1->getParent()); 10442 DomTreeNodeBase<BasicBlock> *NodeI2 = DT->getNode(I2->getParent()); 10443 if (!NodeI1) 10444 return NodeI2 != nullptr; 10445 if (!NodeI2) 10446 return false; 10447 assert((NodeI1 == NodeI2) == 10448 (NodeI1->getDFSNumIn() == NodeI2->getDFSNumIn()) && 10449 "Different nodes should have different DFS numbers"); 10450 if (NodeI1 != NodeI2) 10451 return NodeI1->getDFSNumIn() < NodeI2->getDFSNumIn(); 10452 InstructionsState S = getSameOpcode({I1, I2}); 10453 if (S.getOpcode()) 10454 continue; 10455 return I1->getOpcode() < I2->getOpcode(); 10456 } 10457 if (isa<Constant>(Opcodes1[I]) && isa<Constant>(Opcodes2[I])) { 10458 if (!ConstOrder) 10459 ConstOrder = Opcodes1[I]->getValueID() < Opcodes2[I]->getValueID(); 10460 continue; 10461 } 10462 if (Opcodes1[I]->getValueID() < Opcodes2[I]->getValueID()) 10463 return true; 10464 if (Opcodes1[I]->getValueID() > Opcodes2[I]->getValueID()) 10465 return false; 10466 } 10467 return ConstOrder && *ConstOrder; 10468 }; 10469 auto AreCompatiblePHIs = [&PHIToOpcodes](Value *V1, Value *V2) { 10470 if (V1 == V2) 10471 return true; 10472 if (V1->getType() != V2->getType()) 10473 return false; 10474 ArrayRef<Value *> Opcodes1 = PHIToOpcodes[V1]; 10475 ArrayRef<Value *> Opcodes2 = PHIToOpcodes[V2]; 10476 if (Opcodes1.size() != Opcodes2.size()) 10477 return false; 10478 for (int I = 0, E = Opcodes1.size(); I < E; ++I) { 10479 // Undefs are compatible with any other value. 10480 if (isa<UndefValue>(Opcodes1[I]) || isa<UndefValue>(Opcodes2[I])) 10481 continue; 10482 if (auto *I1 = dyn_cast<Instruction>(Opcodes1[I])) 10483 if (auto *I2 = dyn_cast<Instruction>(Opcodes2[I])) { 10484 if (I1->getParent() != I2->getParent()) 10485 return false; 10486 InstructionsState S = getSameOpcode({I1, I2}); 10487 if (S.getOpcode()) 10488 continue; 10489 return false; 10490 } 10491 if (isa<Constant>(Opcodes1[I]) && isa<Constant>(Opcodes2[I])) 10492 continue; 10493 if (Opcodes1[I]->getValueID() != Opcodes2[I]->getValueID()) 10494 return false; 10495 } 10496 return true; 10497 }; 10498 auto Limit = [&R](Value *V) { 10499 unsigned EltSize = R.getVectorElementSize(V); 10500 return std::max(2U, R.getMaxVecRegSize() / EltSize); 10501 }; 10502 10503 bool HaveVectorizedPhiNodes = false; 10504 do { 10505 // Collect the incoming values from the PHIs. 10506 Incoming.clear(); 10507 for (Instruction &I : *BB) { 10508 PHINode *P = dyn_cast<PHINode>(&I); 10509 if (!P) 10510 break; 10511 10512 // No need to analyze deleted, vectorized and non-vectorizable 10513 // instructions. 10514 if (!VisitedInstrs.count(P) && !R.isDeleted(P) && 10515 isValidElementType(P->getType())) 10516 Incoming.push_back(P); 10517 } 10518 10519 // Find the corresponding non-phi nodes for better matching when trying to 10520 // build the tree. 10521 for (Value *V : Incoming) { 10522 SmallVectorImpl<Value *> &Opcodes = 10523 PHIToOpcodes.try_emplace(V).first->getSecond(); 10524 if (!Opcodes.empty()) 10525 continue; 10526 SmallVector<Value *, 4> Nodes(1, V); 10527 SmallPtrSet<Value *, 4> Visited; 10528 while (!Nodes.empty()) { 10529 auto *PHI = cast<PHINode>(Nodes.pop_back_val()); 10530 if (!Visited.insert(PHI).second) 10531 continue; 10532 for (Value *V : PHI->incoming_values()) { 10533 if (auto *PHI1 = dyn_cast<PHINode>((V))) { 10534 Nodes.push_back(PHI1); 10535 continue; 10536 } 10537 Opcodes.emplace_back(V); 10538 } 10539 } 10540 } 10541 10542 HaveVectorizedPhiNodes = tryToVectorizeSequence<Value>( 10543 Incoming, Limit, PHICompare, AreCompatiblePHIs, 10544 [this, &R](ArrayRef<Value *> Candidates, bool LimitForRegisterSize) { 10545 return tryToVectorizeList(Candidates, R, LimitForRegisterSize); 10546 }, 10547 /*LimitForRegisterSize=*/true); 10548 Changed |= HaveVectorizedPhiNodes; 10549 VisitedInstrs.insert(Incoming.begin(), Incoming.end()); 10550 } while (HaveVectorizedPhiNodes); 10551 10552 VisitedInstrs.clear(); 10553 10554 SmallVector<Instruction *, 8> PostProcessInstructions; 10555 SmallDenseSet<Instruction *, 4> KeyNodes; 10556 for (BasicBlock::iterator it = BB->begin(), e = BB->end(); it != e; ++it) { 10557 // Skip instructions with scalable type. The num of elements is unknown at 10558 // compile-time for scalable type. 10559 if (isa<ScalableVectorType>(it->getType())) 10560 continue; 10561 10562 // Skip instructions marked for the deletion. 10563 if (R.isDeleted(&*it)) 10564 continue; 10565 // We may go through BB multiple times so skip the one we have checked. 10566 if (!VisitedInstrs.insert(&*it).second) { 10567 if (it->use_empty() && KeyNodes.contains(&*it) && 10568 vectorizeSimpleInstructions(PostProcessInstructions, BB, R, 10569 it->isTerminator())) { 10570 // We would like to start over since some instructions are deleted 10571 // and the iterator may become invalid value. 10572 Changed = true; 10573 it = BB->begin(); 10574 e = BB->end(); 10575 } 10576 continue; 10577 } 10578 10579 if (isa<DbgInfoIntrinsic>(it)) 10580 continue; 10581 10582 // Try to vectorize reductions that use PHINodes. 10583 if (PHINode *P = dyn_cast<PHINode>(it)) { 10584 // Check that the PHI is a reduction PHI. 10585 if (P->getNumIncomingValues() == 2) { 10586 // Try to match and vectorize a horizontal reduction. 10587 if (vectorizeRootInstruction(P, getReductionValue(DT, P, BB, LI), BB, R, 10588 TTI)) { 10589 Changed = true; 10590 it = BB->begin(); 10591 e = BB->end(); 10592 continue; 10593 } 10594 } 10595 // Try to vectorize the incoming values of the PHI, to catch reductions 10596 // that feed into PHIs. 10597 for (unsigned I = 0, E = P->getNumIncomingValues(); I != E; I++) { 10598 // Skip if the incoming block is the current BB for now. Also, bypass 10599 // unreachable IR for efficiency and to avoid crashing. 10600 // TODO: Collect the skipped incoming values and try to vectorize them 10601 // after processing BB. 10602 if (BB == P->getIncomingBlock(I) || 10603 !DT->isReachableFromEntry(P->getIncomingBlock(I))) 10604 continue; 10605 10606 Changed |= vectorizeRootInstruction(nullptr, P->getIncomingValue(I), 10607 P->getIncomingBlock(I), R, TTI); 10608 } 10609 continue; 10610 } 10611 10612 // Ran into an instruction without users, like terminator, or function call 10613 // with ignored return value, store. Ignore unused instructions (basing on 10614 // instruction type, except for CallInst and InvokeInst). 10615 if (it->use_empty() && (it->getType()->isVoidTy() || isa<CallInst>(it) || 10616 isa<InvokeInst>(it))) { 10617 KeyNodes.insert(&*it); 10618 bool OpsChanged = false; 10619 if (ShouldStartVectorizeHorAtStore || !isa<StoreInst>(it)) { 10620 for (auto *V : it->operand_values()) { 10621 // Try to match and vectorize a horizontal reduction. 10622 OpsChanged |= vectorizeRootInstruction(nullptr, V, BB, R, TTI); 10623 } 10624 } 10625 // Start vectorization of post-process list of instructions from the 10626 // top-tree instructions to try to vectorize as many instructions as 10627 // possible. 10628 OpsChanged |= vectorizeSimpleInstructions(PostProcessInstructions, BB, R, 10629 it->isTerminator()); 10630 if (OpsChanged) { 10631 // We would like to start over since some instructions are deleted 10632 // and the iterator may become invalid value. 10633 Changed = true; 10634 it = BB->begin(); 10635 e = BB->end(); 10636 continue; 10637 } 10638 } 10639 10640 if (isa<InsertElementInst>(it) || isa<CmpInst>(it) || 10641 isa<InsertValueInst>(it)) 10642 PostProcessInstructions.push_back(&*it); 10643 } 10644 10645 return Changed; 10646 } 10647 10648 bool SLPVectorizerPass::vectorizeGEPIndices(BasicBlock *BB, BoUpSLP &R) { 10649 auto Changed = false; 10650 for (auto &Entry : GEPs) { 10651 // If the getelementptr list has fewer than two elements, there's nothing 10652 // to do. 10653 if (Entry.second.size() < 2) 10654 continue; 10655 10656 LLVM_DEBUG(dbgs() << "SLP: Analyzing a getelementptr list of length " 10657 << Entry.second.size() << ".\n"); 10658 10659 // Process the GEP list in chunks suitable for the target's supported 10660 // vector size. If a vector register can't hold 1 element, we are done. We 10661 // are trying to vectorize the index computations, so the maximum number of 10662 // elements is based on the size of the index expression, rather than the 10663 // size of the GEP itself (the target's pointer size). 10664 unsigned MaxVecRegSize = R.getMaxVecRegSize(); 10665 unsigned EltSize = R.getVectorElementSize(*Entry.second[0]->idx_begin()); 10666 if (MaxVecRegSize < EltSize) 10667 continue; 10668 10669 unsigned MaxElts = MaxVecRegSize / EltSize; 10670 for (unsigned BI = 0, BE = Entry.second.size(); BI < BE; BI += MaxElts) { 10671 auto Len = std::min<unsigned>(BE - BI, MaxElts); 10672 ArrayRef<GetElementPtrInst *> GEPList(&Entry.second[BI], Len); 10673 10674 // Initialize a set a candidate getelementptrs. Note that we use a 10675 // SetVector here to preserve program order. If the index computations 10676 // are vectorizable and begin with loads, we want to minimize the chance 10677 // of having to reorder them later. 10678 SetVector<Value *> Candidates(GEPList.begin(), GEPList.end()); 10679 10680 // Some of the candidates may have already been vectorized after we 10681 // initially collected them. If so, they are marked as deleted, so remove 10682 // them from the set of candidates. 10683 Candidates.remove_if( 10684 [&R](Value *I) { return R.isDeleted(cast<Instruction>(I)); }); 10685 10686 // Remove from the set of candidates all pairs of getelementptrs with 10687 // constant differences. Such getelementptrs are likely not good 10688 // candidates for vectorization in a bottom-up phase since one can be 10689 // computed from the other. We also ensure all candidate getelementptr 10690 // indices are unique. 10691 for (int I = 0, E = GEPList.size(); I < E && Candidates.size() > 1; ++I) { 10692 auto *GEPI = GEPList[I]; 10693 if (!Candidates.count(GEPI)) 10694 continue; 10695 auto *SCEVI = SE->getSCEV(GEPList[I]); 10696 for (int J = I + 1; J < E && Candidates.size() > 1; ++J) { 10697 auto *GEPJ = GEPList[J]; 10698 auto *SCEVJ = SE->getSCEV(GEPList[J]); 10699 if (isa<SCEVConstant>(SE->getMinusSCEV(SCEVI, SCEVJ))) { 10700 Candidates.remove(GEPI); 10701 Candidates.remove(GEPJ); 10702 } else if (GEPI->idx_begin()->get() == GEPJ->idx_begin()->get()) { 10703 Candidates.remove(GEPJ); 10704 } 10705 } 10706 } 10707 10708 // We break out of the above computation as soon as we know there are 10709 // fewer than two candidates remaining. 10710 if (Candidates.size() < 2) 10711 continue; 10712 10713 // Add the single, non-constant index of each candidate to the bundle. We 10714 // ensured the indices met these constraints when we originally collected 10715 // the getelementptrs. 10716 SmallVector<Value *, 16> Bundle(Candidates.size()); 10717 auto BundleIndex = 0u; 10718 for (auto *V : Candidates) { 10719 auto *GEP = cast<GetElementPtrInst>(V); 10720 auto *GEPIdx = GEP->idx_begin()->get(); 10721 assert(GEP->getNumIndices() == 1 || !isa<Constant>(GEPIdx)); 10722 Bundle[BundleIndex++] = GEPIdx; 10723 } 10724 10725 // Try and vectorize the indices. We are currently only interested in 10726 // gather-like cases of the form: 10727 // 10728 // ... = g[a[0] - b[0]] + g[a[1] - b[1]] + ... 10729 // 10730 // where the loads of "a", the loads of "b", and the subtractions can be 10731 // performed in parallel. It's likely that detecting this pattern in a 10732 // bottom-up phase will be simpler and less costly than building a 10733 // full-blown top-down phase beginning at the consecutive loads. 10734 Changed |= tryToVectorizeList(Bundle, R); 10735 } 10736 } 10737 return Changed; 10738 } 10739 10740 bool SLPVectorizerPass::vectorizeStoreChains(BoUpSLP &R) { 10741 bool Changed = false; 10742 // Sort by type, base pointers and values operand. Value operands must be 10743 // compatible (have the same opcode, same parent), otherwise it is 10744 // definitely not profitable to try to vectorize them. 10745 auto &&StoreSorter = [this](StoreInst *V, StoreInst *V2) { 10746 if (V->getPointerOperandType()->getTypeID() < 10747 V2->getPointerOperandType()->getTypeID()) 10748 return true; 10749 if (V->getPointerOperandType()->getTypeID() > 10750 V2->getPointerOperandType()->getTypeID()) 10751 return false; 10752 // UndefValues are compatible with all other values. 10753 if (isa<UndefValue>(V->getValueOperand()) || 10754 isa<UndefValue>(V2->getValueOperand())) 10755 return false; 10756 if (auto *I1 = dyn_cast<Instruction>(V->getValueOperand())) 10757 if (auto *I2 = dyn_cast<Instruction>(V2->getValueOperand())) { 10758 DomTreeNodeBase<llvm::BasicBlock> *NodeI1 = 10759 DT->getNode(I1->getParent()); 10760 DomTreeNodeBase<llvm::BasicBlock> *NodeI2 = 10761 DT->getNode(I2->getParent()); 10762 assert(NodeI1 && "Should only process reachable instructions"); 10763 assert(NodeI1 && "Should only process reachable instructions"); 10764 assert((NodeI1 == NodeI2) == 10765 (NodeI1->getDFSNumIn() == NodeI2->getDFSNumIn()) && 10766 "Different nodes should have different DFS numbers"); 10767 if (NodeI1 != NodeI2) 10768 return NodeI1->getDFSNumIn() < NodeI2->getDFSNumIn(); 10769 InstructionsState S = getSameOpcode({I1, I2}); 10770 if (S.getOpcode()) 10771 return false; 10772 return I1->getOpcode() < I2->getOpcode(); 10773 } 10774 if (isa<Constant>(V->getValueOperand()) && 10775 isa<Constant>(V2->getValueOperand())) 10776 return false; 10777 return V->getValueOperand()->getValueID() < 10778 V2->getValueOperand()->getValueID(); 10779 }; 10780 10781 auto &&AreCompatibleStores = [](StoreInst *V1, StoreInst *V2) { 10782 if (V1 == V2) 10783 return true; 10784 if (V1->getPointerOperandType() != V2->getPointerOperandType()) 10785 return false; 10786 // Undefs are compatible with any other value. 10787 if (isa<UndefValue>(V1->getValueOperand()) || 10788 isa<UndefValue>(V2->getValueOperand())) 10789 return true; 10790 if (auto *I1 = dyn_cast<Instruction>(V1->getValueOperand())) 10791 if (auto *I2 = dyn_cast<Instruction>(V2->getValueOperand())) { 10792 if (I1->getParent() != I2->getParent()) 10793 return false; 10794 InstructionsState S = getSameOpcode({I1, I2}); 10795 return S.getOpcode() > 0; 10796 } 10797 if (isa<Constant>(V1->getValueOperand()) && 10798 isa<Constant>(V2->getValueOperand())) 10799 return true; 10800 return V1->getValueOperand()->getValueID() == 10801 V2->getValueOperand()->getValueID(); 10802 }; 10803 auto Limit = [&R, this](StoreInst *SI) { 10804 unsigned EltSize = DL->getTypeSizeInBits(SI->getValueOperand()->getType()); 10805 return R.getMinVF(EltSize); 10806 }; 10807 10808 // Attempt to sort and vectorize each of the store-groups. 10809 for (auto &Pair : Stores) { 10810 if (Pair.second.size() < 2) 10811 continue; 10812 10813 LLVM_DEBUG(dbgs() << "SLP: Analyzing a store chain of length " 10814 << Pair.second.size() << ".\n"); 10815 10816 if (!isValidElementType(Pair.second.front()->getValueOperand()->getType())) 10817 continue; 10818 10819 Changed |= tryToVectorizeSequence<StoreInst>( 10820 Pair.second, Limit, StoreSorter, AreCompatibleStores, 10821 [this, &R](ArrayRef<StoreInst *> Candidates, bool) { 10822 return vectorizeStores(Candidates, R); 10823 }, 10824 /*LimitForRegisterSize=*/false); 10825 } 10826 return Changed; 10827 } 10828 10829 char SLPVectorizer::ID = 0; 10830 10831 static const char lv_name[] = "SLP Vectorizer"; 10832 10833 INITIALIZE_PASS_BEGIN(SLPVectorizer, SV_NAME, lv_name, false, false) 10834 INITIALIZE_PASS_DEPENDENCY(AAResultsWrapperPass) 10835 INITIALIZE_PASS_DEPENDENCY(TargetTransformInfoWrapperPass) 10836 INITIALIZE_PASS_DEPENDENCY(AssumptionCacheTracker) 10837 INITIALIZE_PASS_DEPENDENCY(ScalarEvolutionWrapperPass) 10838 INITIALIZE_PASS_DEPENDENCY(LoopSimplify) 10839 INITIALIZE_PASS_DEPENDENCY(DemandedBitsWrapperPass) 10840 INITIALIZE_PASS_DEPENDENCY(OptimizationRemarkEmitterWrapperPass) 10841 INITIALIZE_PASS_DEPENDENCY(InjectTLIMappingsLegacy) 10842 INITIALIZE_PASS_END(SLPVectorizer, SV_NAME, lv_name, false, false) 10843 10844 Pass *llvm::createSLPVectorizerPass() { return new SLPVectorizer(); } 10845