1 //===- VPlanRecipes.cpp - Implementations for VPlan recipes ---------------===//
2 //
3 // Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
4 // See https://llvm.org/LICENSE.txt for license information.
5 // SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
6 //
7 //===----------------------------------------------------------------------===//
8 ///
9 /// \file
10 /// This file contains implementations for different VPlan recipes.
11 ///
12 //===----------------------------------------------------------------------===//
13 
14 #include "VPlan.h"
15 #include "llvm/ADT/STLExtras.h"
16 #include "llvm/ADT/SmallVector.h"
17 #include "llvm/ADT/Twine.h"
18 #include "llvm/Analysis/IVDescriptors.h"
19 #include "llvm/IR/BasicBlock.h"
20 #include "llvm/IR/IRBuilder.h"
21 #include "llvm/IR/Instruction.h"
22 #include "llvm/IR/Instructions.h"
23 #include "llvm/IR/Type.h"
24 #include "llvm/IR/Value.h"
25 #include "llvm/Support/Casting.h"
26 #include "llvm/Support/CommandLine.h"
27 #include "llvm/Support/Debug.h"
28 #include "llvm/Support/raw_ostream.h"
29 #include "llvm/Transforms/Utils/ScalarEvolutionExpander.h"
30 #include <cassert>
31 
32 using namespace llvm;
33 
34 extern cl::opt<bool> EnableVPlanNativePath;
35 
36 #define LV_NAME "loop-vectorize"
37 #define DEBUG_TYPE LV_NAME
38 
39 bool VPRecipeBase::mayWriteToMemory() const {
40   switch (getVPDefID()) {
41   case VPWidenMemoryInstructionSC: {
42     return cast<VPWidenMemoryInstructionRecipe>(this)->isStore();
43   }
44   case VPReplicateSC:
45   case VPWidenCallSC:
46     return cast<Instruction>(getVPSingleValue()->getUnderlyingValue())
47         ->mayWriteToMemory();
48   case VPBranchOnMaskSC:
49     return false;
50   case VPWidenIntOrFpInductionSC:
51   case VPWidenCanonicalIVSC:
52   case VPWidenPHISC:
53   case VPBlendSC:
54   case VPWidenSC:
55   case VPWidenGEPSC:
56   case VPReductionSC:
57   case VPWidenSelectSC: {
58     const Instruction *I =
59         dyn_cast_or_null<Instruction>(getVPSingleValue()->getUnderlyingValue());
60     (void)I;
61     assert((!I || !I->mayWriteToMemory()) &&
62            "underlying instruction may write to memory");
63     return false;
64   }
65   default:
66     return true;
67   }
68 }
69 
70 bool VPRecipeBase::mayReadFromMemory() const {
71   switch (getVPDefID()) {
72   case VPWidenMemoryInstructionSC: {
73     return !cast<VPWidenMemoryInstructionRecipe>(this)->isStore();
74   }
75   case VPReplicateSC:
76   case VPWidenCallSC:
77     return cast<Instruction>(getVPSingleValue()->getUnderlyingValue())
78         ->mayReadFromMemory();
79   case VPBranchOnMaskSC:
80     return false;
81   case VPWidenIntOrFpInductionSC:
82   case VPWidenCanonicalIVSC:
83   case VPWidenPHISC:
84   case VPBlendSC:
85   case VPWidenSC:
86   case VPWidenGEPSC:
87   case VPReductionSC:
88   case VPWidenSelectSC: {
89     const Instruction *I =
90         dyn_cast_or_null<Instruction>(getVPSingleValue()->getUnderlyingValue());
91     (void)I;
92     assert((!I || !I->mayReadFromMemory()) &&
93            "underlying instruction may read from memory");
94     return false;
95   }
96   default:
97     return true;
98   }
99 }
100 
101 bool VPRecipeBase::mayHaveSideEffects() const {
102   switch (getVPDefID()) {
103   case VPWidenIntOrFpInductionSC:
104   case VPWidenPointerInductionSC:
105   case VPWidenCanonicalIVSC:
106   case VPWidenPHISC:
107   case VPBlendSC:
108   case VPWidenSC:
109   case VPWidenGEPSC:
110   case VPReductionSC:
111   case VPWidenSelectSC:
112   case VPScalarIVStepsSC: {
113     const Instruction *I =
114         dyn_cast_or_null<Instruction>(getVPSingleValue()->getUnderlyingValue());
115     (void)I;
116     assert((!I || !I->mayHaveSideEffects()) &&
117            "underlying instruction has side-effects");
118     return false;
119   }
120   case VPReplicateSC: {
121     auto *R = cast<VPReplicateRecipe>(this);
122     return R->getUnderlyingInstr()->mayHaveSideEffects();
123   }
124   default:
125     return true;
126   }
127 }
128 
129 void VPLiveOut::fixPhi(VPlan &Plan, VPTransformState &State) {
130   auto Lane = VPLane::getLastLaneForVF(State.VF);
131   VPValue *ExitValue = getOperand(0);
132   if (Plan.isUniformAfterVectorization(ExitValue))
133     Lane = VPLane::getFirstLane();
134   Phi->addIncoming(State.get(ExitValue, VPIteration(State.UF - 1, Lane)),
135                    State.Builder.GetInsertBlock());
136 }
137 
138 void VPRecipeBase::insertBefore(VPRecipeBase *InsertPos) {
139   assert(!Parent && "Recipe already in some VPBasicBlock");
140   assert(InsertPos->getParent() &&
141          "Insertion position not in any VPBasicBlock");
142   Parent = InsertPos->getParent();
143   Parent->getRecipeList().insert(InsertPos->getIterator(), this);
144 }
145 
146 void VPRecipeBase::insertBefore(VPBasicBlock &BB,
147                                 iplist<VPRecipeBase>::iterator I) {
148   assert(!Parent && "Recipe already in some VPBasicBlock");
149   assert(I == BB.end() || I->getParent() == &BB);
150   Parent = &BB;
151   BB.getRecipeList().insert(I, this);
152 }
153 
154 void VPRecipeBase::insertAfter(VPRecipeBase *InsertPos) {
155   assert(!Parent && "Recipe already in some VPBasicBlock");
156   assert(InsertPos->getParent() &&
157          "Insertion position not in any VPBasicBlock");
158   Parent = InsertPos->getParent();
159   Parent->getRecipeList().insertAfter(InsertPos->getIterator(), this);
160 }
161 
162 void VPRecipeBase::removeFromParent() {
163   assert(getParent() && "Recipe not in any VPBasicBlock");
164   getParent()->getRecipeList().remove(getIterator());
165   Parent = nullptr;
166 }
167 
168 iplist<VPRecipeBase>::iterator VPRecipeBase::eraseFromParent() {
169   assert(getParent() && "Recipe not in any VPBasicBlock");
170   return getParent()->getRecipeList().erase(getIterator());
171 }
172 
173 void VPRecipeBase::moveAfter(VPRecipeBase *InsertPos) {
174   removeFromParent();
175   insertAfter(InsertPos);
176 }
177 
178 void VPRecipeBase::moveBefore(VPBasicBlock &BB,
179                               iplist<VPRecipeBase>::iterator I) {
180   removeFromParent();
181   insertBefore(BB, I);
182 }
183 
184 void VPInstruction::generateInstruction(VPTransformState &State,
185                                         unsigned Part) {
186   IRBuilderBase &Builder = State.Builder;
187   Builder.SetCurrentDebugLocation(DL);
188 
189   if (Instruction::isBinaryOp(getOpcode())) {
190     Value *A = State.get(getOperand(0), Part);
191     Value *B = State.get(getOperand(1), Part);
192     Value *V = Builder.CreateBinOp((Instruction::BinaryOps)getOpcode(), A, B);
193     State.set(this, V, Part);
194     return;
195   }
196 
197   switch (getOpcode()) {
198   case VPInstruction::Not: {
199     Value *A = State.get(getOperand(0), Part);
200     Value *V = Builder.CreateNot(A);
201     State.set(this, V, Part);
202     break;
203   }
204   case VPInstruction::ICmpULE: {
205     Value *IV = State.get(getOperand(0), Part);
206     Value *TC = State.get(getOperand(1), Part);
207     Value *V = Builder.CreateICmpULE(IV, TC);
208     State.set(this, V, Part);
209     break;
210   }
211   case Instruction::Select: {
212     Value *Cond = State.get(getOperand(0), Part);
213     Value *Op1 = State.get(getOperand(1), Part);
214     Value *Op2 = State.get(getOperand(2), Part);
215     Value *V = Builder.CreateSelect(Cond, Op1, Op2);
216     State.set(this, V, Part);
217     break;
218   }
219   case VPInstruction::ActiveLaneMask: {
220     // Get first lane of vector induction variable.
221     Value *VIVElem0 = State.get(getOperand(0), VPIteration(Part, 0));
222     // Get the original loop tripcount.
223     Value *ScalarTC = State.get(getOperand(1), Part);
224 
225     auto *Int1Ty = Type::getInt1Ty(Builder.getContext());
226     auto *PredTy = VectorType::get(Int1Ty, State.VF);
227     Instruction *Call = Builder.CreateIntrinsic(
228         Intrinsic::get_active_lane_mask, {PredTy, ScalarTC->getType()},
229         {VIVElem0, ScalarTC}, nullptr, "active.lane.mask");
230     State.set(this, Call, Part);
231     break;
232   }
233   case VPInstruction::FirstOrderRecurrenceSplice: {
234     // Generate code to combine the previous and current values in vector v3.
235     //
236     //   vector.ph:
237     //     v_init = vector(..., ..., ..., a[-1])
238     //     br vector.body
239     //
240     //   vector.body
241     //     i = phi [0, vector.ph], [i+4, vector.body]
242     //     v1 = phi [v_init, vector.ph], [v2, vector.body]
243     //     v2 = a[i, i+1, i+2, i+3];
244     //     v3 = vector(v1(3), v2(0, 1, 2))
245 
246     // For the first part, use the recurrence phi (v1), otherwise v2.
247     auto *V1 = State.get(getOperand(0), 0);
248     Value *PartMinus1 = Part == 0 ? V1 : State.get(getOperand(1), Part - 1);
249     if (!PartMinus1->getType()->isVectorTy()) {
250       State.set(this, PartMinus1, Part);
251     } else {
252       Value *V2 = State.get(getOperand(1), Part);
253       State.set(this, Builder.CreateVectorSplice(PartMinus1, V2, -1), Part);
254     }
255     break;
256   }
257   case VPInstruction::CanonicalIVIncrement:
258   case VPInstruction::CanonicalIVIncrementNUW: {
259     Value *Next = nullptr;
260     if (Part == 0) {
261       bool IsNUW = getOpcode() == VPInstruction::CanonicalIVIncrementNUW;
262       auto *Phi = State.get(getOperand(0), 0);
263       // The loop step is equal to the vectorization factor (num of SIMD
264       // elements) times the unroll factor (num of SIMD instructions).
265       Value *Step =
266           createStepForVF(Builder, Phi->getType(), State.VF, State.UF);
267       Next = Builder.CreateAdd(Phi, Step, "index.next", IsNUW, false);
268     } else {
269       Next = State.get(this, 0);
270     }
271 
272     State.set(this, Next, Part);
273     break;
274   }
275   case VPInstruction::BranchOnCond: {
276     if (Part != 0)
277       break;
278 
279     Value *Cond = State.get(getOperand(0), VPIteration(Part, 0));
280     VPRegionBlock *ParentRegion = getParent()->getParent();
281     VPBasicBlock *Header = ParentRegion->getEntryBasicBlock();
282 
283     // Replace the temporary unreachable terminator with a new conditional
284     // branch, hooking it up to backward destination for exiting blocks now and
285     // to forward destination(s) later when they are created.
286     BranchInst *CondBr =
287         Builder.CreateCondBr(Cond, Builder.GetInsertBlock(), nullptr);
288 
289     if (getParent()->isExiting())
290       CondBr->setSuccessor(1, State.CFG.VPBB2IRBB[Header]);
291 
292     CondBr->setSuccessor(0, nullptr);
293     Builder.GetInsertBlock()->getTerminator()->eraseFromParent();
294     break;
295   }
296   case VPInstruction::BranchOnCount: {
297     if (Part != 0)
298       break;
299     // First create the compare.
300     Value *IV = State.get(getOperand(0), Part);
301     Value *TC = State.get(getOperand(1), Part);
302     Value *Cond = Builder.CreateICmpEQ(IV, TC);
303 
304     // Now create the branch.
305     auto *Plan = getParent()->getPlan();
306     VPRegionBlock *TopRegion = Plan->getVectorLoopRegion();
307     VPBasicBlock *Header = TopRegion->getEntry()->getEntryBasicBlock();
308 
309     // Replace the temporary unreachable terminator with a new conditional
310     // branch, hooking it up to backward destination (the header) now and to the
311     // forward destination (the exit/middle block) later when it is created.
312     // Note that CreateCondBr expects a valid BB as first argument, so we need
313     // to set it to nullptr later.
314     BranchInst *CondBr = Builder.CreateCondBr(Cond, Builder.GetInsertBlock(),
315                                               State.CFG.VPBB2IRBB[Header]);
316     CondBr->setSuccessor(0, nullptr);
317     Builder.GetInsertBlock()->getTerminator()->eraseFromParent();
318     break;
319   }
320   default:
321     llvm_unreachable("Unsupported opcode for instruction");
322   }
323 }
324 
325 void VPInstruction::execute(VPTransformState &State) {
326   assert(!State.Instance && "VPInstruction executing an Instance");
327   IRBuilderBase::FastMathFlagGuard FMFGuard(State.Builder);
328   State.Builder.setFastMathFlags(FMF);
329   for (unsigned Part = 0; Part < State.UF; ++Part)
330     generateInstruction(State, Part);
331 }
332 
333 #if !defined(NDEBUG) || defined(LLVM_ENABLE_DUMP)
334 void VPInstruction::dump() const {
335   VPSlotTracker SlotTracker(getParent()->getPlan());
336   print(dbgs(), "", SlotTracker);
337 }
338 
339 void VPInstruction::print(raw_ostream &O, const Twine &Indent,
340                           VPSlotTracker &SlotTracker) const {
341   O << Indent << "EMIT ";
342 
343   if (hasResult()) {
344     printAsOperand(O, SlotTracker);
345     O << " = ";
346   }
347 
348   switch (getOpcode()) {
349   case VPInstruction::Not:
350     O << "not";
351     break;
352   case VPInstruction::ICmpULE:
353     O << "icmp ule";
354     break;
355   case VPInstruction::SLPLoad:
356     O << "combined load";
357     break;
358   case VPInstruction::SLPStore:
359     O << "combined store";
360     break;
361   case VPInstruction::ActiveLaneMask:
362     O << "active lane mask";
363     break;
364   case VPInstruction::FirstOrderRecurrenceSplice:
365     O << "first-order splice";
366     break;
367   case VPInstruction::CanonicalIVIncrement:
368     O << "VF * UF + ";
369     break;
370   case VPInstruction::CanonicalIVIncrementNUW:
371     O << "VF * UF +(nuw) ";
372     break;
373   case VPInstruction::BranchOnCond:
374     O << "branch-on-cond";
375     break;
376   case VPInstruction::BranchOnCount:
377     O << "branch-on-count ";
378     break;
379   default:
380     O << Instruction::getOpcodeName(getOpcode());
381   }
382 
383   O << FMF;
384 
385   for (const VPValue *Operand : operands()) {
386     O << " ";
387     Operand->printAsOperand(O, SlotTracker);
388   }
389 
390   if (DL) {
391     O << ", !dbg ";
392     DL.print(O);
393   }
394 }
395 #endif
396 
397 void VPInstruction::setFastMathFlags(FastMathFlags FMFNew) {
398   // Make sure the VPInstruction is a floating-point operation.
399   assert((Opcode == Instruction::FAdd || Opcode == Instruction::FMul ||
400           Opcode == Instruction::FNeg || Opcode == Instruction::FSub ||
401           Opcode == Instruction::FDiv || Opcode == Instruction::FRem ||
402           Opcode == Instruction::FCmp) &&
403          "this op can't take fast-math flags");
404   FMF = FMFNew;
405 }
406 
407 #if !defined(NDEBUG) || defined(LLVM_ENABLE_DUMP)
408 void VPWidenCallRecipe::print(raw_ostream &O, const Twine &Indent,
409                               VPSlotTracker &SlotTracker) const {
410   O << Indent << "WIDEN-CALL ";
411 
412   auto *CI = cast<CallInst>(getUnderlyingInstr());
413   if (CI->getType()->isVoidTy())
414     O << "void ";
415   else {
416     printAsOperand(O, SlotTracker);
417     O << " = ";
418   }
419 
420   O << "call @" << CI->getCalledFunction()->getName() << "(";
421   printOperands(O, SlotTracker);
422   O << ")";
423 }
424 
425 void VPWidenSelectRecipe::print(raw_ostream &O, const Twine &Indent,
426                                 VPSlotTracker &SlotTracker) const {
427   O << Indent << "WIDEN-SELECT ";
428   printAsOperand(O, SlotTracker);
429   O << " = select ";
430   getOperand(0)->printAsOperand(O, SlotTracker);
431   O << ", ";
432   getOperand(1)->printAsOperand(O, SlotTracker);
433   O << ", ";
434   getOperand(2)->printAsOperand(O, SlotTracker);
435   O << (InvariantCond ? " (condition is loop invariant)" : "");
436 }
437 #endif
438 
439 void VPWidenSelectRecipe::execute(VPTransformState &State) {
440   auto &I = *cast<SelectInst>(getUnderlyingInstr());
441   State.setDebugLocFromInst(&I);
442 
443   // The condition can be loop invariant but still defined inside the
444   // loop. This means that we can't just use the original 'cond' value.
445   // We have to take the 'vectorized' value and pick the first lane.
446   // Instcombine will make this a no-op.
447   auto *InvarCond =
448       InvariantCond ? State.get(getOperand(0), VPIteration(0, 0)) : nullptr;
449 
450   for (unsigned Part = 0; Part < State.UF; ++Part) {
451     Value *Cond = InvarCond ? InvarCond : State.get(getOperand(0), Part);
452     Value *Op0 = State.get(getOperand(1), Part);
453     Value *Op1 = State.get(getOperand(2), Part);
454     Value *Sel = State.Builder.CreateSelect(Cond, Op0, Op1);
455     State.set(this, Sel, Part);
456     State.addMetadata(Sel, &I);
457   }
458 }
459 
460 void VPWidenRecipe::execute(VPTransformState &State) {
461   auto &I = *cast<Instruction>(getUnderlyingValue());
462   auto &Builder = State.Builder;
463   switch (I.getOpcode()) {
464   case Instruction::Call:
465   case Instruction::Br:
466   case Instruction::PHI:
467   case Instruction::GetElementPtr:
468   case Instruction::Select:
469     llvm_unreachable("This instruction is handled by a different recipe.");
470   case Instruction::UDiv:
471   case Instruction::SDiv:
472   case Instruction::SRem:
473   case Instruction::URem:
474   case Instruction::Add:
475   case Instruction::FAdd:
476   case Instruction::Sub:
477   case Instruction::FSub:
478   case Instruction::FNeg:
479   case Instruction::Mul:
480   case Instruction::FMul:
481   case Instruction::FDiv:
482   case Instruction::FRem:
483   case Instruction::Shl:
484   case Instruction::LShr:
485   case Instruction::AShr:
486   case Instruction::And:
487   case Instruction::Or:
488   case Instruction::Xor: {
489     // Just widen unops and binops.
490     State.setDebugLocFromInst(&I);
491 
492     for (unsigned Part = 0; Part < State.UF; ++Part) {
493       SmallVector<Value *, 2> Ops;
494       for (VPValue *VPOp : operands())
495         Ops.push_back(State.get(VPOp, Part));
496 
497       Value *V = Builder.CreateNAryOp(I.getOpcode(), Ops);
498 
499       if (auto *VecOp = dyn_cast<Instruction>(V)) {
500         VecOp->copyIRFlags(&I);
501 
502         // If the instruction is vectorized and was in a basic block that needed
503         // predication, we can't propagate poison-generating flags (nuw/nsw,
504         // exact, etc.). The control flow has been linearized and the
505         // instruction is no longer guarded by the predicate, which could make
506         // the flag properties to no longer hold.
507         if (State.MayGeneratePoisonRecipes.contains(this))
508           VecOp->dropPoisonGeneratingFlags();
509       }
510 
511       // Use this vector value for all users of the original instruction.
512       State.set(this, V, Part);
513       State.addMetadata(V, &I);
514     }
515 
516     break;
517   }
518   case Instruction::Freeze: {
519     State.setDebugLocFromInst(&I);
520 
521     for (unsigned Part = 0; Part < State.UF; ++Part) {
522       Value *Op = State.get(getOperand(0), Part);
523 
524       Value *Freeze = Builder.CreateFreeze(Op);
525       State.set(this, Freeze, Part);
526     }
527     break;
528   }
529   case Instruction::ICmp:
530   case Instruction::FCmp: {
531     // Widen compares. Generate vector compares.
532     bool FCmp = (I.getOpcode() == Instruction::FCmp);
533     auto *Cmp = cast<CmpInst>(&I);
534     State.setDebugLocFromInst(Cmp);
535     for (unsigned Part = 0; Part < State.UF; ++Part) {
536       Value *A = State.get(getOperand(0), Part);
537       Value *B = State.get(getOperand(1), Part);
538       Value *C = nullptr;
539       if (FCmp) {
540         // Propagate fast math flags.
541         IRBuilder<>::FastMathFlagGuard FMFG(Builder);
542         Builder.setFastMathFlags(Cmp->getFastMathFlags());
543         C = Builder.CreateFCmp(Cmp->getPredicate(), A, B);
544       } else {
545         C = Builder.CreateICmp(Cmp->getPredicate(), A, B);
546       }
547       State.set(this, C, Part);
548       State.addMetadata(C, &I);
549     }
550 
551     break;
552   }
553 
554   case Instruction::ZExt:
555   case Instruction::SExt:
556   case Instruction::FPToUI:
557   case Instruction::FPToSI:
558   case Instruction::FPExt:
559   case Instruction::PtrToInt:
560   case Instruction::IntToPtr:
561   case Instruction::SIToFP:
562   case Instruction::UIToFP:
563   case Instruction::Trunc:
564   case Instruction::FPTrunc:
565   case Instruction::BitCast: {
566     auto *CI = cast<CastInst>(&I);
567     State.setDebugLocFromInst(CI);
568 
569     /// Vectorize casts.
570     Type *DestTy = (State.VF.isScalar())
571                        ? CI->getType()
572                        : VectorType::get(CI->getType(), State.VF);
573 
574     for (unsigned Part = 0; Part < State.UF; ++Part) {
575       Value *A = State.get(getOperand(0), Part);
576       Value *Cast = Builder.CreateCast(CI->getOpcode(), A, DestTy);
577       State.set(this, Cast, Part);
578       State.addMetadata(Cast, &I);
579     }
580     break;
581   }
582   default:
583     // This instruction is not vectorized by simple widening.
584     LLVM_DEBUG(dbgs() << "LV: Found an unhandled instruction: " << I);
585     llvm_unreachable("Unhandled instruction!");
586   } // end of switch.
587 }
588 #if !defined(NDEBUG) || defined(LLVM_ENABLE_DUMP)
589 void VPWidenRecipe::print(raw_ostream &O, const Twine &Indent,
590                           VPSlotTracker &SlotTracker) const {
591   O << Indent << "WIDEN ";
592   printAsOperand(O, SlotTracker);
593   O << " = " << getUnderlyingInstr()->getOpcodeName() << " ";
594   printOperands(O, SlotTracker);
595 }
596 
597 void VPWidenIntOrFpInductionRecipe::print(raw_ostream &O, const Twine &Indent,
598                                           VPSlotTracker &SlotTracker) const {
599   O << Indent << "WIDEN-INDUCTION";
600   if (getTruncInst()) {
601     O << "\\l\"";
602     O << " +\n" << Indent << "\"  " << VPlanIngredient(IV) << "\\l\"";
603     O << " +\n" << Indent << "\"  ";
604     getVPValue(0)->printAsOperand(O, SlotTracker);
605   } else
606     O << " " << VPlanIngredient(IV);
607 
608   O << ", ";
609   getStepValue()->printAsOperand(O, SlotTracker);
610 }
611 #endif
612 
613 bool VPWidenIntOrFpInductionRecipe::isCanonical() const {
614   auto *StartC = dyn_cast<ConstantInt>(getStartValue()->getLiveInIRValue());
615   auto *StepC = dyn_cast<SCEVConstant>(getInductionDescriptor().getStep());
616   return StartC && StartC->isZero() && StepC && StepC->isOne();
617 }
618 
619 VPCanonicalIVPHIRecipe *VPScalarIVStepsRecipe::getCanonicalIV() const {
620   return cast<VPCanonicalIVPHIRecipe>(getOperand(0));
621 }
622 
623 bool VPScalarIVStepsRecipe::isCanonical() const {
624   auto *CanIV = getCanonicalIV();
625   // The start value of the steps-recipe must match the start value of the
626   // canonical induction and it must step by 1.
627   if (CanIV->getStartValue() != getStartValue())
628     return false;
629   auto *StepVPV = getStepValue();
630   if (StepVPV->getDef())
631     return false;
632   auto *StepC = dyn_cast_or_null<ConstantInt>(StepVPV->getLiveInIRValue());
633   return StepC && StepC->isOne();
634 }
635 
636 #if !defined(NDEBUG) || defined(LLVM_ENABLE_DUMP)
637 void VPScalarIVStepsRecipe::print(raw_ostream &O, const Twine &Indent,
638                                   VPSlotTracker &SlotTracker) const {
639   O << Indent;
640   printAsOperand(O, SlotTracker);
641   O << Indent << "= SCALAR-STEPS ";
642   printOperands(O, SlotTracker);
643 }
644 #endif
645 
646 void VPWidenGEPRecipe::execute(VPTransformState &State) {
647   auto *GEP = cast<GetElementPtrInst>(getUnderlyingInstr());
648   // Construct a vector GEP by widening the operands of the scalar GEP as
649   // necessary. We mark the vector GEP 'inbounds' if appropriate. A GEP
650   // results in a vector of pointers when at least one operand of the GEP
651   // is vector-typed. Thus, to keep the representation compact, we only use
652   // vector-typed operands for loop-varying values.
653 
654   if (State.VF.isVector() && IsPtrLoopInvariant && IsIndexLoopInvariant.all()) {
655     // If we are vectorizing, but the GEP has only loop-invariant operands,
656     // the GEP we build (by only using vector-typed operands for
657     // loop-varying values) would be a scalar pointer. Thus, to ensure we
658     // produce a vector of pointers, we need to either arbitrarily pick an
659     // operand to broadcast, or broadcast a clone of the original GEP.
660     // Here, we broadcast a clone of the original.
661     //
662     // TODO: If at some point we decide to scalarize instructions having
663     //       loop-invariant operands, this special case will no longer be
664     //       required. We would add the scalarization decision to
665     //       collectLoopScalars() and teach getVectorValue() to broadcast
666     //       the lane-zero scalar value.
667     auto *Clone = State.Builder.Insert(GEP->clone());
668     for (unsigned Part = 0; Part < State.UF; ++Part) {
669       Value *EntryPart = State.Builder.CreateVectorSplat(State.VF, Clone);
670       State.set(this, EntryPart, Part);
671       State.addMetadata(EntryPart, GEP);
672     }
673   } else {
674     // If the GEP has at least one loop-varying operand, we are sure to
675     // produce a vector of pointers. But if we are only unrolling, we want
676     // to produce a scalar GEP for each unroll part. Thus, the GEP we
677     // produce with the code below will be scalar (if VF == 1) or vector
678     // (otherwise). Note that for the unroll-only case, we still maintain
679     // values in the vector mapping with initVector, as we do for other
680     // instructions.
681     for (unsigned Part = 0; Part < State.UF; ++Part) {
682       // The pointer operand of the new GEP. If it's loop-invariant, we
683       // won't broadcast it.
684       auto *Ptr = IsPtrLoopInvariant
685                       ? State.get(getOperand(0), VPIteration(0, 0))
686                       : State.get(getOperand(0), Part);
687 
688       // Collect all the indices for the new GEP. If any index is
689       // loop-invariant, we won't broadcast it.
690       SmallVector<Value *, 4> Indices;
691       for (unsigned I = 1, E = getNumOperands(); I < E; I++) {
692         VPValue *Operand = getOperand(I);
693         if (IsIndexLoopInvariant[I - 1])
694           Indices.push_back(State.get(Operand, VPIteration(0, 0)));
695         else
696           Indices.push_back(State.get(Operand, Part));
697       }
698 
699       // If the GEP instruction is vectorized and was in a basic block that
700       // needed predication, we can't propagate the poison-generating 'inbounds'
701       // flag. The control flow has been linearized and the GEP is no longer
702       // guarded by the predicate, which could make the 'inbounds' properties to
703       // no longer hold.
704       bool IsInBounds =
705           GEP->isInBounds() && State.MayGeneratePoisonRecipes.count(this) == 0;
706 
707       // Create the new GEP. Note that this GEP may be a scalar if VF == 1,
708       // but it should be a vector, otherwise.
709       auto *NewGEP = State.Builder.CreateGEP(GEP->getSourceElementType(), Ptr,
710                                              Indices, "", IsInBounds);
711       assert((State.VF.isScalar() || NewGEP->getType()->isVectorTy()) &&
712              "NewGEP is not a pointer vector");
713       State.set(this, NewGEP, Part);
714       State.addMetadata(NewGEP, GEP);
715     }
716   }
717 }
718 
719 #if !defined(NDEBUG) || defined(LLVM_ENABLE_DUMP)
720 void VPWidenGEPRecipe::print(raw_ostream &O, const Twine &Indent,
721                              VPSlotTracker &SlotTracker) const {
722   O << Indent << "WIDEN-GEP ";
723   O << (IsPtrLoopInvariant ? "Inv" : "Var");
724   size_t IndicesNumber = IsIndexLoopInvariant.size();
725   for (size_t I = 0; I < IndicesNumber; ++I)
726     O << "[" << (IsIndexLoopInvariant[I] ? "Inv" : "Var") << "]";
727 
728   O << " ";
729   printAsOperand(O, SlotTracker);
730   O << " = getelementptr ";
731   printOperands(O, SlotTracker);
732 }
733 
734 void VPBlendRecipe::print(raw_ostream &O, const Twine &Indent,
735                           VPSlotTracker &SlotTracker) const {
736   O << Indent << "BLEND ";
737   Phi->printAsOperand(O, false);
738   O << " =";
739   if (getNumIncomingValues() == 1) {
740     // Not a User of any mask: not really blending, this is a
741     // single-predecessor phi.
742     O << " ";
743     getIncomingValue(0)->printAsOperand(O, SlotTracker);
744   } else {
745     for (unsigned I = 0, E = getNumIncomingValues(); I < E; ++I) {
746       O << " ";
747       getIncomingValue(I)->printAsOperand(O, SlotTracker);
748       O << "/";
749       getMask(I)->printAsOperand(O, SlotTracker);
750     }
751   }
752 }
753 
754 void VPReductionRecipe::print(raw_ostream &O, const Twine &Indent,
755                               VPSlotTracker &SlotTracker) const {
756   O << Indent << "REDUCE ";
757   printAsOperand(O, SlotTracker);
758   O << " = ";
759   getChainOp()->printAsOperand(O, SlotTracker);
760   O << " +";
761   if (isa<FPMathOperator>(getUnderlyingInstr()))
762     O << getUnderlyingInstr()->getFastMathFlags();
763   O << " reduce." << Instruction::getOpcodeName(RdxDesc->getOpcode()) << " (";
764   getVecOp()->printAsOperand(O, SlotTracker);
765   if (getCondOp()) {
766     O << ", ";
767     getCondOp()->printAsOperand(O, SlotTracker);
768   }
769   O << ")";
770   if (RdxDesc->IntermediateStore)
771     O << " (with final reduction value stored in invariant address sank "
772          "outside of loop)";
773 }
774 
775 void VPReplicateRecipe::print(raw_ostream &O, const Twine &Indent,
776                               VPSlotTracker &SlotTracker) const {
777   O << Indent << (IsUniform ? "CLONE " : "REPLICATE ");
778 
779   if (!getUnderlyingInstr()->getType()->isVoidTy()) {
780     printAsOperand(O, SlotTracker);
781     O << " = ";
782   }
783   if (auto *CB = dyn_cast<CallBase>(getUnderlyingInstr())) {
784     O << "call @" << CB->getCalledFunction()->getName() << "(";
785     interleaveComma(make_range(op_begin(), op_begin() + (getNumOperands() - 1)),
786                     O, [&O, &SlotTracker](VPValue *Op) {
787                       Op->printAsOperand(O, SlotTracker);
788                     });
789     O << ")";
790   } else {
791     O << Instruction::getOpcodeName(getUnderlyingInstr()->getOpcode()) << " ";
792     printOperands(O, SlotTracker);
793   }
794 
795   if (AlsoPack)
796     O << " (S->V)";
797 }
798 
799 void VPPredInstPHIRecipe::print(raw_ostream &O, const Twine &Indent,
800                                 VPSlotTracker &SlotTracker) const {
801   O << Indent << "PHI-PREDICATED-INSTRUCTION ";
802   printAsOperand(O, SlotTracker);
803   O << " = ";
804   printOperands(O, SlotTracker);
805 }
806 
807 void VPWidenMemoryInstructionRecipe::print(raw_ostream &O, const Twine &Indent,
808                                            VPSlotTracker &SlotTracker) const {
809   O << Indent << "WIDEN ";
810 
811   if (!isStore()) {
812     getVPSingleValue()->printAsOperand(O, SlotTracker);
813     O << " = ";
814   }
815   O << Instruction::getOpcodeName(Ingredient.getOpcode()) << " ";
816 
817   printOperands(O, SlotTracker);
818 }
819 #endif
820 
821 void VPCanonicalIVPHIRecipe::execute(VPTransformState &State) {
822   Value *Start = getStartValue()->getLiveInIRValue();
823   PHINode *EntryPart = PHINode::Create(
824       Start->getType(), 2, "index", &*State.CFG.PrevBB->getFirstInsertionPt());
825 
826   BasicBlock *VectorPH = State.CFG.getPreheaderBBFor(this);
827   EntryPart->addIncoming(Start, VectorPH);
828   EntryPart->setDebugLoc(DL);
829   for (unsigned Part = 0, UF = State.UF; Part < UF; ++Part)
830     State.set(this, EntryPart, Part);
831 }
832 
833 #if !defined(NDEBUG) || defined(LLVM_ENABLE_DUMP)
834 void VPCanonicalIVPHIRecipe::print(raw_ostream &O, const Twine &Indent,
835                                    VPSlotTracker &SlotTracker) const {
836   O << Indent << "EMIT ";
837   printAsOperand(O, SlotTracker);
838   O << " = CANONICAL-INDUCTION";
839 }
840 #endif
841 
842 bool VPWidenPointerInductionRecipe::onlyScalarsGenerated(ElementCount VF) {
843   bool IsUniform = vputils::onlyFirstLaneUsed(this);
844   return all_of(users(),
845                 [&](const VPUser *U) { return U->usesScalars(this); }) &&
846          (IsUniform || !VF.isScalable());
847 }
848 
849 #if !defined(NDEBUG) || defined(LLVM_ENABLE_DUMP)
850 void VPWidenPointerInductionRecipe::print(raw_ostream &O, const Twine &Indent,
851                                           VPSlotTracker &SlotTracker) const {
852   O << Indent << "EMIT ";
853   printAsOperand(O, SlotTracker);
854   O << " = WIDEN-POINTER-INDUCTION ";
855   getStartValue()->printAsOperand(O, SlotTracker);
856   O << ", " << *IndDesc.getStep();
857 }
858 #endif
859 
860 void VPExpandSCEVRecipe::execute(VPTransformState &State) {
861   assert(!State.Instance && "cannot be used in per-lane");
862   const DataLayout &DL = State.CFG.PrevBB->getModule()->getDataLayout();
863   SCEVExpander Exp(SE, DL, "induction");
864 
865   Value *Res = Exp.expandCodeFor(Expr, Expr->getType(),
866                                  &*State.Builder.GetInsertPoint());
867 
868   for (unsigned Part = 0, UF = State.UF; Part < UF; ++Part)
869     State.set(this, Res, Part);
870 }
871 
872 #if !defined(NDEBUG) || defined(LLVM_ENABLE_DUMP)
873 void VPExpandSCEVRecipe::print(raw_ostream &O, const Twine &Indent,
874                                VPSlotTracker &SlotTracker) const {
875   O << Indent << "EMIT ";
876   getVPSingleValue()->printAsOperand(O, SlotTracker);
877   O << " = EXPAND SCEV " << *Expr;
878 }
879 #endif
880 
881 void VPWidenCanonicalIVRecipe::execute(VPTransformState &State) {
882   Value *CanonicalIV = State.get(getOperand(0), 0);
883   Type *STy = CanonicalIV->getType();
884   IRBuilder<> Builder(State.CFG.PrevBB->getTerminator());
885   ElementCount VF = State.VF;
886   Value *VStart = VF.isScalar()
887                       ? CanonicalIV
888                       : Builder.CreateVectorSplat(VF, CanonicalIV, "broadcast");
889   for (unsigned Part = 0, UF = State.UF; Part < UF; ++Part) {
890     Value *VStep = createStepForVF(Builder, STy, VF, Part);
891     if (VF.isVector()) {
892       VStep = Builder.CreateVectorSplat(VF, VStep);
893       VStep =
894           Builder.CreateAdd(VStep, Builder.CreateStepVector(VStep->getType()));
895     }
896     Value *CanonicalVectorIV = Builder.CreateAdd(VStart, VStep, "vec.iv");
897     State.set(this, CanonicalVectorIV, Part);
898   }
899 }
900 
901 #if !defined(NDEBUG) || defined(LLVM_ENABLE_DUMP)
902 void VPWidenCanonicalIVRecipe::print(raw_ostream &O, const Twine &Indent,
903                                      VPSlotTracker &SlotTracker) const {
904   O << Indent << "EMIT ";
905   printAsOperand(O, SlotTracker);
906   O << " = WIDEN-CANONICAL-INDUCTION ";
907   printOperands(O, SlotTracker);
908 }
909 #endif
910 
911 void VPFirstOrderRecurrencePHIRecipe::execute(VPTransformState &State) {
912   auto &Builder = State.Builder;
913   // Create a vector from the initial value.
914   auto *VectorInit = getStartValue()->getLiveInIRValue();
915 
916   Type *VecTy = State.VF.isScalar()
917                     ? VectorInit->getType()
918                     : VectorType::get(VectorInit->getType(), State.VF);
919 
920   BasicBlock *VectorPH = State.CFG.getPreheaderBBFor(this);
921   if (State.VF.isVector()) {
922     auto *IdxTy = Builder.getInt32Ty();
923     auto *One = ConstantInt::get(IdxTy, 1);
924     IRBuilder<>::InsertPointGuard Guard(Builder);
925     Builder.SetInsertPoint(VectorPH->getTerminator());
926     auto *RuntimeVF = getRuntimeVF(Builder, IdxTy, State.VF);
927     auto *LastIdx = Builder.CreateSub(RuntimeVF, One);
928     VectorInit = Builder.CreateInsertElement(
929         PoisonValue::get(VecTy), VectorInit, LastIdx, "vector.recur.init");
930   }
931 
932   // Create a phi node for the new recurrence.
933   PHINode *EntryPart = PHINode::Create(
934       VecTy, 2, "vector.recur", &*State.CFG.PrevBB->getFirstInsertionPt());
935   EntryPart->addIncoming(VectorInit, VectorPH);
936   State.set(this, EntryPart, 0);
937 }
938 
939 #if !defined(NDEBUG) || defined(LLVM_ENABLE_DUMP)
940 void VPFirstOrderRecurrencePHIRecipe::print(raw_ostream &O, const Twine &Indent,
941                                             VPSlotTracker &SlotTracker) const {
942   O << Indent << "FIRST-ORDER-RECURRENCE-PHI ";
943   printAsOperand(O, SlotTracker);
944   O << " = phi ";
945   printOperands(O, SlotTracker);
946 }
947 #endif
948 
949 void VPReductionPHIRecipe::execute(VPTransformState &State) {
950   PHINode *PN = cast<PHINode>(getUnderlyingValue());
951   auto &Builder = State.Builder;
952 
953   // In order to support recurrences we need to be able to vectorize Phi nodes.
954   // Phi nodes have cycles, so we need to vectorize them in two stages. This is
955   // stage #1: We create a new vector PHI node with no incoming edges. We'll use
956   // this value when we vectorize all of the instructions that use the PHI.
957   bool ScalarPHI = State.VF.isScalar() || IsInLoop;
958   Type *VecTy =
959       ScalarPHI ? PN->getType() : VectorType::get(PN->getType(), State.VF);
960 
961   BasicBlock *HeaderBB = State.CFG.PrevBB;
962   assert(State.CurrentVectorLoop->getHeader() == HeaderBB &&
963          "recipe must be in the vector loop header");
964   unsigned LastPartForNewPhi = isOrdered() ? 1 : State.UF;
965   for (unsigned Part = 0; Part < LastPartForNewPhi; ++Part) {
966     Value *EntryPart =
967         PHINode::Create(VecTy, 2, "vec.phi", &*HeaderBB->getFirstInsertionPt());
968     State.set(this, EntryPart, Part);
969   }
970 
971   BasicBlock *VectorPH = State.CFG.getPreheaderBBFor(this);
972 
973   // Reductions do not have to start at zero. They can start with
974   // any loop invariant values.
975   VPValue *StartVPV = getStartValue();
976   Value *StartV = StartVPV->getLiveInIRValue();
977 
978   Value *Iden = nullptr;
979   RecurKind RK = RdxDesc.getRecurrenceKind();
980   if (RecurrenceDescriptor::isMinMaxRecurrenceKind(RK) ||
981       RecurrenceDescriptor::isSelectCmpRecurrenceKind(RK)) {
982     // MinMax reduction have the start value as their identify.
983     if (ScalarPHI) {
984       Iden = StartV;
985     } else {
986       IRBuilderBase::InsertPointGuard IPBuilder(Builder);
987       Builder.SetInsertPoint(VectorPH->getTerminator());
988       StartV = Iden =
989           Builder.CreateVectorSplat(State.VF, StartV, "minmax.ident");
990     }
991   } else {
992     Iden = RdxDesc.getRecurrenceIdentity(RK, VecTy->getScalarType(),
993                                          RdxDesc.getFastMathFlags());
994 
995     if (!ScalarPHI) {
996       Iden = Builder.CreateVectorSplat(State.VF, Iden);
997       IRBuilderBase::InsertPointGuard IPBuilder(Builder);
998       Builder.SetInsertPoint(VectorPH->getTerminator());
999       Constant *Zero = Builder.getInt32(0);
1000       StartV = Builder.CreateInsertElement(Iden, StartV, Zero);
1001     }
1002   }
1003 
1004   for (unsigned Part = 0; Part < LastPartForNewPhi; ++Part) {
1005     Value *EntryPart = State.get(this, Part);
1006     // Make sure to add the reduction start value only to the
1007     // first unroll part.
1008     Value *StartVal = (Part == 0) ? StartV : Iden;
1009     cast<PHINode>(EntryPart)->addIncoming(StartVal, VectorPH);
1010   }
1011 }
1012 
1013 #if !defined(NDEBUG) || defined(LLVM_ENABLE_DUMP)
1014 void VPReductionPHIRecipe::print(raw_ostream &O, const Twine &Indent,
1015                                  VPSlotTracker &SlotTracker) const {
1016   O << Indent << "WIDEN-REDUCTION-PHI ";
1017 
1018   printAsOperand(O, SlotTracker);
1019   O << " = phi ";
1020   printOperands(O, SlotTracker);
1021 }
1022 #endif
1023 
1024 void VPWidenPHIRecipe::execute(VPTransformState &State) {
1025   assert(EnableVPlanNativePath &&
1026          "Non-native vplans are not expected to have VPWidenPHIRecipes.");
1027 
1028   // Currently we enter here in the VPlan-native path for non-induction
1029   // PHIs where all control flow is uniform. We simply widen these PHIs.
1030   // Create a vector phi with no operands - the vector phi operands will be
1031   // set at the end of vector code generation.
1032   VPBasicBlock *Parent = getParent();
1033   VPRegionBlock *LoopRegion = Parent->getEnclosingLoopRegion();
1034   unsigned StartIdx = 0;
1035   // For phis in header blocks of loop regions, use the index of the value
1036   // coming from the preheader.
1037   if (LoopRegion->getEntryBasicBlock() == Parent) {
1038     for (unsigned I = 0; I < getNumOperands(); ++I) {
1039       if (getIncomingBlock(I) ==
1040           LoopRegion->getSinglePredecessor()->getExitingBasicBlock())
1041         StartIdx = I;
1042     }
1043   }
1044   Value *Op0 = State.get(getOperand(StartIdx), 0);
1045   Type *VecTy = Op0->getType();
1046   Value *VecPhi = State.Builder.CreatePHI(VecTy, 2, "vec.phi");
1047   State.set(this, VecPhi, 0);
1048 }
1049 
1050 #if !defined(NDEBUG) || defined(LLVM_ENABLE_DUMP)
1051 void VPWidenPHIRecipe::print(raw_ostream &O, const Twine &Indent,
1052                              VPSlotTracker &SlotTracker) const {
1053   O << Indent << "WIDEN-PHI ";
1054 
1055   auto *OriginalPhi = cast<PHINode>(getUnderlyingValue());
1056   // Unless all incoming values are modeled in VPlan  print the original PHI
1057   // directly.
1058   // TODO: Remove once all VPWidenPHIRecipe instances keep all relevant incoming
1059   // values as VPValues.
1060   if (getNumOperands() != OriginalPhi->getNumOperands()) {
1061     O << VPlanIngredient(OriginalPhi);
1062     return;
1063   }
1064 
1065   printAsOperand(O, SlotTracker);
1066   O << " = phi ";
1067   printOperands(O, SlotTracker);
1068 }
1069 #endif
1070