1 //===- VPlanRecipes.cpp - Implementations for VPlan recipes ---------------===//
2 //
3 // Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
4 // See https://llvm.org/LICENSE.txt for license information.
5 // SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
6 //
7 //===----------------------------------------------------------------------===//
8 ///
9 /// \file
10 /// This file contains implementations for different VPlan recipes.
11 ///
12 //===----------------------------------------------------------------------===//
13 
14 #include "VPlan.h"
15 #include "llvm/ADT/STLExtras.h"
16 #include "llvm/ADT/SmallVector.h"
17 #include "llvm/ADT/Twine.h"
18 #include "llvm/Analysis/IVDescriptors.h"
19 #include "llvm/IR/BasicBlock.h"
20 #include "llvm/IR/IRBuilder.h"
21 #include "llvm/IR/Instruction.h"
22 #include "llvm/IR/Instructions.h"
23 #include "llvm/IR/Type.h"
24 #include "llvm/IR/Value.h"
25 #include "llvm/Support/Casting.h"
26 #include "llvm/Support/CommandLine.h"
27 #include "llvm/Support/Debug.h"
28 #include "llvm/Support/raw_ostream.h"
29 #include "llvm/Transforms/Utils/ScalarEvolutionExpander.h"
30 #include <cassert>
31 
32 using namespace llvm;
33 
34 extern cl::opt<bool> EnableVPlanNativePath;
35 
36 #define LV_NAME "loop-vectorize"
37 #define DEBUG_TYPE LV_NAME
38 
39 bool VPRecipeBase::mayWriteToMemory() const {
40   switch (getVPDefID()) {
41   case VPWidenMemoryInstructionSC: {
42     return cast<VPWidenMemoryInstructionRecipe>(this)->isStore();
43   }
44   case VPReplicateSC:
45   case VPWidenCallSC:
46     return cast<Instruction>(getVPSingleValue()->getUnderlyingValue())
47         ->mayWriteToMemory();
48   case VPBranchOnMaskSC:
49     return false;
50   case VPWidenIntOrFpInductionSC:
51   case VPWidenCanonicalIVSC:
52   case VPWidenPHISC:
53   case VPBlendSC:
54   case VPWidenSC:
55   case VPWidenGEPSC:
56   case VPReductionSC:
57   case VPWidenSelectSC: {
58     const Instruction *I =
59         dyn_cast_or_null<Instruction>(getVPSingleValue()->getUnderlyingValue());
60     (void)I;
61     assert((!I || !I->mayWriteToMemory()) &&
62            "underlying instruction may write to memory");
63     return false;
64   }
65   default:
66     return true;
67   }
68 }
69 
70 bool VPRecipeBase::mayReadFromMemory() const {
71   switch (getVPDefID()) {
72   case VPWidenMemoryInstructionSC: {
73     return !cast<VPWidenMemoryInstructionRecipe>(this)->isStore();
74   }
75   case VPReplicateSC:
76   case VPWidenCallSC:
77     return cast<Instruction>(getVPSingleValue()->getUnderlyingValue())
78         ->mayReadFromMemory();
79   case VPBranchOnMaskSC:
80     return false;
81   case VPWidenIntOrFpInductionSC:
82   case VPWidenCanonicalIVSC:
83   case VPWidenPHISC:
84   case VPBlendSC:
85   case VPWidenSC:
86   case VPWidenGEPSC:
87   case VPReductionSC:
88   case VPWidenSelectSC: {
89     const Instruction *I =
90         dyn_cast_or_null<Instruction>(getVPSingleValue()->getUnderlyingValue());
91     (void)I;
92     assert((!I || !I->mayReadFromMemory()) &&
93            "underlying instruction may read from memory");
94     return false;
95   }
96   default:
97     return true;
98   }
99 }
100 
101 bool VPRecipeBase::mayHaveSideEffects() const {
102   switch (getVPDefID()) {
103   case VPWidenIntOrFpInductionSC:
104   case VPWidenPointerInductionSC:
105   case VPWidenCanonicalIVSC:
106   case VPWidenPHISC:
107   case VPBlendSC:
108   case VPWidenSC:
109   case VPWidenGEPSC:
110   case VPReductionSC:
111   case VPWidenSelectSC:
112   case VPScalarIVStepsSC: {
113     const Instruction *I =
114         dyn_cast_or_null<Instruction>(getVPSingleValue()->getUnderlyingValue());
115     (void)I;
116     assert((!I || !I->mayHaveSideEffects()) &&
117            "underlying instruction has side-effects");
118     return false;
119   }
120   case VPReplicateSC: {
121     auto *R = cast<VPReplicateRecipe>(this);
122     return R->getUnderlyingInstr()->mayHaveSideEffects();
123   }
124   default:
125     return true;
126   }
127 }
128 
129 void VPLiveOut::fixPhi(VPlan &Plan, VPTransformState &State) {
130   auto Lane = VPLane::getLastLaneForVF(State.VF);
131   VPValue *ExitValue = getOperand(0);
132   if (Plan.isUniformAfterVectorization(ExitValue))
133     Lane = VPLane::getFirstLane();
134   Phi->addIncoming(State.get(ExitValue, VPIteration(State.UF - 1, Lane)),
135                    State.Builder.GetInsertBlock());
136 }
137 
138 void VPRecipeBase::insertBefore(VPRecipeBase *InsertPos) {
139   assert(!Parent && "Recipe already in some VPBasicBlock");
140   assert(InsertPos->getParent() &&
141          "Insertion position not in any VPBasicBlock");
142   Parent = InsertPos->getParent();
143   Parent->getRecipeList().insert(InsertPos->getIterator(), this);
144 }
145 
146 void VPRecipeBase::insertBefore(VPBasicBlock &BB,
147                                 iplist<VPRecipeBase>::iterator I) {
148   assert(!Parent && "Recipe already in some VPBasicBlock");
149   assert(I == BB.end() || I->getParent() == &BB);
150   Parent = &BB;
151   BB.getRecipeList().insert(I, this);
152 }
153 
154 void VPRecipeBase::insertAfter(VPRecipeBase *InsertPos) {
155   assert(!Parent && "Recipe already in some VPBasicBlock");
156   assert(InsertPos->getParent() &&
157          "Insertion position not in any VPBasicBlock");
158   Parent = InsertPos->getParent();
159   Parent->getRecipeList().insertAfter(InsertPos->getIterator(), this);
160 }
161 
162 void VPRecipeBase::removeFromParent() {
163   assert(getParent() && "Recipe not in any VPBasicBlock");
164   getParent()->getRecipeList().remove(getIterator());
165   Parent = nullptr;
166 }
167 
168 iplist<VPRecipeBase>::iterator VPRecipeBase::eraseFromParent() {
169   assert(getParent() && "Recipe not in any VPBasicBlock");
170   return getParent()->getRecipeList().erase(getIterator());
171 }
172 
173 void VPRecipeBase::moveAfter(VPRecipeBase *InsertPos) {
174   removeFromParent();
175   insertAfter(InsertPos);
176 }
177 
178 void VPRecipeBase::moveBefore(VPBasicBlock &BB,
179                               iplist<VPRecipeBase>::iterator I) {
180   removeFromParent();
181   insertBefore(BB, I);
182 }
183 
184 void VPInstruction::generateInstruction(VPTransformState &State,
185                                         unsigned Part) {
186   IRBuilderBase &Builder = State.Builder;
187   Builder.SetCurrentDebugLocation(DL);
188 
189   if (Instruction::isBinaryOp(getOpcode())) {
190     Value *A = State.get(getOperand(0), Part);
191     Value *B = State.get(getOperand(1), Part);
192     Value *V =
193         Builder.CreateBinOp((Instruction::BinaryOps)getOpcode(), A, B, Name);
194     State.set(this, V, Part);
195     return;
196   }
197 
198   switch (getOpcode()) {
199   case VPInstruction::Not: {
200     Value *A = State.get(getOperand(0), Part);
201     Value *V = Builder.CreateNot(A, Name);
202     State.set(this, V, Part);
203     break;
204   }
205   case VPInstruction::ICmpULE: {
206     Value *IV = State.get(getOperand(0), Part);
207     Value *TC = State.get(getOperand(1), Part);
208     Value *V = Builder.CreateICmpULE(IV, TC, Name);
209     State.set(this, V, Part);
210     break;
211   }
212   case Instruction::Select: {
213     Value *Cond = State.get(getOperand(0), Part);
214     Value *Op1 = State.get(getOperand(1), Part);
215     Value *Op2 = State.get(getOperand(2), Part);
216     Value *V = Builder.CreateSelect(Cond, Op1, Op2, Name);
217     State.set(this, V, Part);
218     break;
219   }
220   case VPInstruction::ActiveLaneMask: {
221     // Get first lane of vector induction variable.
222     Value *VIVElem0 = State.get(getOperand(0), VPIteration(Part, 0));
223     // Get the original loop tripcount.
224     Value *ScalarTC = State.get(getOperand(1), Part);
225 
226     auto *Int1Ty = Type::getInt1Ty(Builder.getContext());
227     auto *PredTy = VectorType::get(Int1Ty, State.VF);
228     Instruction *Call = Builder.CreateIntrinsic(
229         Intrinsic::get_active_lane_mask, {PredTy, ScalarTC->getType()},
230         {VIVElem0, ScalarTC}, nullptr, Name);
231     State.set(this, Call, Part);
232     break;
233   }
234   case VPInstruction::FirstOrderRecurrenceSplice: {
235     // Generate code to combine the previous and current values in vector v3.
236     //
237     //   vector.ph:
238     //     v_init = vector(..., ..., ..., a[-1])
239     //     br vector.body
240     //
241     //   vector.body
242     //     i = phi [0, vector.ph], [i+4, vector.body]
243     //     v1 = phi [v_init, vector.ph], [v2, vector.body]
244     //     v2 = a[i, i+1, i+2, i+3];
245     //     v3 = vector(v1(3), v2(0, 1, 2))
246 
247     // For the first part, use the recurrence phi (v1), otherwise v2.
248     auto *V1 = State.get(getOperand(0), 0);
249     Value *PartMinus1 = Part == 0 ? V1 : State.get(getOperand(1), Part - 1);
250     if (!PartMinus1->getType()->isVectorTy()) {
251       State.set(this, PartMinus1, Part);
252     } else {
253       Value *V2 = State.get(getOperand(1), Part);
254       State.set(this, Builder.CreateVectorSplice(PartMinus1, V2, -1, Name),
255                 Part);
256     }
257     break;
258   }
259   case VPInstruction::CanonicalIVIncrement:
260   case VPInstruction::CanonicalIVIncrementNUW: {
261     Value *Next = nullptr;
262     if (Part == 0) {
263       bool IsNUW = getOpcode() == VPInstruction::CanonicalIVIncrementNUW;
264       auto *Phi = State.get(getOperand(0), 0);
265       // The loop step is equal to the vectorization factor (num of SIMD
266       // elements) times the unroll factor (num of SIMD instructions).
267       Value *Step =
268           createStepForVF(Builder, Phi->getType(), State.VF, State.UF);
269       Next = Builder.CreateAdd(Phi, Step, Name, IsNUW, false);
270     } else {
271       Next = State.get(this, 0);
272     }
273 
274     State.set(this, Next, Part);
275     break;
276   }
277 
278   case VPInstruction::CanonicalIVIncrementForPart:
279   case VPInstruction::CanonicalIVIncrementForPartNUW: {
280     bool IsNUW = getOpcode() == VPInstruction::CanonicalIVIncrementForPartNUW;
281     auto *IV = State.get(getOperand(0), VPIteration(0, 0));
282     if (Part == 0) {
283       State.set(this, IV, Part);
284       break;
285     }
286 
287     // The canonical IV is incremented by the vectorization factor (num of SIMD
288     // elements) times the unroll part.
289     Value *Step = createStepForVF(Builder, IV->getType(), State.VF, Part);
290     Value *Next = Builder.CreateAdd(IV, Step, Name, IsNUW, false);
291     State.set(this, Next, Part);
292     break;
293   }
294   case VPInstruction::BranchOnCond: {
295     if (Part != 0)
296       break;
297 
298     Value *Cond = State.get(getOperand(0), VPIteration(Part, 0));
299     VPRegionBlock *ParentRegion = getParent()->getParent();
300     VPBasicBlock *Header = ParentRegion->getEntryBasicBlock();
301 
302     // Replace the temporary unreachable terminator with a new conditional
303     // branch, hooking it up to backward destination for exiting blocks now and
304     // to forward destination(s) later when they are created.
305     BranchInst *CondBr =
306         Builder.CreateCondBr(Cond, Builder.GetInsertBlock(), nullptr);
307 
308     if (getParent()->isExiting())
309       CondBr->setSuccessor(1, State.CFG.VPBB2IRBB[Header]);
310 
311     CondBr->setSuccessor(0, nullptr);
312     Builder.GetInsertBlock()->getTerminator()->eraseFromParent();
313     break;
314   }
315   case VPInstruction::BranchOnCount: {
316     if (Part != 0)
317       break;
318     // First create the compare.
319     Value *IV = State.get(getOperand(0), Part);
320     Value *TC = State.get(getOperand(1), Part);
321     Value *Cond = Builder.CreateICmpEQ(IV, TC);
322 
323     // Now create the branch.
324     auto *Plan = getParent()->getPlan();
325     VPRegionBlock *TopRegion = Plan->getVectorLoopRegion();
326     VPBasicBlock *Header = TopRegion->getEntry()->getEntryBasicBlock();
327 
328     // Replace the temporary unreachable terminator with a new conditional
329     // branch, hooking it up to backward destination (the header) now and to the
330     // forward destination (the exit/middle block) later when it is created.
331     // Note that CreateCondBr expects a valid BB as first argument, so we need
332     // to set it to nullptr later.
333     BranchInst *CondBr = Builder.CreateCondBr(Cond, Builder.GetInsertBlock(),
334                                               State.CFG.VPBB2IRBB[Header]);
335     CondBr->setSuccessor(0, nullptr);
336     Builder.GetInsertBlock()->getTerminator()->eraseFromParent();
337     break;
338   }
339   default:
340     llvm_unreachable("Unsupported opcode for instruction");
341   }
342 }
343 
344 void VPInstruction::execute(VPTransformState &State) {
345   assert(!State.Instance && "VPInstruction executing an Instance");
346   IRBuilderBase::FastMathFlagGuard FMFGuard(State.Builder);
347   State.Builder.setFastMathFlags(FMF);
348   for (unsigned Part = 0; Part < State.UF; ++Part)
349     generateInstruction(State, Part);
350 }
351 
352 #if !defined(NDEBUG) || defined(LLVM_ENABLE_DUMP)
353 void VPInstruction::dump() const {
354   VPSlotTracker SlotTracker(getParent()->getPlan());
355   print(dbgs(), "", SlotTracker);
356 }
357 
358 void VPInstruction::print(raw_ostream &O, const Twine &Indent,
359                           VPSlotTracker &SlotTracker) const {
360   O << Indent << "EMIT ";
361 
362   if (hasResult()) {
363     printAsOperand(O, SlotTracker);
364     O << " = ";
365   }
366 
367   switch (getOpcode()) {
368   case VPInstruction::Not:
369     O << "not";
370     break;
371   case VPInstruction::ICmpULE:
372     O << "icmp ule";
373     break;
374   case VPInstruction::SLPLoad:
375     O << "combined load";
376     break;
377   case VPInstruction::SLPStore:
378     O << "combined store";
379     break;
380   case VPInstruction::ActiveLaneMask:
381     O << "active lane mask";
382     break;
383   case VPInstruction::FirstOrderRecurrenceSplice:
384     O << "first-order splice";
385     break;
386   case VPInstruction::CanonicalIVIncrement:
387     O << "VF * UF + ";
388     break;
389   case VPInstruction::CanonicalIVIncrementNUW:
390     O << "VF * UF +(nuw) ";
391     break;
392   case VPInstruction::BranchOnCond:
393     O << "branch-on-cond";
394     break;
395   case VPInstruction::CanonicalIVIncrementForPart:
396     O << "VF * Part + ";
397     break;
398   case VPInstruction::CanonicalIVIncrementForPartNUW:
399     O << "VF * Part +(nuw) ";
400     break;
401   case VPInstruction::BranchOnCount:
402     O << "branch-on-count ";
403     break;
404   default:
405     O << Instruction::getOpcodeName(getOpcode());
406   }
407 
408   O << FMF;
409 
410   for (const VPValue *Operand : operands()) {
411     O << " ";
412     Operand->printAsOperand(O, SlotTracker);
413   }
414 
415   if (DL) {
416     O << ", !dbg ";
417     DL.print(O);
418   }
419 }
420 #endif
421 
422 void VPInstruction::setFastMathFlags(FastMathFlags FMFNew) {
423   // Make sure the VPInstruction is a floating-point operation.
424   assert((Opcode == Instruction::FAdd || Opcode == Instruction::FMul ||
425           Opcode == Instruction::FNeg || Opcode == Instruction::FSub ||
426           Opcode == Instruction::FDiv || Opcode == Instruction::FRem ||
427           Opcode == Instruction::FCmp) &&
428          "this op can't take fast-math flags");
429   FMF = FMFNew;
430 }
431 
432 #if !defined(NDEBUG) || defined(LLVM_ENABLE_DUMP)
433 void VPWidenCallRecipe::print(raw_ostream &O, const Twine &Indent,
434                               VPSlotTracker &SlotTracker) const {
435   O << Indent << "WIDEN-CALL ";
436 
437   auto *CI = cast<CallInst>(getUnderlyingInstr());
438   if (CI->getType()->isVoidTy())
439     O << "void ";
440   else {
441     printAsOperand(O, SlotTracker);
442     O << " = ";
443   }
444 
445   O << "call @" << CI->getCalledFunction()->getName() << "(";
446   printOperands(O, SlotTracker);
447   O << ")";
448 }
449 
450 void VPWidenSelectRecipe::print(raw_ostream &O, const Twine &Indent,
451                                 VPSlotTracker &SlotTracker) const {
452   O << Indent << "WIDEN-SELECT ";
453   printAsOperand(O, SlotTracker);
454   O << " = select ";
455   getOperand(0)->printAsOperand(O, SlotTracker);
456   O << ", ";
457   getOperand(1)->printAsOperand(O, SlotTracker);
458   O << ", ";
459   getOperand(2)->printAsOperand(O, SlotTracker);
460   O << (InvariantCond ? " (condition is loop invariant)" : "");
461 }
462 #endif
463 
464 void VPWidenSelectRecipe::execute(VPTransformState &State) {
465   auto &I = *cast<SelectInst>(getUnderlyingInstr());
466   State.setDebugLocFromInst(&I);
467 
468   // The condition can be loop invariant but still defined inside the
469   // loop. This means that we can't just use the original 'cond' value.
470   // We have to take the 'vectorized' value and pick the first lane.
471   // Instcombine will make this a no-op.
472   auto *InvarCond =
473       InvariantCond ? State.get(getOperand(0), VPIteration(0, 0)) : nullptr;
474 
475   for (unsigned Part = 0; Part < State.UF; ++Part) {
476     Value *Cond = InvarCond ? InvarCond : State.get(getOperand(0), Part);
477     Value *Op0 = State.get(getOperand(1), Part);
478     Value *Op1 = State.get(getOperand(2), Part);
479     Value *Sel = State.Builder.CreateSelect(Cond, Op0, Op1);
480     State.set(this, Sel, Part);
481     State.addMetadata(Sel, &I);
482   }
483 }
484 
485 void VPWidenRecipe::execute(VPTransformState &State) {
486   auto &I = *cast<Instruction>(getUnderlyingValue());
487   auto &Builder = State.Builder;
488   switch (I.getOpcode()) {
489   case Instruction::Call:
490   case Instruction::Br:
491   case Instruction::PHI:
492   case Instruction::GetElementPtr:
493   case Instruction::Select:
494     llvm_unreachable("This instruction is handled by a different recipe.");
495   case Instruction::UDiv:
496   case Instruction::SDiv:
497   case Instruction::SRem:
498   case Instruction::URem:
499   case Instruction::Add:
500   case Instruction::FAdd:
501   case Instruction::Sub:
502   case Instruction::FSub:
503   case Instruction::FNeg:
504   case Instruction::Mul:
505   case Instruction::FMul:
506   case Instruction::FDiv:
507   case Instruction::FRem:
508   case Instruction::Shl:
509   case Instruction::LShr:
510   case Instruction::AShr:
511   case Instruction::And:
512   case Instruction::Or:
513   case Instruction::Xor: {
514     // Just widen unops and binops.
515     State.setDebugLocFromInst(&I);
516 
517     for (unsigned Part = 0; Part < State.UF; ++Part) {
518       SmallVector<Value *, 2> Ops;
519       for (VPValue *VPOp : operands())
520         Ops.push_back(State.get(VPOp, Part));
521 
522       Value *V = Builder.CreateNAryOp(I.getOpcode(), Ops);
523 
524       if (auto *VecOp = dyn_cast<Instruction>(V)) {
525         VecOp->copyIRFlags(&I);
526 
527         // If the instruction is vectorized and was in a basic block that needed
528         // predication, we can't propagate poison-generating flags (nuw/nsw,
529         // exact, etc.). The control flow has been linearized and the
530         // instruction is no longer guarded by the predicate, which could make
531         // the flag properties to no longer hold.
532         if (State.MayGeneratePoisonRecipes.contains(this))
533           VecOp->dropPoisonGeneratingFlags();
534       }
535 
536       // Use this vector value for all users of the original instruction.
537       State.set(this, V, Part);
538       State.addMetadata(V, &I);
539     }
540 
541     break;
542   }
543   case Instruction::Freeze: {
544     State.setDebugLocFromInst(&I);
545 
546     for (unsigned Part = 0; Part < State.UF; ++Part) {
547       Value *Op = State.get(getOperand(0), Part);
548 
549       Value *Freeze = Builder.CreateFreeze(Op);
550       State.set(this, Freeze, Part);
551     }
552     break;
553   }
554   case Instruction::ICmp:
555   case Instruction::FCmp: {
556     // Widen compares. Generate vector compares.
557     bool FCmp = (I.getOpcode() == Instruction::FCmp);
558     auto *Cmp = cast<CmpInst>(&I);
559     State.setDebugLocFromInst(Cmp);
560     for (unsigned Part = 0; Part < State.UF; ++Part) {
561       Value *A = State.get(getOperand(0), Part);
562       Value *B = State.get(getOperand(1), Part);
563       Value *C = nullptr;
564       if (FCmp) {
565         // Propagate fast math flags.
566         IRBuilder<>::FastMathFlagGuard FMFG(Builder);
567         Builder.setFastMathFlags(Cmp->getFastMathFlags());
568         C = Builder.CreateFCmp(Cmp->getPredicate(), A, B);
569       } else {
570         C = Builder.CreateICmp(Cmp->getPredicate(), A, B);
571       }
572       State.set(this, C, Part);
573       State.addMetadata(C, &I);
574     }
575 
576     break;
577   }
578 
579   case Instruction::ZExt:
580   case Instruction::SExt:
581   case Instruction::FPToUI:
582   case Instruction::FPToSI:
583   case Instruction::FPExt:
584   case Instruction::PtrToInt:
585   case Instruction::IntToPtr:
586   case Instruction::SIToFP:
587   case Instruction::UIToFP:
588   case Instruction::Trunc:
589   case Instruction::FPTrunc:
590   case Instruction::BitCast: {
591     auto *CI = cast<CastInst>(&I);
592     State.setDebugLocFromInst(CI);
593 
594     /// Vectorize casts.
595     Type *DestTy = (State.VF.isScalar())
596                        ? CI->getType()
597                        : VectorType::get(CI->getType(), State.VF);
598 
599     for (unsigned Part = 0; Part < State.UF; ++Part) {
600       Value *A = State.get(getOperand(0), Part);
601       Value *Cast = Builder.CreateCast(CI->getOpcode(), A, DestTy);
602       State.set(this, Cast, Part);
603       State.addMetadata(Cast, &I);
604     }
605     break;
606   }
607   default:
608     // This instruction is not vectorized by simple widening.
609     LLVM_DEBUG(dbgs() << "LV: Found an unhandled instruction: " << I);
610     llvm_unreachable("Unhandled instruction!");
611   } // end of switch.
612 }
613 #if !defined(NDEBUG) || defined(LLVM_ENABLE_DUMP)
614 void VPWidenRecipe::print(raw_ostream &O, const Twine &Indent,
615                           VPSlotTracker &SlotTracker) const {
616   O << Indent << "WIDEN ";
617   printAsOperand(O, SlotTracker);
618   O << " = " << getUnderlyingInstr()->getOpcodeName() << " ";
619   printOperands(O, SlotTracker);
620 }
621 
622 void VPWidenIntOrFpInductionRecipe::print(raw_ostream &O, const Twine &Indent,
623                                           VPSlotTracker &SlotTracker) const {
624   O << Indent << "WIDEN-INDUCTION";
625   if (getTruncInst()) {
626     O << "\\l\"";
627     O << " +\n" << Indent << "\"  " << VPlanIngredient(IV) << "\\l\"";
628     O << " +\n" << Indent << "\"  ";
629     getVPValue(0)->printAsOperand(O, SlotTracker);
630   } else
631     O << " " << VPlanIngredient(IV);
632 
633   O << ", ";
634   getStepValue()->printAsOperand(O, SlotTracker);
635 }
636 #endif
637 
638 bool VPWidenIntOrFpInductionRecipe::isCanonical() const {
639   auto *StartC = dyn_cast<ConstantInt>(getStartValue()->getLiveInIRValue());
640   auto *StepC = dyn_cast<SCEVConstant>(getInductionDescriptor().getStep());
641   return StartC && StartC->isZero() && StepC && StepC->isOne();
642 }
643 
644 VPCanonicalIVPHIRecipe *VPScalarIVStepsRecipe::getCanonicalIV() const {
645   return cast<VPCanonicalIVPHIRecipe>(getOperand(0));
646 }
647 
648 bool VPScalarIVStepsRecipe::isCanonical() const {
649   auto *CanIV = getCanonicalIV();
650   // The start value of the steps-recipe must match the start value of the
651   // canonical induction and it must step by 1.
652   if (CanIV->getStartValue() != getStartValue())
653     return false;
654   auto *StepVPV = getStepValue();
655   if (StepVPV->getDef())
656     return false;
657   auto *StepC = dyn_cast_or_null<ConstantInt>(StepVPV->getLiveInIRValue());
658   return StepC && StepC->isOne();
659 }
660 
661 #if !defined(NDEBUG) || defined(LLVM_ENABLE_DUMP)
662 void VPScalarIVStepsRecipe::print(raw_ostream &O, const Twine &Indent,
663                                   VPSlotTracker &SlotTracker) const {
664   O << Indent;
665   printAsOperand(O, SlotTracker);
666   O << Indent << "= SCALAR-STEPS ";
667   printOperands(O, SlotTracker);
668 }
669 #endif
670 
671 void VPWidenGEPRecipe::execute(VPTransformState &State) {
672   auto *GEP = cast<GetElementPtrInst>(getUnderlyingInstr());
673   // Construct a vector GEP by widening the operands of the scalar GEP as
674   // necessary. We mark the vector GEP 'inbounds' if appropriate. A GEP
675   // results in a vector of pointers when at least one operand of the GEP
676   // is vector-typed. Thus, to keep the representation compact, we only use
677   // vector-typed operands for loop-varying values.
678 
679   if (State.VF.isVector() && IsPtrLoopInvariant && IsIndexLoopInvariant.all()) {
680     // If we are vectorizing, but the GEP has only loop-invariant operands,
681     // the GEP we build (by only using vector-typed operands for
682     // loop-varying values) would be a scalar pointer. Thus, to ensure we
683     // produce a vector of pointers, we need to either arbitrarily pick an
684     // operand to broadcast, or broadcast a clone of the original GEP.
685     // Here, we broadcast a clone of the original.
686     //
687     // TODO: If at some point we decide to scalarize instructions having
688     //       loop-invariant operands, this special case will no longer be
689     //       required. We would add the scalarization decision to
690     //       collectLoopScalars() and teach getVectorValue() to broadcast
691     //       the lane-zero scalar value.
692     auto *Clone = State.Builder.Insert(GEP->clone());
693     for (unsigned Part = 0; Part < State.UF; ++Part) {
694       Value *EntryPart = State.Builder.CreateVectorSplat(State.VF, Clone);
695       State.set(this, EntryPart, Part);
696       State.addMetadata(EntryPart, GEP);
697     }
698   } else {
699     // If the GEP has at least one loop-varying operand, we are sure to
700     // produce a vector of pointers. But if we are only unrolling, we want
701     // to produce a scalar GEP for each unroll part. Thus, the GEP we
702     // produce with the code below will be scalar (if VF == 1) or vector
703     // (otherwise). Note that for the unroll-only case, we still maintain
704     // values in the vector mapping with initVector, as we do for other
705     // instructions.
706     for (unsigned Part = 0; Part < State.UF; ++Part) {
707       // The pointer operand of the new GEP. If it's loop-invariant, we
708       // won't broadcast it.
709       auto *Ptr = IsPtrLoopInvariant
710                       ? State.get(getOperand(0), VPIteration(0, 0))
711                       : State.get(getOperand(0), Part);
712 
713       // Collect all the indices for the new GEP. If any index is
714       // loop-invariant, we won't broadcast it.
715       SmallVector<Value *, 4> Indices;
716       for (unsigned I = 1, E = getNumOperands(); I < E; I++) {
717         VPValue *Operand = getOperand(I);
718         if (IsIndexLoopInvariant[I - 1])
719           Indices.push_back(State.get(Operand, VPIteration(0, 0)));
720         else
721           Indices.push_back(State.get(Operand, Part));
722       }
723 
724       // If the GEP instruction is vectorized and was in a basic block that
725       // needed predication, we can't propagate the poison-generating 'inbounds'
726       // flag. The control flow has been linearized and the GEP is no longer
727       // guarded by the predicate, which could make the 'inbounds' properties to
728       // no longer hold.
729       bool IsInBounds =
730           GEP->isInBounds() && State.MayGeneratePoisonRecipes.count(this) == 0;
731 
732       // Create the new GEP. Note that this GEP may be a scalar if VF == 1,
733       // but it should be a vector, otherwise.
734       auto *NewGEP = State.Builder.CreateGEP(GEP->getSourceElementType(), Ptr,
735                                              Indices, "", IsInBounds);
736       assert((State.VF.isScalar() || NewGEP->getType()->isVectorTy()) &&
737              "NewGEP is not a pointer vector");
738       State.set(this, NewGEP, Part);
739       State.addMetadata(NewGEP, GEP);
740     }
741   }
742 }
743 
744 #if !defined(NDEBUG) || defined(LLVM_ENABLE_DUMP)
745 void VPWidenGEPRecipe::print(raw_ostream &O, const Twine &Indent,
746                              VPSlotTracker &SlotTracker) const {
747   O << Indent << "WIDEN-GEP ";
748   O << (IsPtrLoopInvariant ? "Inv" : "Var");
749   size_t IndicesNumber = IsIndexLoopInvariant.size();
750   for (size_t I = 0; I < IndicesNumber; ++I)
751     O << "[" << (IsIndexLoopInvariant[I] ? "Inv" : "Var") << "]";
752 
753   O << " ";
754   printAsOperand(O, SlotTracker);
755   O << " = getelementptr ";
756   printOperands(O, SlotTracker);
757 }
758 
759 void VPBlendRecipe::print(raw_ostream &O, const Twine &Indent,
760                           VPSlotTracker &SlotTracker) const {
761   O << Indent << "BLEND ";
762   Phi->printAsOperand(O, false);
763   O << " =";
764   if (getNumIncomingValues() == 1) {
765     // Not a User of any mask: not really blending, this is a
766     // single-predecessor phi.
767     O << " ";
768     getIncomingValue(0)->printAsOperand(O, SlotTracker);
769   } else {
770     for (unsigned I = 0, E = getNumIncomingValues(); I < E; ++I) {
771       O << " ";
772       getIncomingValue(I)->printAsOperand(O, SlotTracker);
773       O << "/";
774       getMask(I)->printAsOperand(O, SlotTracker);
775     }
776   }
777 }
778 
779 void VPReductionRecipe::print(raw_ostream &O, const Twine &Indent,
780                               VPSlotTracker &SlotTracker) const {
781   O << Indent << "REDUCE ";
782   printAsOperand(O, SlotTracker);
783   O << " = ";
784   getChainOp()->printAsOperand(O, SlotTracker);
785   O << " +";
786   if (isa<FPMathOperator>(getUnderlyingInstr()))
787     O << getUnderlyingInstr()->getFastMathFlags();
788   O << " reduce." << Instruction::getOpcodeName(RdxDesc->getOpcode()) << " (";
789   getVecOp()->printAsOperand(O, SlotTracker);
790   if (getCondOp()) {
791     O << ", ";
792     getCondOp()->printAsOperand(O, SlotTracker);
793   }
794   O << ")";
795   if (RdxDesc->IntermediateStore)
796     O << " (with final reduction value stored in invariant address sank "
797          "outside of loop)";
798 }
799 
800 void VPReplicateRecipe::print(raw_ostream &O, const Twine &Indent,
801                               VPSlotTracker &SlotTracker) const {
802   O << Indent << (IsUniform ? "CLONE " : "REPLICATE ");
803 
804   if (!getUnderlyingInstr()->getType()->isVoidTy()) {
805     printAsOperand(O, SlotTracker);
806     O << " = ";
807   }
808   if (auto *CB = dyn_cast<CallBase>(getUnderlyingInstr())) {
809     O << "call @" << CB->getCalledFunction()->getName() << "(";
810     interleaveComma(make_range(op_begin(), op_begin() + (getNumOperands() - 1)),
811                     O, [&O, &SlotTracker](VPValue *Op) {
812                       Op->printAsOperand(O, SlotTracker);
813                     });
814     O << ")";
815   } else {
816     O << Instruction::getOpcodeName(getUnderlyingInstr()->getOpcode()) << " ";
817     printOperands(O, SlotTracker);
818   }
819 
820   if (AlsoPack)
821     O << " (S->V)";
822 }
823 
824 void VPPredInstPHIRecipe::print(raw_ostream &O, const Twine &Indent,
825                                 VPSlotTracker &SlotTracker) const {
826   O << Indent << "PHI-PREDICATED-INSTRUCTION ";
827   printAsOperand(O, SlotTracker);
828   O << " = ";
829   printOperands(O, SlotTracker);
830 }
831 
832 void VPWidenMemoryInstructionRecipe::print(raw_ostream &O, const Twine &Indent,
833                                            VPSlotTracker &SlotTracker) const {
834   O << Indent << "WIDEN ";
835 
836   if (!isStore()) {
837     getVPSingleValue()->printAsOperand(O, SlotTracker);
838     O << " = ";
839   }
840   O << Instruction::getOpcodeName(Ingredient.getOpcode()) << " ";
841 
842   printOperands(O, SlotTracker);
843 }
844 #endif
845 
846 void VPCanonicalIVPHIRecipe::execute(VPTransformState &State) {
847   Value *Start = getStartValue()->getLiveInIRValue();
848   PHINode *EntryPart = PHINode::Create(
849       Start->getType(), 2, "index", &*State.CFG.PrevBB->getFirstInsertionPt());
850 
851   BasicBlock *VectorPH = State.CFG.getPreheaderBBFor(this);
852   EntryPart->addIncoming(Start, VectorPH);
853   EntryPart->setDebugLoc(DL);
854   for (unsigned Part = 0, UF = State.UF; Part < UF; ++Part)
855     State.set(this, EntryPart, Part);
856 }
857 
858 #if !defined(NDEBUG) || defined(LLVM_ENABLE_DUMP)
859 void VPCanonicalIVPHIRecipe::print(raw_ostream &O, const Twine &Indent,
860                                    VPSlotTracker &SlotTracker) const {
861   O << Indent << "EMIT ";
862   printAsOperand(O, SlotTracker);
863   O << " = CANONICAL-INDUCTION";
864 }
865 #endif
866 
867 bool VPWidenPointerInductionRecipe::onlyScalarsGenerated(ElementCount VF) {
868   bool IsUniform = vputils::onlyFirstLaneUsed(this);
869   return all_of(users(),
870                 [&](const VPUser *U) { return U->usesScalars(this); }) &&
871          (IsUniform || !VF.isScalable());
872 }
873 
874 #if !defined(NDEBUG) || defined(LLVM_ENABLE_DUMP)
875 void VPWidenPointerInductionRecipe::print(raw_ostream &O, const Twine &Indent,
876                                           VPSlotTracker &SlotTracker) const {
877   O << Indent << "EMIT ";
878   printAsOperand(O, SlotTracker);
879   O << " = WIDEN-POINTER-INDUCTION ";
880   getStartValue()->printAsOperand(O, SlotTracker);
881   O << ", " << *IndDesc.getStep();
882 }
883 #endif
884 
885 void VPExpandSCEVRecipe::execute(VPTransformState &State) {
886   assert(!State.Instance && "cannot be used in per-lane");
887   const DataLayout &DL = State.CFG.PrevBB->getModule()->getDataLayout();
888   SCEVExpander Exp(SE, DL, "induction");
889 
890   Value *Res = Exp.expandCodeFor(Expr, Expr->getType(),
891                                  &*State.Builder.GetInsertPoint());
892 
893   for (unsigned Part = 0, UF = State.UF; Part < UF; ++Part)
894     State.set(this, Res, Part);
895 }
896 
897 #if !defined(NDEBUG) || defined(LLVM_ENABLE_DUMP)
898 void VPExpandSCEVRecipe::print(raw_ostream &O, const Twine &Indent,
899                                VPSlotTracker &SlotTracker) const {
900   O << Indent << "EMIT ";
901   getVPSingleValue()->printAsOperand(O, SlotTracker);
902   O << " = EXPAND SCEV " << *Expr;
903 }
904 #endif
905 
906 void VPWidenCanonicalIVRecipe::execute(VPTransformState &State) {
907   Value *CanonicalIV = State.get(getOperand(0), 0);
908   Type *STy = CanonicalIV->getType();
909   IRBuilder<> Builder(State.CFG.PrevBB->getTerminator());
910   ElementCount VF = State.VF;
911   Value *VStart = VF.isScalar()
912                       ? CanonicalIV
913                       : Builder.CreateVectorSplat(VF, CanonicalIV, "broadcast");
914   for (unsigned Part = 0, UF = State.UF; Part < UF; ++Part) {
915     Value *VStep = createStepForVF(Builder, STy, VF, Part);
916     if (VF.isVector()) {
917       VStep = Builder.CreateVectorSplat(VF, VStep);
918       VStep =
919           Builder.CreateAdd(VStep, Builder.CreateStepVector(VStep->getType()));
920     }
921     Value *CanonicalVectorIV = Builder.CreateAdd(VStart, VStep, "vec.iv");
922     State.set(this, CanonicalVectorIV, Part);
923   }
924 }
925 
926 #if !defined(NDEBUG) || defined(LLVM_ENABLE_DUMP)
927 void VPWidenCanonicalIVRecipe::print(raw_ostream &O, const Twine &Indent,
928                                      VPSlotTracker &SlotTracker) const {
929   O << Indent << "EMIT ";
930   printAsOperand(O, SlotTracker);
931   O << " = WIDEN-CANONICAL-INDUCTION ";
932   printOperands(O, SlotTracker);
933 }
934 #endif
935 
936 void VPFirstOrderRecurrencePHIRecipe::execute(VPTransformState &State) {
937   auto &Builder = State.Builder;
938   // Create a vector from the initial value.
939   auto *VectorInit = getStartValue()->getLiveInIRValue();
940 
941   Type *VecTy = State.VF.isScalar()
942                     ? VectorInit->getType()
943                     : VectorType::get(VectorInit->getType(), State.VF);
944 
945   BasicBlock *VectorPH = State.CFG.getPreheaderBBFor(this);
946   if (State.VF.isVector()) {
947     auto *IdxTy = Builder.getInt32Ty();
948     auto *One = ConstantInt::get(IdxTy, 1);
949     IRBuilder<>::InsertPointGuard Guard(Builder);
950     Builder.SetInsertPoint(VectorPH->getTerminator());
951     auto *RuntimeVF = getRuntimeVF(Builder, IdxTy, State.VF);
952     auto *LastIdx = Builder.CreateSub(RuntimeVF, One);
953     VectorInit = Builder.CreateInsertElement(
954         PoisonValue::get(VecTy), VectorInit, LastIdx, "vector.recur.init");
955   }
956 
957   // Create a phi node for the new recurrence.
958   PHINode *EntryPart = PHINode::Create(
959       VecTy, 2, "vector.recur", &*State.CFG.PrevBB->getFirstInsertionPt());
960   EntryPart->addIncoming(VectorInit, VectorPH);
961   State.set(this, EntryPart, 0);
962 }
963 
964 #if !defined(NDEBUG) || defined(LLVM_ENABLE_DUMP)
965 void VPFirstOrderRecurrencePHIRecipe::print(raw_ostream &O, const Twine &Indent,
966                                             VPSlotTracker &SlotTracker) const {
967   O << Indent << "FIRST-ORDER-RECURRENCE-PHI ";
968   printAsOperand(O, SlotTracker);
969   O << " = phi ";
970   printOperands(O, SlotTracker);
971 }
972 #endif
973 
974 void VPReductionPHIRecipe::execute(VPTransformState &State) {
975   PHINode *PN = cast<PHINode>(getUnderlyingValue());
976   auto &Builder = State.Builder;
977 
978   // In order to support recurrences we need to be able to vectorize Phi nodes.
979   // Phi nodes have cycles, so we need to vectorize them in two stages. This is
980   // stage #1: We create a new vector PHI node with no incoming edges. We'll use
981   // this value when we vectorize all of the instructions that use the PHI.
982   bool ScalarPHI = State.VF.isScalar() || IsInLoop;
983   Type *VecTy =
984       ScalarPHI ? PN->getType() : VectorType::get(PN->getType(), State.VF);
985 
986   BasicBlock *HeaderBB = State.CFG.PrevBB;
987   assert(State.CurrentVectorLoop->getHeader() == HeaderBB &&
988          "recipe must be in the vector loop header");
989   unsigned LastPartForNewPhi = isOrdered() ? 1 : State.UF;
990   for (unsigned Part = 0; Part < LastPartForNewPhi; ++Part) {
991     Value *EntryPart =
992         PHINode::Create(VecTy, 2, "vec.phi", &*HeaderBB->getFirstInsertionPt());
993     State.set(this, EntryPart, Part);
994   }
995 
996   BasicBlock *VectorPH = State.CFG.getPreheaderBBFor(this);
997 
998   // Reductions do not have to start at zero. They can start with
999   // any loop invariant values.
1000   VPValue *StartVPV = getStartValue();
1001   Value *StartV = StartVPV->getLiveInIRValue();
1002 
1003   Value *Iden = nullptr;
1004   RecurKind RK = RdxDesc.getRecurrenceKind();
1005   if (RecurrenceDescriptor::isMinMaxRecurrenceKind(RK) ||
1006       RecurrenceDescriptor::isSelectCmpRecurrenceKind(RK)) {
1007     // MinMax reduction have the start value as their identify.
1008     if (ScalarPHI) {
1009       Iden = StartV;
1010     } else {
1011       IRBuilderBase::InsertPointGuard IPBuilder(Builder);
1012       Builder.SetInsertPoint(VectorPH->getTerminator());
1013       StartV = Iden =
1014           Builder.CreateVectorSplat(State.VF, StartV, "minmax.ident");
1015     }
1016   } else {
1017     Iden = RdxDesc.getRecurrenceIdentity(RK, VecTy->getScalarType(),
1018                                          RdxDesc.getFastMathFlags());
1019 
1020     if (!ScalarPHI) {
1021       Iden = Builder.CreateVectorSplat(State.VF, Iden);
1022       IRBuilderBase::InsertPointGuard IPBuilder(Builder);
1023       Builder.SetInsertPoint(VectorPH->getTerminator());
1024       Constant *Zero = Builder.getInt32(0);
1025       StartV = Builder.CreateInsertElement(Iden, StartV, Zero);
1026     }
1027   }
1028 
1029   for (unsigned Part = 0; Part < LastPartForNewPhi; ++Part) {
1030     Value *EntryPart = State.get(this, Part);
1031     // Make sure to add the reduction start value only to the
1032     // first unroll part.
1033     Value *StartVal = (Part == 0) ? StartV : Iden;
1034     cast<PHINode>(EntryPart)->addIncoming(StartVal, VectorPH);
1035   }
1036 }
1037 
1038 #if !defined(NDEBUG) || defined(LLVM_ENABLE_DUMP)
1039 void VPReductionPHIRecipe::print(raw_ostream &O, const Twine &Indent,
1040                                  VPSlotTracker &SlotTracker) const {
1041   O << Indent << "WIDEN-REDUCTION-PHI ";
1042 
1043   printAsOperand(O, SlotTracker);
1044   O << " = phi ";
1045   printOperands(O, SlotTracker);
1046 }
1047 #endif
1048 
1049 void VPWidenPHIRecipe::execute(VPTransformState &State) {
1050   assert(EnableVPlanNativePath &&
1051          "Non-native vplans are not expected to have VPWidenPHIRecipes.");
1052 
1053   // Currently we enter here in the VPlan-native path for non-induction
1054   // PHIs where all control flow is uniform. We simply widen these PHIs.
1055   // Create a vector phi with no operands - the vector phi operands will be
1056   // set at the end of vector code generation.
1057   VPBasicBlock *Parent = getParent();
1058   VPRegionBlock *LoopRegion = Parent->getEnclosingLoopRegion();
1059   unsigned StartIdx = 0;
1060   // For phis in header blocks of loop regions, use the index of the value
1061   // coming from the preheader.
1062   if (LoopRegion->getEntryBasicBlock() == Parent) {
1063     for (unsigned I = 0; I < getNumOperands(); ++I) {
1064       if (getIncomingBlock(I) ==
1065           LoopRegion->getSinglePredecessor()->getExitingBasicBlock())
1066         StartIdx = I;
1067     }
1068   }
1069   Value *Op0 = State.get(getOperand(StartIdx), 0);
1070   Type *VecTy = Op0->getType();
1071   Value *VecPhi = State.Builder.CreatePHI(VecTy, 2, "vec.phi");
1072   State.set(this, VecPhi, 0);
1073 }
1074 
1075 #if !defined(NDEBUG) || defined(LLVM_ENABLE_DUMP)
1076 void VPWidenPHIRecipe::print(raw_ostream &O, const Twine &Indent,
1077                              VPSlotTracker &SlotTracker) const {
1078   O << Indent << "WIDEN-PHI ";
1079 
1080   auto *OriginalPhi = cast<PHINode>(getUnderlyingValue());
1081   // Unless all incoming values are modeled in VPlan  print the original PHI
1082   // directly.
1083   // TODO: Remove once all VPWidenPHIRecipe instances keep all relevant incoming
1084   // values as VPValues.
1085   if (getNumOperands() != OriginalPhi->getNumOperands()) {
1086     O << VPlanIngredient(OriginalPhi);
1087     return;
1088   }
1089 
1090   printAsOperand(O, SlotTracker);
1091   O << " = phi ";
1092   printOperands(O, SlotTracker);
1093 }
1094 #endif
1095 
1096 // TODO: It would be good to use the existing VPWidenPHIRecipe instead and
1097 // remove VPActiveLaneMaskPHIRecipe.
1098 void VPActiveLaneMaskPHIRecipe::execute(VPTransformState &State) {
1099   BasicBlock *VectorPH = State.CFG.getPreheaderBBFor(this);
1100   for (unsigned Part = 0, UF = State.UF; Part < UF; ++Part) {
1101     Value *StartMask = State.get(getOperand(0), Part);
1102     PHINode *EntryPart =
1103         State.Builder.CreatePHI(StartMask->getType(), 2, "active.lane.mask");
1104     EntryPart->addIncoming(StartMask, VectorPH);
1105     EntryPart->setDebugLoc(DL);
1106     State.set(this, EntryPart, Part);
1107   }
1108 }
1109 
1110 #if !defined(NDEBUG) || defined(LLVM_ENABLE_DUMP)
1111 void VPActiveLaneMaskPHIRecipe::print(raw_ostream &O, const Twine &Indent,
1112                                       VPSlotTracker &SlotTracker) const {
1113   O << Indent << "ACTIVE-LANE-MASK-PHI ";
1114 
1115   printAsOperand(O, SlotTracker);
1116   O << " = phi ";
1117   printOperands(O, SlotTracker);
1118 }
1119 #endif
1120