1 //===- VPlanRecipes.cpp - Implementations for VPlan recipes ---------------===//
2 //
3 // Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
4 // See https://llvm.org/LICENSE.txt for license information.
5 // SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
6 //
7 //===----------------------------------------------------------------------===//
8 ///
9 /// \file
10 /// This file contains implementations for different VPlan recipes.
11 ///
12 //===----------------------------------------------------------------------===//
13
14 #include "VPlan.h"
15 #include "llvm/ADT/STLExtras.h"
16 #include "llvm/ADT/SmallVector.h"
17 #include "llvm/ADT/Twine.h"
18 #include "llvm/Analysis/IVDescriptors.h"
19 #include "llvm/IR/BasicBlock.h"
20 #include "llvm/IR/IRBuilder.h"
21 #include "llvm/IR/Instruction.h"
22 #include "llvm/IR/Instructions.h"
23 #include "llvm/IR/Type.h"
24 #include "llvm/IR/Value.h"
25 #include "llvm/Support/Casting.h"
26 #include "llvm/Support/CommandLine.h"
27 #include "llvm/Support/Debug.h"
28 #include "llvm/Support/raw_ostream.h"
29 #include "llvm/Transforms/Utils/BasicBlockUtils.h"
30 #include "llvm/Transforms/Utils/ScalarEvolutionExpander.h"
31 #include <cassert>
32
33 using namespace llvm;
34
35 using VectorParts = SmallVector<Value *, 2>;
36
37 extern cl::opt<bool> EnableVPlanNativePath;
38
39 #define LV_NAME "loop-vectorize"
40 #define DEBUG_TYPE LV_NAME
41
mayWriteToMemory() const42 bool VPRecipeBase::mayWriteToMemory() const {
43 switch (getVPDefID()) {
44 case VPWidenMemoryInstructionSC: {
45 return cast<VPWidenMemoryInstructionRecipe>(this)->isStore();
46 }
47 case VPReplicateSC:
48 case VPWidenCallSC:
49 return cast<Instruction>(getVPSingleValue()->getUnderlyingValue())
50 ->mayWriteToMemory();
51 case VPBranchOnMaskSC:
52 return false;
53 case VPWidenIntOrFpInductionSC:
54 case VPWidenCanonicalIVSC:
55 case VPWidenPHISC:
56 case VPBlendSC:
57 case VPWidenSC:
58 case VPWidenGEPSC:
59 case VPReductionSC:
60 case VPWidenSelectSC: {
61 const Instruction *I =
62 dyn_cast_or_null<Instruction>(getVPSingleValue()->getUnderlyingValue());
63 (void)I;
64 assert((!I || !I->mayWriteToMemory()) &&
65 "underlying instruction may write to memory");
66 return false;
67 }
68 default:
69 return true;
70 }
71 }
72
mayReadFromMemory() const73 bool VPRecipeBase::mayReadFromMemory() const {
74 switch (getVPDefID()) {
75 case VPWidenMemoryInstructionSC: {
76 return !cast<VPWidenMemoryInstructionRecipe>(this)->isStore();
77 }
78 case VPReplicateSC:
79 case VPWidenCallSC:
80 return cast<Instruction>(getVPSingleValue()->getUnderlyingValue())
81 ->mayReadFromMemory();
82 case VPBranchOnMaskSC:
83 return false;
84 case VPWidenIntOrFpInductionSC:
85 case VPWidenCanonicalIVSC:
86 case VPWidenPHISC:
87 case VPBlendSC:
88 case VPWidenSC:
89 case VPWidenGEPSC:
90 case VPReductionSC:
91 case VPWidenSelectSC: {
92 const Instruction *I =
93 dyn_cast_or_null<Instruction>(getVPSingleValue()->getUnderlyingValue());
94 (void)I;
95 assert((!I || !I->mayReadFromMemory()) &&
96 "underlying instruction may read from memory");
97 return false;
98 }
99 default:
100 return true;
101 }
102 }
103
mayHaveSideEffects() const104 bool VPRecipeBase::mayHaveSideEffects() const {
105 switch (getVPDefID()) {
106 case VPWidenIntOrFpInductionSC:
107 case VPWidenPointerInductionSC:
108 case VPWidenCanonicalIVSC:
109 case VPWidenPHISC:
110 case VPBlendSC:
111 case VPWidenSC:
112 case VPWidenGEPSC:
113 case VPReductionSC:
114 case VPWidenSelectSC:
115 case VPScalarIVStepsSC: {
116 const Instruction *I =
117 dyn_cast_or_null<Instruction>(getVPSingleValue()->getUnderlyingValue());
118 (void)I;
119 assert((!I || !I->mayHaveSideEffects()) &&
120 "underlying instruction has side-effects");
121 return false;
122 }
123 case VPReplicateSC: {
124 auto *R = cast<VPReplicateRecipe>(this);
125 return R->getUnderlyingInstr()->mayHaveSideEffects();
126 }
127 default:
128 return true;
129 }
130 }
131
fixPhi(VPlan & Plan,VPTransformState & State)132 void VPLiveOut::fixPhi(VPlan &Plan, VPTransformState &State) {
133 auto Lane = VPLane::getLastLaneForVF(State.VF);
134 VPValue *ExitValue = getOperand(0);
135 if (Plan.isUniformAfterVectorization(ExitValue))
136 Lane = VPLane::getFirstLane();
137 Phi->addIncoming(State.get(ExitValue, VPIteration(State.UF - 1, Lane)),
138 State.Builder.GetInsertBlock());
139 }
140
insertBefore(VPRecipeBase * InsertPos)141 void VPRecipeBase::insertBefore(VPRecipeBase *InsertPos) {
142 assert(!Parent && "Recipe already in some VPBasicBlock");
143 assert(InsertPos->getParent() &&
144 "Insertion position not in any VPBasicBlock");
145 Parent = InsertPos->getParent();
146 Parent->getRecipeList().insert(InsertPos->getIterator(), this);
147 }
148
insertBefore(VPBasicBlock & BB,iplist<VPRecipeBase>::iterator I)149 void VPRecipeBase::insertBefore(VPBasicBlock &BB,
150 iplist<VPRecipeBase>::iterator I) {
151 assert(!Parent && "Recipe already in some VPBasicBlock");
152 assert(I == BB.end() || I->getParent() == &BB);
153 Parent = &BB;
154 BB.getRecipeList().insert(I, this);
155 }
156
insertAfter(VPRecipeBase * InsertPos)157 void VPRecipeBase::insertAfter(VPRecipeBase *InsertPos) {
158 assert(!Parent && "Recipe already in some VPBasicBlock");
159 assert(InsertPos->getParent() &&
160 "Insertion position not in any VPBasicBlock");
161 Parent = InsertPos->getParent();
162 Parent->getRecipeList().insertAfter(InsertPos->getIterator(), this);
163 }
164
removeFromParent()165 void VPRecipeBase::removeFromParent() {
166 assert(getParent() && "Recipe not in any VPBasicBlock");
167 getParent()->getRecipeList().remove(getIterator());
168 Parent = nullptr;
169 }
170
eraseFromParent()171 iplist<VPRecipeBase>::iterator VPRecipeBase::eraseFromParent() {
172 assert(getParent() && "Recipe not in any VPBasicBlock");
173 return getParent()->getRecipeList().erase(getIterator());
174 }
175
moveAfter(VPRecipeBase * InsertPos)176 void VPRecipeBase::moveAfter(VPRecipeBase *InsertPos) {
177 removeFromParent();
178 insertAfter(InsertPos);
179 }
180
moveBefore(VPBasicBlock & BB,iplist<VPRecipeBase>::iterator I)181 void VPRecipeBase::moveBefore(VPBasicBlock &BB,
182 iplist<VPRecipeBase>::iterator I) {
183 removeFromParent();
184 insertBefore(BB, I);
185 }
186
generateInstruction(VPTransformState & State,unsigned Part)187 void VPInstruction::generateInstruction(VPTransformState &State,
188 unsigned Part) {
189 IRBuilderBase &Builder = State.Builder;
190 Builder.SetCurrentDebugLocation(DL);
191
192 if (Instruction::isBinaryOp(getOpcode())) {
193 Value *A = State.get(getOperand(0), Part);
194 Value *B = State.get(getOperand(1), Part);
195 Value *V =
196 Builder.CreateBinOp((Instruction::BinaryOps)getOpcode(), A, B, Name);
197 State.set(this, V, Part);
198 return;
199 }
200
201 switch (getOpcode()) {
202 case VPInstruction::Not: {
203 Value *A = State.get(getOperand(0), Part);
204 Value *V = Builder.CreateNot(A, Name);
205 State.set(this, V, Part);
206 break;
207 }
208 case VPInstruction::ICmpULE: {
209 Value *IV = State.get(getOperand(0), Part);
210 Value *TC = State.get(getOperand(1), Part);
211 Value *V = Builder.CreateICmpULE(IV, TC, Name);
212 State.set(this, V, Part);
213 break;
214 }
215 case Instruction::Select: {
216 Value *Cond = State.get(getOperand(0), Part);
217 Value *Op1 = State.get(getOperand(1), Part);
218 Value *Op2 = State.get(getOperand(2), Part);
219 Value *V = Builder.CreateSelect(Cond, Op1, Op2, Name);
220 State.set(this, V, Part);
221 break;
222 }
223 case VPInstruction::ActiveLaneMask: {
224 // Get first lane of vector induction variable.
225 Value *VIVElem0 = State.get(getOperand(0), VPIteration(Part, 0));
226 // Get the original loop tripcount.
227 Value *ScalarTC = State.get(getOperand(1), Part);
228
229 auto *Int1Ty = Type::getInt1Ty(Builder.getContext());
230 auto *PredTy = VectorType::get(Int1Ty, State.VF);
231 Instruction *Call = Builder.CreateIntrinsic(
232 Intrinsic::get_active_lane_mask, {PredTy, ScalarTC->getType()},
233 {VIVElem0, ScalarTC}, nullptr, Name);
234 State.set(this, Call, Part);
235 break;
236 }
237 case VPInstruction::FirstOrderRecurrenceSplice: {
238 // Generate code to combine the previous and current values in vector v3.
239 //
240 // vector.ph:
241 // v_init = vector(..., ..., ..., a[-1])
242 // br vector.body
243 //
244 // vector.body
245 // i = phi [0, vector.ph], [i+4, vector.body]
246 // v1 = phi [v_init, vector.ph], [v2, vector.body]
247 // v2 = a[i, i+1, i+2, i+3];
248 // v3 = vector(v1(3), v2(0, 1, 2))
249
250 // For the first part, use the recurrence phi (v1), otherwise v2.
251 auto *V1 = State.get(getOperand(0), 0);
252 Value *PartMinus1 = Part == 0 ? V1 : State.get(getOperand(1), Part - 1);
253 if (!PartMinus1->getType()->isVectorTy()) {
254 State.set(this, PartMinus1, Part);
255 } else {
256 Value *V2 = State.get(getOperand(1), Part);
257 State.set(this, Builder.CreateVectorSplice(PartMinus1, V2, -1, Name),
258 Part);
259 }
260 break;
261 }
262 case VPInstruction::CanonicalIVIncrement:
263 case VPInstruction::CanonicalIVIncrementNUW: {
264 Value *Next = nullptr;
265 if (Part == 0) {
266 bool IsNUW = getOpcode() == VPInstruction::CanonicalIVIncrementNUW;
267 auto *Phi = State.get(getOperand(0), 0);
268 // The loop step is equal to the vectorization factor (num of SIMD
269 // elements) times the unroll factor (num of SIMD instructions).
270 Value *Step =
271 createStepForVF(Builder, Phi->getType(), State.VF, State.UF);
272 Next = Builder.CreateAdd(Phi, Step, Name, IsNUW, false);
273 } else {
274 Next = State.get(this, 0);
275 }
276
277 State.set(this, Next, Part);
278 break;
279 }
280
281 case VPInstruction::CanonicalIVIncrementForPart:
282 case VPInstruction::CanonicalIVIncrementForPartNUW: {
283 bool IsNUW = getOpcode() == VPInstruction::CanonicalIVIncrementForPartNUW;
284 auto *IV = State.get(getOperand(0), VPIteration(0, 0));
285 if (Part == 0) {
286 State.set(this, IV, Part);
287 break;
288 }
289
290 // The canonical IV is incremented by the vectorization factor (num of SIMD
291 // elements) times the unroll part.
292 Value *Step = createStepForVF(Builder, IV->getType(), State.VF, Part);
293 Value *Next = Builder.CreateAdd(IV, Step, Name, IsNUW, false);
294 State.set(this, Next, Part);
295 break;
296 }
297 case VPInstruction::BranchOnCond: {
298 if (Part != 0)
299 break;
300
301 Value *Cond = State.get(getOperand(0), VPIteration(Part, 0));
302 VPRegionBlock *ParentRegion = getParent()->getParent();
303 VPBasicBlock *Header = ParentRegion->getEntryBasicBlock();
304
305 // Replace the temporary unreachable terminator with a new conditional
306 // branch, hooking it up to backward destination for exiting blocks now and
307 // to forward destination(s) later when they are created.
308 BranchInst *CondBr =
309 Builder.CreateCondBr(Cond, Builder.GetInsertBlock(), nullptr);
310
311 if (getParent()->isExiting())
312 CondBr->setSuccessor(1, State.CFG.VPBB2IRBB[Header]);
313
314 CondBr->setSuccessor(0, nullptr);
315 Builder.GetInsertBlock()->getTerminator()->eraseFromParent();
316 break;
317 }
318 case VPInstruction::BranchOnCount: {
319 if (Part != 0)
320 break;
321 // First create the compare.
322 Value *IV = State.get(getOperand(0), Part);
323 Value *TC = State.get(getOperand(1), Part);
324 Value *Cond = Builder.CreateICmpEQ(IV, TC);
325
326 // Now create the branch.
327 auto *Plan = getParent()->getPlan();
328 VPRegionBlock *TopRegion = Plan->getVectorLoopRegion();
329 VPBasicBlock *Header = TopRegion->getEntry()->getEntryBasicBlock();
330
331 // Replace the temporary unreachable terminator with a new conditional
332 // branch, hooking it up to backward destination (the header) now and to the
333 // forward destination (the exit/middle block) later when it is created.
334 // Note that CreateCondBr expects a valid BB as first argument, so we need
335 // to set it to nullptr later.
336 BranchInst *CondBr = Builder.CreateCondBr(Cond, Builder.GetInsertBlock(),
337 State.CFG.VPBB2IRBB[Header]);
338 CondBr->setSuccessor(0, nullptr);
339 Builder.GetInsertBlock()->getTerminator()->eraseFromParent();
340 break;
341 }
342 default:
343 llvm_unreachable("Unsupported opcode for instruction");
344 }
345 }
346
execute(VPTransformState & State)347 void VPInstruction::execute(VPTransformState &State) {
348 assert(!State.Instance && "VPInstruction executing an Instance");
349 IRBuilderBase::FastMathFlagGuard FMFGuard(State.Builder);
350 State.Builder.setFastMathFlags(FMF);
351 for (unsigned Part = 0; Part < State.UF; ++Part)
352 generateInstruction(State, Part);
353 }
354
355 #if !defined(NDEBUG) || defined(LLVM_ENABLE_DUMP)
dump() const356 void VPInstruction::dump() const {
357 VPSlotTracker SlotTracker(getParent()->getPlan());
358 print(dbgs(), "", SlotTracker);
359 }
360
print(raw_ostream & O,const Twine & Indent,VPSlotTracker & SlotTracker) const361 void VPInstruction::print(raw_ostream &O, const Twine &Indent,
362 VPSlotTracker &SlotTracker) const {
363 O << Indent << "EMIT ";
364
365 if (hasResult()) {
366 printAsOperand(O, SlotTracker);
367 O << " = ";
368 }
369
370 switch (getOpcode()) {
371 case VPInstruction::Not:
372 O << "not";
373 break;
374 case VPInstruction::ICmpULE:
375 O << "icmp ule";
376 break;
377 case VPInstruction::SLPLoad:
378 O << "combined load";
379 break;
380 case VPInstruction::SLPStore:
381 O << "combined store";
382 break;
383 case VPInstruction::ActiveLaneMask:
384 O << "active lane mask";
385 break;
386 case VPInstruction::FirstOrderRecurrenceSplice:
387 O << "first-order splice";
388 break;
389 case VPInstruction::CanonicalIVIncrement:
390 O << "VF * UF + ";
391 break;
392 case VPInstruction::CanonicalIVIncrementNUW:
393 O << "VF * UF +(nuw) ";
394 break;
395 case VPInstruction::BranchOnCond:
396 O << "branch-on-cond";
397 break;
398 case VPInstruction::CanonicalIVIncrementForPart:
399 O << "VF * Part + ";
400 break;
401 case VPInstruction::CanonicalIVIncrementForPartNUW:
402 O << "VF * Part +(nuw) ";
403 break;
404 case VPInstruction::BranchOnCount:
405 O << "branch-on-count ";
406 break;
407 default:
408 O << Instruction::getOpcodeName(getOpcode());
409 }
410
411 O << FMF;
412
413 for (const VPValue *Operand : operands()) {
414 O << " ";
415 Operand->printAsOperand(O, SlotTracker);
416 }
417
418 if (DL) {
419 O << ", !dbg ";
420 DL.print(O);
421 }
422 }
423 #endif
424
setFastMathFlags(FastMathFlags FMFNew)425 void VPInstruction::setFastMathFlags(FastMathFlags FMFNew) {
426 // Make sure the VPInstruction is a floating-point operation.
427 assert((Opcode == Instruction::FAdd || Opcode == Instruction::FMul ||
428 Opcode == Instruction::FNeg || Opcode == Instruction::FSub ||
429 Opcode == Instruction::FDiv || Opcode == Instruction::FRem ||
430 Opcode == Instruction::FCmp) &&
431 "this op can't take fast-math flags");
432 FMF = FMFNew;
433 }
434
435 #if !defined(NDEBUG) || defined(LLVM_ENABLE_DUMP)
print(raw_ostream & O,const Twine & Indent,VPSlotTracker & SlotTracker) const436 void VPWidenCallRecipe::print(raw_ostream &O, const Twine &Indent,
437 VPSlotTracker &SlotTracker) const {
438 O << Indent << "WIDEN-CALL ";
439
440 auto *CI = cast<CallInst>(getUnderlyingInstr());
441 if (CI->getType()->isVoidTy())
442 O << "void ";
443 else {
444 printAsOperand(O, SlotTracker);
445 O << " = ";
446 }
447
448 O << "call @" << CI->getCalledFunction()->getName() << "(";
449 printOperands(O, SlotTracker);
450 O << ")";
451 }
452
print(raw_ostream & O,const Twine & Indent,VPSlotTracker & SlotTracker) const453 void VPWidenSelectRecipe::print(raw_ostream &O, const Twine &Indent,
454 VPSlotTracker &SlotTracker) const {
455 O << Indent << "WIDEN-SELECT ";
456 printAsOperand(O, SlotTracker);
457 O << " = select ";
458 getOperand(0)->printAsOperand(O, SlotTracker);
459 O << ", ";
460 getOperand(1)->printAsOperand(O, SlotTracker);
461 O << ", ";
462 getOperand(2)->printAsOperand(O, SlotTracker);
463 O << (InvariantCond ? " (condition is loop invariant)" : "");
464 }
465 #endif
466
execute(VPTransformState & State)467 void VPWidenSelectRecipe::execute(VPTransformState &State) {
468 auto &I = *cast<SelectInst>(getUnderlyingInstr());
469 State.setDebugLocFromInst(&I);
470
471 // The condition can be loop invariant but still defined inside the
472 // loop. This means that we can't just use the original 'cond' value.
473 // We have to take the 'vectorized' value and pick the first lane.
474 // Instcombine will make this a no-op.
475 auto *InvarCond =
476 InvariantCond ? State.get(getOperand(0), VPIteration(0, 0)) : nullptr;
477
478 for (unsigned Part = 0; Part < State.UF; ++Part) {
479 Value *Cond = InvarCond ? InvarCond : State.get(getOperand(0), Part);
480 Value *Op0 = State.get(getOperand(1), Part);
481 Value *Op1 = State.get(getOperand(2), Part);
482 Value *Sel = State.Builder.CreateSelect(Cond, Op0, Op1);
483 State.set(this, Sel, Part);
484 State.addMetadata(Sel, &I);
485 }
486 }
487
execute(VPTransformState & State)488 void VPWidenRecipe::execute(VPTransformState &State) {
489 auto &I = *cast<Instruction>(getUnderlyingValue());
490 auto &Builder = State.Builder;
491 switch (I.getOpcode()) {
492 case Instruction::Call:
493 case Instruction::Br:
494 case Instruction::PHI:
495 case Instruction::GetElementPtr:
496 case Instruction::Select:
497 llvm_unreachable("This instruction is handled by a different recipe.");
498 case Instruction::UDiv:
499 case Instruction::SDiv:
500 case Instruction::SRem:
501 case Instruction::URem:
502 case Instruction::Add:
503 case Instruction::FAdd:
504 case Instruction::Sub:
505 case Instruction::FSub:
506 case Instruction::FNeg:
507 case Instruction::Mul:
508 case Instruction::FMul:
509 case Instruction::FDiv:
510 case Instruction::FRem:
511 case Instruction::Shl:
512 case Instruction::LShr:
513 case Instruction::AShr:
514 case Instruction::And:
515 case Instruction::Or:
516 case Instruction::Xor: {
517 // Just widen unops and binops.
518 State.setDebugLocFromInst(&I);
519
520 for (unsigned Part = 0; Part < State.UF; ++Part) {
521 SmallVector<Value *, 2> Ops;
522 for (VPValue *VPOp : operands())
523 Ops.push_back(State.get(VPOp, Part));
524
525 Value *V = Builder.CreateNAryOp(I.getOpcode(), Ops);
526
527 if (auto *VecOp = dyn_cast<Instruction>(V)) {
528 VecOp->copyIRFlags(&I);
529
530 // If the instruction is vectorized and was in a basic block that needed
531 // predication, we can't propagate poison-generating flags (nuw/nsw,
532 // exact, etc.). The control flow has been linearized and the
533 // instruction is no longer guarded by the predicate, which could make
534 // the flag properties to no longer hold.
535 if (State.MayGeneratePoisonRecipes.contains(this))
536 VecOp->dropPoisonGeneratingFlags();
537 }
538
539 // Use this vector value for all users of the original instruction.
540 State.set(this, V, Part);
541 State.addMetadata(V, &I);
542 }
543
544 break;
545 }
546 case Instruction::Freeze: {
547 State.setDebugLocFromInst(&I);
548
549 for (unsigned Part = 0; Part < State.UF; ++Part) {
550 Value *Op = State.get(getOperand(0), Part);
551
552 Value *Freeze = Builder.CreateFreeze(Op);
553 State.set(this, Freeze, Part);
554 }
555 break;
556 }
557 case Instruction::ICmp:
558 case Instruction::FCmp: {
559 // Widen compares. Generate vector compares.
560 bool FCmp = (I.getOpcode() == Instruction::FCmp);
561 auto *Cmp = cast<CmpInst>(&I);
562 State.setDebugLocFromInst(Cmp);
563 for (unsigned Part = 0; Part < State.UF; ++Part) {
564 Value *A = State.get(getOperand(0), Part);
565 Value *B = State.get(getOperand(1), Part);
566 Value *C = nullptr;
567 if (FCmp) {
568 // Propagate fast math flags.
569 IRBuilder<>::FastMathFlagGuard FMFG(Builder);
570 Builder.setFastMathFlags(Cmp->getFastMathFlags());
571 C = Builder.CreateFCmp(Cmp->getPredicate(), A, B);
572 } else {
573 C = Builder.CreateICmp(Cmp->getPredicate(), A, B);
574 }
575 State.set(this, C, Part);
576 State.addMetadata(C, &I);
577 }
578
579 break;
580 }
581
582 case Instruction::ZExt:
583 case Instruction::SExt:
584 case Instruction::FPToUI:
585 case Instruction::FPToSI:
586 case Instruction::FPExt:
587 case Instruction::PtrToInt:
588 case Instruction::IntToPtr:
589 case Instruction::SIToFP:
590 case Instruction::UIToFP:
591 case Instruction::Trunc:
592 case Instruction::FPTrunc:
593 case Instruction::BitCast: {
594 auto *CI = cast<CastInst>(&I);
595 State.setDebugLocFromInst(CI);
596
597 /// Vectorize casts.
598 Type *DestTy = (State.VF.isScalar())
599 ? CI->getType()
600 : VectorType::get(CI->getType(), State.VF);
601
602 for (unsigned Part = 0; Part < State.UF; ++Part) {
603 Value *A = State.get(getOperand(0), Part);
604 Value *Cast = Builder.CreateCast(CI->getOpcode(), A, DestTy);
605 State.set(this, Cast, Part);
606 State.addMetadata(Cast, &I);
607 }
608 break;
609 }
610 default:
611 // This instruction is not vectorized by simple widening.
612 LLVM_DEBUG(dbgs() << "LV: Found an unhandled instruction: " << I);
613 llvm_unreachable("Unhandled instruction!");
614 } // end of switch.
615 }
616 #if !defined(NDEBUG) || defined(LLVM_ENABLE_DUMP)
print(raw_ostream & O,const Twine & Indent,VPSlotTracker & SlotTracker) const617 void VPWidenRecipe::print(raw_ostream &O, const Twine &Indent,
618 VPSlotTracker &SlotTracker) const {
619 O << Indent << "WIDEN ";
620 printAsOperand(O, SlotTracker);
621 O << " = " << getUnderlyingInstr()->getOpcodeName() << " ";
622 printOperands(O, SlotTracker);
623 }
624
print(raw_ostream & O,const Twine & Indent,VPSlotTracker & SlotTracker) const625 void VPWidenIntOrFpInductionRecipe::print(raw_ostream &O, const Twine &Indent,
626 VPSlotTracker &SlotTracker) const {
627 O << Indent << "WIDEN-INDUCTION";
628 if (getTruncInst()) {
629 O << "\\l\"";
630 O << " +\n" << Indent << "\" " << VPlanIngredient(IV) << "\\l\"";
631 O << " +\n" << Indent << "\" ";
632 getVPValue(0)->printAsOperand(O, SlotTracker);
633 } else
634 O << " " << VPlanIngredient(IV);
635
636 O << ", ";
637 getStepValue()->printAsOperand(O, SlotTracker);
638 }
639 #endif
640
isCanonical() const641 bool VPWidenIntOrFpInductionRecipe::isCanonical() const {
642 auto *StartC = dyn_cast<ConstantInt>(getStartValue()->getLiveInIRValue());
643 auto *StepC = dyn_cast<SCEVConstant>(getInductionDescriptor().getStep());
644 return StartC && StartC->isZero() && StepC && StepC->isOne();
645 }
646
getCanonicalIV() const647 VPCanonicalIVPHIRecipe *VPScalarIVStepsRecipe::getCanonicalIV() const {
648 return cast<VPCanonicalIVPHIRecipe>(getOperand(0));
649 }
650
isCanonical() const651 bool VPScalarIVStepsRecipe::isCanonical() const {
652 auto *CanIV = getCanonicalIV();
653 // The start value of the steps-recipe must match the start value of the
654 // canonical induction and it must step by 1.
655 if (CanIV->getStartValue() != getStartValue())
656 return false;
657 auto *StepVPV = getStepValue();
658 if (StepVPV->getDef())
659 return false;
660 auto *StepC = dyn_cast_or_null<ConstantInt>(StepVPV->getLiveInIRValue());
661 return StepC && StepC->isOne();
662 }
663
664 #if !defined(NDEBUG) || defined(LLVM_ENABLE_DUMP)
print(raw_ostream & O,const Twine & Indent,VPSlotTracker & SlotTracker) const665 void VPScalarIVStepsRecipe::print(raw_ostream &O, const Twine &Indent,
666 VPSlotTracker &SlotTracker) const {
667 O << Indent;
668 printAsOperand(O, SlotTracker);
669 O << Indent << "= SCALAR-STEPS ";
670 printOperands(O, SlotTracker);
671 }
672 #endif
673
execute(VPTransformState & State)674 void VPWidenGEPRecipe::execute(VPTransformState &State) {
675 auto *GEP = cast<GetElementPtrInst>(getUnderlyingInstr());
676 // Construct a vector GEP by widening the operands of the scalar GEP as
677 // necessary. We mark the vector GEP 'inbounds' if appropriate. A GEP
678 // results in a vector of pointers when at least one operand of the GEP
679 // is vector-typed. Thus, to keep the representation compact, we only use
680 // vector-typed operands for loop-varying values.
681
682 if (State.VF.isVector() && IsPtrLoopInvariant && IsIndexLoopInvariant.all()) {
683 // If we are vectorizing, but the GEP has only loop-invariant operands,
684 // the GEP we build (by only using vector-typed operands for
685 // loop-varying values) would be a scalar pointer. Thus, to ensure we
686 // produce a vector of pointers, we need to either arbitrarily pick an
687 // operand to broadcast, or broadcast a clone of the original GEP.
688 // Here, we broadcast a clone of the original.
689 //
690 // TODO: If at some point we decide to scalarize instructions having
691 // loop-invariant operands, this special case will no longer be
692 // required. We would add the scalarization decision to
693 // collectLoopScalars() and teach getVectorValue() to broadcast
694 // the lane-zero scalar value.
695 auto *Clone = State.Builder.Insert(GEP->clone());
696 for (unsigned Part = 0; Part < State.UF; ++Part) {
697 Value *EntryPart = State.Builder.CreateVectorSplat(State.VF, Clone);
698 State.set(this, EntryPart, Part);
699 State.addMetadata(EntryPart, GEP);
700 }
701 } else {
702 // If the GEP has at least one loop-varying operand, we are sure to
703 // produce a vector of pointers. But if we are only unrolling, we want
704 // to produce a scalar GEP for each unroll part. Thus, the GEP we
705 // produce with the code below will be scalar (if VF == 1) or vector
706 // (otherwise). Note that for the unroll-only case, we still maintain
707 // values in the vector mapping with initVector, as we do for other
708 // instructions.
709 for (unsigned Part = 0; Part < State.UF; ++Part) {
710 // The pointer operand of the new GEP. If it's loop-invariant, we
711 // won't broadcast it.
712 auto *Ptr = IsPtrLoopInvariant
713 ? State.get(getOperand(0), VPIteration(0, 0))
714 : State.get(getOperand(0), Part);
715
716 // Collect all the indices for the new GEP. If any index is
717 // loop-invariant, we won't broadcast it.
718 SmallVector<Value *, 4> Indices;
719 for (unsigned I = 1, E = getNumOperands(); I < E; I++) {
720 VPValue *Operand = getOperand(I);
721 if (IsIndexLoopInvariant[I - 1])
722 Indices.push_back(State.get(Operand, VPIteration(0, 0)));
723 else
724 Indices.push_back(State.get(Operand, Part));
725 }
726
727 // If the GEP instruction is vectorized and was in a basic block that
728 // needed predication, we can't propagate the poison-generating 'inbounds'
729 // flag. The control flow has been linearized and the GEP is no longer
730 // guarded by the predicate, which could make the 'inbounds' properties to
731 // no longer hold.
732 bool IsInBounds =
733 GEP->isInBounds() && State.MayGeneratePoisonRecipes.count(this) == 0;
734
735 // Create the new GEP. Note that this GEP may be a scalar if VF == 1,
736 // but it should be a vector, otherwise.
737 auto *NewGEP = State.Builder.CreateGEP(GEP->getSourceElementType(), Ptr,
738 Indices, "", IsInBounds);
739 assert((State.VF.isScalar() || NewGEP->getType()->isVectorTy()) &&
740 "NewGEP is not a pointer vector");
741 State.set(this, NewGEP, Part);
742 State.addMetadata(NewGEP, GEP);
743 }
744 }
745 }
746
747 #if !defined(NDEBUG) || defined(LLVM_ENABLE_DUMP)
print(raw_ostream & O,const Twine & Indent,VPSlotTracker & SlotTracker) const748 void VPWidenGEPRecipe::print(raw_ostream &O, const Twine &Indent,
749 VPSlotTracker &SlotTracker) const {
750 O << Indent << "WIDEN-GEP ";
751 O << (IsPtrLoopInvariant ? "Inv" : "Var");
752 size_t IndicesNumber = IsIndexLoopInvariant.size();
753 for (size_t I = 0; I < IndicesNumber; ++I)
754 O << "[" << (IsIndexLoopInvariant[I] ? "Inv" : "Var") << "]";
755
756 O << " ";
757 printAsOperand(O, SlotTracker);
758 O << " = getelementptr ";
759 printOperands(O, SlotTracker);
760 }
761 #endif
762
execute(VPTransformState & State)763 void VPBlendRecipe::execute(VPTransformState &State) {
764 State.setDebugLocFromInst(Phi);
765 // We know that all PHIs in non-header blocks are converted into
766 // selects, so we don't have to worry about the insertion order and we
767 // can just use the builder.
768 // At this point we generate the predication tree. There may be
769 // duplications since this is a simple recursive scan, but future
770 // optimizations will clean it up.
771
772 unsigned NumIncoming = getNumIncomingValues();
773
774 // Generate a sequence of selects of the form:
775 // SELECT(Mask3, In3,
776 // SELECT(Mask2, In2,
777 // SELECT(Mask1, In1,
778 // In0)))
779 // Note that Mask0 is never used: lanes for which no path reaches this phi and
780 // are essentially undef are taken from In0.
781 VectorParts Entry(State.UF);
782 for (unsigned In = 0; In < NumIncoming; ++In) {
783 for (unsigned Part = 0; Part < State.UF; ++Part) {
784 // We might have single edge PHIs (blocks) - use an identity
785 // 'select' for the first PHI operand.
786 Value *In0 = State.get(getIncomingValue(In), Part);
787 if (In == 0)
788 Entry[Part] = In0; // Initialize with the first incoming value.
789 else {
790 // Select between the current value and the previous incoming edge
791 // based on the incoming mask.
792 Value *Cond = State.get(getMask(In), Part);
793 Entry[Part] =
794 State.Builder.CreateSelect(Cond, In0, Entry[Part], "predphi");
795 }
796 }
797 }
798 for (unsigned Part = 0; Part < State.UF; ++Part)
799 State.set(this, Entry[Part], Part);
800 }
801
802 #if !defined(NDEBUG) || defined(LLVM_ENABLE_DUMP)
print(raw_ostream & O,const Twine & Indent,VPSlotTracker & SlotTracker) const803 void VPBlendRecipe::print(raw_ostream &O, const Twine &Indent,
804 VPSlotTracker &SlotTracker) const {
805 O << Indent << "BLEND ";
806 Phi->printAsOperand(O, false);
807 O << " =";
808 if (getNumIncomingValues() == 1) {
809 // Not a User of any mask: not really blending, this is a
810 // single-predecessor phi.
811 O << " ";
812 getIncomingValue(0)->printAsOperand(O, SlotTracker);
813 } else {
814 for (unsigned I = 0, E = getNumIncomingValues(); I < E; ++I) {
815 O << " ";
816 getIncomingValue(I)->printAsOperand(O, SlotTracker);
817 O << "/";
818 getMask(I)->printAsOperand(O, SlotTracker);
819 }
820 }
821 }
822
print(raw_ostream & O,const Twine & Indent,VPSlotTracker & SlotTracker) const823 void VPReductionRecipe::print(raw_ostream &O, const Twine &Indent,
824 VPSlotTracker &SlotTracker) const {
825 O << Indent << "REDUCE ";
826 printAsOperand(O, SlotTracker);
827 O << " = ";
828 getChainOp()->printAsOperand(O, SlotTracker);
829 O << " +";
830 if (isa<FPMathOperator>(getUnderlyingInstr()))
831 O << getUnderlyingInstr()->getFastMathFlags();
832 O << " reduce." << Instruction::getOpcodeName(RdxDesc->getOpcode()) << " (";
833 getVecOp()->printAsOperand(O, SlotTracker);
834 if (getCondOp()) {
835 O << ", ";
836 getCondOp()->printAsOperand(O, SlotTracker);
837 }
838 O << ")";
839 if (RdxDesc->IntermediateStore)
840 O << " (with final reduction value stored in invariant address sank "
841 "outside of loop)";
842 }
843
print(raw_ostream & O,const Twine & Indent,VPSlotTracker & SlotTracker) const844 void VPReplicateRecipe::print(raw_ostream &O, const Twine &Indent,
845 VPSlotTracker &SlotTracker) const {
846 O << Indent << (IsUniform ? "CLONE " : "REPLICATE ");
847
848 if (!getUnderlyingInstr()->getType()->isVoidTy()) {
849 printAsOperand(O, SlotTracker);
850 O << " = ";
851 }
852 if (auto *CB = dyn_cast<CallBase>(getUnderlyingInstr())) {
853 O << "call @" << CB->getCalledFunction()->getName() << "(";
854 interleaveComma(make_range(op_begin(), op_begin() + (getNumOperands() - 1)),
855 O, [&O, &SlotTracker](VPValue *Op) {
856 Op->printAsOperand(O, SlotTracker);
857 });
858 O << ")";
859 } else {
860 O << Instruction::getOpcodeName(getUnderlyingInstr()->getOpcode()) << " ";
861 printOperands(O, SlotTracker);
862 }
863
864 if (AlsoPack)
865 O << " (S->V)";
866 }
867 #endif
868
execute(VPTransformState & State)869 void VPBranchOnMaskRecipe::execute(VPTransformState &State) {
870 assert(State.Instance && "Branch on Mask works only on single instance.");
871
872 unsigned Part = State.Instance->Part;
873 unsigned Lane = State.Instance->Lane.getKnownLane();
874
875 Value *ConditionBit = nullptr;
876 VPValue *BlockInMask = getMask();
877 if (BlockInMask) {
878 ConditionBit = State.get(BlockInMask, Part);
879 if (ConditionBit->getType()->isVectorTy())
880 ConditionBit = State.Builder.CreateExtractElement(
881 ConditionBit, State.Builder.getInt32(Lane));
882 } else // Block in mask is all-one.
883 ConditionBit = State.Builder.getTrue();
884
885 // Replace the temporary unreachable terminator with a new conditional branch,
886 // whose two destinations will be set later when they are created.
887 auto *CurrentTerminator = State.CFG.PrevBB->getTerminator();
888 assert(isa<UnreachableInst>(CurrentTerminator) &&
889 "Expected to replace unreachable terminator with conditional branch.");
890 auto *CondBr = BranchInst::Create(State.CFG.PrevBB, nullptr, ConditionBit);
891 CondBr->setSuccessor(0, nullptr);
892 ReplaceInstWithInst(CurrentTerminator, CondBr);
893 }
894
execute(VPTransformState & State)895 void VPPredInstPHIRecipe::execute(VPTransformState &State) {
896 assert(State.Instance && "Predicated instruction PHI works per instance.");
897 Instruction *ScalarPredInst =
898 cast<Instruction>(State.get(getOperand(0), *State.Instance));
899 BasicBlock *PredicatedBB = ScalarPredInst->getParent();
900 BasicBlock *PredicatingBB = PredicatedBB->getSinglePredecessor();
901 assert(PredicatingBB && "Predicated block has no single predecessor.");
902 assert(isa<VPReplicateRecipe>(getOperand(0)) &&
903 "operand must be VPReplicateRecipe");
904
905 // By current pack/unpack logic we need to generate only a single phi node: if
906 // a vector value for the predicated instruction exists at this point it means
907 // the instruction has vector users only, and a phi for the vector value is
908 // needed. In this case the recipe of the predicated instruction is marked to
909 // also do that packing, thereby "hoisting" the insert-element sequence.
910 // Otherwise, a phi node for the scalar value is needed.
911 unsigned Part = State.Instance->Part;
912 if (State.hasVectorValue(getOperand(0), Part)) {
913 Value *VectorValue = State.get(getOperand(0), Part);
914 InsertElementInst *IEI = cast<InsertElementInst>(VectorValue);
915 PHINode *VPhi = State.Builder.CreatePHI(IEI->getType(), 2);
916 VPhi->addIncoming(IEI->getOperand(0), PredicatingBB); // Unmodified vector.
917 VPhi->addIncoming(IEI, PredicatedBB); // New vector with inserted element.
918 if (State.hasVectorValue(this, Part))
919 State.reset(this, VPhi, Part);
920 else
921 State.set(this, VPhi, Part);
922 // NOTE: Currently we need to update the value of the operand, so the next
923 // predicated iteration inserts its generated value in the correct vector.
924 State.reset(getOperand(0), VPhi, Part);
925 } else {
926 Type *PredInstType = getOperand(0)->getUnderlyingValue()->getType();
927 PHINode *Phi = State.Builder.CreatePHI(PredInstType, 2);
928 Phi->addIncoming(PoisonValue::get(ScalarPredInst->getType()),
929 PredicatingBB);
930 Phi->addIncoming(ScalarPredInst, PredicatedBB);
931 if (State.hasScalarValue(this, *State.Instance))
932 State.reset(this, Phi, *State.Instance);
933 else
934 State.set(this, Phi, *State.Instance);
935 // NOTE: Currently we need to update the value of the operand, so the next
936 // predicated iteration inserts its generated value in the correct vector.
937 State.reset(getOperand(0), Phi, *State.Instance);
938 }
939 }
940
941 #if !defined(NDEBUG) || defined(LLVM_ENABLE_DUMP)
print(raw_ostream & O,const Twine & Indent,VPSlotTracker & SlotTracker) const942 void VPPredInstPHIRecipe::print(raw_ostream &O, const Twine &Indent,
943 VPSlotTracker &SlotTracker) const {
944 O << Indent << "PHI-PREDICATED-INSTRUCTION ";
945 printAsOperand(O, SlotTracker);
946 O << " = ";
947 printOperands(O, SlotTracker);
948 }
949
print(raw_ostream & O,const Twine & Indent,VPSlotTracker & SlotTracker) const950 void VPWidenMemoryInstructionRecipe::print(raw_ostream &O, const Twine &Indent,
951 VPSlotTracker &SlotTracker) const {
952 O << Indent << "WIDEN ";
953
954 if (!isStore()) {
955 getVPSingleValue()->printAsOperand(O, SlotTracker);
956 O << " = ";
957 }
958 O << Instruction::getOpcodeName(Ingredient.getOpcode()) << " ";
959
960 printOperands(O, SlotTracker);
961 }
962 #endif
963
execute(VPTransformState & State)964 void VPCanonicalIVPHIRecipe::execute(VPTransformState &State) {
965 Value *Start = getStartValue()->getLiveInIRValue();
966 PHINode *EntryPart = PHINode::Create(
967 Start->getType(), 2, "index", &*State.CFG.PrevBB->getFirstInsertionPt());
968
969 BasicBlock *VectorPH = State.CFG.getPreheaderBBFor(this);
970 EntryPart->addIncoming(Start, VectorPH);
971 EntryPart->setDebugLoc(DL);
972 for (unsigned Part = 0, UF = State.UF; Part < UF; ++Part)
973 State.set(this, EntryPart, Part);
974 }
975
976 #if !defined(NDEBUG) || defined(LLVM_ENABLE_DUMP)
print(raw_ostream & O,const Twine & Indent,VPSlotTracker & SlotTracker) const977 void VPCanonicalIVPHIRecipe::print(raw_ostream &O, const Twine &Indent,
978 VPSlotTracker &SlotTracker) const {
979 O << Indent << "EMIT ";
980 printAsOperand(O, SlotTracker);
981 O << " = CANONICAL-INDUCTION";
982 }
983 #endif
984
onlyScalarsGenerated(ElementCount VF)985 bool VPWidenPointerInductionRecipe::onlyScalarsGenerated(ElementCount VF) {
986 return IsScalarAfterVectorization &&
987 (!VF.isScalable() || vputils::onlyFirstLaneUsed(this));
988 }
989
990 #if !defined(NDEBUG) || defined(LLVM_ENABLE_DUMP)
print(raw_ostream & O,const Twine & Indent,VPSlotTracker & SlotTracker) const991 void VPWidenPointerInductionRecipe::print(raw_ostream &O, const Twine &Indent,
992 VPSlotTracker &SlotTracker) const {
993 O << Indent << "EMIT ";
994 printAsOperand(O, SlotTracker);
995 O << " = WIDEN-POINTER-INDUCTION ";
996 getStartValue()->printAsOperand(O, SlotTracker);
997 O << ", " << *IndDesc.getStep();
998 }
999 #endif
1000
execute(VPTransformState & State)1001 void VPExpandSCEVRecipe::execute(VPTransformState &State) {
1002 assert(!State.Instance && "cannot be used in per-lane");
1003 const DataLayout &DL = State.CFG.PrevBB->getModule()->getDataLayout();
1004 SCEVExpander Exp(SE, DL, "induction");
1005
1006 Value *Res = Exp.expandCodeFor(Expr, Expr->getType(),
1007 &*State.Builder.GetInsertPoint());
1008
1009 for (unsigned Part = 0, UF = State.UF; Part < UF; ++Part)
1010 State.set(this, Res, Part);
1011 }
1012
1013 #if !defined(NDEBUG) || defined(LLVM_ENABLE_DUMP)
print(raw_ostream & O,const Twine & Indent,VPSlotTracker & SlotTracker) const1014 void VPExpandSCEVRecipe::print(raw_ostream &O, const Twine &Indent,
1015 VPSlotTracker &SlotTracker) const {
1016 O << Indent << "EMIT ";
1017 getVPSingleValue()->printAsOperand(O, SlotTracker);
1018 O << " = EXPAND SCEV " << *Expr;
1019 }
1020 #endif
1021
execute(VPTransformState & State)1022 void VPWidenCanonicalIVRecipe::execute(VPTransformState &State) {
1023 Value *CanonicalIV = State.get(getOperand(0), 0);
1024 Type *STy = CanonicalIV->getType();
1025 IRBuilder<> Builder(State.CFG.PrevBB->getTerminator());
1026 ElementCount VF = State.VF;
1027 Value *VStart = VF.isScalar()
1028 ? CanonicalIV
1029 : Builder.CreateVectorSplat(VF, CanonicalIV, "broadcast");
1030 for (unsigned Part = 0, UF = State.UF; Part < UF; ++Part) {
1031 Value *VStep = createStepForVF(Builder, STy, VF, Part);
1032 if (VF.isVector()) {
1033 VStep = Builder.CreateVectorSplat(VF, VStep);
1034 VStep =
1035 Builder.CreateAdd(VStep, Builder.CreateStepVector(VStep->getType()));
1036 }
1037 Value *CanonicalVectorIV = Builder.CreateAdd(VStart, VStep, "vec.iv");
1038 State.set(this, CanonicalVectorIV, Part);
1039 }
1040 }
1041
1042 #if !defined(NDEBUG) || defined(LLVM_ENABLE_DUMP)
print(raw_ostream & O,const Twine & Indent,VPSlotTracker & SlotTracker) const1043 void VPWidenCanonicalIVRecipe::print(raw_ostream &O, const Twine &Indent,
1044 VPSlotTracker &SlotTracker) const {
1045 O << Indent << "EMIT ";
1046 printAsOperand(O, SlotTracker);
1047 O << " = WIDEN-CANONICAL-INDUCTION ";
1048 printOperands(O, SlotTracker);
1049 }
1050 #endif
1051
execute(VPTransformState & State)1052 void VPFirstOrderRecurrencePHIRecipe::execute(VPTransformState &State) {
1053 auto &Builder = State.Builder;
1054 // Create a vector from the initial value.
1055 auto *VectorInit = getStartValue()->getLiveInIRValue();
1056
1057 Type *VecTy = State.VF.isScalar()
1058 ? VectorInit->getType()
1059 : VectorType::get(VectorInit->getType(), State.VF);
1060
1061 BasicBlock *VectorPH = State.CFG.getPreheaderBBFor(this);
1062 if (State.VF.isVector()) {
1063 auto *IdxTy = Builder.getInt32Ty();
1064 auto *One = ConstantInt::get(IdxTy, 1);
1065 IRBuilder<>::InsertPointGuard Guard(Builder);
1066 Builder.SetInsertPoint(VectorPH->getTerminator());
1067 auto *RuntimeVF = getRuntimeVF(Builder, IdxTy, State.VF);
1068 auto *LastIdx = Builder.CreateSub(RuntimeVF, One);
1069 VectorInit = Builder.CreateInsertElement(
1070 PoisonValue::get(VecTy), VectorInit, LastIdx, "vector.recur.init");
1071 }
1072
1073 // Create a phi node for the new recurrence.
1074 PHINode *EntryPart = PHINode::Create(
1075 VecTy, 2, "vector.recur", &*State.CFG.PrevBB->getFirstInsertionPt());
1076 EntryPart->addIncoming(VectorInit, VectorPH);
1077 State.set(this, EntryPart, 0);
1078 }
1079
1080 #if !defined(NDEBUG) || defined(LLVM_ENABLE_DUMP)
print(raw_ostream & O,const Twine & Indent,VPSlotTracker & SlotTracker) const1081 void VPFirstOrderRecurrencePHIRecipe::print(raw_ostream &O, const Twine &Indent,
1082 VPSlotTracker &SlotTracker) const {
1083 O << Indent << "FIRST-ORDER-RECURRENCE-PHI ";
1084 printAsOperand(O, SlotTracker);
1085 O << " = phi ";
1086 printOperands(O, SlotTracker);
1087 }
1088 #endif
1089
execute(VPTransformState & State)1090 void VPReductionPHIRecipe::execute(VPTransformState &State) {
1091 PHINode *PN = cast<PHINode>(getUnderlyingValue());
1092 auto &Builder = State.Builder;
1093
1094 // In order to support recurrences we need to be able to vectorize Phi nodes.
1095 // Phi nodes have cycles, so we need to vectorize them in two stages. This is
1096 // stage #1: We create a new vector PHI node with no incoming edges. We'll use
1097 // this value when we vectorize all of the instructions that use the PHI.
1098 bool ScalarPHI = State.VF.isScalar() || IsInLoop;
1099 Type *VecTy =
1100 ScalarPHI ? PN->getType() : VectorType::get(PN->getType(), State.VF);
1101
1102 BasicBlock *HeaderBB = State.CFG.PrevBB;
1103 assert(State.CurrentVectorLoop->getHeader() == HeaderBB &&
1104 "recipe must be in the vector loop header");
1105 unsigned LastPartForNewPhi = isOrdered() ? 1 : State.UF;
1106 for (unsigned Part = 0; Part < LastPartForNewPhi; ++Part) {
1107 Value *EntryPart =
1108 PHINode::Create(VecTy, 2, "vec.phi", &*HeaderBB->getFirstInsertionPt());
1109 State.set(this, EntryPart, Part);
1110 }
1111
1112 BasicBlock *VectorPH = State.CFG.getPreheaderBBFor(this);
1113
1114 // Reductions do not have to start at zero. They can start with
1115 // any loop invariant values.
1116 VPValue *StartVPV = getStartValue();
1117 Value *StartV = StartVPV->getLiveInIRValue();
1118
1119 Value *Iden = nullptr;
1120 RecurKind RK = RdxDesc.getRecurrenceKind();
1121 if (RecurrenceDescriptor::isMinMaxRecurrenceKind(RK) ||
1122 RecurrenceDescriptor::isSelectCmpRecurrenceKind(RK)) {
1123 // MinMax reduction have the start value as their identify.
1124 if (ScalarPHI) {
1125 Iden = StartV;
1126 } else {
1127 IRBuilderBase::InsertPointGuard IPBuilder(Builder);
1128 Builder.SetInsertPoint(VectorPH->getTerminator());
1129 StartV = Iden =
1130 Builder.CreateVectorSplat(State.VF, StartV, "minmax.ident");
1131 }
1132 } else {
1133 Iden = RdxDesc.getRecurrenceIdentity(RK, VecTy->getScalarType(),
1134 RdxDesc.getFastMathFlags());
1135
1136 if (!ScalarPHI) {
1137 Iden = Builder.CreateVectorSplat(State.VF, Iden);
1138 IRBuilderBase::InsertPointGuard IPBuilder(Builder);
1139 Builder.SetInsertPoint(VectorPH->getTerminator());
1140 Constant *Zero = Builder.getInt32(0);
1141 StartV = Builder.CreateInsertElement(Iden, StartV, Zero);
1142 }
1143 }
1144
1145 for (unsigned Part = 0; Part < LastPartForNewPhi; ++Part) {
1146 Value *EntryPart = State.get(this, Part);
1147 // Make sure to add the reduction start value only to the
1148 // first unroll part.
1149 Value *StartVal = (Part == 0) ? StartV : Iden;
1150 cast<PHINode>(EntryPart)->addIncoming(StartVal, VectorPH);
1151 }
1152 }
1153
1154 #if !defined(NDEBUG) || defined(LLVM_ENABLE_DUMP)
print(raw_ostream & O,const Twine & Indent,VPSlotTracker & SlotTracker) const1155 void VPReductionPHIRecipe::print(raw_ostream &O, const Twine &Indent,
1156 VPSlotTracker &SlotTracker) const {
1157 O << Indent << "WIDEN-REDUCTION-PHI ";
1158
1159 printAsOperand(O, SlotTracker);
1160 O << " = phi ";
1161 printOperands(O, SlotTracker);
1162 }
1163 #endif
1164
execute(VPTransformState & State)1165 void VPWidenPHIRecipe::execute(VPTransformState &State) {
1166 assert(EnableVPlanNativePath &&
1167 "Non-native vplans are not expected to have VPWidenPHIRecipes.");
1168
1169 // Currently we enter here in the VPlan-native path for non-induction
1170 // PHIs where all control flow is uniform. We simply widen these PHIs.
1171 // Create a vector phi with no operands - the vector phi operands will be
1172 // set at the end of vector code generation.
1173 VPBasicBlock *Parent = getParent();
1174 VPRegionBlock *LoopRegion = Parent->getEnclosingLoopRegion();
1175 unsigned StartIdx = 0;
1176 // For phis in header blocks of loop regions, use the index of the value
1177 // coming from the preheader.
1178 if (LoopRegion->getEntryBasicBlock() == Parent) {
1179 for (unsigned I = 0; I < getNumOperands(); ++I) {
1180 if (getIncomingBlock(I) ==
1181 LoopRegion->getSinglePredecessor()->getExitingBasicBlock())
1182 StartIdx = I;
1183 }
1184 }
1185 Value *Op0 = State.get(getOperand(StartIdx), 0);
1186 Type *VecTy = Op0->getType();
1187 Value *VecPhi = State.Builder.CreatePHI(VecTy, 2, "vec.phi");
1188 State.set(this, VecPhi, 0);
1189 }
1190
1191 #if !defined(NDEBUG) || defined(LLVM_ENABLE_DUMP)
print(raw_ostream & O,const Twine & Indent,VPSlotTracker & SlotTracker) const1192 void VPWidenPHIRecipe::print(raw_ostream &O, const Twine &Indent,
1193 VPSlotTracker &SlotTracker) const {
1194 O << Indent << "WIDEN-PHI ";
1195
1196 auto *OriginalPhi = cast<PHINode>(getUnderlyingValue());
1197 // Unless all incoming values are modeled in VPlan print the original PHI
1198 // directly.
1199 // TODO: Remove once all VPWidenPHIRecipe instances keep all relevant incoming
1200 // values as VPValues.
1201 if (getNumOperands() != OriginalPhi->getNumOperands()) {
1202 O << VPlanIngredient(OriginalPhi);
1203 return;
1204 }
1205
1206 printAsOperand(O, SlotTracker);
1207 O << " = phi ";
1208 printOperands(O, SlotTracker);
1209 }
1210 #endif
1211
1212 // TODO: It would be good to use the existing VPWidenPHIRecipe instead and
1213 // remove VPActiveLaneMaskPHIRecipe.
execute(VPTransformState & State)1214 void VPActiveLaneMaskPHIRecipe::execute(VPTransformState &State) {
1215 BasicBlock *VectorPH = State.CFG.getPreheaderBBFor(this);
1216 for (unsigned Part = 0, UF = State.UF; Part < UF; ++Part) {
1217 Value *StartMask = State.get(getOperand(0), Part);
1218 PHINode *EntryPart =
1219 State.Builder.CreatePHI(StartMask->getType(), 2, "active.lane.mask");
1220 EntryPart->addIncoming(StartMask, VectorPH);
1221 EntryPart->setDebugLoc(DL);
1222 State.set(this, EntryPart, Part);
1223 }
1224 }
1225
1226 #if !defined(NDEBUG) || defined(LLVM_ENABLE_DUMP)
print(raw_ostream & O,const Twine & Indent,VPSlotTracker & SlotTracker) const1227 void VPActiveLaneMaskPHIRecipe::print(raw_ostream &O, const Twine &Indent,
1228 VPSlotTracker &SlotTracker) const {
1229 O << Indent << "ACTIVE-LANE-MASK-PHI ";
1230
1231 printAsOperand(O, SlotTracker);
1232 O << " = phi ";
1233 printOperands(O, SlotTracker);
1234 }
1235 #endif
1236