1 //===- MVETailPredication.cpp - MVE Tail Predication ----------------------===//
2 //
3 // Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
4 // See https://llvm.org/LICENSE.txt for license information.
5 // SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
6 //
7 //===----------------------------------------------------------------------===//
8 //
9 /// \file
10 /// Armv8.1m introduced MVE, M-Profile Vector Extension, and low-overhead
11 /// branches to help accelerate DSP applications. These two extensions can be
12 /// combined to provide implicit vector predication within a low-overhead loop.
13 /// The HardwareLoops pass inserts intrinsics identifying loops that the
14 /// backend will attempt to convert into a low-overhead loop. The vectorizer is
15 /// responsible for generating a vectorized loop in which the lanes are
16 /// predicated upon the iteration counter. This pass looks at these predicated
17 /// vector loops, that are targets for low-overhead loops, and prepares it for
18 /// code generation. Once the vectorizer has produced a masked loop, there's a
19 /// couple of final forms:
20 /// - A tail-predicated loop, with implicit predication.
21 /// - A loop containing multiple VCPT instructions, predicating multiple VPT
22 ///   blocks of instructions operating on different vector types.
23 ///
24 /// This pass inserts the inserts the VCTP intrinsic to represent the effect of
25 /// tail predication. This will be picked up by the ARM Low-overhead loop pass,
26 /// which performs the final transformation to a DLSTP or WLSTP tail-predicated
27 /// loop.
28 
29 #include "ARM.h"
30 #include "ARMSubtarget.h"
31 #include "llvm/Analysis/LoopInfo.h"
32 #include "llvm/Analysis/LoopPass.h"
33 #include "llvm/Analysis/ScalarEvolution.h"
34 #include "llvm/Analysis/ScalarEvolutionExpander.h"
35 #include "llvm/Analysis/ScalarEvolutionExpressions.h"
36 #include "llvm/Analysis/TargetTransformInfo.h"
37 #include "llvm/CodeGen/TargetPassConfig.h"
38 #include "llvm/InitializePasses.h"
39 #include "llvm/IR/IRBuilder.h"
40 #include "llvm/IR/Instructions.h"
41 #include "llvm/IR/IntrinsicsARM.h"
42 #include "llvm/IR/PatternMatch.h"
43 #include "llvm/Support/Debug.h"
44 #include "llvm/Transforms/Utils/BasicBlockUtils.h"
45 #include "llvm/Transforms/Utils/LoopUtils.h"
46 
47 using namespace llvm;
48 
49 #define DEBUG_TYPE "mve-tail-predication"
50 #define DESC "Transform predicated vector loops to use MVE tail predication"
51 
52 cl::opt<bool>
53 DisableTailPredication("disable-mve-tail-predication", cl::Hidden,
54                        cl::init(true),
55                        cl::desc("Disable MVE Tail Predication"));
56 namespace {
57 
58 class MVETailPredication : public LoopPass {
59   SmallVector<IntrinsicInst*, 4> MaskedInsts;
60   Loop *L = nullptr;
61   LoopInfo *LI = nullptr;
62   const DataLayout *DL;
63   DominatorTree *DT = nullptr;
64   ScalarEvolution *SE = nullptr;
65   TargetTransformInfo *TTI = nullptr;
66   TargetLibraryInfo *TLI = nullptr;
67   bool ClonedVCTPInExitBlock = false;
68 
69 public:
70   static char ID;
71 
72   MVETailPredication() : LoopPass(ID) { }
73 
74   void getAnalysisUsage(AnalysisUsage &AU) const override {
75     AU.addRequired<ScalarEvolutionWrapperPass>();
76     AU.addRequired<LoopInfoWrapperPass>();
77     AU.addRequired<TargetPassConfig>();
78     AU.addRequired<TargetTransformInfoWrapperPass>();
79     AU.addRequired<DominatorTreeWrapperPass>();
80     AU.addRequired<TargetLibraryInfoWrapperPass>();
81     AU.addPreserved<LoopInfoWrapperPass>();
82     AU.setPreservesCFG();
83   }
84 
85   bool runOnLoop(Loop *L, LPPassManager&) override;
86 
87 private:
88 
89   /// Perform the relevant checks on the loop and convert if possible.
90   bool TryConvert(Value *TripCount);
91 
92   /// Return whether this is a vectorized loop, that contains masked
93   /// load/stores.
94   bool IsPredicatedVectorLoop();
95 
96   /// Compute a value for the total number of elements that the predicated
97   /// loop will process.
98   Value *ComputeElements(Value *TripCount, VectorType *VecTy);
99 
100   /// Is the icmp that generates an i1 vector, based upon a loop counter
101   /// and a limit that is defined outside the loop.
102   bool isTailPredicate(Instruction *Predicate, Value *NumElements);
103 
104   /// Insert the intrinsic to represent the effect of tail predication.
105   void InsertVCTPIntrinsic(Instruction *Predicate,
106                            DenseMap<Instruction*, Instruction*> &NewPredicates,
107                            VectorType *VecTy,
108                            Value *NumElements);
109 
110   /// Rematerialize the iteration count in exit blocks, which enables
111   /// ARMLowOverheadLoops to better optimise away loop update statements inside
112   /// hardware-loops.
113   void RematerializeIterCount();
114 };
115 
116 } // end namespace
117 
118 static bool IsDecrement(Instruction &I) {
119   auto *Call = dyn_cast<IntrinsicInst>(&I);
120   if (!Call)
121     return false;
122 
123   Intrinsic::ID ID = Call->getIntrinsicID();
124   return ID == Intrinsic::loop_decrement_reg;
125 }
126 
127 static bool IsMasked(Instruction *I) {
128   auto *Call = dyn_cast<IntrinsicInst>(I);
129   if (!Call)
130     return false;
131 
132   Intrinsic::ID ID = Call->getIntrinsicID();
133   // TODO: Support gather/scatter expand/compress operations.
134   return ID == Intrinsic::masked_store || ID == Intrinsic::masked_load;
135 }
136 
137 void MVETailPredication::RematerializeIterCount() {
138   SmallVector<WeakTrackingVH, 16> DeadInsts;
139   SCEVExpander Rewriter(*SE, *DL, "mvetp");
140   ReplaceExitVal ReplaceExitValue = AlwaysRepl;
141 
142   formLCSSARecursively(*L, *DT, LI, SE);
143   rewriteLoopExitValues(L, LI, TLI, SE, Rewriter, DT, ReplaceExitValue,
144                         DeadInsts);
145 }
146 
147 bool MVETailPredication::runOnLoop(Loop *L, LPPassManager&) {
148   if (skipLoop(L) || DisableTailPredication)
149     return false;
150 
151   Function &F = *L->getHeader()->getParent();
152   auto &TPC = getAnalysis<TargetPassConfig>();
153   auto &TM = TPC.getTM<TargetMachine>();
154   auto *ST = &TM.getSubtarget<ARMSubtarget>(F);
155   DT = &getAnalysis<DominatorTreeWrapperPass>().getDomTree();
156   LI = &getAnalysis<LoopInfoWrapperPass>().getLoopInfo();
157   TTI = &getAnalysis<TargetTransformInfoWrapperPass>().getTTI(F);
158   SE = &getAnalysis<ScalarEvolutionWrapperPass>().getSE();
159   auto *TLIP = getAnalysisIfAvailable<TargetLibraryInfoWrapperPass>();
160   TLI = TLIP ? &TLIP->getTLI(*L->getHeader()->getParent()) : nullptr;
161   DL = &L->getHeader()->getModule()->getDataLayout();
162   this->L = L;
163 
164   // The MVE and LOB extensions are combined to enable tail-predication, but
165   // there's nothing preventing us from generating VCTP instructions for v8.1m.
166   if (!ST->hasMVEIntegerOps() || !ST->hasV8_1MMainlineOps()) {
167     LLVM_DEBUG(dbgs() << "ARM TP: Not a v8.1m.main+mve target.\n");
168     return false;
169   }
170 
171   BasicBlock *Preheader = L->getLoopPreheader();
172   if (!Preheader)
173     return false;
174 
175   auto FindLoopIterations = [](BasicBlock *BB) -> IntrinsicInst* {
176     for (auto &I : *BB) {
177       auto *Call = dyn_cast<IntrinsicInst>(&I);
178       if (!Call)
179         continue;
180 
181       Intrinsic::ID ID = Call->getIntrinsicID();
182       if (ID == Intrinsic::set_loop_iterations ||
183           ID == Intrinsic::test_set_loop_iterations)
184         return cast<IntrinsicInst>(&I);
185     }
186     return nullptr;
187   };
188 
189   // Look for the hardware loop intrinsic that sets the iteration count.
190   IntrinsicInst *Setup = FindLoopIterations(Preheader);
191 
192   // The test.set iteration could live in the pre-preheader.
193   if (!Setup) {
194     if (!Preheader->getSinglePredecessor())
195       return false;
196     Setup = FindLoopIterations(Preheader->getSinglePredecessor());
197     if (!Setup)
198       return false;
199   }
200 
201   // Search for the hardware loop intrinic that decrements the loop counter.
202   IntrinsicInst *Decrement = nullptr;
203   for (auto *BB : L->getBlocks()) {
204     for (auto &I : *BB) {
205       if (IsDecrement(I)) {
206         Decrement = cast<IntrinsicInst>(&I);
207         break;
208       }
209     }
210   }
211 
212   if (!Decrement)
213     return false;
214 
215   LLVM_DEBUG(dbgs() << "ARM TP: Running on Loop: " << *L << *Setup << "\n"
216              << *Decrement << "\n");
217 
218   if (TryConvert(Setup->getArgOperand(0))) {
219     if (ClonedVCTPInExitBlock)
220       RematerializeIterCount();
221     return true;
222   }
223 
224   return false;
225 }
226 
227 bool MVETailPredication::isTailPredicate(Instruction *I, Value *NumElements) {
228   // Look for the following:
229 
230   // %trip.count.minus.1 = add i32 %N, -1
231   // %broadcast.splatinsert10 = insertelement <4 x i32> undef,
232   //                                          i32 %trip.count.minus.1, i32 0
233   // %broadcast.splat11 = shufflevector <4 x i32> %broadcast.splatinsert10,
234   //                                    <4 x i32> undef,
235   //                                    <4 x i32> zeroinitializer
236   // ...
237   // ...
238   // %index = phi i32
239   // %broadcast.splatinsert = insertelement <4 x i32> undef, i32 %index, i32 0
240   // %broadcast.splat = shufflevector <4 x i32> %broadcast.splatinsert,
241   //                                  <4 x i32> undef,
242   //                                  <4 x i32> zeroinitializer
243   // %induction = add <4 x i32> %broadcast.splat, <i32 0, i32 1, i32 2, i32 3>
244   // %pred = icmp ule <4 x i32> %induction, %broadcast.splat11
245 
246   // And return whether V == %pred.
247 
248   using namespace PatternMatch;
249 
250   CmpInst::Predicate Pred;
251   Instruction *Shuffle = nullptr;
252   Instruction *Induction = nullptr;
253 
254   // The vector icmp
255   if (!match(I, m_ICmp(Pred, m_Instruction(Induction),
256                        m_Instruction(Shuffle))) ||
257       Pred != ICmpInst::ICMP_ULE)
258     return false;
259 
260   // First find the stuff outside the loop which is setting up the limit
261   // vector....
262   // The invariant shuffle that broadcast the limit into a vector.
263   Instruction *Insert = nullptr;
264   if (!match(Shuffle, m_ShuffleVector(m_Instruction(Insert), m_Undef(),
265                                       m_Zero())))
266     return false;
267 
268   // Insert the limit into a vector.
269   Instruction *BECount = nullptr;
270   if (!match(Insert, m_InsertElement(m_Undef(), m_Instruction(BECount),
271                                      m_Zero())))
272     return false;
273 
274   // The limit calculation, backedge count.
275   Value *TripCount = nullptr;
276   if (!match(BECount, m_Add(m_Value(TripCount), m_AllOnes())))
277     return false;
278 
279   if (TripCount != NumElements || !L->isLoopInvariant(BECount))
280     return false;
281 
282   // Now back to searching inside the loop body...
283   // Find the add with takes the index iv and adds a constant vector to it.
284   Instruction *BroadcastSplat = nullptr;
285   Constant *Const = nullptr;
286   if (!match(Induction, m_Add(m_Instruction(BroadcastSplat),
287                               m_Constant(Const))))
288    return false;
289 
290   // Check that we're adding <0, 1, 2, 3...
291   if (auto *CDS = dyn_cast<ConstantDataSequential>(Const)) {
292     for (unsigned i = 0; i < CDS->getNumElements(); ++i) {
293       if (CDS->getElementAsInteger(i) != i)
294         return false;
295     }
296   } else
297     return false;
298 
299   // The shuffle which broadcasts the index iv into a vector.
300   if (!match(BroadcastSplat, m_ShuffleVector(m_Instruction(Insert), m_Undef(),
301                                              m_Zero())))
302     return false;
303 
304   // The insert element which initialises a vector with the index iv.
305   Instruction *IV = nullptr;
306   if (!match(Insert, m_InsertElement(m_Undef(), m_Instruction(IV), m_Zero())))
307     return false;
308 
309   // The index iv.
310   auto *Phi = dyn_cast<PHINode>(IV);
311   if (!Phi)
312     return false;
313 
314   // TODO: Don't think we need to check the entry value.
315   Value *OnEntry = Phi->getIncomingValueForBlock(L->getLoopPreheader());
316   if (!match(OnEntry, m_Zero()))
317     return false;
318 
319   Value *InLoop = Phi->getIncomingValueForBlock(L->getLoopLatch());
320   unsigned Lanes = cast<VectorType>(Insert->getType())->getNumElements();
321 
322   Instruction *LHS = nullptr;
323   if (!match(InLoop, m_Add(m_Instruction(LHS), m_SpecificInt(Lanes))))
324     return false;
325 
326   return LHS == Phi;
327 }
328 
329 static VectorType* getVectorType(IntrinsicInst *I) {
330   unsigned TypeOp = I->getIntrinsicID() == Intrinsic::masked_load ? 0 : 1;
331   auto *PtrTy = cast<PointerType>(I->getOperand(TypeOp)->getType());
332   return cast<VectorType>(PtrTy->getElementType());
333 }
334 
335 bool MVETailPredication::IsPredicatedVectorLoop() {
336   // Check that the loop contains at least one masked load/store intrinsic.
337   // We only support 'normal' vector instructions - other than masked
338   // load/stores.
339   for (auto *BB : L->getBlocks()) {
340     for (auto &I : *BB) {
341       if (IsMasked(&I)) {
342         VectorType *VecTy = getVectorType(cast<IntrinsicInst>(&I));
343         unsigned Lanes = VecTy->getNumElements();
344         unsigned ElementWidth = VecTy->getScalarSizeInBits();
345         // MVE vectors are 128-bit, but don't support 128 x i1.
346         // TODO: Can we support vectors larger than 128-bits?
347         unsigned MaxWidth = TTI->getRegisterBitWidth(true);
348         if (Lanes * ElementWidth > MaxWidth || Lanes == MaxWidth)
349           return false;
350         MaskedInsts.push_back(cast<IntrinsicInst>(&I));
351       } else if (auto *Int = dyn_cast<IntrinsicInst>(&I)) {
352         for (auto &U : Int->args()) {
353           if (isa<VectorType>(U->getType()))
354             return false;
355         }
356       }
357     }
358   }
359 
360   return !MaskedInsts.empty();
361 }
362 
363 Value* MVETailPredication::ComputeElements(Value *TripCount,
364                                            VectorType *VecTy) {
365   const SCEV *TripCountSE = SE->getSCEV(TripCount);
366   ConstantInt *VF = ConstantInt::get(cast<IntegerType>(TripCount->getType()),
367                                      VecTy->getNumElements());
368 
369   if (VF->equalsInt(1))
370     return nullptr;
371 
372   // TODO: Support constant trip counts.
373   auto VisitAdd = [&](const SCEVAddExpr *S) -> const SCEVMulExpr* {
374     if (auto *Const = dyn_cast<SCEVConstant>(S->getOperand(0))) {
375       if (Const->getAPInt() != -VF->getValue())
376         return nullptr;
377     } else
378       return nullptr;
379     return dyn_cast<SCEVMulExpr>(S->getOperand(1));
380   };
381 
382   auto VisitMul = [&](const SCEVMulExpr *S) -> const SCEVUDivExpr* {
383     if (auto *Const = dyn_cast<SCEVConstant>(S->getOperand(0))) {
384       if (Const->getValue() != VF)
385         return nullptr;
386     } else
387       return nullptr;
388     return dyn_cast<SCEVUDivExpr>(S->getOperand(1));
389   };
390 
391   auto VisitDiv = [&](const SCEVUDivExpr *S) -> const SCEV* {
392     if (auto *Const = dyn_cast<SCEVConstant>(S->getRHS())) {
393       if (Const->getValue() != VF)
394         return nullptr;
395     } else
396       return nullptr;
397 
398     if (auto *RoundUp = dyn_cast<SCEVAddExpr>(S->getLHS())) {
399       if (auto *Const = dyn_cast<SCEVConstant>(RoundUp->getOperand(0))) {
400         if (Const->getAPInt() != (VF->getValue() - 1))
401           return nullptr;
402       } else
403         return nullptr;
404 
405       return RoundUp->getOperand(1);
406     }
407     return nullptr;
408   };
409 
410   // TODO: Can we use SCEV helpers, such as findArrayDimensions, and friends to
411   // determine the numbers of elements instead? Looks like this is what is used
412   // for delinearization, but I'm not sure if it can be applied to the
413   // vectorized form - at least not without a bit more work than I feel
414   // comfortable with.
415 
416   // Search for Elems in the following SCEV:
417   // (1 + ((-VF + (VF * (((VF - 1) + %Elems) /u VF))<nuw>) /u VF))<nuw><nsw>
418   const SCEV *Elems = nullptr;
419   if (auto *TC = dyn_cast<SCEVAddExpr>(TripCountSE))
420     if (auto *Div = dyn_cast<SCEVUDivExpr>(TC->getOperand(1)))
421       if (auto *Add = dyn_cast<SCEVAddExpr>(Div->getLHS()))
422         if (auto *Mul = VisitAdd(Add))
423           if (auto *Div = VisitMul(Mul))
424             if (auto *Res = VisitDiv(Div))
425               Elems = Res;
426 
427   if (!Elems)
428     return nullptr;
429 
430   Instruction *InsertPt = L->getLoopPreheader()->getTerminator();
431   if (!isSafeToExpandAt(Elems, InsertPt, *SE))
432     return nullptr;
433 
434   auto DL = L->getHeader()->getModule()->getDataLayout();
435   SCEVExpander Expander(*SE, DL, "elements");
436   return Expander.expandCodeFor(Elems, Elems->getType(), InsertPt);
437 }
438 
439 // Look through the exit block to see whether there's a duplicate predicate
440 // instruction. This can happen when we need to perform a select on values
441 // from the last and previous iteration. Instead of doing a straight
442 // replacement of that predicate with the vctp, clone the vctp and place it
443 // in the block. This means that the VPR doesn't have to be live into the
444 // exit block which should make it easier to convert this loop into a proper
445 // tail predicated loop.
446 static bool Cleanup(DenseMap<Instruction*, Instruction*> &NewPredicates,
447                     SetVector<Instruction*> &MaybeDead, Loop *L) {
448   BasicBlock *Exit = L->getUniqueExitBlock();
449   if (!Exit) {
450     LLVM_DEBUG(dbgs() << "ARM TP: can't find loop exit block\n");
451     return false;
452   }
453 
454   bool ClonedVCTPInExitBlock = false;
455 
456   for (auto &Pair : NewPredicates) {
457     Instruction *OldPred = Pair.first;
458     Instruction *NewPred = Pair.second;
459 
460     for (auto &I : *Exit) {
461       if (I.isSameOperationAs(OldPred)) {
462         Instruction *PredClone = NewPred->clone();
463         PredClone->insertBefore(&I);
464         I.replaceAllUsesWith(PredClone);
465         MaybeDead.insert(&I);
466         ClonedVCTPInExitBlock = true;
467         LLVM_DEBUG(dbgs() << "ARM TP: replacing: "; I.dump();
468                    dbgs() << "ARM TP: with:      "; PredClone->dump());
469         break;
470       }
471     }
472   }
473 
474   // Drop references and add operands to check for dead.
475   SmallPtrSet<Instruction*, 4> Dead;
476   while (!MaybeDead.empty()) {
477     auto *I = MaybeDead.front();
478     MaybeDead.remove(I);
479     if (I->hasNUsesOrMore(1))
480       continue;
481 
482     for (auto &U : I->operands()) {
483       if (auto *OpI = dyn_cast<Instruction>(U))
484         MaybeDead.insert(OpI);
485     }
486     I->dropAllReferences();
487     Dead.insert(I);
488   }
489 
490   for (auto *I : Dead) {
491     LLVM_DEBUG(dbgs() << "ARM TP: removing dead insn: "; I->dump());
492     I->eraseFromParent();
493   }
494 
495   for (auto I : L->blocks())
496     DeleteDeadPHIs(I);
497 
498   return ClonedVCTPInExitBlock;
499 }
500 
501 void MVETailPredication::InsertVCTPIntrinsic(Instruction *Predicate,
502     DenseMap<Instruction*, Instruction*> &NewPredicates,
503     VectorType *VecTy, Value *NumElements) {
504   IRBuilder<> Builder(L->getHeader()->getFirstNonPHI());
505   Module *M = L->getHeader()->getModule();
506   Type *Ty = IntegerType::get(M->getContext(), 32);
507 
508   // Insert a phi to count the number of elements processed by the loop.
509   PHINode *Processed = Builder.CreatePHI(Ty, 2);
510   Processed->addIncoming(NumElements, L->getLoopPreheader());
511 
512   // Insert the intrinsic to represent the effect of tail predication.
513   Builder.SetInsertPoint(cast<Instruction>(Predicate));
514   ConstantInt *Factor =
515     ConstantInt::get(cast<IntegerType>(Ty), VecTy->getNumElements());
516 
517   Intrinsic::ID VCTPID;
518   switch (VecTy->getNumElements()) {
519   default:
520     llvm_unreachable("unexpected number of lanes");
521   case 4:  VCTPID = Intrinsic::arm_mve_vctp32; break;
522   case 8:  VCTPID = Intrinsic::arm_mve_vctp16; break;
523   case 16: VCTPID = Intrinsic::arm_mve_vctp8; break;
524 
525     // FIXME: vctp64 currently not supported because the predicate
526     // vector wants to be <2 x i1>, but v2i1 is not a legal MVE
527     // type, so problems happen at isel time.
528     // Intrinsic::arm_mve_vctp64 exists for ACLE intrinsics
529     // purposes, but takes a v4i1 instead of a v2i1.
530   }
531   Function *VCTP = Intrinsic::getDeclaration(M, VCTPID);
532   Value *TailPredicate = Builder.CreateCall(VCTP, Processed);
533   Predicate->replaceAllUsesWith(TailPredicate);
534   NewPredicates[Predicate] = cast<Instruction>(TailPredicate);
535 
536   // Add the incoming value to the new phi.
537   // TODO: This add likely already exists in the loop.
538   Value *Remaining = Builder.CreateSub(Processed, Factor);
539   Processed->addIncoming(Remaining, L->getLoopLatch());
540   LLVM_DEBUG(dbgs() << "ARM TP: Insert processed elements phi: "
541              << *Processed << "\n"
542              << "ARM TP: Inserted VCTP: " << *TailPredicate << "\n");
543 }
544 
545 bool MVETailPredication::TryConvert(Value *TripCount) {
546   if (!IsPredicatedVectorLoop()) {
547     LLVM_DEBUG(dbgs() << "ARM TP: no masked instructions in loop");
548     return false;
549   }
550 
551   LLVM_DEBUG(dbgs() << "ARM TP: Found predicated vector loop.\n");
552 
553   // Walk through the masked intrinsics and try to find whether the predicate
554   // operand is generated from an induction variable.
555   SetVector<Instruction*> Predicates;
556   DenseMap<Instruction*, Instruction*> NewPredicates;
557 
558   for (auto *I : MaskedInsts) {
559     Intrinsic::ID ID = I->getIntrinsicID();
560     unsigned PredOp = ID == Intrinsic::masked_load ? 2 : 3;
561     auto *Predicate = dyn_cast<Instruction>(I->getArgOperand(PredOp));
562     if (!Predicate || Predicates.count(Predicate))
563       continue;
564 
565     VectorType *VecTy = getVectorType(I);
566     Value *NumElements = ComputeElements(TripCount, VecTy);
567     if (!NumElements)
568       continue;
569 
570     if (!isTailPredicate(Predicate, NumElements)) {
571       LLVM_DEBUG(dbgs() << "ARM TP: Not tail predicate: " << *Predicate << "\n");
572       continue;
573     }
574 
575     LLVM_DEBUG(dbgs() << "ARM TP: Found tail predicate: " << *Predicate << "\n");
576     Predicates.insert(Predicate);
577 
578     InsertVCTPIntrinsic(Predicate, NewPredicates, VecTy, NumElements);
579   }
580 
581   // Now clean up.
582   ClonedVCTPInExitBlock = Cleanup(NewPredicates, Predicates, L);
583   return true;
584 }
585 
586 Pass *llvm::createMVETailPredicationPass() {
587   return new MVETailPredication();
588 }
589 
590 char MVETailPredication::ID = 0;
591 
592 INITIALIZE_PASS_BEGIN(MVETailPredication, DEBUG_TYPE, DESC, false, false)
593 INITIALIZE_PASS_END(MVETailPredication, DEBUG_TYPE, DESC, false, false)
594