1 //===- MVETailPredication.cpp - MVE Tail Predication ----------------------===//
2 //
3 // Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
4 // See https://llvm.org/LICENSE.txt for license information.
5 // SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
6 //
7 //===----------------------------------------------------------------------===//
8 //
9 /// \file
10 /// Armv8.1m introduced MVE, M-Profile Vector Extension, and low-overhead
11 /// branches to help accelerate DSP applications. These two extensions can be
12 /// combined to provide implicit vector predication within a low-overhead loop.
13 /// The HardwareLoops pass inserts intrinsics identifying loops that the
14 /// backend will attempt to convert into a low-overhead loop. The vectorizer is
15 /// responsible for generating a vectorized loop in which the lanes are
16 /// predicated upon the iteration counter. This pass looks at these predicated
17 /// vector loops, that are targets for low-overhead loops, and prepares it for
18 /// code generation. Once the vectorizer has produced a masked loop, there's a
19 /// couple of final forms:
20 /// - A tail-predicated loop, with implicit predication.
21 /// - A loop containing multiple VCPT instructions, predicating multiple VPT
22 ///   blocks of instructions operating on different vector types.
23 ///
24 /// This pass inserts the inserts the VCTP intrinsic to represent the effect of
25 /// tail predication. This will be picked up by the ARM Low-overhead loop pass,
26 /// which performs the final transformation to a DLSTP or WLSTP tail-predicated
27 /// loop.
28 
29 #include "ARM.h"
30 #include "ARMSubtarget.h"
31 #include "llvm/Analysis/LoopInfo.h"
32 #include "llvm/Analysis/LoopPass.h"
33 #include "llvm/Analysis/ScalarEvolution.h"
34 #include "llvm/Analysis/ScalarEvolutionExpander.h"
35 #include "llvm/Analysis/ScalarEvolutionExpressions.h"
36 #include "llvm/Analysis/TargetTransformInfo.h"
37 #include "llvm/CodeGen/TargetPassConfig.h"
38 #include "llvm/InitializePasses.h"
39 #include "llvm/IR/IRBuilder.h"
40 #include "llvm/IR/Instructions.h"
41 #include "llvm/IR/IntrinsicsARM.h"
42 #include "llvm/IR/PatternMatch.h"
43 #include "llvm/Support/Debug.h"
44 #include "llvm/Transforms/Utils/BasicBlockUtils.h"
45 #include "llvm/Transforms/Utils/LoopUtils.h"
46 
47 using namespace llvm;
48 
49 #define DEBUG_TYPE "mve-tail-predication"
50 #define DESC "Transform predicated vector loops to use MVE tail predication"
51 
52 cl::opt<bool>
53 DisableTailPredication("disable-mve-tail-predication", cl::Hidden,
54                        cl::init(true),
55                        cl::desc("Disable MVE Tail Predication"));
56 namespace {
57 
58 class MVETailPredication : public LoopPass {
59   SmallVector<IntrinsicInst*, 4> MaskedInsts;
60   Loop *L = nullptr;
61   LoopInfo *LI = nullptr;
62   const DataLayout *DL;
63   DominatorTree *DT = nullptr;
64   ScalarEvolution *SE = nullptr;
65   TargetTransformInfo *TTI = nullptr;
66   TargetLibraryInfo *TLI = nullptr;
67   bool ClonedVCTPInExitBlock = false;
68 
69 public:
70   static char ID;
71 
72   MVETailPredication() : LoopPass(ID) { }
73 
74   void getAnalysisUsage(AnalysisUsage &AU) const override {
75     AU.addRequired<ScalarEvolutionWrapperPass>();
76     AU.addRequired<LoopInfoWrapperPass>();
77     AU.addRequired<TargetPassConfig>();
78     AU.addRequired<TargetTransformInfoWrapperPass>();
79     AU.addRequired<DominatorTreeWrapperPass>();
80     AU.addRequired<TargetLibraryInfoWrapperPass>();
81     AU.addPreserved<LoopInfoWrapperPass>();
82     AU.setPreservesCFG();
83   }
84 
85   bool runOnLoop(Loop *L, LPPassManager&) override;
86 
87 private:
88 
89   /// Perform the relevant checks on the loop and convert if possible.
90   bool TryConvert(Value *TripCount);
91 
92   /// Return whether this is a vectorized loop, that contains masked
93   /// load/stores.
94   bool IsPredicatedVectorLoop();
95 
96   /// Compute a value for the total number of elements that the predicated
97   /// loop will process.
98   Value *ComputeElements(Value *TripCount, VectorType *VecTy);
99 
100   /// Is the icmp that generates an i1 vector, based upon a loop counter
101   /// and a limit that is defined outside the loop.
102   bool isTailPredicate(Instruction *Predicate, Value *NumElements);
103 
104   /// Insert the intrinsic to represent the effect of tail predication.
105   void InsertVCTPIntrinsic(Instruction *Predicate,
106                            DenseMap<Instruction*, Instruction*> &NewPredicates,
107                            VectorType *VecTy,
108                            Value *NumElements);
109 
110   /// Rematerialize the iteration count in exit blocks, which enables
111   /// ARMLowOverheadLoops to better optimise away loop update statements inside
112   /// hardware-loops.
113   void RematerializeIterCount();
114 };
115 
116 } // end namespace
117 
118 static bool IsDecrement(Instruction &I) {
119   auto *Call = dyn_cast<IntrinsicInst>(&I);
120   if (!Call)
121     return false;
122 
123   Intrinsic::ID ID = Call->getIntrinsicID();
124   return ID == Intrinsic::loop_decrement_reg;
125 }
126 
127 static bool IsMasked(Instruction *I) {
128   auto *Call = dyn_cast<IntrinsicInst>(I);
129   if (!Call)
130     return false;
131 
132   Intrinsic::ID ID = Call->getIntrinsicID();
133   // TODO: Support gather/scatter expand/compress operations.
134   return ID == Intrinsic::masked_store || ID == Intrinsic::masked_load;
135 }
136 
137 void MVETailPredication::RematerializeIterCount() {
138   SmallVector<WeakTrackingVH, 16> DeadInsts;
139   SCEVExpander Rewriter(*SE, *DL, "mvetp");
140   ReplaceExitVal ReplaceExitValue = AlwaysRepl;
141 
142   formLCSSARecursively(*L, *DT, LI, SE);
143   rewriteLoopExitValues(L, LI, TLI, SE, Rewriter, DT, ReplaceExitValue,
144                         DeadInsts);
145 }
146 
147 bool MVETailPredication::runOnLoop(Loop *L, LPPassManager&) {
148   if (skipLoop(L) || DisableTailPredication)
149     return false;
150 
151   MaskedInsts.clear();
152   Function &F = *L->getHeader()->getParent();
153   auto &TPC = getAnalysis<TargetPassConfig>();
154   auto &TM = TPC.getTM<TargetMachine>();
155   auto *ST = &TM.getSubtarget<ARMSubtarget>(F);
156   DT = &getAnalysis<DominatorTreeWrapperPass>().getDomTree();
157   LI = &getAnalysis<LoopInfoWrapperPass>().getLoopInfo();
158   TTI = &getAnalysis<TargetTransformInfoWrapperPass>().getTTI(F);
159   SE = &getAnalysis<ScalarEvolutionWrapperPass>().getSE();
160   auto *TLIP = getAnalysisIfAvailable<TargetLibraryInfoWrapperPass>();
161   TLI = TLIP ? &TLIP->getTLI(*L->getHeader()->getParent()) : nullptr;
162   DL = &L->getHeader()->getModule()->getDataLayout();
163   this->L = L;
164 
165   // The MVE and LOB extensions are combined to enable tail-predication, but
166   // there's nothing preventing us from generating VCTP instructions for v8.1m.
167   if (!ST->hasMVEIntegerOps() || !ST->hasV8_1MMainlineOps()) {
168     LLVM_DEBUG(dbgs() << "ARM TP: Not a v8.1m.main+mve target.\n");
169     return false;
170   }
171 
172   BasicBlock *Preheader = L->getLoopPreheader();
173   if (!Preheader)
174     return false;
175 
176   auto FindLoopIterations = [](BasicBlock *BB) -> IntrinsicInst* {
177     for (auto &I : *BB) {
178       auto *Call = dyn_cast<IntrinsicInst>(&I);
179       if (!Call)
180         continue;
181 
182       Intrinsic::ID ID = Call->getIntrinsicID();
183       if (ID == Intrinsic::set_loop_iterations ||
184           ID == Intrinsic::test_set_loop_iterations)
185         return cast<IntrinsicInst>(&I);
186     }
187     return nullptr;
188   };
189 
190   // Look for the hardware loop intrinsic that sets the iteration count.
191   IntrinsicInst *Setup = FindLoopIterations(Preheader);
192 
193   // The test.set iteration could live in the pre-preheader.
194   if (!Setup) {
195     if (!Preheader->getSinglePredecessor())
196       return false;
197     Setup = FindLoopIterations(Preheader->getSinglePredecessor());
198     if (!Setup)
199       return false;
200   }
201 
202   // Search for the hardware loop intrinic that decrements the loop counter.
203   IntrinsicInst *Decrement = nullptr;
204   for (auto *BB : L->getBlocks()) {
205     for (auto &I : *BB) {
206       if (IsDecrement(I)) {
207         Decrement = cast<IntrinsicInst>(&I);
208         break;
209       }
210     }
211   }
212 
213   if (!Decrement)
214     return false;
215 
216   LLVM_DEBUG(dbgs() << "ARM TP: Running on Loop: " << *L << *Setup << "\n"
217              << *Decrement << "\n");
218 
219   if (TryConvert(Setup->getArgOperand(0))) {
220     if (ClonedVCTPInExitBlock)
221       RematerializeIterCount();
222     return true;
223   }
224 
225   return false;
226 }
227 
228 bool MVETailPredication::isTailPredicate(Instruction *I, Value *NumElements) {
229   // Look for the following:
230 
231   // %trip.count.minus.1 = add i32 %N, -1
232   // %broadcast.splatinsert10 = insertelement <4 x i32> undef,
233   //                                          i32 %trip.count.minus.1, i32 0
234   // %broadcast.splat11 = shufflevector <4 x i32> %broadcast.splatinsert10,
235   //                                    <4 x i32> undef,
236   //                                    <4 x i32> zeroinitializer
237   // ...
238   // ...
239   // %index = phi i32
240   // %broadcast.splatinsert = insertelement <4 x i32> undef, i32 %index, i32 0
241   // %broadcast.splat = shufflevector <4 x i32> %broadcast.splatinsert,
242   //                                  <4 x i32> undef,
243   //                                  <4 x i32> zeroinitializer
244   // %induction = add <4 x i32> %broadcast.splat, <i32 0, i32 1, i32 2, i32 3>
245   // %pred = icmp ule <4 x i32> %induction, %broadcast.splat11
246 
247   // And return whether V == %pred.
248 
249   using namespace PatternMatch;
250 
251   CmpInst::Predicate Pred;
252   Instruction *Shuffle = nullptr;
253   Instruction *Induction = nullptr;
254 
255   // The vector icmp
256   if (!match(I, m_ICmp(Pred, m_Instruction(Induction),
257                        m_Instruction(Shuffle))) ||
258       Pred != ICmpInst::ICMP_ULE)
259     return false;
260 
261   // First find the stuff outside the loop which is setting up the limit
262   // vector....
263   // The invariant shuffle that broadcast the limit into a vector.
264   Instruction *Insert = nullptr;
265   if (!match(Shuffle, m_ShuffleVector(m_Instruction(Insert), m_Undef(),
266                                       m_Zero())))
267     return false;
268 
269   // Insert the limit into a vector.
270   Instruction *BECount = nullptr;
271   if (!match(Insert, m_InsertElement(m_Undef(), m_Instruction(BECount),
272                                      m_Zero())))
273     return false;
274 
275   // The limit calculation, backedge count.
276   Value *TripCount = nullptr;
277   if (!match(BECount, m_Add(m_Value(TripCount), m_AllOnes())))
278     return false;
279 
280   if (TripCount != NumElements || !L->isLoopInvariant(BECount))
281     return false;
282 
283   // Now back to searching inside the loop body...
284   // Find the add with takes the index iv and adds a constant vector to it.
285   Instruction *BroadcastSplat = nullptr;
286   Constant *Const = nullptr;
287   if (!match(Induction, m_Add(m_Instruction(BroadcastSplat),
288                               m_Constant(Const))))
289    return false;
290 
291   // Check that we're adding <0, 1, 2, 3...
292   if (auto *CDS = dyn_cast<ConstantDataSequential>(Const)) {
293     for (unsigned i = 0; i < CDS->getNumElements(); ++i) {
294       if (CDS->getElementAsInteger(i) != i)
295         return false;
296     }
297   } else
298     return false;
299 
300   // The shuffle which broadcasts the index iv into a vector.
301   if (!match(BroadcastSplat, m_ShuffleVector(m_Instruction(Insert), m_Undef(),
302                                              m_Zero())))
303     return false;
304 
305   // The insert element which initialises a vector with the index iv.
306   Instruction *IV = nullptr;
307   if (!match(Insert, m_InsertElement(m_Undef(), m_Instruction(IV), m_Zero())))
308     return false;
309 
310   // The index iv.
311   auto *Phi = dyn_cast<PHINode>(IV);
312   if (!Phi)
313     return false;
314 
315   // TODO: Don't think we need to check the entry value.
316   Value *OnEntry = Phi->getIncomingValueForBlock(L->getLoopPreheader());
317   if (!match(OnEntry, m_Zero()))
318     return false;
319 
320   Value *InLoop = Phi->getIncomingValueForBlock(L->getLoopLatch());
321   unsigned Lanes = cast<VectorType>(Insert->getType())->getNumElements();
322 
323   Instruction *LHS = nullptr;
324   if (!match(InLoop, m_Add(m_Instruction(LHS), m_SpecificInt(Lanes))))
325     return false;
326 
327   return LHS == Phi;
328 }
329 
330 static VectorType* getVectorType(IntrinsicInst *I) {
331   unsigned TypeOp = I->getIntrinsicID() == Intrinsic::masked_load ? 0 : 1;
332   auto *PtrTy = cast<PointerType>(I->getOperand(TypeOp)->getType());
333   return cast<VectorType>(PtrTy->getElementType());
334 }
335 
336 bool MVETailPredication::IsPredicatedVectorLoop() {
337   // Check that the loop contains at least one masked load/store intrinsic.
338   // We only support 'normal' vector instructions - other than masked
339   // load/stores.
340   for (auto *BB : L->getBlocks()) {
341     for (auto &I : *BB) {
342       if (IsMasked(&I)) {
343         VectorType *VecTy = getVectorType(cast<IntrinsicInst>(&I));
344         unsigned Lanes = VecTy->getNumElements();
345         unsigned ElementWidth = VecTy->getScalarSizeInBits();
346         // MVE vectors are 128-bit, but don't support 128 x i1.
347         // TODO: Can we support vectors larger than 128-bits?
348         unsigned MaxWidth = TTI->getRegisterBitWidth(true);
349         if (Lanes * ElementWidth > MaxWidth || Lanes == MaxWidth)
350           return false;
351         MaskedInsts.push_back(cast<IntrinsicInst>(&I));
352       } else if (auto *Int = dyn_cast<IntrinsicInst>(&I)) {
353         for (auto &U : Int->args()) {
354           if (isa<VectorType>(U->getType()))
355             return false;
356         }
357       }
358     }
359   }
360 
361   return !MaskedInsts.empty();
362 }
363 
364 Value* MVETailPredication::ComputeElements(Value *TripCount,
365                                            VectorType *VecTy) {
366   const SCEV *TripCountSE = SE->getSCEV(TripCount);
367   ConstantInt *VF = ConstantInt::get(cast<IntegerType>(TripCount->getType()),
368                                      VecTy->getNumElements());
369 
370   if (VF->equalsInt(1))
371     return nullptr;
372 
373   // TODO: Support constant trip counts.
374   auto VisitAdd = [&](const SCEVAddExpr *S) -> const SCEVMulExpr* {
375     if (auto *Const = dyn_cast<SCEVConstant>(S->getOperand(0))) {
376       if (Const->getAPInt() != -VF->getValue())
377         return nullptr;
378     } else
379       return nullptr;
380     return dyn_cast<SCEVMulExpr>(S->getOperand(1));
381   };
382 
383   auto VisitMul = [&](const SCEVMulExpr *S) -> const SCEVUDivExpr* {
384     if (auto *Const = dyn_cast<SCEVConstant>(S->getOperand(0))) {
385       if (Const->getValue() != VF)
386         return nullptr;
387     } else
388       return nullptr;
389     return dyn_cast<SCEVUDivExpr>(S->getOperand(1));
390   };
391 
392   auto VisitDiv = [&](const SCEVUDivExpr *S) -> const SCEV* {
393     if (auto *Const = dyn_cast<SCEVConstant>(S->getRHS())) {
394       if (Const->getValue() != VF)
395         return nullptr;
396     } else
397       return nullptr;
398 
399     if (auto *RoundUp = dyn_cast<SCEVAddExpr>(S->getLHS())) {
400       if (auto *Const = dyn_cast<SCEVConstant>(RoundUp->getOperand(0))) {
401         if (Const->getAPInt() != (VF->getValue() - 1))
402           return nullptr;
403       } else
404         return nullptr;
405 
406       return RoundUp->getOperand(1);
407     }
408     return nullptr;
409   };
410 
411   // TODO: Can we use SCEV helpers, such as findArrayDimensions, and friends to
412   // determine the numbers of elements instead? Looks like this is what is used
413   // for delinearization, but I'm not sure if it can be applied to the
414   // vectorized form - at least not without a bit more work than I feel
415   // comfortable with.
416 
417   // Search for Elems in the following SCEV:
418   // (1 + ((-VF + (VF * (((VF - 1) + %Elems) /u VF))<nuw>) /u VF))<nuw><nsw>
419   const SCEV *Elems = nullptr;
420   if (auto *TC = dyn_cast<SCEVAddExpr>(TripCountSE))
421     if (auto *Div = dyn_cast<SCEVUDivExpr>(TC->getOperand(1)))
422       if (auto *Add = dyn_cast<SCEVAddExpr>(Div->getLHS()))
423         if (auto *Mul = VisitAdd(Add))
424           if (auto *Div = VisitMul(Mul))
425             if (auto *Res = VisitDiv(Div))
426               Elems = Res;
427 
428   if (!Elems)
429     return nullptr;
430 
431   Instruction *InsertPt = L->getLoopPreheader()->getTerminator();
432   if (!isSafeToExpandAt(Elems, InsertPt, *SE))
433     return nullptr;
434 
435   auto DL = L->getHeader()->getModule()->getDataLayout();
436   SCEVExpander Expander(*SE, DL, "elements");
437   return Expander.expandCodeFor(Elems, Elems->getType(), InsertPt);
438 }
439 
440 // Look through the exit block to see whether there's a duplicate predicate
441 // instruction. This can happen when we need to perform a select on values
442 // from the last and previous iteration. Instead of doing a straight
443 // replacement of that predicate with the vctp, clone the vctp and place it
444 // in the block. This means that the VPR doesn't have to be live into the
445 // exit block which should make it easier to convert this loop into a proper
446 // tail predicated loop.
447 static bool Cleanup(DenseMap<Instruction*, Instruction*> &NewPredicates,
448                     SetVector<Instruction*> &MaybeDead, Loop *L) {
449   BasicBlock *Exit = L->getUniqueExitBlock();
450   if (!Exit) {
451     LLVM_DEBUG(dbgs() << "ARM TP: can't find loop exit block\n");
452     return false;
453   }
454 
455   bool ClonedVCTPInExitBlock = false;
456 
457   for (auto &Pair : NewPredicates) {
458     Instruction *OldPred = Pair.first;
459     Instruction *NewPred = Pair.second;
460 
461     for (auto &I : *Exit) {
462       if (I.isSameOperationAs(OldPred)) {
463         Instruction *PredClone = NewPred->clone();
464         PredClone->insertBefore(&I);
465         I.replaceAllUsesWith(PredClone);
466         MaybeDead.insert(&I);
467         ClonedVCTPInExitBlock = true;
468         LLVM_DEBUG(dbgs() << "ARM TP: replacing: "; I.dump();
469                    dbgs() << "ARM TP: with:      "; PredClone->dump());
470         break;
471       }
472     }
473   }
474 
475   // Drop references and add operands to check for dead.
476   SmallPtrSet<Instruction*, 4> Dead;
477   while (!MaybeDead.empty()) {
478     auto *I = MaybeDead.front();
479     MaybeDead.remove(I);
480     if (I->hasNUsesOrMore(1))
481       continue;
482 
483     for (auto &U : I->operands()) {
484       if (auto *OpI = dyn_cast<Instruction>(U))
485         MaybeDead.insert(OpI);
486     }
487     I->dropAllReferences();
488     Dead.insert(I);
489   }
490 
491   for (auto *I : Dead) {
492     LLVM_DEBUG(dbgs() << "ARM TP: removing dead insn: "; I->dump());
493     I->eraseFromParent();
494   }
495 
496   for (auto I : L->blocks())
497     DeleteDeadPHIs(I);
498 
499   return ClonedVCTPInExitBlock;
500 }
501 
502 void MVETailPredication::InsertVCTPIntrinsic(Instruction *Predicate,
503     DenseMap<Instruction*, Instruction*> &NewPredicates,
504     VectorType *VecTy, Value *NumElements) {
505   IRBuilder<> Builder(L->getHeader()->getFirstNonPHI());
506   Module *M = L->getHeader()->getModule();
507   Type *Ty = IntegerType::get(M->getContext(), 32);
508 
509   // Insert a phi to count the number of elements processed by the loop.
510   PHINode *Processed = Builder.CreatePHI(Ty, 2);
511   Processed->addIncoming(NumElements, L->getLoopPreheader());
512 
513   // Insert the intrinsic to represent the effect of tail predication.
514   Builder.SetInsertPoint(cast<Instruction>(Predicate));
515   ConstantInt *Factor =
516     ConstantInt::get(cast<IntegerType>(Ty), VecTy->getNumElements());
517 
518   Intrinsic::ID VCTPID;
519   switch (VecTy->getNumElements()) {
520   default:
521     llvm_unreachable("unexpected number of lanes");
522   case 4:  VCTPID = Intrinsic::arm_mve_vctp32; break;
523   case 8:  VCTPID = Intrinsic::arm_mve_vctp16; break;
524   case 16: VCTPID = Intrinsic::arm_mve_vctp8; break;
525 
526     // FIXME: vctp64 currently not supported because the predicate
527     // vector wants to be <2 x i1>, but v2i1 is not a legal MVE
528     // type, so problems happen at isel time.
529     // Intrinsic::arm_mve_vctp64 exists for ACLE intrinsics
530     // purposes, but takes a v4i1 instead of a v2i1.
531   }
532   Function *VCTP = Intrinsic::getDeclaration(M, VCTPID);
533   Value *TailPredicate = Builder.CreateCall(VCTP, Processed);
534   Predicate->replaceAllUsesWith(TailPredicate);
535   NewPredicates[Predicate] = cast<Instruction>(TailPredicate);
536 
537   // Add the incoming value to the new phi.
538   // TODO: This add likely already exists in the loop.
539   Value *Remaining = Builder.CreateSub(Processed, Factor);
540   Processed->addIncoming(Remaining, L->getLoopLatch());
541   LLVM_DEBUG(dbgs() << "ARM TP: Insert processed elements phi: "
542              << *Processed << "\n"
543              << "ARM TP: Inserted VCTP: " << *TailPredicate << "\n");
544 }
545 
546 bool MVETailPredication::TryConvert(Value *TripCount) {
547   if (!IsPredicatedVectorLoop()) {
548     LLVM_DEBUG(dbgs() << "ARM TP: no masked instructions in loop");
549     return false;
550   }
551 
552   LLVM_DEBUG(dbgs() << "ARM TP: Found predicated vector loop.\n");
553 
554   // Walk through the masked intrinsics and try to find whether the predicate
555   // operand is generated from an induction variable.
556   SetVector<Instruction*> Predicates;
557   DenseMap<Instruction*, Instruction*> NewPredicates;
558 
559   for (auto *I : MaskedInsts) {
560     Intrinsic::ID ID = I->getIntrinsicID();
561     unsigned PredOp = ID == Intrinsic::masked_load ? 2 : 3;
562     auto *Predicate = dyn_cast<Instruction>(I->getArgOperand(PredOp));
563     if (!Predicate || Predicates.count(Predicate))
564       continue;
565 
566     VectorType *VecTy = getVectorType(I);
567     Value *NumElements = ComputeElements(TripCount, VecTy);
568     if (!NumElements)
569       continue;
570 
571     if (!isTailPredicate(Predicate, NumElements)) {
572       LLVM_DEBUG(dbgs() << "ARM TP: Not tail predicate: " << *Predicate << "\n");
573       continue;
574     }
575 
576     LLVM_DEBUG(dbgs() << "ARM TP: Found tail predicate: " << *Predicate << "\n");
577     Predicates.insert(Predicate);
578 
579     InsertVCTPIntrinsic(Predicate, NewPredicates, VecTy, NumElements);
580   }
581 
582   // Now clean up.
583   ClonedVCTPInExitBlock = Cleanup(NewPredicates, Predicates, L);
584   return true;
585 }
586 
587 Pass *llvm::createMVETailPredicationPass() {
588   return new MVETailPredication();
589 }
590 
591 char MVETailPredication::ID = 0;
592 
593 INITIALIZE_PASS_BEGIN(MVETailPredication, DEBUG_TYPE, DESC, false, false)
594 INITIALIZE_PASS_END(MVETailPredication, DEBUG_TYPE, DESC, false, false)
595