1 //===----- SVEIntrinsicOpts - SVE ACLE Intrinsics Opts --------------------===//
2 //
3 //                     The LLVM Compiler Infrastructure
4 //
5 // This file is distributed under the University of Illinois Open Source
6 // License. See LICENSE.TXT for details.
7 //
8 //===----------------------------------------------------------------------===//
9 //
10 // Performs general IR level optimizations on SVE intrinsics.
11 //
12 // This pass performs the following optimizations:
13 //
14 // - removes unnecessary reinterpret intrinsics
15 //   (llvm.aarch64.sve.convert.[to|from].svbool), e.g:
16 //     %1 = @llvm.aarch64.sve.convert.to.svbool.nxv4i1(<vscale x 4 x i1> %a)
17 //     %2 = @llvm.aarch64.sve.convert.from.svbool.nxv4i1(<vscale x 16 x i1> %1)
18 //
19 // - removes unnecessary ptrue intrinsics (llvm.aarch64.sve.ptrue), e.g:
20 //     %1 = @llvm.aarch64.sve.ptrue.nxv4i1(i32 31)
21 //     %2 = @llvm.aarch64.sve.ptrue.nxv8i1(i32 31)
22 //     ; (%1 can be replaced with a reinterpret of %2)
23 //
24 // - optimizes ptest intrinsics and phi instructions where the operands are
25 //   being needlessly converted to and from svbool_t.
26 //
27 //===----------------------------------------------------------------------===//
28 
29 #include "Utils/AArch64BaseInfo.h"
30 #include "llvm/ADT/PostOrderIterator.h"
31 #include "llvm/ADT/SetVector.h"
32 #include "llvm/IR/Constants.h"
33 #include "llvm/IR/Dominators.h"
34 #include "llvm/IR/IRBuilder.h"
35 #include "llvm/IR/Instructions.h"
36 #include "llvm/IR/IntrinsicInst.h"
37 #include "llvm/IR/IntrinsicsAArch64.h"
38 #include "llvm/IR/LLVMContext.h"
39 #include "llvm/IR/PatternMatch.h"
40 #include "llvm/InitializePasses.h"
41 #include "llvm/Support/Debug.h"
42 
43 using namespace llvm;
44 using namespace llvm::PatternMatch;
45 
46 #define DEBUG_TYPE "aarch64-sve-intrinsic-opts"
47 
48 namespace llvm {
49 void initializeSVEIntrinsicOptsPass(PassRegistry &);
50 }
51 
52 namespace {
53 struct SVEIntrinsicOpts : public ModulePass {
54   static char ID; // Pass identification, replacement for typeid
55   SVEIntrinsicOpts() : ModulePass(ID) {
56     initializeSVEIntrinsicOptsPass(*PassRegistry::getPassRegistry());
57   }
58 
59   bool runOnModule(Module &M) override;
60   void getAnalysisUsage(AnalysisUsage &AU) const override;
61 
62 private:
63   static IntrinsicInst *isReinterpretToSVBool(Value *V);
64 
65   bool coalescePTrueIntrinsicCalls(BasicBlock &BB,
66                                    SmallSetVector<IntrinsicInst *, 4> &PTrues);
67   bool optimizePTrueIntrinsicCalls(SmallSetVector<Function *, 4> &Functions);
68 
69   /// Operates at the instruction-scope. I.e., optimizations are applied local
70   /// to individual instructions.
71   static bool optimizeIntrinsic(Instruction *I);
72   bool optimizeIntrinsicCalls(SmallSetVector<Function *, 4> &Functions);
73 
74   /// Operates at the function-scope. I.e., optimizations are applied local to
75   /// the functions themselves.
76   bool optimizeFunctions(SmallSetVector<Function *, 4> &Functions);
77 
78   static bool optimizeConvertFromSVBool(IntrinsicInst *I);
79   static bool optimizePTest(IntrinsicInst *I);
80 
81   static bool processPhiNode(IntrinsicInst *I);
82 };
83 } // end anonymous namespace
84 
85 void SVEIntrinsicOpts::getAnalysisUsage(AnalysisUsage &AU) const {
86   AU.addRequired<DominatorTreeWrapperPass>();
87   AU.setPreservesCFG();
88 }
89 
90 char SVEIntrinsicOpts::ID = 0;
91 static const char *name = "SVE intrinsics optimizations";
92 INITIALIZE_PASS_BEGIN(SVEIntrinsicOpts, DEBUG_TYPE, name, false, false)
93 INITIALIZE_PASS_DEPENDENCY(DominatorTreeWrapperPass);
94 INITIALIZE_PASS_END(SVEIntrinsicOpts, DEBUG_TYPE, name, false, false)
95 
96 namespace llvm {
97 ModulePass *createSVEIntrinsicOptsPass() { return new SVEIntrinsicOpts(); }
98 } // namespace llvm
99 
100 /// Returns V if it's a cast from <n x 16 x i1> (aka svbool_t), nullptr
101 /// otherwise.
102 IntrinsicInst *SVEIntrinsicOpts::isReinterpretToSVBool(Value *V) {
103   IntrinsicInst *I = dyn_cast<IntrinsicInst>(V);
104   if (!I)
105     return nullptr;
106 
107   if (I->getIntrinsicID() != Intrinsic::aarch64_sve_convert_to_svbool)
108     return nullptr;
109 
110   return I;
111 }
112 
113 /// Checks if a ptrue intrinsic call is promoted. The act of promoting a
114 /// ptrue will introduce zeroing. For example:
115 ///
116 ///     %1 = <vscale x 4 x i1> call @llvm.aarch64.sve.ptrue.nxv4i1(i32 31)
117 ///     %2 = <vscale x 16 x i1> call @llvm.aarch64.sve.convert.to.svbool.nxv4i1(<vscale x 4 x i1> %1)
118 ///     %3 = <vscale x 8 x i1> call @llvm.aarch64.sve.convert.from.svbool.nxv8i1(<vscale x 16 x i1> %2)
119 ///
120 /// %1 is promoted, because it is converted:
121 ///
122 ///     <vscale x 4 x i1> => <vscale x 16 x i1> => <vscale x 8 x i1>
123 ///
124 /// via a sequence of the SVE reinterpret intrinsics convert.{to,from}.svbool.
125 bool isPTruePromoted(IntrinsicInst *PTrue) {
126   // Find all users of this intrinsic that are calls to convert-to-svbool
127   // reinterpret intrinsics.
128   SmallVector<IntrinsicInst *, 4> ConvertToUses;
129   for (User *User : PTrue->users()) {
130     if (match(User, m_Intrinsic<Intrinsic::aarch64_sve_convert_to_svbool>())) {
131       ConvertToUses.push_back(cast<IntrinsicInst>(User));
132     }
133   }
134 
135   // If no such calls were found, this is ptrue is not promoted.
136   if (ConvertToUses.empty())
137     return false;
138 
139   // Otherwise, try to find users of the convert-to-svbool intrinsics that are
140   // calls to the convert-from-svbool intrinsic, and would result in some lanes
141   // being zeroed.
142   const auto *PTrueVTy = cast<ScalableVectorType>(PTrue->getType());
143   for (IntrinsicInst *ConvertToUse : ConvertToUses) {
144     for (User *User : ConvertToUse->users()) {
145       auto *IntrUser = dyn_cast<IntrinsicInst>(User);
146       if (IntrUser && IntrUser->getIntrinsicID() ==
147                           Intrinsic::aarch64_sve_convert_from_svbool) {
148         const auto *IntrUserVTy = cast<ScalableVectorType>(IntrUser->getType());
149 
150         // Would some lanes become zeroed by the conversion?
151         if (IntrUserVTy->getElementCount().getKnownMinValue() >
152             PTrueVTy->getElementCount().getKnownMinValue())
153           // This is a promoted ptrue.
154           return true;
155       }
156     }
157   }
158 
159   // If no matching calls were found, this is not a promoted ptrue.
160   return false;
161 }
162 
163 /// Attempts to coalesce ptrues in a basic block.
164 bool SVEIntrinsicOpts::coalescePTrueIntrinsicCalls(
165     BasicBlock &BB, SmallSetVector<IntrinsicInst *, 4> &PTrues) {
166   if (PTrues.size() <= 1)
167     return false;
168 
169   // Find the ptrue with the most lanes.
170   auto *MostEncompassingPTrue = *std::max_element(
171       PTrues.begin(), PTrues.end(), [](auto *PTrue1, auto *PTrue2) {
172         auto *PTrue1VTy = cast<ScalableVectorType>(PTrue1->getType());
173         auto *PTrue2VTy = cast<ScalableVectorType>(PTrue2->getType());
174         return PTrue1VTy->getElementCount().getKnownMinValue() <
175                PTrue2VTy->getElementCount().getKnownMinValue();
176       });
177 
178   // Remove the most encompassing ptrue, as well as any promoted ptrues, leaving
179   // behind only the ptrues to be coalesced.
180   PTrues.remove(MostEncompassingPTrue);
181   PTrues.remove_if([](auto *PTrue) { return isPTruePromoted(PTrue); });
182 
183   // Hoist MostEncompassingPTrue to the start of the basic block. It is always
184   // safe to do this, since ptrue intrinsic calls are guaranteed to have no
185   // predecessors.
186   MostEncompassingPTrue->moveBefore(BB, BB.getFirstInsertionPt());
187 
188   LLVMContext &Ctx = BB.getContext();
189   IRBuilder<> Builder(Ctx);
190   Builder.SetInsertPoint(&BB, ++MostEncompassingPTrue->getIterator());
191 
192   auto *MostEncompassingPTrueVTy =
193       cast<VectorType>(MostEncompassingPTrue->getType());
194   auto *ConvertToSVBool = Builder.CreateIntrinsic(
195       Intrinsic::aarch64_sve_convert_to_svbool, {MostEncompassingPTrueVTy},
196       {MostEncompassingPTrue});
197 
198   for (auto *PTrue : PTrues) {
199     auto *PTrueVTy = cast<VectorType>(PTrue->getType());
200 
201     Builder.SetInsertPoint(&BB, ++ConvertToSVBool->getIterator());
202     auto *ConvertFromSVBool =
203         Builder.CreateIntrinsic(Intrinsic::aarch64_sve_convert_from_svbool,
204                                 {PTrueVTy}, {ConvertToSVBool});
205     PTrue->replaceAllUsesWith(ConvertFromSVBool);
206     PTrue->eraseFromParent();
207   }
208 
209   return true;
210 }
211 
212 /// The goal of this function is to remove redundant calls to the SVE ptrue
213 /// intrinsic in each basic block within the given functions.
214 ///
215 /// SVE ptrues have two representations in LLVM IR:
216 /// - a logical representation -- an arbitrary-width scalable vector of i1s,
217 ///   i.e. <vscale x N x i1>.
218 /// - a physical representation (svbool, <vscale x 16 x i1>) -- a 16-element
219 ///   scalable vector of i1s, i.e. <vscale x 16 x i1>.
220 ///
221 /// The SVE ptrue intrinsic is used to create a logical representation of an SVE
222 /// predicate. Suppose that we have two SVE ptrue intrinsic calls: P1 and P2. If
223 /// P1 creates a logical SVE predicate that is at least as wide as the logical
224 /// SVE predicate created by P2, then all of the bits that are true in the
225 /// physical representation of P2 are necessarily also true in the physical
226 /// representation of P1. P1 'encompasses' P2, therefore, the intrinsic call to
227 /// P2 is redundant and can be replaced by an SVE reinterpret of P1 via
228 /// convert.{to,from}.svbool.
229 ///
230 /// Currently, this pass only coalesces calls to SVE ptrue intrinsics
231 /// if they match the following conditions:
232 ///
233 /// - the call to the intrinsic uses either the SV_ALL or SV_POW2 patterns.
234 ///   SV_ALL indicates that all bits of the predicate vector are to be set to
235 ///   true. SV_POW2 indicates that all bits of the predicate vector up to the
236 ///   largest power-of-two are to be set to true.
237 /// - the result of the call to the intrinsic is not promoted to a wider
238 ///   predicate. In this case, keeping the extra ptrue leads to better codegen
239 ///   -- coalescing here would create an irreducible chain of SVE reinterprets
240 ///   via convert.{to,from}.svbool.
241 ///
242 /// EXAMPLE:
243 ///
244 ///     %1 = <vscale x 8 x i1> ptrue(i32 SV_ALL)
245 ///     ; Logical:  <1, 1, 1, 1, 1, 1, 1, 1>
246 ///     ; Physical: <1, 0, 1, 0, 1, 0, 1, 0, 1, 0, 1, 0, 1, 0, 1, 0>
247 ///     ...
248 ///
249 ///     %2 = <vscale x 4 x i1> ptrue(i32 SV_ALL)
250 ///     ; Logical:  <1, 1, 1, 1>
251 ///     ; Physical: <1, 0, 0, 0, 1, 0, 0, 0, 1, 0, 0, 0, 1, 0, 0, 0>
252 ///     ...
253 ///
254 /// Here, %2 can be replaced by an SVE reinterpret of %1, giving, for instance:
255 ///
256 ///     %1 = <vscale x 8 x i1> ptrue(i32 i31)
257 ///     %2 = <vscale x 16 x i1> convert.to.svbool(<vscale x 8 x i1> %1)
258 ///     %3 = <vscale x 4 x i1> convert.from.svbool(<vscale x 16 x i1> %2)
259 ///
260 bool SVEIntrinsicOpts::optimizePTrueIntrinsicCalls(
261     SmallSetVector<Function *, 4> &Functions) {
262   bool Changed = false;
263 
264   for (auto *F : Functions) {
265     for (auto &BB : *F) {
266       SmallSetVector<IntrinsicInst *, 4> SVAllPTrues;
267       SmallSetVector<IntrinsicInst *, 4> SVPow2PTrues;
268 
269       // For each basic block, collect the used ptrues and try to coalesce them.
270       for (Instruction &I : BB) {
271         if (I.use_empty())
272           continue;
273 
274         auto *IntrI = dyn_cast<IntrinsicInst>(&I);
275         if (!IntrI || IntrI->getIntrinsicID() != Intrinsic::aarch64_sve_ptrue)
276           continue;
277 
278         const auto PTruePattern =
279             cast<ConstantInt>(IntrI->getOperand(0))->getZExtValue();
280 
281         if (PTruePattern == AArch64SVEPredPattern::all)
282           SVAllPTrues.insert(IntrI);
283         if (PTruePattern == AArch64SVEPredPattern::pow2)
284           SVPow2PTrues.insert(IntrI);
285       }
286 
287       Changed |= coalescePTrueIntrinsicCalls(BB, SVAllPTrues);
288       Changed |= coalescePTrueIntrinsicCalls(BB, SVPow2PTrues);
289     }
290   }
291 
292   return Changed;
293 }
294 
295 /// The function will remove redundant reinterprets casting in the presence
296 /// of the control flow
297 bool SVEIntrinsicOpts::processPhiNode(IntrinsicInst *X) {
298 
299   SmallVector<Instruction *, 32> Worklist;
300   auto RequiredType = X->getType();
301 
302   auto *PN = dyn_cast<PHINode>(X->getArgOperand(0));
303   assert(PN && "Expected Phi Node!");
304 
305   // Don't create a new Phi unless we can remove the old one.
306   if (!PN->hasOneUse())
307     return false;
308 
309   for (Value *IncValPhi : PN->incoming_values()) {
310     auto *Reinterpret = isReinterpretToSVBool(IncValPhi);
311     if (!Reinterpret ||
312         RequiredType != Reinterpret->getArgOperand(0)->getType())
313       return false;
314   }
315 
316   // Create the new Phi
317   LLVMContext &Ctx = PN->getContext();
318   IRBuilder<> Builder(Ctx);
319   Builder.SetInsertPoint(PN);
320   PHINode *NPN = Builder.CreatePHI(RequiredType, PN->getNumIncomingValues());
321   Worklist.push_back(PN);
322 
323   for (unsigned I = 0; I < PN->getNumIncomingValues(); I++) {
324     auto *Reinterpret = cast<Instruction>(PN->getIncomingValue(I));
325     NPN->addIncoming(Reinterpret->getOperand(0), PN->getIncomingBlock(I));
326     Worklist.push_back(Reinterpret);
327   }
328 
329   // Cleanup Phi Node and reinterprets
330   X->replaceAllUsesWith(NPN);
331   X->eraseFromParent();
332 
333   for (auto &I : Worklist)
334     if (I->use_empty())
335       I->eraseFromParent();
336 
337   return true;
338 }
339 
340 bool SVEIntrinsicOpts::optimizePTest(IntrinsicInst *I) {
341   IntrinsicInst *Op1 = dyn_cast<IntrinsicInst>(I->getArgOperand(0));
342   IntrinsicInst *Op2 = dyn_cast<IntrinsicInst>(I->getArgOperand(1));
343 
344   if (Op1 && Op2 &&
345       Op1->getIntrinsicID() == Intrinsic::aarch64_sve_convert_to_svbool &&
346       Op2->getIntrinsicID() == Intrinsic::aarch64_sve_convert_to_svbool &&
347       Op1->getArgOperand(0)->getType() == Op2->getArgOperand(0)->getType()) {
348 
349     Value *Ops[] = {Op1->getArgOperand(0), Op2->getArgOperand(0)};
350     Type *Tys[] = {Op1->getArgOperand(0)->getType()};
351     Module *M = I->getParent()->getParent()->getParent();
352 
353     auto Fn = Intrinsic::getDeclaration(M, I->getIntrinsicID(), Tys);
354     auto CI = CallInst::Create(Fn, Ops, I->getName(), I);
355 
356     I->replaceAllUsesWith(CI);
357     I->eraseFromParent();
358     if (Op1->use_empty())
359       Op1->eraseFromParent();
360     if (Op1 != Op2 && Op2->use_empty())
361       Op2->eraseFromParent();
362 
363     return true;
364   }
365 
366   return false;
367 }
368 
369 bool SVEIntrinsicOpts::optimizeConvertFromSVBool(IntrinsicInst *I) {
370   assert(I->getIntrinsicID() == Intrinsic::aarch64_sve_convert_from_svbool &&
371          "Unexpected opcode");
372 
373   // If the reinterpret instruction operand is a PHI Node
374   if (isa<PHINode>(I->getArgOperand(0)))
375     return processPhiNode(I);
376 
377   SmallVector<Instruction *, 32> CandidatesForRemoval;
378   Value *Cursor = I->getOperand(0), *EarliestReplacement = nullptr;
379 
380   const auto *IVTy = cast<VectorType>(I->getType());
381 
382   // Walk the chain of conversions.
383   while (Cursor) {
384     // If the type of the cursor has fewer lanes than the final result, zeroing
385     // must take place, which breaks the equivalence chain.
386     const auto *CursorVTy = cast<VectorType>(Cursor->getType());
387     if (CursorVTy->getElementCount().getKnownMinValue() <
388         IVTy->getElementCount().getKnownMinValue())
389       break;
390 
391     // If the cursor has the same type as I, it is a viable replacement.
392     if (Cursor->getType() == IVTy)
393       EarliestReplacement = Cursor;
394 
395     auto *IntrinsicCursor = dyn_cast<IntrinsicInst>(Cursor);
396 
397     // If this is not an SVE conversion intrinsic, this is the end of the chain.
398     if (!IntrinsicCursor || !(IntrinsicCursor->getIntrinsicID() ==
399                                   Intrinsic::aarch64_sve_convert_to_svbool ||
400                               IntrinsicCursor->getIntrinsicID() ==
401                                   Intrinsic::aarch64_sve_convert_from_svbool))
402       break;
403 
404     CandidatesForRemoval.insert(CandidatesForRemoval.begin(), IntrinsicCursor);
405     Cursor = IntrinsicCursor->getOperand(0);
406   }
407 
408   // If no viable replacement in the conversion chain was found, there is
409   // nothing to do.
410   if (!EarliestReplacement)
411     return false;
412 
413   I->replaceAllUsesWith(EarliestReplacement);
414   I->eraseFromParent();
415 
416   while (!CandidatesForRemoval.empty()) {
417     Instruction *Candidate = CandidatesForRemoval.pop_back_val();
418     if (Candidate->use_empty())
419       Candidate->eraseFromParent();
420   }
421   return true;
422 }
423 
424 bool SVEIntrinsicOpts::optimizeIntrinsic(Instruction *I) {
425   IntrinsicInst *IntrI = dyn_cast<IntrinsicInst>(I);
426   if (!IntrI)
427     return false;
428 
429   switch (IntrI->getIntrinsicID()) {
430   case Intrinsic::aarch64_sve_convert_from_svbool:
431     return optimizeConvertFromSVBool(IntrI);
432   case Intrinsic::aarch64_sve_ptest_any:
433   case Intrinsic::aarch64_sve_ptest_first:
434   case Intrinsic::aarch64_sve_ptest_last:
435     return optimizePTest(IntrI);
436   default:
437     return false;
438   }
439 
440   return true;
441 }
442 
443 bool SVEIntrinsicOpts::optimizeIntrinsicCalls(
444     SmallSetVector<Function *, 4> &Functions) {
445   bool Changed = false;
446   for (auto *F : Functions) {
447     DominatorTree *DT = &getAnalysis<DominatorTreeWrapperPass>(*F).getDomTree();
448 
449     // Traverse the DT with an rpo walk so we see defs before uses, allowing
450     // simplification to be done incrementally.
451     BasicBlock *Root = DT->getRoot();
452     ReversePostOrderTraversal<BasicBlock *> RPOT(Root);
453     for (auto *BB : RPOT)
454       for (Instruction &I : make_early_inc_range(*BB))
455         Changed |= optimizeIntrinsic(&I);
456   }
457   return Changed;
458 }
459 
460 bool SVEIntrinsicOpts::optimizeFunctions(
461     SmallSetVector<Function *, 4> &Functions) {
462   bool Changed = false;
463 
464   Changed |= optimizePTrueIntrinsicCalls(Functions);
465   Changed |= optimizeIntrinsicCalls(Functions);
466 
467   return Changed;
468 }
469 
470 bool SVEIntrinsicOpts::runOnModule(Module &M) {
471   bool Changed = false;
472   SmallSetVector<Function *, 4> Functions;
473 
474   // Check for SVE intrinsic declarations first so that we only iterate over
475   // relevant functions. Where an appropriate declaration is found, store the
476   // function(s) where it is used so we can target these only.
477   for (auto &F : M.getFunctionList()) {
478     if (!F.isDeclaration())
479       continue;
480 
481     switch (F.getIntrinsicID()) {
482     case Intrinsic::aarch64_sve_convert_from_svbool:
483     case Intrinsic::aarch64_sve_ptest_any:
484     case Intrinsic::aarch64_sve_ptest_first:
485     case Intrinsic::aarch64_sve_ptest_last:
486     case Intrinsic::aarch64_sve_ptrue:
487       for (User *U : F.users())
488         Functions.insert(cast<Instruction>(U)->getFunction());
489       break;
490     default:
491       break;
492     }
493   }
494 
495   if (!Functions.empty())
496     Changed |= optimizeFunctions(Functions);
497 
498   return Changed;
499 }
500