1 //===----- SVEIntrinsicOpts - SVE ACLE Intrinsics Opts --------------------===// 2 // 3 // The LLVM Compiler Infrastructure 4 // 5 // This file is distributed under the University of Illinois Open Source 6 // License. See LICENSE.TXT for details. 7 // 8 //===----------------------------------------------------------------------===// 9 // 10 // Performs general IR level optimizations on SVE intrinsics. 11 // 12 // This pass performs the following optimizations: 13 // 14 // - removes unnecessary reinterpret intrinsics 15 // (llvm.aarch64.sve.convert.[to|from].svbool), e.g: 16 // %1 = @llvm.aarch64.sve.convert.to.svbool.nxv4i1(<vscale x 4 x i1> %a) 17 // %2 = @llvm.aarch64.sve.convert.from.svbool.nxv4i1(<vscale x 16 x i1> %1) 18 // 19 // - removes unnecessary ptrue intrinsics (llvm.aarch64.sve.ptrue), e.g: 20 // %1 = @llvm.aarch64.sve.ptrue.nxv4i1(i32 31) 21 // %2 = @llvm.aarch64.sve.ptrue.nxv8i1(i32 31) 22 // ; (%1 can be replaced with a reinterpret of %2) 23 // 24 // - optimizes ptest intrinsics and phi instructions where the operands are 25 // being needlessly converted to and from svbool_t. 26 // 27 //===----------------------------------------------------------------------===// 28 29 #include "Utils/AArch64BaseInfo.h" 30 #include "llvm/ADT/PostOrderIterator.h" 31 #include "llvm/ADT/SetVector.h" 32 #include "llvm/IR/Constants.h" 33 #include "llvm/IR/Dominators.h" 34 #include "llvm/IR/IRBuilder.h" 35 #include "llvm/IR/Instructions.h" 36 #include "llvm/IR/IntrinsicInst.h" 37 #include "llvm/IR/IntrinsicsAArch64.h" 38 #include "llvm/IR/LLVMContext.h" 39 #include "llvm/IR/PatternMatch.h" 40 #include "llvm/InitializePasses.h" 41 #include "llvm/Support/Debug.h" 42 43 using namespace llvm; 44 using namespace llvm::PatternMatch; 45 46 #define DEBUG_TYPE "aarch64-sve-intrinsic-opts" 47 48 namespace llvm { 49 void initializeSVEIntrinsicOptsPass(PassRegistry &); 50 } 51 52 namespace { 53 struct SVEIntrinsicOpts : public ModulePass { 54 static char ID; // Pass identification, replacement for typeid 55 SVEIntrinsicOpts() : ModulePass(ID) { 56 initializeSVEIntrinsicOptsPass(*PassRegistry::getPassRegistry()); 57 } 58 59 bool runOnModule(Module &M) override; 60 void getAnalysisUsage(AnalysisUsage &AU) const override; 61 62 private: 63 static IntrinsicInst *isReinterpretToSVBool(Value *V); 64 65 bool coalescePTrueIntrinsicCalls(BasicBlock &BB, 66 SmallSetVector<IntrinsicInst *, 4> &PTrues); 67 bool optimizePTrueIntrinsicCalls(SmallSetVector<Function *, 4> &Functions); 68 69 /// Operates at the instruction-scope. I.e., optimizations are applied local 70 /// to individual instructions. 71 static bool optimizeIntrinsic(Instruction *I); 72 bool optimizeIntrinsicCalls(SmallSetVector<Function *, 4> &Functions); 73 74 /// Operates at the function-scope. I.e., optimizations are applied local to 75 /// the functions themselves. 76 bool optimizeFunctions(SmallSetVector<Function *, 4> &Functions); 77 78 static bool optimizeConvertFromSVBool(IntrinsicInst *I); 79 static bool optimizePTest(IntrinsicInst *I); 80 81 static bool processPhiNode(IntrinsicInst *I); 82 }; 83 } // end anonymous namespace 84 85 void SVEIntrinsicOpts::getAnalysisUsage(AnalysisUsage &AU) const { 86 AU.addRequired<DominatorTreeWrapperPass>(); 87 AU.setPreservesCFG(); 88 } 89 90 char SVEIntrinsicOpts::ID = 0; 91 static const char *name = "SVE intrinsics optimizations"; 92 INITIALIZE_PASS_BEGIN(SVEIntrinsicOpts, DEBUG_TYPE, name, false, false) 93 INITIALIZE_PASS_DEPENDENCY(DominatorTreeWrapperPass); 94 INITIALIZE_PASS_END(SVEIntrinsicOpts, DEBUG_TYPE, name, false, false) 95 96 namespace llvm { 97 ModulePass *createSVEIntrinsicOptsPass() { return new SVEIntrinsicOpts(); } 98 } // namespace llvm 99 100 /// Returns V if it's a cast from <n x 16 x i1> (aka svbool_t), nullptr 101 /// otherwise. 102 IntrinsicInst *SVEIntrinsicOpts::isReinterpretToSVBool(Value *V) { 103 IntrinsicInst *I = dyn_cast<IntrinsicInst>(V); 104 if (!I) 105 return nullptr; 106 107 if (I->getIntrinsicID() != Intrinsic::aarch64_sve_convert_to_svbool) 108 return nullptr; 109 110 return I; 111 } 112 113 /// Checks if a ptrue intrinsic call is promoted. The act of promoting a 114 /// ptrue will introduce zeroing. For example: 115 /// 116 /// %1 = <vscale x 4 x i1> call @llvm.aarch64.sve.ptrue.nxv4i1(i32 31) 117 /// %2 = <vscale x 16 x i1> call @llvm.aarch64.sve.convert.to.svbool.nxv4i1(<vscale x 4 x i1> %1) 118 /// %3 = <vscale x 8 x i1> call @llvm.aarch64.sve.convert.from.svbool.nxv8i1(<vscale x 16 x i1> %2) 119 /// 120 /// %1 is promoted, because it is converted: 121 /// 122 /// <vscale x 4 x i1> => <vscale x 16 x i1> => <vscale x 8 x i1> 123 /// 124 /// via a sequence of the SVE reinterpret intrinsics convert.{to,from}.svbool. 125 bool isPTruePromoted(IntrinsicInst *PTrue) { 126 // Find all users of this intrinsic that are calls to convert-to-svbool 127 // reinterpret intrinsics. 128 SmallVector<IntrinsicInst *, 4> ConvertToUses; 129 for (User *User : PTrue->users()) { 130 if (match(User, m_Intrinsic<Intrinsic::aarch64_sve_convert_to_svbool>())) { 131 ConvertToUses.push_back(cast<IntrinsicInst>(User)); 132 } 133 } 134 135 // If no such calls were found, this is ptrue is not promoted. 136 if (ConvertToUses.empty()) 137 return false; 138 139 // Otherwise, try to find users of the convert-to-svbool intrinsics that are 140 // calls to the convert-from-svbool intrinsic, and would result in some lanes 141 // being zeroed. 142 const auto *PTrueVTy = cast<ScalableVectorType>(PTrue->getType()); 143 for (IntrinsicInst *ConvertToUse : ConvertToUses) { 144 for (User *User : ConvertToUse->users()) { 145 auto *IntrUser = dyn_cast<IntrinsicInst>(User); 146 if (IntrUser && IntrUser->getIntrinsicID() == 147 Intrinsic::aarch64_sve_convert_from_svbool) { 148 const auto *IntrUserVTy = cast<ScalableVectorType>(IntrUser->getType()); 149 150 // Would some lanes become zeroed by the conversion? 151 if (IntrUserVTy->getElementCount().getKnownMinValue() > 152 PTrueVTy->getElementCount().getKnownMinValue()) 153 // This is a promoted ptrue. 154 return true; 155 } 156 } 157 } 158 159 // If no matching calls were found, this is not a promoted ptrue. 160 return false; 161 } 162 163 /// Attempts to coalesce ptrues in a basic block. 164 bool SVEIntrinsicOpts::coalescePTrueIntrinsicCalls( 165 BasicBlock &BB, SmallSetVector<IntrinsicInst *, 4> &PTrues) { 166 if (PTrues.size() <= 1) 167 return false; 168 169 // Find the ptrue with the most lanes. 170 auto *MostEncompassingPTrue = *std::max_element( 171 PTrues.begin(), PTrues.end(), [](auto *PTrue1, auto *PTrue2) { 172 auto *PTrue1VTy = cast<ScalableVectorType>(PTrue1->getType()); 173 auto *PTrue2VTy = cast<ScalableVectorType>(PTrue2->getType()); 174 return PTrue1VTy->getElementCount().getKnownMinValue() < 175 PTrue2VTy->getElementCount().getKnownMinValue(); 176 }); 177 178 // Remove the most encompassing ptrue, as well as any promoted ptrues, leaving 179 // behind only the ptrues to be coalesced. 180 PTrues.remove(MostEncompassingPTrue); 181 PTrues.remove_if([](auto *PTrue) { return isPTruePromoted(PTrue); }); 182 183 // Hoist MostEncompassingPTrue to the start of the basic block. It is always 184 // safe to do this, since ptrue intrinsic calls are guaranteed to have no 185 // predecessors. 186 MostEncompassingPTrue->moveBefore(BB, BB.getFirstInsertionPt()); 187 188 LLVMContext &Ctx = BB.getContext(); 189 IRBuilder<> Builder(Ctx); 190 Builder.SetInsertPoint(&BB, ++MostEncompassingPTrue->getIterator()); 191 192 auto *MostEncompassingPTrueVTy = 193 cast<VectorType>(MostEncompassingPTrue->getType()); 194 auto *ConvertToSVBool = Builder.CreateIntrinsic( 195 Intrinsic::aarch64_sve_convert_to_svbool, {MostEncompassingPTrueVTy}, 196 {MostEncompassingPTrue}); 197 198 for (auto *PTrue : PTrues) { 199 auto *PTrueVTy = cast<VectorType>(PTrue->getType()); 200 201 Builder.SetInsertPoint(&BB, ++ConvertToSVBool->getIterator()); 202 auto *ConvertFromSVBool = 203 Builder.CreateIntrinsic(Intrinsic::aarch64_sve_convert_from_svbool, 204 {PTrueVTy}, {ConvertToSVBool}); 205 PTrue->replaceAllUsesWith(ConvertFromSVBool); 206 PTrue->eraseFromParent(); 207 } 208 209 return true; 210 } 211 212 /// The goal of this function is to remove redundant calls to the SVE ptrue 213 /// intrinsic in each basic block within the given functions. 214 /// 215 /// SVE ptrues have two representations in LLVM IR: 216 /// - a logical representation -- an arbitrary-width scalable vector of i1s, 217 /// i.e. <vscale x N x i1>. 218 /// - a physical representation (svbool, <vscale x 16 x i1>) -- a 16-element 219 /// scalable vector of i1s, i.e. <vscale x 16 x i1>. 220 /// 221 /// The SVE ptrue intrinsic is used to create a logical representation of an SVE 222 /// predicate. Suppose that we have two SVE ptrue intrinsic calls: P1 and P2. If 223 /// P1 creates a logical SVE predicate that is at least as wide as the logical 224 /// SVE predicate created by P2, then all of the bits that are true in the 225 /// physical representation of P2 are necessarily also true in the physical 226 /// representation of P1. P1 'encompasses' P2, therefore, the intrinsic call to 227 /// P2 is redundant and can be replaced by an SVE reinterpret of P1 via 228 /// convert.{to,from}.svbool. 229 /// 230 /// Currently, this pass only coalesces calls to SVE ptrue intrinsics 231 /// if they match the following conditions: 232 /// 233 /// - the call to the intrinsic uses either the SV_ALL or SV_POW2 patterns. 234 /// SV_ALL indicates that all bits of the predicate vector are to be set to 235 /// true. SV_POW2 indicates that all bits of the predicate vector up to the 236 /// largest power-of-two are to be set to true. 237 /// - the result of the call to the intrinsic is not promoted to a wider 238 /// predicate. In this case, keeping the extra ptrue leads to better codegen 239 /// -- coalescing here would create an irreducible chain of SVE reinterprets 240 /// via convert.{to,from}.svbool. 241 /// 242 /// EXAMPLE: 243 /// 244 /// %1 = <vscale x 8 x i1> ptrue(i32 SV_ALL) 245 /// ; Logical: <1, 1, 1, 1, 1, 1, 1, 1> 246 /// ; Physical: <1, 0, 1, 0, 1, 0, 1, 0, 1, 0, 1, 0, 1, 0, 1, 0> 247 /// ... 248 /// 249 /// %2 = <vscale x 4 x i1> ptrue(i32 SV_ALL) 250 /// ; Logical: <1, 1, 1, 1> 251 /// ; Physical: <1, 0, 0, 0, 1, 0, 0, 0, 1, 0, 0, 0, 1, 0, 0, 0> 252 /// ... 253 /// 254 /// Here, %2 can be replaced by an SVE reinterpret of %1, giving, for instance: 255 /// 256 /// %1 = <vscale x 8 x i1> ptrue(i32 i31) 257 /// %2 = <vscale x 16 x i1> convert.to.svbool(<vscale x 8 x i1> %1) 258 /// %3 = <vscale x 4 x i1> convert.from.svbool(<vscale x 16 x i1> %2) 259 /// 260 bool SVEIntrinsicOpts::optimizePTrueIntrinsicCalls( 261 SmallSetVector<Function *, 4> &Functions) { 262 bool Changed = false; 263 264 for (auto *F : Functions) { 265 for (auto &BB : *F) { 266 SmallSetVector<IntrinsicInst *, 4> SVAllPTrues; 267 SmallSetVector<IntrinsicInst *, 4> SVPow2PTrues; 268 269 // For each basic block, collect the used ptrues and try to coalesce them. 270 for (Instruction &I : BB) { 271 if (I.use_empty()) 272 continue; 273 274 auto *IntrI = dyn_cast<IntrinsicInst>(&I); 275 if (!IntrI || IntrI->getIntrinsicID() != Intrinsic::aarch64_sve_ptrue) 276 continue; 277 278 const auto PTruePattern = 279 cast<ConstantInt>(IntrI->getOperand(0))->getZExtValue(); 280 281 if (PTruePattern == AArch64SVEPredPattern::all) 282 SVAllPTrues.insert(IntrI); 283 if (PTruePattern == AArch64SVEPredPattern::pow2) 284 SVPow2PTrues.insert(IntrI); 285 } 286 287 Changed |= coalescePTrueIntrinsicCalls(BB, SVAllPTrues); 288 Changed |= coalescePTrueIntrinsicCalls(BB, SVPow2PTrues); 289 } 290 } 291 292 return Changed; 293 } 294 295 /// The function will remove redundant reinterprets casting in the presence 296 /// of the control flow 297 bool SVEIntrinsicOpts::processPhiNode(IntrinsicInst *X) { 298 299 SmallVector<Instruction *, 32> Worklist; 300 auto RequiredType = X->getType(); 301 302 auto *PN = dyn_cast<PHINode>(X->getArgOperand(0)); 303 assert(PN && "Expected Phi Node!"); 304 305 // Don't create a new Phi unless we can remove the old one. 306 if (!PN->hasOneUse()) 307 return false; 308 309 for (Value *IncValPhi : PN->incoming_values()) { 310 auto *Reinterpret = isReinterpretToSVBool(IncValPhi); 311 if (!Reinterpret || 312 RequiredType != Reinterpret->getArgOperand(0)->getType()) 313 return false; 314 } 315 316 // Create the new Phi 317 LLVMContext &Ctx = PN->getContext(); 318 IRBuilder<> Builder(Ctx); 319 Builder.SetInsertPoint(PN); 320 PHINode *NPN = Builder.CreatePHI(RequiredType, PN->getNumIncomingValues()); 321 Worklist.push_back(PN); 322 323 for (unsigned I = 0; I < PN->getNumIncomingValues(); I++) { 324 auto *Reinterpret = cast<Instruction>(PN->getIncomingValue(I)); 325 NPN->addIncoming(Reinterpret->getOperand(0), PN->getIncomingBlock(I)); 326 Worklist.push_back(Reinterpret); 327 } 328 329 // Cleanup Phi Node and reinterprets 330 X->replaceAllUsesWith(NPN); 331 X->eraseFromParent(); 332 333 for (auto &I : Worklist) 334 if (I->use_empty()) 335 I->eraseFromParent(); 336 337 return true; 338 } 339 340 bool SVEIntrinsicOpts::optimizePTest(IntrinsicInst *I) { 341 IntrinsicInst *Op1 = dyn_cast<IntrinsicInst>(I->getArgOperand(0)); 342 IntrinsicInst *Op2 = dyn_cast<IntrinsicInst>(I->getArgOperand(1)); 343 344 if (Op1 && Op2 && 345 Op1->getIntrinsicID() == Intrinsic::aarch64_sve_convert_to_svbool && 346 Op2->getIntrinsicID() == Intrinsic::aarch64_sve_convert_to_svbool && 347 Op1->getArgOperand(0)->getType() == Op2->getArgOperand(0)->getType()) { 348 349 Value *Ops[] = {Op1->getArgOperand(0), Op2->getArgOperand(0)}; 350 Type *Tys[] = {Op1->getArgOperand(0)->getType()}; 351 Module *M = I->getParent()->getParent()->getParent(); 352 353 auto Fn = Intrinsic::getDeclaration(M, I->getIntrinsicID(), Tys); 354 auto CI = CallInst::Create(Fn, Ops, I->getName(), I); 355 356 I->replaceAllUsesWith(CI); 357 I->eraseFromParent(); 358 if (Op1->use_empty()) 359 Op1->eraseFromParent(); 360 if (Op1 != Op2 && Op2->use_empty()) 361 Op2->eraseFromParent(); 362 363 return true; 364 } 365 366 return false; 367 } 368 369 bool SVEIntrinsicOpts::optimizeConvertFromSVBool(IntrinsicInst *I) { 370 assert(I->getIntrinsicID() == Intrinsic::aarch64_sve_convert_from_svbool && 371 "Unexpected opcode"); 372 373 // If the reinterpret instruction operand is a PHI Node 374 if (isa<PHINode>(I->getArgOperand(0))) 375 return processPhiNode(I); 376 377 SmallVector<Instruction *, 32> CandidatesForRemoval; 378 Value *Cursor = I->getOperand(0), *EarliestReplacement = nullptr; 379 380 const auto *IVTy = cast<VectorType>(I->getType()); 381 382 // Walk the chain of conversions. 383 while (Cursor) { 384 // If the type of the cursor has fewer lanes than the final result, zeroing 385 // must take place, which breaks the equivalence chain. 386 const auto *CursorVTy = cast<VectorType>(Cursor->getType()); 387 if (CursorVTy->getElementCount().getKnownMinValue() < 388 IVTy->getElementCount().getKnownMinValue()) 389 break; 390 391 // If the cursor has the same type as I, it is a viable replacement. 392 if (Cursor->getType() == IVTy) 393 EarliestReplacement = Cursor; 394 395 auto *IntrinsicCursor = dyn_cast<IntrinsicInst>(Cursor); 396 397 // If this is not an SVE conversion intrinsic, this is the end of the chain. 398 if (!IntrinsicCursor || !(IntrinsicCursor->getIntrinsicID() == 399 Intrinsic::aarch64_sve_convert_to_svbool || 400 IntrinsicCursor->getIntrinsicID() == 401 Intrinsic::aarch64_sve_convert_from_svbool)) 402 break; 403 404 CandidatesForRemoval.insert(CandidatesForRemoval.begin(), IntrinsicCursor); 405 Cursor = IntrinsicCursor->getOperand(0); 406 } 407 408 // If no viable replacement in the conversion chain was found, there is 409 // nothing to do. 410 if (!EarliestReplacement) 411 return false; 412 413 I->replaceAllUsesWith(EarliestReplacement); 414 I->eraseFromParent(); 415 416 while (!CandidatesForRemoval.empty()) { 417 Instruction *Candidate = CandidatesForRemoval.pop_back_val(); 418 if (Candidate->use_empty()) 419 Candidate->eraseFromParent(); 420 } 421 return true; 422 } 423 424 bool SVEIntrinsicOpts::optimizeIntrinsic(Instruction *I) { 425 IntrinsicInst *IntrI = dyn_cast<IntrinsicInst>(I); 426 if (!IntrI) 427 return false; 428 429 switch (IntrI->getIntrinsicID()) { 430 case Intrinsic::aarch64_sve_convert_from_svbool: 431 return optimizeConvertFromSVBool(IntrI); 432 case Intrinsic::aarch64_sve_ptest_any: 433 case Intrinsic::aarch64_sve_ptest_first: 434 case Intrinsic::aarch64_sve_ptest_last: 435 return optimizePTest(IntrI); 436 default: 437 return false; 438 } 439 440 return true; 441 } 442 443 bool SVEIntrinsicOpts::optimizeIntrinsicCalls( 444 SmallSetVector<Function *, 4> &Functions) { 445 bool Changed = false; 446 for (auto *F : Functions) { 447 DominatorTree *DT = &getAnalysis<DominatorTreeWrapperPass>(*F).getDomTree(); 448 449 // Traverse the DT with an rpo walk so we see defs before uses, allowing 450 // simplification to be done incrementally. 451 BasicBlock *Root = DT->getRoot(); 452 ReversePostOrderTraversal<BasicBlock *> RPOT(Root); 453 for (auto *BB : RPOT) 454 for (Instruction &I : make_early_inc_range(*BB)) 455 Changed |= optimizeIntrinsic(&I); 456 } 457 return Changed; 458 } 459 460 bool SVEIntrinsicOpts::optimizeFunctions( 461 SmallSetVector<Function *, 4> &Functions) { 462 bool Changed = false; 463 464 Changed |= optimizePTrueIntrinsicCalls(Functions); 465 Changed |= optimizeIntrinsicCalls(Functions); 466 467 return Changed; 468 } 469 470 bool SVEIntrinsicOpts::runOnModule(Module &M) { 471 bool Changed = false; 472 SmallSetVector<Function *, 4> Functions; 473 474 // Check for SVE intrinsic declarations first so that we only iterate over 475 // relevant functions. Where an appropriate declaration is found, store the 476 // function(s) where it is used so we can target these only. 477 for (auto &F : M.getFunctionList()) { 478 if (!F.isDeclaration()) 479 continue; 480 481 switch (F.getIntrinsicID()) { 482 case Intrinsic::aarch64_sve_convert_from_svbool: 483 case Intrinsic::aarch64_sve_ptest_any: 484 case Intrinsic::aarch64_sve_ptest_first: 485 case Intrinsic::aarch64_sve_ptest_last: 486 case Intrinsic::aarch64_sve_ptrue: 487 for (User *U : F.users()) 488 Functions.insert(cast<Instruction>(U)->getFunction()); 489 break; 490 default: 491 break; 492 } 493 } 494 495 if (!Functions.empty()) 496 Changed |= optimizeFunctions(Functions); 497 498 return Changed; 499 } 500