1 //===- Construction of pass pipelines -------------------------------------===// 2 // 3 // Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. 4 // See https://llvm.org/LICENSE.txt for license information. 5 // SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception 6 // 7 //===----------------------------------------------------------------------===// 8 /// \file 9 /// 10 /// This file provides the implementation of the PassBuilder based on our 11 /// static pass registry as well as related functionality. It also provides 12 /// helpers to aid in analyzing, debugging, and testing passes and pass 13 /// pipelines. 14 /// 15 //===----------------------------------------------------------------------===// 16 17 #include "llvm/Analysis/AliasAnalysis.h" 18 #include "llvm/Analysis/BasicAliasAnalysis.h" 19 #include "llvm/Analysis/CGSCCPassManager.h" 20 #include "llvm/Analysis/GlobalsModRef.h" 21 #include "llvm/Analysis/InlineAdvisor.h" 22 #include "llvm/Analysis/OptimizationRemarkEmitter.h" 23 #include "llvm/Analysis/ProfileSummaryInfo.h" 24 #include "llvm/Analysis/ScopedNoAliasAA.h" 25 #include "llvm/Analysis/TypeBasedAliasAnalysis.h" 26 #include "llvm/IR/PassManager.h" 27 #include "llvm/Passes/OptimizationLevel.h" 28 #include "llvm/Passes/PassBuilder.h" 29 #include "llvm/Support/CommandLine.h" 30 #include "llvm/Support/ErrorHandling.h" 31 #include "llvm/Support/PGOOptions.h" 32 #include "llvm/Target/TargetMachine.h" 33 #include "llvm/Transforms/AggressiveInstCombine/AggressiveInstCombine.h" 34 #include "llvm/Transforms/Coroutines/CoroCleanup.h" 35 #include "llvm/Transforms/Coroutines/CoroConditionalWrapper.h" 36 #include "llvm/Transforms/Coroutines/CoroEarly.h" 37 #include "llvm/Transforms/Coroutines/CoroElide.h" 38 #include "llvm/Transforms/Coroutines/CoroSplit.h" 39 #include "llvm/Transforms/IPO/AlwaysInliner.h" 40 #include "llvm/Transforms/IPO/Annotation2Metadata.h" 41 #include "llvm/Transforms/IPO/ArgumentPromotion.h" 42 #include "llvm/Transforms/IPO/Attributor.h" 43 #include "llvm/Transforms/IPO/CalledValuePropagation.h" 44 #include "llvm/Transforms/IPO/ConstantMerge.h" 45 #include "llvm/Transforms/IPO/CrossDSOCFI.h" 46 #include "llvm/Transforms/IPO/DeadArgumentElimination.h" 47 #include "llvm/Transforms/IPO/ElimAvailExtern.h" 48 #include "llvm/Transforms/IPO/ForceFunctionAttrs.h" 49 #include "llvm/Transforms/IPO/FunctionAttrs.h" 50 #include "llvm/Transforms/IPO/GlobalDCE.h" 51 #include "llvm/Transforms/IPO/GlobalOpt.h" 52 #include "llvm/Transforms/IPO/GlobalSplit.h" 53 #include "llvm/Transforms/IPO/HotColdSplitting.h" 54 #include "llvm/Transforms/IPO/IROutliner.h" 55 #include "llvm/Transforms/IPO/InferFunctionAttrs.h" 56 #include "llvm/Transforms/IPO/Inliner.h" 57 #include "llvm/Transforms/IPO/LowerTypeTests.h" 58 #include "llvm/Transforms/IPO/MergeFunctions.h" 59 #include "llvm/Transforms/IPO/ModuleInliner.h" 60 #include "llvm/Transforms/IPO/OpenMPOpt.h" 61 #include "llvm/Transforms/IPO/PartialInlining.h" 62 #include "llvm/Transforms/IPO/SCCP.h" 63 #include "llvm/Transforms/IPO/SampleProfile.h" 64 #include "llvm/Transforms/IPO/SampleProfileProbe.h" 65 #include "llvm/Transforms/IPO/SyntheticCountsPropagation.h" 66 #include "llvm/Transforms/IPO/WholeProgramDevirt.h" 67 #include "llvm/Transforms/InstCombine/InstCombine.h" 68 #include "llvm/Transforms/Instrumentation/CGProfile.h" 69 #include "llvm/Transforms/Instrumentation/ControlHeightReduction.h" 70 #include "llvm/Transforms/Instrumentation/InstrOrderFile.h" 71 #include "llvm/Transforms/Instrumentation/InstrProfiling.h" 72 #include "llvm/Transforms/Instrumentation/MemProfiler.h" 73 #include "llvm/Transforms/Instrumentation/PGOInstrumentation.h" 74 #include "llvm/Transforms/Scalar/ADCE.h" 75 #include "llvm/Transforms/Scalar/AlignmentFromAssumptions.h" 76 #include "llvm/Transforms/Scalar/AnnotationRemarks.h" 77 #include "llvm/Transforms/Scalar/BDCE.h" 78 #include "llvm/Transforms/Scalar/CallSiteSplitting.h" 79 #include "llvm/Transforms/Scalar/ConstraintElimination.h" 80 #include "llvm/Transforms/Scalar/CorrelatedValuePropagation.h" 81 #include "llvm/Transforms/Scalar/DFAJumpThreading.h" 82 #include "llvm/Transforms/Scalar/DeadStoreElimination.h" 83 #include "llvm/Transforms/Scalar/DivRemPairs.h" 84 #include "llvm/Transforms/Scalar/EarlyCSE.h" 85 #include "llvm/Transforms/Scalar/Float2Int.h" 86 #include "llvm/Transforms/Scalar/GVN.h" 87 #include "llvm/Transforms/Scalar/IndVarSimplify.h" 88 #include "llvm/Transforms/Scalar/InstSimplifyPass.h" 89 #include "llvm/Transforms/Scalar/JumpThreading.h" 90 #include "llvm/Transforms/Scalar/LICM.h" 91 #include "llvm/Transforms/Scalar/LoopDeletion.h" 92 #include "llvm/Transforms/Scalar/LoopDistribute.h" 93 #include "llvm/Transforms/Scalar/LoopFlatten.h" 94 #include "llvm/Transforms/Scalar/LoopIdiomRecognize.h" 95 #include "llvm/Transforms/Scalar/LoopInstSimplify.h" 96 #include "llvm/Transforms/Scalar/LoopInterchange.h" 97 #include "llvm/Transforms/Scalar/LoopLoadElimination.h" 98 #include "llvm/Transforms/Scalar/LoopPassManager.h" 99 #include "llvm/Transforms/Scalar/LoopRotation.h" 100 #include "llvm/Transforms/Scalar/LoopSimplifyCFG.h" 101 #include "llvm/Transforms/Scalar/LoopSink.h" 102 #include "llvm/Transforms/Scalar/LoopUnrollAndJamPass.h" 103 #include "llvm/Transforms/Scalar/LoopUnrollPass.h" 104 #include "llvm/Transforms/Scalar/LowerConstantIntrinsics.h" 105 #include "llvm/Transforms/Scalar/LowerExpectIntrinsic.h" 106 #include "llvm/Transforms/Scalar/LowerMatrixIntrinsics.h" 107 #include "llvm/Transforms/Scalar/MemCpyOptimizer.h" 108 #include "llvm/Transforms/Scalar/MergedLoadStoreMotion.h" 109 #include "llvm/Transforms/Scalar/NewGVN.h" 110 #include "llvm/Transforms/Scalar/Reassociate.h" 111 #include "llvm/Transforms/Scalar/SCCP.h" 112 #include "llvm/Transforms/Scalar/SROA.h" 113 #include "llvm/Transforms/Scalar/SimpleLoopUnswitch.h" 114 #include "llvm/Transforms/Scalar/SimplifyCFG.h" 115 #include "llvm/Transforms/Scalar/SpeculativeExecution.h" 116 #include "llvm/Transforms/Scalar/TailRecursionElimination.h" 117 #include "llvm/Transforms/Scalar/WarnMissedTransforms.h" 118 #include "llvm/Transforms/Utils/AddDiscriminators.h" 119 #include "llvm/Transforms/Utils/AssumeBundleBuilder.h" 120 #include "llvm/Transforms/Utils/CanonicalizeAliases.h" 121 #include "llvm/Transforms/Utils/InjectTLIMappings.h" 122 #include "llvm/Transforms/Utils/LibCallsShrinkWrap.h" 123 #include "llvm/Transforms/Utils/Mem2Reg.h" 124 #include "llvm/Transforms/Utils/NameAnonGlobals.h" 125 #include "llvm/Transforms/Utils/RelLookupTableConverter.h" 126 #include "llvm/Transforms/Utils/SimplifyCFGOptions.h" 127 #include "llvm/Transforms/Vectorize/LoopVectorize.h" 128 #include "llvm/Transforms/Vectorize/SLPVectorizer.h" 129 #include "llvm/Transforms/Vectorize/VectorCombine.h" 130 131 using namespace llvm; 132 133 static cl::opt<InliningAdvisorMode> UseInlineAdvisor( 134 "enable-ml-inliner", cl::init(InliningAdvisorMode::Default), cl::Hidden, 135 cl::desc("Enable ML policy for inliner. Currently trained for -Oz only"), 136 cl::values(clEnumValN(InliningAdvisorMode::Default, "default", 137 "Heuristics-based inliner version."), 138 clEnumValN(InliningAdvisorMode::Development, "development", 139 "Use development mode (runtime-loadable model)."), 140 clEnumValN(InliningAdvisorMode::Release, "release", 141 "Use release mode (AOT-compiled model)."))); 142 143 static cl::opt<bool> EnableSyntheticCounts( 144 "enable-npm-synthetic-counts", cl::init(false), cl::Hidden, cl::ZeroOrMore, 145 cl::desc("Run synthetic function entry count generation " 146 "pass")); 147 148 /// Flag to enable inline deferral during PGO. 149 static cl::opt<bool> 150 EnablePGOInlineDeferral("enable-npm-pgo-inline-deferral", cl::init(true), 151 cl::Hidden, 152 cl::desc("Enable inline deferral during PGO")); 153 154 static cl::opt<bool> EnableMemProfiler("enable-mem-prof", cl::init(false), 155 cl::Hidden, cl::ZeroOrMore, 156 cl::desc("Enable memory profiler")); 157 158 static cl::opt<bool> EnableModuleInliner("enable-module-inliner", 159 cl::init(false), cl::Hidden, 160 cl::desc("Enable module inliner")); 161 162 static cl::opt<bool> PerformMandatoryInliningsFirst( 163 "mandatory-inlining-first", cl::init(true), cl::Hidden, cl::ZeroOrMore, 164 cl::desc("Perform mandatory inlinings module-wide, before performing " 165 "inlining.")); 166 167 static cl::opt<bool> EnableO3NonTrivialUnswitching( 168 "enable-npm-O3-nontrivial-unswitch", cl::init(true), cl::Hidden, 169 cl::ZeroOrMore, cl::desc("Enable non-trivial loop unswitching for -O3")); 170 171 static cl::opt<bool> EnableEagerlyInvalidateAnalyses( 172 "eagerly-invalidate-analyses", cl::init(true), cl::Hidden, 173 cl::desc("Eagerly invalidate more analyses in default pipelines")); 174 175 static cl::opt<bool> EnableNoRerunSimplificationPipeline( 176 "enable-no-rerun-simplification-pipeline", cl::init(false), cl::Hidden, 177 cl::desc( 178 "Prevent running the simplification pipeline on a function more " 179 "than once in the case that SCC mutations cause a function to be " 180 "visited multiple times as long as the function has not been changed")); 181 182 static cl::opt<bool> EnableMergeFunctions( 183 "enable-merge-functions", cl::init(false), cl::Hidden, 184 cl::desc("Enable function merging as part of the optimization pipeline")); 185 186 PipelineTuningOptions::PipelineTuningOptions() { 187 LoopInterleaving = true; 188 LoopVectorization = true; 189 SLPVectorization = false; 190 LoopUnrolling = true; 191 ForgetAllSCEVInLoopUnroll = ForgetSCEVInLoopUnroll; 192 LicmMssaOptCap = SetLicmMssaOptCap; 193 LicmMssaNoAccForPromotionCap = SetLicmMssaNoAccForPromotionCap; 194 CallGraphProfile = true; 195 MergeFunctions = EnableMergeFunctions; 196 EagerlyInvalidateAnalyses = EnableEagerlyInvalidateAnalyses; 197 } 198 199 namespace llvm { 200 201 extern cl::opt<unsigned> MaxDevirtIterations; 202 extern cl::opt<bool> EnableConstraintElimination; 203 extern cl::opt<bool> EnableFunctionSpecialization; 204 extern cl::opt<bool> EnableGVNHoist; 205 extern cl::opt<bool> EnableGVNSink; 206 extern cl::opt<bool> EnableHotColdSplit; 207 extern cl::opt<bool> EnableIROutliner; 208 extern cl::opt<bool> EnableOrderFileInstrumentation; 209 extern cl::opt<bool> EnableCHR; 210 extern cl::opt<bool> EnableLoopInterchange; 211 extern cl::opt<bool> EnableUnrollAndJam; 212 extern cl::opt<bool> EnableLoopFlatten; 213 extern cl::opt<bool> EnableDFAJumpThreading; 214 extern cl::opt<bool> RunNewGVN; 215 extern cl::opt<bool> RunPartialInlining; 216 extern cl::opt<bool> ExtraVectorizerPasses; 217 218 extern cl::opt<bool> FlattenedProfileUsed; 219 220 extern cl::opt<AttributorRunOption> AttributorRun; 221 extern cl::opt<bool> EnableKnowledgeRetention; 222 223 extern cl::opt<bool> EnableMatrix; 224 225 extern cl::opt<bool> DisablePreInliner; 226 extern cl::opt<int> PreInlineThreshold; 227 } // namespace llvm 228 229 void PassBuilder::invokePeepholeEPCallbacks(FunctionPassManager &FPM, 230 OptimizationLevel Level) { 231 for (auto &C : PeepholeEPCallbacks) 232 C(FPM, Level); 233 } 234 235 // Helper to add AnnotationRemarksPass. 236 static void addAnnotationRemarksPass(ModulePassManager &MPM) { 237 MPM.addPass(createModuleToFunctionPassAdaptor(AnnotationRemarksPass())); 238 } 239 240 // Helper to check if the current compilation phase is preparing for LTO 241 static bool isLTOPreLink(ThinOrFullLTOPhase Phase) { 242 return Phase == ThinOrFullLTOPhase::ThinLTOPreLink || 243 Phase == ThinOrFullLTOPhase::FullLTOPreLink; 244 } 245 246 // TODO: Investigate the cost/benefit of tail call elimination on debugging. 247 FunctionPassManager 248 PassBuilder::buildO1FunctionSimplificationPipeline(OptimizationLevel Level, 249 ThinOrFullLTOPhase Phase) { 250 251 FunctionPassManager FPM; 252 253 // Form SSA out of local memory accesses after breaking apart aggregates into 254 // scalars. 255 FPM.addPass(SROAPass()); 256 257 // Catch trivial redundancies 258 FPM.addPass(EarlyCSEPass(true /* Enable mem-ssa. */)); 259 260 // Hoisting of scalars and load expressions. 261 FPM.addPass( 262 SimplifyCFGPass(SimplifyCFGOptions().convertSwitchRangeToICmp(true))); 263 FPM.addPass(InstCombinePass()); 264 265 FPM.addPass(LibCallsShrinkWrapPass()); 266 267 invokePeepholeEPCallbacks(FPM, Level); 268 269 FPM.addPass( 270 SimplifyCFGPass(SimplifyCFGOptions().convertSwitchRangeToICmp(true))); 271 272 // Form canonically associated expression trees, and simplify the trees using 273 // basic mathematical properties. For example, this will form (nearly) 274 // minimal multiplication trees. 275 FPM.addPass(ReassociatePass()); 276 277 // Add the primary loop simplification pipeline. 278 // FIXME: Currently this is split into two loop pass pipelines because we run 279 // some function passes in between them. These can and should be removed 280 // and/or replaced by scheduling the loop pass equivalents in the correct 281 // positions. But those equivalent passes aren't powerful enough yet. 282 // Specifically, `SimplifyCFGPass` and `InstCombinePass` are currently still 283 // used. We have `LoopSimplifyCFGPass` which isn't yet powerful enough yet to 284 // fully replace `SimplifyCFGPass`, and the closest to the other we have is 285 // `LoopInstSimplify`. 286 LoopPassManager LPM1, LPM2; 287 288 // Simplify the loop body. We do this initially to clean up after other loop 289 // passes run, either when iterating on a loop or on inner loops with 290 // implications on the outer loop. 291 LPM1.addPass(LoopInstSimplifyPass()); 292 LPM1.addPass(LoopSimplifyCFGPass()); 293 294 // Try to remove as much code from the loop header as possible, 295 // to reduce amount of IR that will have to be duplicated. However, 296 // do not perform speculative hoisting the first time as LICM 297 // will destroy metadata that may not need to be destroyed if run 298 // after loop rotation. 299 // TODO: Investigate promotion cap for O1. 300 LPM1.addPass(LICMPass(PTO.LicmMssaOptCap, PTO.LicmMssaNoAccForPromotionCap, 301 /*AllowSpeculation=*/false)); 302 303 LPM1.addPass(LoopRotatePass(/* Disable header duplication */ true, 304 isLTOPreLink(Phase))); 305 // TODO: Investigate promotion cap for O1. 306 LPM1.addPass(LICMPass(PTO.LicmMssaOptCap, PTO.LicmMssaNoAccForPromotionCap, 307 /*AllowSpeculation=*/true)); 308 LPM1.addPass(SimpleLoopUnswitchPass()); 309 if (EnableLoopFlatten) 310 LPM1.addPass(LoopFlattenPass()); 311 312 LPM2.addPass(LoopIdiomRecognizePass()); 313 LPM2.addPass(IndVarSimplifyPass()); 314 315 for (auto &C : LateLoopOptimizationsEPCallbacks) 316 C(LPM2, Level); 317 318 LPM2.addPass(LoopDeletionPass()); 319 320 if (EnableLoopInterchange) 321 LPM2.addPass(LoopInterchangePass()); 322 323 // Do not enable unrolling in PreLinkThinLTO phase during sample PGO 324 // because it changes IR to makes profile annotation in back compile 325 // inaccurate. The normal unroller doesn't pay attention to forced full unroll 326 // attributes so we need to make sure and allow the full unroll pass to pay 327 // attention to it. 328 if (Phase != ThinOrFullLTOPhase::ThinLTOPreLink || !PGOOpt || 329 PGOOpt->Action != PGOOptions::SampleUse) 330 LPM2.addPass(LoopFullUnrollPass(Level.getSpeedupLevel(), 331 /* OnlyWhenForced= */ !PTO.LoopUnrolling, 332 PTO.ForgetAllSCEVInLoopUnroll)); 333 334 for (auto &C : LoopOptimizerEndEPCallbacks) 335 C(LPM2, Level); 336 337 // We provide the opt remark emitter pass for LICM to use. We only need to do 338 // this once as it is immutable. 339 FPM.addPass( 340 RequireAnalysisPass<OptimizationRemarkEmitterAnalysis, Function>()); 341 FPM.addPass(createFunctionToLoopPassAdaptor(std::move(LPM1), 342 /*UseMemorySSA=*/true, 343 /*UseBlockFrequencyInfo=*/true)); 344 FPM.addPass( 345 SimplifyCFGPass(SimplifyCFGOptions().convertSwitchRangeToICmp(true))); 346 FPM.addPass(InstCombinePass()); 347 // The loop passes in LPM2 (LoopFullUnrollPass) do not preserve MemorySSA. 348 // *All* loop passes must preserve it, in order to be able to use it. 349 FPM.addPass(createFunctionToLoopPassAdaptor(std::move(LPM2), 350 /*UseMemorySSA=*/false, 351 /*UseBlockFrequencyInfo=*/false)); 352 353 // Delete small array after loop unroll. 354 FPM.addPass(SROAPass()); 355 356 // Specially optimize memory movement as it doesn't look like dataflow in SSA. 357 FPM.addPass(MemCpyOptPass()); 358 359 // Sparse conditional constant propagation. 360 // FIXME: It isn't clear why we do this *after* loop passes rather than 361 // before... 362 FPM.addPass(SCCPPass()); 363 364 // Delete dead bit computations (instcombine runs after to fold away the dead 365 // computations, and then ADCE will run later to exploit any new DCE 366 // opportunities that creates). 367 FPM.addPass(BDCEPass()); 368 369 // Run instcombine after redundancy and dead bit elimination to exploit 370 // opportunities opened up by them. 371 FPM.addPass(InstCombinePass()); 372 invokePeepholeEPCallbacks(FPM, Level); 373 374 FPM.addPass(CoroElidePass()); 375 376 for (auto &C : ScalarOptimizerLateEPCallbacks) 377 C(FPM, Level); 378 379 // Finally, do an expensive DCE pass to catch all the dead code exposed by 380 // the simplifications and basic cleanup after all the simplifications. 381 // TODO: Investigate if this is too expensive. 382 FPM.addPass(ADCEPass()); 383 FPM.addPass( 384 SimplifyCFGPass(SimplifyCFGOptions().convertSwitchRangeToICmp(true))); 385 FPM.addPass(InstCombinePass()); 386 invokePeepholeEPCallbacks(FPM, Level); 387 388 return FPM; 389 } 390 391 FunctionPassManager 392 PassBuilder::buildFunctionSimplificationPipeline(OptimizationLevel Level, 393 ThinOrFullLTOPhase Phase) { 394 assert(Level != OptimizationLevel::O0 && "Must request optimizations!"); 395 396 // The O1 pipeline has a separate pipeline creation function to simplify 397 // construction readability. 398 if (Level.getSpeedupLevel() == 1) 399 return buildO1FunctionSimplificationPipeline(Level, Phase); 400 401 FunctionPassManager FPM; 402 403 // Form SSA out of local memory accesses after breaking apart aggregates into 404 // scalars. 405 FPM.addPass(SROAPass()); 406 407 // Catch trivial redundancies 408 FPM.addPass(EarlyCSEPass(true /* Enable mem-ssa. */)); 409 if (EnableKnowledgeRetention) 410 FPM.addPass(AssumeSimplifyPass()); 411 412 // Hoisting of scalars and load expressions. 413 if (EnableGVNHoist) 414 FPM.addPass(GVNHoistPass()); 415 416 // Global value numbering based sinking. 417 if (EnableGVNSink) { 418 FPM.addPass(GVNSinkPass()); 419 FPM.addPass( 420 SimplifyCFGPass(SimplifyCFGOptions().convertSwitchRangeToICmp(true))); 421 } 422 423 if (EnableConstraintElimination) 424 FPM.addPass(ConstraintEliminationPass()); 425 426 // Speculative execution if the target has divergent branches; otherwise nop. 427 FPM.addPass(SpeculativeExecutionPass(/* OnlyIfDivergentTarget =*/true)); 428 429 // Optimize based on known information about branches, and cleanup afterward. 430 FPM.addPass(JumpThreadingPass()); 431 FPM.addPass(CorrelatedValuePropagationPass()); 432 433 FPM.addPass( 434 SimplifyCFGPass(SimplifyCFGOptions().convertSwitchRangeToICmp(true))); 435 FPM.addPass(InstCombinePass()); 436 if (Level == OptimizationLevel::O3) 437 FPM.addPass(AggressiveInstCombinePass()); 438 439 if (!Level.isOptimizingForSize()) 440 FPM.addPass(LibCallsShrinkWrapPass()); 441 442 invokePeepholeEPCallbacks(FPM, Level); 443 444 // For PGO use pipeline, try to optimize memory intrinsics such as memcpy 445 // using the size value profile. Don't perform this when optimizing for size. 446 if (PGOOpt && PGOOpt->Action == PGOOptions::IRUse && 447 !Level.isOptimizingForSize()) 448 FPM.addPass(PGOMemOPSizeOpt()); 449 450 FPM.addPass(TailCallElimPass()); 451 FPM.addPass( 452 SimplifyCFGPass(SimplifyCFGOptions().convertSwitchRangeToICmp(true))); 453 454 // Form canonically associated expression trees, and simplify the trees using 455 // basic mathematical properties. For example, this will form (nearly) 456 // minimal multiplication trees. 457 FPM.addPass(ReassociatePass()); 458 459 // Add the primary loop simplification pipeline. 460 // FIXME: Currently this is split into two loop pass pipelines because we run 461 // some function passes in between them. These can and should be removed 462 // and/or replaced by scheduling the loop pass equivalents in the correct 463 // positions. But those equivalent passes aren't powerful enough yet. 464 // Specifically, `SimplifyCFGPass` and `InstCombinePass` are currently still 465 // used. We have `LoopSimplifyCFGPass` which isn't yet powerful enough yet to 466 // fully replace `SimplifyCFGPass`, and the closest to the other we have is 467 // `LoopInstSimplify`. 468 LoopPassManager LPM1, LPM2; 469 470 // Simplify the loop body. We do this initially to clean up after other loop 471 // passes run, either when iterating on a loop or on inner loops with 472 // implications on the outer loop. 473 LPM1.addPass(LoopInstSimplifyPass()); 474 LPM1.addPass(LoopSimplifyCFGPass()); 475 476 // Try to remove as much code from the loop header as possible, 477 // to reduce amount of IR that will have to be duplicated. However, 478 // do not perform speculative hoisting the first time as LICM 479 // will destroy metadata that may not need to be destroyed if run 480 // after loop rotation. 481 // TODO: Investigate promotion cap for O1. 482 LPM1.addPass(LICMPass(PTO.LicmMssaOptCap, PTO.LicmMssaNoAccForPromotionCap, 483 /*AllowSpeculation=*/false)); 484 485 // Disable header duplication in loop rotation at -Oz. 486 LPM1.addPass( 487 LoopRotatePass(Level != OptimizationLevel::Oz, isLTOPreLink(Phase))); 488 // TODO: Investigate promotion cap for O1. 489 LPM1.addPass(LICMPass(PTO.LicmMssaOptCap, PTO.LicmMssaNoAccForPromotionCap, 490 /*AllowSpeculation=*/true)); 491 LPM1.addPass( 492 SimpleLoopUnswitchPass(/* NonTrivial */ Level == OptimizationLevel::O3 && 493 EnableO3NonTrivialUnswitching)); 494 if (EnableLoopFlatten) 495 LPM1.addPass(LoopFlattenPass()); 496 497 LPM2.addPass(LoopIdiomRecognizePass()); 498 LPM2.addPass(IndVarSimplifyPass()); 499 500 for (auto &C : LateLoopOptimizationsEPCallbacks) 501 C(LPM2, Level); 502 503 LPM2.addPass(LoopDeletionPass()); 504 505 if (EnableLoopInterchange) 506 LPM2.addPass(LoopInterchangePass()); 507 508 // Do not enable unrolling in PreLinkThinLTO phase during sample PGO 509 // because it changes IR to makes profile annotation in back compile 510 // inaccurate. The normal unroller doesn't pay attention to forced full unroll 511 // attributes so we need to make sure and allow the full unroll pass to pay 512 // attention to it. 513 if (Phase != ThinOrFullLTOPhase::ThinLTOPreLink || !PGOOpt || 514 PGOOpt->Action != PGOOptions::SampleUse) 515 LPM2.addPass(LoopFullUnrollPass(Level.getSpeedupLevel(), 516 /* OnlyWhenForced= */ !PTO.LoopUnrolling, 517 PTO.ForgetAllSCEVInLoopUnroll)); 518 519 for (auto &C : LoopOptimizerEndEPCallbacks) 520 C(LPM2, Level); 521 522 // We provide the opt remark emitter pass for LICM to use. We only need to do 523 // this once as it is immutable. 524 FPM.addPass( 525 RequireAnalysisPass<OptimizationRemarkEmitterAnalysis, Function>()); 526 FPM.addPass(createFunctionToLoopPassAdaptor(std::move(LPM1), 527 /*UseMemorySSA=*/true, 528 /*UseBlockFrequencyInfo=*/true)); 529 FPM.addPass( 530 SimplifyCFGPass(SimplifyCFGOptions().convertSwitchRangeToICmp(true))); 531 FPM.addPass(InstCombinePass()); 532 // The loop passes in LPM2 (LoopIdiomRecognizePass, IndVarSimplifyPass, 533 // LoopDeletionPass and LoopFullUnrollPass) do not preserve MemorySSA. 534 // *All* loop passes must preserve it, in order to be able to use it. 535 FPM.addPass(createFunctionToLoopPassAdaptor(std::move(LPM2), 536 /*UseMemorySSA=*/false, 537 /*UseBlockFrequencyInfo=*/false)); 538 539 // Delete small array after loop unroll. 540 FPM.addPass(SROAPass()); 541 542 // The matrix extension can introduce large vector operations early, which can 543 // benefit from running vector-combine early on. 544 if (EnableMatrix) 545 FPM.addPass(VectorCombinePass(/*ScalarizationOnly=*/true)); 546 547 // Eliminate redundancies. 548 FPM.addPass(MergedLoadStoreMotionPass()); 549 if (RunNewGVN) 550 FPM.addPass(NewGVNPass()); 551 else 552 FPM.addPass(GVNPass()); 553 554 // Sparse conditional constant propagation. 555 // FIXME: It isn't clear why we do this *after* loop passes rather than 556 // before... 557 FPM.addPass(SCCPPass()); 558 559 // Delete dead bit computations (instcombine runs after to fold away the dead 560 // computations, and then ADCE will run later to exploit any new DCE 561 // opportunities that creates). 562 FPM.addPass(BDCEPass()); 563 564 // Run instcombine after redundancy and dead bit elimination to exploit 565 // opportunities opened up by them. 566 FPM.addPass(InstCombinePass()); 567 invokePeepholeEPCallbacks(FPM, Level); 568 569 // Re-consider control flow based optimizations after redundancy elimination, 570 // redo DCE, etc. 571 if (EnableDFAJumpThreading && Level.getSizeLevel() == 0) 572 FPM.addPass(DFAJumpThreadingPass()); 573 574 FPM.addPass(JumpThreadingPass()); 575 FPM.addPass(CorrelatedValuePropagationPass()); 576 577 // Finally, do an expensive DCE pass to catch all the dead code exposed by 578 // the simplifications and basic cleanup after all the simplifications. 579 // TODO: Investigate if this is too expensive. 580 FPM.addPass(ADCEPass()); 581 582 // Specially optimize memory movement as it doesn't look like dataflow in SSA. 583 FPM.addPass(MemCpyOptPass()); 584 585 FPM.addPass(DSEPass()); 586 FPM.addPass(createFunctionToLoopPassAdaptor( 587 LICMPass(PTO.LicmMssaOptCap, PTO.LicmMssaNoAccForPromotionCap, 588 /*AllowSpeculation=*/true), 589 /*UseMemorySSA=*/true, /*UseBlockFrequencyInfo=*/true)); 590 591 FPM.addPass(CoroElidePass()); 592 593 for (auto &C : ScalarOptimizerLateEPCallbacks) 594 C(FPM, Level); 595 596 FPM.addPass(SimplifyCFGPass(SimplifyCFGOptions() 597 .convertSwitchRangeToICmp(true) 598 .hoistCommonInsts(true) 599 .sinkCommonInsts(true))); 600 FPM.addPass(InstCombinePass()); 601 invokePeepholeEPCallbacks(FPM, Level); 602 603 if (EnableCHR && Level == OptimizationLevel::O3 && PGOOpt && 604 (PGOOpt->Action == PGOOptions::IRUse || 605 PGOOpt->Action == PGOOptions::SampleUse)) 606 FPM.addPass(ControlHeightReductionPass()); 607 608 return FPM; 609 } 610 611 void PassBuilder::addRequiredLTOPreLinkPasses(ModulePassManager &MPM) { 612 MPM.addPass(CanonicalizeAliasesPass()); 613 MPM.addPass(NameAnonGlobalPass()); 614 } 615 616 void PassBuilder::addPGOInstrPasses(ModulePassManager &MPM, 617 OptimizationLevel Level, bool RunProfileGen, 618 bool IsCS, std::string ProfileFile, 619 std::string ProfileRemappingFile) { 620 assert(Level != OptimizationLevel::O0 && "Not expecting O0 here!"); 621 if (!IsCS && !DisablePreInliner) { 622 InlineParams IP; 623 624 IP.DefaultThreshold = PreInlineThreshold; 625 626 // FIXME: The hint threshold has the same value used by the regular inliner 627 // when not optimzing for size. This should probably be lowered after 628 // performance testing. 629 // FIXME: this comment is cargo culted from the old pass manager, revisit). 630 IP.HintThreshold = Level.isOptimizingForSize() ? PreInlineThreshold : 325; 631 ModuleInlinerWrapperPass MIWP(IP); 632 CGSCCPassManager &CGPipeline = MIWP.getPM(); 633 634 FunctionPassManager FPM; 635 FPM.addPass(SROAPass()); 636 FPM.addPass(EarlyCSEPass()); // Catch trivial redundancies. 637 FPM.addPass(SimplifyCFGPass(SimplifyCFGOptions().convertSwitchRangeToICmp( 638 true))); // Merge & remove basic blocks. 639 FPM.addPass(InstCombinePass()); // Combine silly sequences. 640 invokePeepholeEPCallbacks(FPM, Level); 641 642 CGPipeline.addPass(createCGSCCToFunctionPassAdaptor( 643 std::move(FPM), PTO.EagerlyInvalidateAnalyses)); 644 645 MPM.addPass(std::move(MIWP)); 646 647 // Delete anything that is now dead to make sure that we don't instrument 648 // dead code. Instrumentation can end up keeping dead code around and 649 // dramatically increase code size. 650 MPM.addPass(GlobalDCEPass()); 651 } 652 653 if (!RunProfileGen) { 654 assert(!ProfileFile.empty() && "Profile use expecting a profile file!"); 655 MPM.addPass(PGOInstrumentationUse(ProfileFile, ProfileRemappingFile, IsCS)); 656 // Cache ProfileSummaryAnalysis once to avoid the potential need to insert 657 // RequireAnalysisPass for PSI before subsequent non-module passes. 658 MPM.addPass(RequireAnalysisPass<ProfileSummaryAnalysis, Module>()); 659 return; 660 } 661 662 // Perform PGO instrumentation. 663 MPM.addPass(PGOInstrumentationGen(IsCS)); 664 665 // Disable header duplication in loop rotation at -Oz. 666 MPM.addPass(createModuleToFunctionPassAdaptor( 667 createFunctionToLoopPassAdaptor( 668 LoopRotatePass(Level != OptimizationLevel::Oz), 669 /*UseMemorySSA=*/false, 670 /*UseBlockFrequencyInfo=*/false), 671 PTO.EagerlyInvalidateAnalyses)); 672 673 // Add the profile lowering pass. 674 InstrProfOptions Options; 675 if (!ProfileFile.empty()) 676 Options.InstrProfileOutput = ProfileFile; 677 // Do counter promotion at Level greater than O0. 678 Options.DoCounterPromotion = true; 679 Options.UseBFIInPromotion = IsCS; 680 MPM.addPass(InstrProfiling(Options, IsCS)); 681 } 682 683 void PassBuilder::addPGOInstrPassesForO0(ModulePassManager &MPM, 684 bool RunProfileGen, bool IsCS, 685 std::string ProfileFile, 686 std::string ProfileRemappingFile) { 687 if (!RunProfileGen) { 688 assert(!ProfileFile.empty() && "Profile use expecting a profile file!"); 689 MPM.addPass(PGOInstrumentationUse(ProfileFile, ProfileRemappingFile, IsCS)); 690 // Cache ProfileSummaryAnalysis once to avoid the potential need to insert 691 // RequireAnalysisPass for PSI before subsequent non-module passes. 692 MPM.addPass(RequireAnalysisPass<ProfileSummaryAnalysis, Module>()); 693 return; 694 } 695 696 // Perform PGO instrumentation. 697 MPM.addPass(PGOInstrumentationGen(IsCS)); 698 // Add the profile lowering pass. 699 InstrProfOptions Options; 700 if (!ProfileFile.empty()) 701 Options.InstrProfileOutput = ProfileFile; 702 // Do not do counter promotion at O0. 703 Options.DoCounterPromotion = false; 704 Options.UseBFIInPromotion = IsCS; 705 MPM.addPass(InstrProfiling(Options, IsCS)); 706 } 707 708 static InlineParams getInlineParamsFromOptLevel(OptimizationLevel Level) { 709 return getInlineParams(Level.getSpeedupLevel(), Level.getSizeLevel()); 710 } 711 712 ModuleInlinerWrapperPass 713 PassBuilder::buildInlinerPipeline(OptimizationLevel Level, 714 ThinOrFullLTOPhase Phase) { 715 InlineParams IP = getInlineParamsFromOptLevel(Level); 716 // For PreLinkThinLTO + SamplePGO, set hot-caller threshold to 0 to 717 // disable hot callsite inline (as much as possible [1]) because it makes 718 // profile annotation in the backend inaccurate. 719 // 720 // [1] Note the cost of a function could be below zero due to erased 721 // prologue / epilogue. 722 if (Phase == ThinOrFullLTOPhase::ThinLTOPreLink && PGOOpt && 723 PGOOpt->Action == PGOOptions::SampleUse) 724 IP.HotCallSiteThreshold = 0; 725 726 if (PGOOpt) 727 IP.EnableDeferral = EnablePGOInlineDeferral; 728 729 ModuleInlinerWrapperPass MIWP(IP, PerformMandatoryInliningsFirst, 730 UseInlineAdvisor, MaxDevirtIterations); 731 732 // Require the GlobalsAA analysis for the module so we can query it within 733 // the CGSCC pipeline. 734 MIWP.addModulePass(RequireAnalysisPass<GlobalsAA, Module>()); 735 // Invalidate AAManager so it can be recreated and pick up the newly available 736 // GlobalsAA. 737 MIWP.addModulePass( 738 createModuleToFunctionPassAdaptor(InvalidateAnalysisPass<AAManager>())); 739 740 // Require the ProfileSummaryAnalysis for the module so we can query it within 741 // the inliner pass. 742 MIWP.addModulePass(RequireAnalysisPass<ProfileSummaryAnalysis, Module>()); 743 744 // Now begin the main postorder CGSCC pipeline. 745 // FIXME: The current CGSCC pipeline has its origins in the legacy pass 746 // manager and trying to emulate its precise behavior. Much of this doesn't 747 // make a lot of sense and we should revisit the core CGSCC structure. 748 CGSCCPassManager &MainCGPipeline = MIWP.getPM(); 749 750 // Note: historically, the PruneEH pass was run first to deduce nounwind and 751 // generally clean up exception handling overhead. It isn't clear this is 752 // valuable as the inliner doesn't currently care whether it is inlining an 753 // invoke or a call. 754 755 if (AttributorRun & AttributorRunOption::CGSCC) 756 MainCGPipeline.addPass(AttributorCGSCCPass()); 757 758 // Now deduce any function attributes based in the current code. 759 MainCGPipeline.addPass(PostOrderFunctionAttrsPass()); 760 761 // When at O3 add argument promotion to the pass pipeline. 762 // FIXME: It isn't at all clear why this should be limited to O3. 763 if (Level == OptimizationLevel::O3) 764 MainCGPipeline.addPass(ArgumentPromotionPass()); 765 766 // Try to perform OpenMP specific optimizations. This is a (quick!) no-op if 767 // there are no OpenMP runtime calls present in the module. 768 if (Level == OptimizationLevel::O2 || Level == OptimizationLevel::O3) 769 MainCGPipeline.addPass(OpenMPOptCGSCCPass()); 770 771 for (auto &C : CGSCCOptimizerLateEPCallbacks) 772 C(MainCGPipeline, Level); 773 774 // Lastly, add the core function simplification pipeline nested inside the 775 // CGSCC walk. 776 MainCGPipeline.addPass(createCGSCCToFunctionPassAdaptor( 777 buildFunctionSimplificationPipeline(Level, Phase), 778 PTO.EagerlyInvalidateAnalyses, EnableNoRerunSimplificationPipeline)); 779 780 MainCGPipeline.addPass(CoroSplitPass(Level != OptimizationLevel::O0)); 781 782 if (EnableNoRerunSimplificationPipeline) 783 MIWP.addLateModulePass(createModuleToFunctionPassAdaptor( 784 InvalidateAnalysisPass<ShouldNotRunFunctionPassesAnalysis>())); 785 786 return MIWP; 787 } 788 789 ModulePassManager 790 PassBuilder::buildModuleInlinerPipeline(OptimizationLevel Level, 791 ThinOrFullLTOPhase Phase) { 792 ModulePassManager MPM; 793 794 InlineParams IP = getInlineParamsFromOptLevel(Level); 795 // For PreLinkThinLTO + SamplePGO, set hot-caller threshold to 0 to 796 // disable hot callsite inline (as much as possible [1]) because it makes 797 // profile annotation in the backend inaccurate. 798 // 799 // [1] Note the cost of a function could be below zero due to erased 800 // prologue / epilogue. 801 if (Phase == ThinOrFullLTOPhase::ThinLTOPreLink && PGOOpt && 802 PGOOpt->Action == PGOOptions::SampleUse) 803 IP.HotCallSiteThreshold = 0; 804 805 if (PGOOpt) 806 IP.EnableDeferral = EnablePGOInlineDeferral; 807 808 // The inline deferral logic is used to avoid losing some 809 // inlining chance in future. It is helpful in SCC inliner, in which 810 // inlining is processed in bottom-up order. 811 // While in module inliner, the inlining order is a priority-based order 812 // by default. The inline deferral is unnecessary there. So we disable the 813 // inline deferral logic in module inliner. 814 IP.EnableDeferral = false; 815 816 MPM.addPass(ModuleInlinerPass(IP, UseInlineAdvisor)); 817 818 MPM.addPass(createModuleToFunctionPassAdaptor( 819 buildFunctionSimplificationPipeline(Level, Phase), 820 PTO.EagerlyInvalidateAnalyses)); 821 822 MPM.addPass(createModuleToPostOrderCGSCCPassAdaptor( 823 CoroSplitPass(Level != OptimizationLevel::O0))); 824 825 return MPM; 826 } 827 828 ModulePassManager 829 PassBuilder::buildModuleSimplificationPipeline(OptimizationLevel Level, 830 ThinOrFullLTOPhase Phase) { 831 ModulePassManager MPM; 832 833 // Place pseudo probe instrumentation as the first pass of the pipeline to 834 // minimize the impact of optimization changes. 835 if (PGOOpt && PGOOpt->PseudoProbeForProfiling && 836 Phase != ThinOrFullLTOPhase::ThinLTOPostLink) 837 MPM.addPass(SampleProfileProbePass(TM)); 838 839 bool HasSampleProfile = PGOOpt && (PGOOpt->Action == PGOOptions::SampleUse); 840 841 // In ThinLTO mode, when flattened profile is used, all the available 842 // profile information will be annotated in PreLink phase so there is 843 // no need to load the profile again in PostLink. 844 bool LoadSampleProfile = 845 HasSampleProfile && 846 !(FlattenedProfileUsed && Phase == ThinOrFullLTOPhase::ThinLTOPostLink); 847 848 // During the ThinLTO backend phase we perform early indirect call promotion 849 // here, before globalopt. Otherwise imported available_externally functions 850 // look unreferenced and are removed. If we are going to load the sample 851 // profile then defer until later. 852 // TODO: See if we can move later and consolidate with the location where 853 // we perform ICP when we are loading a sample profile. 854 // TODO: We pass HasSampleProfile (whether there was a sample profile file 855 // passed to the compile) to the SamplePGO flag of ICP. This is used to 856 // determine whether the new direct calls are annotated with prof metadata. 857 // Ideally this should be determined from whether the IR is annotated with 858 // sample profile, and not whether the a sample profile was provided on the 859 // command line. E.g. for flattened profiles where we will not be reloading 860 // the sample profile in the ThinLTO backend, we ideally shouldn't have to 861 // provide the sample profile file. 862 if (Phase == ThinOrFullLTOPhase::ThinLTOPostLink && !LoadSampleProfile) 863 MPM.addPass(PGOIndirectCallPromotion(true /* InLTO */, HasSampleProfile)); 864 865 // Do basic inference of function attributes from known properties of system 866 // libraries and other oracles. 867 MPM.addPass(InferFunctionAttrsPass()); 868 MPM.addPass(CoroEarlyPass()); 869 870 // Create an early function pass manager to cleanup the output of the 871 // frontend. 872 FunctionPassManager EarlyFPM; 873 // Lower llvm.expect to metadata before attempting transforms. 874 // Compare/branch metadata may alter the behavior of passes like SimplifyCFG. 875 EarlyFPM.addPass(LowerExpectIntrinsicPass()); 876 EarlyFPM.addPass(SimplifyCFGPass()); 877 EarlyFPM.addPass(SROAPass()); 878 EarlyFPM.addPass(EarlyCSEPass()); 879 if (Level == OptimizationLevel::O3) 880 EarlyFPM.addPass(CallSiteSplittingPass()); 881 882 // In SamplePGO ThinLTO backend, we need instcombine before profile annotation 883 // to convert bitcast to direct calls so that they can be inlined during the 884 // profile annotation prepration step. 885 // More details about SamplePGO design can be found in: 886 // https://research.google.com/pubs/pub45290.html 887 // FIXME: revisit how SampleProfileLoad/Inliner/ICP is structured. 888 if (LoadSampleProfile) 889 EarlyFPM.addPass(InstCombinePass()); 890 MPM.addPass(createModuleToFunctionPassAdaptor(std::move(EarlyFPM), 891 PTO.EagerlyInvalidateAnalyses)); 892 893 if (LoadSampleProfile) { 894 // Annotate sample profile right after early FPM to ensure freshness of 895 // the debug info. 896 MPM.addPass(SampleProfileLoaderPass(PGOOpt->ProfileFile, 897 PGOOpt->ProfileRemappingFile, Phase)); 898 // Cache ProfileSummaryAnalysis once to avoid the potential need to insert 899 // RequireAnalysisPass for PSI before subsequent non-module passes. 900 MPM.addPass(RequireAnalysisPass<ProfileSummaryAnalysis, Module>()); 901 // Do not invoke ICP in the LTOPrelink phase as it makes it hard 902 // for the profile annotation to be accurate in the LTO backend. 903 if (Phase != ThinOrFullLTOPhase::ThinLTOPreLink && 904 Phase != ThinOrFullLTOPhase::FullLTOPreLink) 905 // We perform early indirect call promotion here, before globalopt. 906 // This is important for the ThinLTO backend phase because otherwise 907 // imported available_externally functions look unreferenced and are 908 // removed. 909 MPM.addPass( 910 PGOIndirectCallPromotion(true /* IsInLTO */, true /* SamplePGO */)); 911 } 912 913 // Try to perform OpenMP specific optimizations on the module. This is a 914 // (quick!) no-op if there are no OpenMP runtime calls present in the module. 915 if (Level != OptimizationLevel::O0) 916 MPM.addPass(OpenMPOptPass()); 917 918 if (AttributorRun & AttributorRunOption::MODULE) 919 MPM.addPass(AttributorPass()); 920 921 // Lower type metadata and the type.test intrinsic in the ThinLTO 922 // post link pipeline after ICP. This is to enable usage of the type 923 // tests in ICP sequences. 924 if (Phase == ThinOrFullLTOPhase::ThinLTOPostLink) 925 MPM.addPass(LowerTypeTestsPass(nullptr, nullptr, true)); 926 927 for (auto &C : PipelineEarlySimplificationEPCallbacks) 928 C(MPM, Level); 929 930 // Specialize functions with IPSCCP. 931 if (EnableFunctionSpecialization && Level == OptimizationLevel::O3) 932 MPM.addPass(FunctionSpecializationPass()); 933 934 // Interprocedural constant propagation now that basic cleanup has occurred 935 // and prior to optimizing globals. 936 // FIXME: This position in the pipeline hasn't been carefully considered in 937 // years, it should be re-analyzed. 938 MPM.addPass(IPSCCPPass()); 939 940 // Attach metadata to indirect call sites indicating the set of functions 941 // they may target at run-time. This should follow IPSCCP. 942 MPM.addPass(CalledValuePropagationPass()); 943 944 // Optimize globals to try and fold them into constants. 945 MPM.addPass(GlobalOptPass()); 946 947 // Promote any localized globals to SSA registers. 948 // FIXME: Should this instead by a run of SROA? 949 // FIXME: We should probably run instcombine and simplifycfg afterward to 950 // delete control flows that are dead once globals have been folded to 951 // constants. 952 MPM.addPass(createModuleToFunctionPassAdaptor(PromotePass())); 953 954 // Remove any dead arguments exposed by cleanups and constant folding 955 // globals. 956 MPM.addPass(DeadArgumentEliminationPass()); 957 958 // Create a small function pass pipeline to cleanup after all the global 959 // optimizations. 960 FunctionPassManager GlobalCleanupPM; 961 GlobalCleanupPM.addPass(InstCombinePass()); 962 invokePeepholeEPCallbacks(GlobalCleanupPM, Level); 963 964 GlobalCleanupPM.addPass( 965 SimplifyCFGPass(SimplifyCFGOptions().convertSwitchRangeToICmp(true))); 966 MPM.addPass(createModuleToFunctionPassAdaptor(std::move(GlobalCleanupPM), 967 PTO.EagerlyInvalidateAnalyses)); 968 969 // Add all the requested passes for instrumentation PGO, if requested. 970 if (PGOOpt && Phase != ThinOrFullLTOPhase::ThinLTOPostLink && 971 (PGOOpt->Action == PGOOptions::IRInstr || 972 PGOOpt->Action == PGOOptions::IRUse)) { 973 addPGOInstrPasses(MPM, Level, 974 /* RunProfileGen */ PGOOpt->Action == PGOOptions::IRInstr, 975 /* IsCS */ false, PGOOpt->ProfileFile, 976 PGOOpt->ProfileRemappingFile); 977 MPM.addPass(PGOIndirectCallPromotion(false, false)); 978 } 979 if (PGOOpt && Phase != ThinOrFullLTOPhase::ThinLTOPostLink && 980 PGOOpt->CSAction == PGOOptions::CSIRInstr) 981 MPM.addPass(PGOInstrumentationGenCreateVar(PGOOpt->CSProfileGenFile)); 982 983 // Synthesize function entry counts for non-PGO compilation. 984 if (EnableSyntheticCounts && !PGOOpt) 985 MPM.addPass(SyntheticCountsPropagation()); 986 987 if (EnableModuleInliner) 988 MPM.addPass(buildModuleInlinerPipeline(Level, Phase)); 989 else 990 MPM.addPass(buildInlinerPipeline(Level, Phase)); 991 992 if (EnableMemProfiler && Phase != ThinOrFullLTOPhase::ThinLTOPreLink) { 993 MPM.addPass(createModuleToFunctionPassAdaptor(MemProfilerPass())); 994 MPM.addPass(ModuleMemProfilerPass()); 995 } 996 997 return MPM; 998 } 999 1000 /// TODO: Should LTO cause any differences to this set of passes? 1001 void PassBuilder::addVectorPasses(OptimizationLevel Level, 1002 FunctionPassManager &FPM, bool IsFullLTO) { 1003 FPM.addPass(LoopVectorizePass( 1004 LoopVectorizeOptions(!PTO.LoopInterleaving, !PTO.LoopVectorization))); 1005 1006 if (IsFullLTO) { 1007 // The vectorizer may have significantly shortened a loop body; unroll 1008 // again. Unroll small loops to hide loop backedge latency and saturate any 1009 // parallel execution resources of an out-of-order processor. We also then 1010 // need to clean up redundancies and loop invariant code. 1011 // FIXME: It would be really good to use a loop-integrated instruction 1012 // combiner for cleanup here so that the unrolling and LICM can be pipelined 1013 // across the loop nests. 1014 // We do UnrollAndJam in a separate LPM to ensure it happens before unroll 1015 if (EnableUnrollAndJam && PTO.LoopUnrolling) 1016 FPM.addPass(createFunctionToLoopPassAdaptor( 1017 LoopUnrollAndJamPass(Level.getSpeedupLevel()))); 1018 FPM.addPass(LoopUnrollPass(LoopUnrollOptions( 1019 Level.getSpeedupLevel(), /*OnlyWhenForced=*/!PTO.LoopUnrolling, 1020 PTO.ForgetAllSCEVInLoopUnroll))); 1021 FPM.addPass(WarnMissedTransformationsPass()); 1022 } 1023 1024 if (!IsFullLTO) { 1025 // Eliminate loads by forwarding stores from the previous iteration to loads 1026 // of the current iteration. 1027 FPM.addPass(LoopLoadEliminationPass()); 1028 } 1029 // Cleanup after the loop optimization passes. 1030 FPM.addPass(InstCombinePass()); 1031 1032 if (Level.getSpeedupLevel() > 1 && ExtraVectorizerPasses) { 1033 ExtraVectorPassManager ExtraPasses; 1034 // At higher optimization levels, try to clean up any runtime overlap and 1035 // alignment checks inserted by the vectorizer. We want to track correlated 1036 // runtime checks for two inner loops in the same outer loop, fold any 1037 // common computations, hoist loop-invariant aspects out of any outer loop, 1038 // and unswitch the runtime checks if possible. Once hoisted, we may have 1039 // dead (or speculatable) control flows or more combining opportunities. 1040 ExtraPasses.addPass(EarlyCSEPass()); 1041 ExtraPasses.addPass(CorrelatedValuePropagationPass()); 1042 ExtraPasses.addPass(InstCombinePass()); 1043 LoopPassManager LPM; 1044 LPM.addPass(LICMPass(PTO.LicmMssaOptCap, PTO.LicmMssaNoAccForPromotionCap, 1045 /*AllowSpeculation=*/true)); 1046 LPM.addPass(SimpleLoopUnswitchPass(/* NonTrivial */ Level == 1047 OptimizationLevel::O3)); 1048 ExtraPasses.addPass( 1049 RequireAnalysisPass<OptimizationRemarkEmitterAnalysis, Function>()); 1050 ExtraPasses.addPass( 1051 createFunctionToLoopPassAdaptor(std::move(LPM), /*UseMemorySSA=*/true, 1052 /*UseBlockFrequencyInfo=*/true)); 1053 ExtraPasses.addPass( 1054 SimplifyCFGPass(SimplifyCFGOptions().convertSwitchRangeToICmp(true))); 1055 ExtraPasses.addPass(InstCombinePass()); 1056 FPM.addPass(std::move(ExtraPasses)); 1057 } 1058 1059 // Now that we've formed fast to execute loop structures, we do further 1060 // optimizations. These are run afterward as they might block doing complex 1061 // analyses and transforms such as what are needed for loop vectorization. 1062 1063 // Cleanup after loop vectorization, etc. Simplification passes like CVP and 1064 // GVN, loop transforms, and others have already run, so it's now better to 1065 // convert to more optimized IR using more aggressive simplify CFG options. 1066 // The extra sinking transform can create larger basic blocks, so do this 1067 // before SLP vectorization. 1068 FPM.addPass(SimplifyCFGPass(SimplifyCFGOptions() 1069 .forwardSwitchCondToPhi(true) 1070 .convertSwitchRangeToICmp(true) 1071 .convertSwitchToLookupTable(true) 1072 .needCanonicalLoops(false) 1073 .hoistCommonInsts(true) 1074 .sinkCommonInsts(true))); 1075 1076 if (IsFullLTO) { 1077 FPM.addPass(SCCPPass()); 1078 FPM.addPass(InstCombinePass()); 1079 FPM.addPass(BDCEPass()); 1080 } 1081 1082 // Optimize parallel scalar instruction chains into SIMD instructions. 1083 if (PTO.SLPVectorization) { 1084 FPM.addPass(SLPVectorizerPass()); 1085 if (Level.getSpeedupLevel() > 1 && ExtraVectorizerPasses) { 1086 FPM.addPass(EarlyCSEPass()); 1087 } 1088 } 1089 // Enhance/cleanup vector code. 1090 FPM.addPass(VectorCombinePass()); 1091 1092 if (!IsFullLTO) { 1093 FPM.addPass(InstCombinePass()); 1094 // Unroll small loops to hide loop backedge latency and saturate any 1095 // parallel execution resources of an out-of-order processor. We also then 1096 // need to clean up redundancies and loop invariant code. 1097 // FIXME: It would be really good to use a loop-integrated instruction 1098 // combiner for cleanup here so that the unrolling and LICM can be pipelined 1099 // across the loop nests. 1100 // We do UnrollAndJam in a separate LPM to ensure it happens before unroll 1101 if (EnableUnrollAndJam && PTO.LoopUnrolling) { 1102 FPM.addPass(createFunctionToLoopPassAdaptor( 1103 LoopUnrollAndJamPass(Level.getSpeedupLevel()))); 1104 } 1105 FPM.addPass(LoopUnrollPass(LoopUnrollOptions( 1106 Level.getSpeedupLevel(), /*OnlyWhenForced=*/!PTO.LoopUnrolling, 1107 PTO.ForgetAllSCEVInLoopUnroll))); 1108 FPM.addPass(WarnMissedTransformationsPass()); 1109 FPM.addPass(InstCombinePass()); 1110 FPM.addPass( 1111 RequireAnalysisPass<OptimizationRemarkEmitterAnalysis, Function>()); 1112 FPM.addPass(createFunctionToLoopPassAdaptor( 1113 LICMPass(PTO.LicmMssaOptCap, PTO.LicmMssaNoAccForPromotionCap, 1114 /*AllowSpeculation=*/true), 1115 /*UseMemorySSA=*/true, /*UseBlockFrequencyInfo=*/true)); 1116 } 1117 1118 // Now that we've vectorized and unrolled loops, we may have more refined 1119 // alignment information, try to re-derive it here. 1120 FPM.addPass(AlignmentFromAssumptionsPass()); 1121 1122 if (IsFullLTO) 1123 FPM.addPass(InstCombinePass()); 1124 } 1125 1126 ModulePassManager 1127 PassBuilder::buildModuleOptimizationPipeline(OptimizationLevel Level, 1128 bool LTOPreLink) { 1129 ModulePassManager MPM; 1130 1131 // Optimize globals now that the module is fully simplified. 1132 MPM.addPass(GlobalOptPass()); 1133 MPM.addPass(GlobalDCEPass()); 1134 1135 // Run partial inlining pass to partially inline functions that have 1136 // large bodies. 1137 if (RunPartialInlining) 1138 MPM.addPass(PartialInlinerPass()); 1139 1140 // Remove avail extern fns and globals definitions since we aren't compiling 1141 // an object file for later LTO. For LTO we want to preserve these so they 1142 // are eligible for inlining at link-time. Note if they are unreferenced they 1143 // will be removed by GlobalDCE later, so this only impacts referenced 1144 // available externally globals. Eventually they will be suppressed during 1145 // codegen, but eliminating here enables more opportunity for GlobalDCE as it 1146 // may make globals referenced by available external functions dead and saves 1147 // running remaining passes on the eliminated functions. These should be 1148 // preserved during prelinking for link-time inlining decisions. 1149 if (!LTOPreLink) 1150 MPM.addPass(EliminateAvailableExternallyPass()); 1151 1152 if (EnableOrderFileInstrumentation) 1153 MPM.addPass(InstrOrderFilePass()); 1154 1155 // Do RPO function attribute inference across the module to forward-propagate 1156 // attributes where applicable. 1157 // FIXME: Is this really an optimization rather than a canonicalization? 1158 MPM.addPass(ReversePostOrderFunctionAttrsPass()); 1159 1160 // Do a post inline PGO instrumentation and use pass. This is a context 1161 // sensitive PGO pass. We don't want to do this in LTOPreLink phrase as 1162 // cross-module inline has not been done yet. The context sensitive 1163 // instrumentation is after all the inlines are done. 1164 if (!LTOPreLink && PGOOpt) { 1165 if (PGOOpt->CSAction == PGOOptions::CSIRInstr) 1166 addPGOInstrPasses(MPM, Level, /* RunProfileGen */ true, 1167 /* IsCS */ true, PGOOpt->CSProfileGenFile, 1168 PGOOpt->ProfileRemappingFile); 1169 else if (PGOOpt->CSAction == PGOOptions::CSIRUse) 1170 addPGOInstrPasses(MPM, Level, /* RunProfileGen */ false, 1171 /* IsCS */ true, PGOOpt->ProfileFile, 1172 PGOOpt->ProfileRemappingFile); 1173 } 1174 1175 // Re-compute GlobalsAA here prior to function passes. This is particularly 1176 // useful as the above will have inlined, DCE'ed, and function-attr 1177 // propagated everything. We should at this point have a reasonably minimal 1178 // and richly annotated call graph. By computing aliasing and mod/ref 1179 // information for all local globals here, the late loop passes and notably 1180 // the vectorizer will be able to use them to help recognize vectorizable 1181 // memory operations. 1182 MPM.addPass(RecomputeGlobalsAAPass()); 1183 1184 for (auto &C : OptimizerEarlyEPCallbacks) 1185 C(MPM, Level); 1186 1187 FunctionPassManager OptimizePM; 1188 OptimizePM.addPass(Float2IntPass()); 1189 OptimizePM.addPass(LowerConstantIntrinsicsPass()); 1190 1191 if (EnableMatrix) { 1192 OptimizePM.addPass(LowerMatrixIntrinsicsPass()); 1193 OptimizePM.addPass(EarlyCSEPass()); 1194 } 1195 1196 // FIXME: We need to run some loop optimizations to re-rotate loops after 1197 // simplifycfg and others undo their rotation. 1198 1199 // Optimize the loop execution. These passes operate on entire loop nests 1200 // rather than on each loop in an inside-out manner, and so they are actually 1201 // function passes. 1202 1203 for (auto &C : VectorizerStartEPCallbacks) 1204 C(OptimizePM, Level); 1205 1206 LoopPassManager LPM; 1207 // First rotate loops that may have been un-rotated by prior passes. 1208 // Disable header duplication at -Oz. 1209 LPM.addPass(LoopRotatePass(Level != OptimizationLevel::Oz, LTOPreLink)); 1210 // Some loops may have become dead by now. Try to delete them. 1211 // FIXME: see discussion in https://reviews.llvm.org/D112851, 1212 // this may need to be revisited once we run GVN before loop deletion 1213 // in the simplification pipeline. 1214 LPM.addPass(LoopDeletionPass()); 1215 OptimizePM.addPass(createFunctionToLoopPassAdaptor( 1216 std::move(LPM), /*UseMemorySSA=*/false, /*UseBlockFrequencyInfo=*/false)); 1217 1218 // Distribute loops to allow partial vectorization. I.e. isolate dependences 1219 // into separate loop that would otherwise inhibit vectorization. This is 1220 // currently only performed for loops marked with the metadata 1221 // llvm.loop.distribute=true or when -enable-loop-distribute is specified. 1222 OptimizePM.addPass(LoopDistributePass()); 1223 1224 // Populates the VFABI attribute with the scalar-to-vector mappings 1225 // from the TargetLibraryInfo. 1226 OptimizePM.addPass(InjectTLIMappings()); 1227 1228 addVectorPasses(Level, OptimizePM, /* IsFullLTO */ false); 1229 1230 // LoopSink pass sinks instructions hoisted by LICM, which serves as a 1231 // canonicalization pass that enables other optimizations. As a result, 1232 // LoopSink pass needs to be a very late IR pass to avoid undoing LICM 1233 // result too early. 1234 OptimizePM.addPass(LoopSinkPass()); 1235 1236 // And finally clean up LCSSA form before generating code. 1237 OptimizePM.addPass(InstSimplifyPass()); 1238 1239 // This hoists/decomposes div/rem ops. It should run after other sink/hoist 1240 // passes to avoid re-sinking, but before SimplifyCFG because it can allow 1241 // flattening of blocks. 1242 OptimizePM.addPass(DivRemPairsPass()); 1243 1244 // LoopSink (and other loop passes since the last simplifyCFG) might have 1245 // resulted in single-entry-single-exit or empty blocks. Clean up the CFG. 1246 OptimizePM.addPass( 1247 SimplifyCFGPass(SimplifyCFGOptions().convertSwitchRangeToICmp(true))); 1248 1249 OptimizePM.addPass(CoroCleanupPass()); 1250 1251 // Add the core optimizing pipeline. 1252 MPM.addPass(createModuleToFunctionPassAdaptor(std::move(OptimizePM), 1253 PTO.EagerlyInvalidateAnalyses)); 1254 1255 for (auto &C : OptimizerLastEPCallbacks) 1256 C(MPM, Level); 1257 1258 // Split out cold code. Splitting is done late to avoid hiding context from 1259 // other optimizations and inadvertently regressing performance. The tradeoff 1260 // is that this has a higher code size cost than splitting early. 1261 if (EnableHotColdSplit && !LTOPreLink) 1262 MPM.addPass(HotColdSplittingPass()); 1263 1264 // Search the code for similar regions of code. If enough similar regions can 1265 // be found where extracting the regions into their own function will decrease 1266 // the size of the program, we extract the regions, a deduplicate the 1267 // structurally similar regions. 1268 if (EnableIROutliner) 1269 MPM.addPass(IROutlinerPass()); 1270 1271 // Merge functions if requested. 1272 if (PTO.MergeFunctions) 1273 MPM.addPass(MergeFunctionsPass()); 1274 1275 if (PTO.CallGraphProfile) 1276 MPM.addPass(CGProfilePass()); 1277 1278 // Now we need to do some global optimization transforms. 1279 // FIXME: It would seem like these should come first in the optimization 1280 // pipeline and maybe be the bottom of the canonicalization pipeline? Weird 1281 // ordering here. 1282 MPM.addPass(GlobalDCEPass()); 1283 MPM.addPass(ConstantMergePass()); 1284 1285 // TODO: Relative look table converter pass caused an issue when full lto is 1286 // enabled. See https://reviews.llvm.org/D94355 for more details. 1287 // Until the issue fixed, disable this pass during pre-linking phase. 1288 if (!LTOPreLink) 1289 MPM.addPass(RelLookupTableConverterPass()); 1290 1291 return MPM; 1292 } 1293 1294 ModulePassManager 1295 PassBuilder::buildPerModuleDefaultPipeline(OptimizationLevel Level, 1296 bool LTOPreLink) { 1297 assert(Level != OptimizationLevel::O0 && 1298 "Must request optimizations for the default pipeline!"); 1299 1300 ModulePassManager MPM; 1301 1302 // Convert @llvm.global.annotations to !annotation metadata. 1303 MPM.addPass(Annotation2MetadataPass()); 1304 1305 // Force any function attributes we want the rest of the pipeline to observe. 1306 MPM.addPass(ForceFunctionAttrsPass()); 1307 1308 // Apply module pipeline start EP callback. 1309 for (auto &C : PipelineStartEPCallbacks) 1310 C(MPM, Level); 1311 1312 if (PGOOpt && PGOOpt->DebugInfoForProfiling) 1313 MPM.addPass(createModuleToFunctionPassAdaptor(AddDiscriminatorsPass())); 1314 1315 // Add the core simplification pipeline. 1316 MPM.addPass(buildModuleSimplificationPipeline( 1317 Level, LTOPreLink ? ThinOrFullLTOPhase::FullLTOPreLink 1318 : ThinOrFullLTOPhase::None)); 1319 1320 // Now add the optimization pipeline. 1321 MPM.addPass(buildModuleOptimizationPipeline(Level, LTOPreLink)); 1322 1323 if (PGOOpt && PGOOpt->PseudoProbeForProfiling && 1324 PGOOpt->Action == PGOOptions::SampleUse) 1325 MPM.addPass(PseudoProbeUpdatePass()); 1326 1327 // Emit annotation remarks. 1328 addAnnotationRemarksPass(MPM); 1329 1330 if (LTOPreLink) 1331 addRequiredLTOPreLinkPasses(MPM); 1332 1333 return MPM; 1334 } 1335 1336 ModulePassManager 1337 PassBuilder::buildThinLTOPreLinkDefaultPipeline(OptimizationLevel Level) { 1338 assert(Level != OptimizationLevel::O0 && 1339 "Must request optimizations for the default pipeline!"); 1340 1341 ModulePassManager MPM; 1342 1343 // Convert @llvm.global.annotations to !annotation metadata. 1344 MPM.addPass(Annotation2MetadataPass()); 1345 1346 // Force any function attributes we want the rest of the pipeline to observe. 1347 MPM.addPass(ForceFunctionAttrsPass()); 1348 1349 if (PGOOpt && PGOOpt->DebugInfoForProfiling) 1350 MPM.addPass(createModuleToFunctionPassAdaptor(AddDiscriminatorsPass())); 1351 1352 // Apply module pipeline start EP callback. 1353 for (auto &C : PipelineStartEPCallbacks) 1354 C(MPM, Level); 1355 1356 // If we are planning to perform ThinLTO later, we don't bloat the code with 1357 // unrolling/vectorization/... now. Just simplify the module as much as we 1358 // can. 1359 MPM.addPass(buildModuleSimplificationPipeline( 1360 Level, ThinOrFullLTOPhase::ThinLTOPreLink)); 1361 1362 // Run partial inlining pass to partially inline functions that have 1363 // large bodies. 1364 // FIXME: It isn't clear whether this is really the right place to run this 1365 // in ThinLTO. Because there is another canonicalization and simplification 1366 // phase that will run after the thin link, running this here ends up with 1367 // less information than will be available later and it may grow functions in 1368 // ways that aren't beneficial. 1369 if (RunPartialInlining) 1370 MPM.addPass(PartialInlinerPass()); 1371 1372 // Reduce the size of the IR as much as possible. 1373 MPM.addPass(GlobalOptPass()); 1374 1375 // Module simplification splits coroutines, but does not fully clean up 1376 // coroutine intrinsics. To ensure ThinLTO optimization passes don't trip up 1377 // on these, we schedule the cleanup here. 1378 MPM.addPass(createModuleToFunctionPassAdaptor(CoroCleanupPass())); 1379 1380 if (PGOOpt && PGOOpt->PseudoProbeForProfiling && 1381 PGOOpt->Action == PGOOptions::SampleUse) 1382 MPM.addPass(PseudoProbeUpdatePass()); 1383 1384 // Handle OptimizerLastEPCallbacks added by clang on PreLink. Actual 1385 // optimization is going to be done in PostLink stage, but clang can't 1386 // add callbacks there in case of in-process ThinLTO called by linker. 1387 for (auto &C : OptimizerLastEPCallbacks) 1388 C(MPM, Level); 1389 1390 // Emit annotation remarks. 1391 addAnnotationRemarksPass(MPM); 1392 1393 addRequiredLTOPreLinkPasses(MPM); 1394 1395 return MPM; 1396 } 1397 1398 ModulePassManager PassBuilder::buildThinLTODefaultPipeline( 1399 OptimizationLevel Level, const ModuleSummaryIndex *ImportSummary) { 1400 ModulePassManager MPM; 1401 1402 // Convert @llvm.global.annotations to !annotation metadata. 1403 MPM.addPass(Annotation2MetadataPass()); 1404 1405 if (ImportSummary) { 1406 // These passes import type identifier resolutions for whole-program 1407 // devirtualization and CFI. They must run early because other passes may 1408 // disturb the specific instruction patterns that these passes look for, 1409 // creating dependencies on resolutions that may not appear in the summary. 1410 // 1411 // For example, GVN may transform the pattern assume(type.test) appearing in 1412 // two basic blocks into assume(phi(type.test, type.test)), which would 1413 // transform a dependency on a WPD resolution into a dependency on a type 1414 // identifier resolution for CFI. 1415 // 1416 // Also, WPD has access to more precise information than ICP and can 1417 // devirtualize more effectively, so it should operate on the IR first. 1418 // 1419 // The WPD and LowerTypeTest passes need to run at -O0 to lower type 1420 // metadata and intrinsics. 1421 MPM.addPass(WholeProgramDevirtPass(nullptr, ImportSummary)); 1422 MPM.addPass(LowerTypeTestsPass(nullptr, ImportSummary)); 1423 } 1424 1425 if (Level == OptimizationLevel::O0) { 1426 // Run a second time to clean up any type tests left behind by WPD for use 1427 // in ICP. 1428 MPM.addPass(LowerTypeTestsPass(nullptr, nullptr, true)); 1429 // Drop available_externally and unreferenced globals. This is necessary 1430 // with ThinLTO in order to avoid leaving undefined references to dead 1431 // globals in the object file. 1432 MPM.addPass(EliminateAvailableExternallyPass()); 1433 MPM.addPass(GlobalDCEPass()); 1434 return MPM; 1435 } 1436 1437 // Force any function attributes we want the rest of the pipeline to observe. 1438 MPM.addPass(ForceFunctionAttrsPass()); 1439 1440 // Add the core simplification pipeline. 1441 MPM.addPass(buildModuleSimplificationPipeline( 1442 Level, ThinOrFullLTOPhase::ThinLTOPostLink)); 1443 1444 // Now add the optimization pipeline. 1445 MPM.addPass(buildModuleOptimizationPipeline(Level)); 1446 1447 // Emit annotation remarks. 1448 addAnnotationRemarksPass(MPM); 1449 1450 return MPM; 1451 } 1452 1453 ModulePassManager 1454 PassBuilder::buildLTOPreLinkDefaultPipeline(OptimizationLevel Level) { 1455 assert(Level != OptimizationLevel::O0 && 1456 "Must request optimizations for the default pipeline!"); 1457 // FIXME: We should use a customized pre-link pipeline! 1458 return buildPerModuleDefaultPipeline(Level, 1459 /* LTOPreLink */ true); 1460 } 1461 1462 ModulePassManager 1463 PassBuilder::buildLTODefaultPipeline(OptimizationLevel Level, 1464 ModuleSummaryIndex *ExportSummary) { 1465 ModulePassManager MPM; 1466 1467 // Convert @llvm.global.annotations to !annotation metadata. 1468 MPM.addPass(Annotation2MetadataPass()); 1469 1470 for (auto &C : FullLinkTimeOptimizationEarlyEPCallbacks) 1471 C(MPM, Level); 1472 1473 // Create a function that performs CFI checks for cross-DSO calls with targets 1474 // in the current module. 1475 MPM.addPass(CrossDSOCFIPass()); 1476 1477 if (Level == OptimizationLevel::O0) { 1478 // The WPD and LowerTypeTest passes need to run at -O0 to lower type 1479 // metadata and intrinsics. 1480 MPM.addPass(WholeProgramDevirtPass(ExportSummary, nullptr)); 1481 MPM.addPass(LowerTypeTestsPass(ExportSummary, nullptr)); 1482 // Run a second time to clean up any type tests left behind by WPD for use 1483 // in ICP. 1484 MPM.addPass(LowerTypeTestsPass(nullptr, nullptr, true)); 1485 1486 for (auto &C : FullLinkTimeOptimizationLastEPCallbacks) 1487 C(MPM, Level); 1488 1489 // Emit annotation remarks. 1490 addAnnotationRemarksPass(MPM); 1491 1492 return MPM; 1493 } 1494 1495 if (PGOOpt && PGOOpt->Action == PGOOptions::SampleUse) { 1496 // Load sample profile before running the LTO optimization pipeline. 1497 MPM.addPass(SampleProfileLoaderPass(PGOOpt->ProfileFile, 1498 PGOOpt->ProfileRemappingFile, 1499 ThinOrFullLTOPhase::FullLTOPostLink)); 1500 // Cache ProfileSummaryAnalysis once to avoid the potential need to insert 1501 // RequireAnalysisPass for PSI before subsequent non-module passes. 1502 MPM.addPass(RequireAnalysisPass<ProfileSummaryAnalysis, Module>()); 1503 } 1504 1505 // Try to run OpenMP optimizations, quick no-op if no OpenMP metadata present. 1506 MPM.addPass(OpenMPOptPass()); 1507 1508 // Remove unused virtual tables to improve the quality of code generated by 1509 // whole-program devirtualization and bitset lowering. 1510 MPM.addPass(GlobalDCEPass()); 1511 1512 // Force any function attributes we want the rest of the pipeline to observe. 1513 MPM.addPass(ForceFunctionAttrsPass()); 1514 1515 // Do basic inference of function attributes from known properties of system 1516 // libraries and other oracles. 1517 MPM.addPass(InferFunctionAttrsPass()); 1518 1519 if (Level.getSpeedupLevel() > 1) { 1520 MPM.addPass(createModuleToFunctionPassAdaptor( 1521 CallSiteSplittingPass(), PTO.EagerlyInvalidateAnalyses)); 1522 1523 // Indirect call promotion. This should promote all the targets that are 1524 // left by the earlier promotion pass that promotes intra-module targets. 1525 // This two-step promotion is to save the compile time. For LTO, it should 1526 // produce the same result as if we only do promotion here. 1527 MPM.addPass(PGOIndirectCallPromotion( 1528 true /* InLTO */, PGOOpt && PGOOpt->Action == PGOOptions::SampleUse)); 1529 1530 if (EnableFunctionSpecialization && Level == OptimizationLevel::O3) 1531 MPM.addPass(FunctionSpecializationPass()); 1532 // Propagate constants at call sites into the functions they call. This 1533 // opens opportunities for globalopt (and inlining) by substituting function 1534 // pointers passed as arguments to direct uses of functions. 1535 MPM.addPass(IPSCCPPass()); 1536 1537 // Attach metadata to indirect call sites indicating the set of functions 1538 // they may target at run-time. This should follow IPSCCP. 1539 MPM.addPass(CalledValuePropagationPass()); 1540 } 1541 1542 // Now deduce any function attributes based in the current code. 1543 MPM.addPass( 1544 createModuleToPostOrderCGSCCPassAdaptor(PostOrderFunctionAttrsPass())); 1545 1546 // Do RPO function attribute inference across the module to forward-propagate 1547 // attributes where applicable. 1548 // FIXME: Is this really an optimization rather than a canonicalization? 1549 MPM.addPass(ReversePostOrderFunctionAttrsPass()); 1550 1551 // Use in-range annotations on GEP indices to split globals where beneficial. 1552 MPM.addPass(GlobalSplitPass()); 1553 1554 // Run whole program optimization of virtual call when the list of callees 1555 // is fixed. 1556 MPM.addPass(WholeProgramDevirtPass(ExportSummary, nullptr)); 1557 1558 // Stop here at -O1. 1559 if (Level == OptimizationLevel::O1) { 1560 // The LowerTypeTestsPass needs to run to lower type metadata and the 1561 // type.test intrinsics. The pass does nothing if CFI is disabled. 1562 MPM.addPass(LowerTypeTestsPass(ExportSummary, nullptr)); 1563 // Run a second time to clean up any type tests left behind by WPD for use 1564 // in ICP (which is performed earlier than this in the regular LTO 1565 // pipeline). 1566 MPM.addPass(LowerTypeTestsPass(nullptr, nullptr, true)); 1567 1568 for (auto &C : FullLinkTimeOptimizationLastEPCallbacks) 1569 C(MPM, Level); 1570 1571 // Emit annotation remarks. 1572 addAnnotationRemarksPass(MPM); 1573 1574 return MPM; 1575 } 1576 1577 // Optimize globals to try and fold them into constants. 1578 MPM.addPass(GlobalOptPass()); 1579 1580 // Promote any localized globals to SSA registers. 1581 MPM.addPass(createModuleToFunctionPassAdaptor(PromotePass())); 1582 1583 // Linking modules together can lead to duplicate global constant, only 1584 // keep one copy of each constant. 1585 MPM.addPass(ConstantMergePass()); 1586 1587 // Remove unused arguments from functions. 1588 MPM.addPass(DeadArgumentEliminationPass()); 1589 1590 // Reduce the code after globalopt and ipsccp. Both can open up significant 1591 // simplification opportunities, and both can propagate functions through 1592 // function pointers. When this happens, we often have to resolve varargs 1593 // calls, etc, so let instcombine do this. 1594 FunctionPassManager PeepholeFPM; 1595 PeepholeFPM.addPass(InstCombinePass()); 1596 if (Level == OptimizationLevel::O3) 1597 PeepholeFPM.addPass(AggressiveInstCombinePass()); 1598 invokePeepholeEPCallbacks(PeepholeFPM, Level); 1599 1600 MPM.addPass(createModuleToFunctionPassAdaptor(std::move(PeepholeFPM), 1601 PTO.EagerlyInvalidateAnalyses)); 1602 1603 // Note: historically, the PruneEH pass was run first to deduce nounwind and 1604 // generally clean up exception handling overhead. It isn't clear this is 1605 // valuable as the inliner doesn't currently care whether it is inlining an 1606 // invoke or a call. 1607 // Run the inliner now. 1608 MPM.addPass(ModuleInlinerWrapperPass(getInlineParamsFromOptLevel(Level))); 1609 1610 // Optimize globals again after we ran the inliner. 1611 MPM.addPass(GlobalOptPass()); 1612 1613 // Garbage collect dead functions. 1614 MPM.addPass(GlobalDCEPass()); 1615 1616 // If we didn't decide to inline a function, check to see if we can 1617 // transform it to pass arguments by value instead of by reference. 1618 MPM.addPass(createModuleToPostOrderCGSCCPassAdaptor(ArgumentPromotionPass())); 1619 1620 FunctionPassManager FPM; 1621 // The IPO Passes may leave cruft around. Clean up after them. 1622 FPM.addPass(InstCombinePass()); 1623 invokePeepholeEPCallbacks(FPM, Level); 1624 1625 FPM.addPass(JumpThreadingPass(/*InsertFreezeWhenUnfoldingSelect*/ true)); 1626 1627 // Do a post inline PGO instrumentation and use pass. This is a context 1628 // sensitive PGO pass. 1629 if (PGOOpt) { 1630 if (PGOOpt->CSAction == PGOOptions::CSIRInstr) 1631 addPGOInstrPasses(MPM, Level, /* RunProfileGen */ true, 1632 /* IsCS */ true, PGOOpt->CSProfileGenFile, 1633 PGOOpt->ProfileRemappingFile); 1634 else if (PGOOpt->CSAction == PGOOptions::CSIRUse) 1635 addPGOInstrPasses(MPM, Level, /* RunProfileGen */ false, 1636 /* IsCS */ true, PGOOpt->ProfileFile, 1637 PGOOpt->ProfileRemappingFile); 1638 } 1639 1640 // Break up allocas 1641 FPM.addPass(SROAPass()); 1642 1643 // LTO provides additional opportunities for tailcall elimination due to 1644 // link-time inlining, and visibility of nocapture attribute. 1645 FPM.addPass(TailCallElimPass()); 1646 1647 // Run a few AA driver optimizations here and now to cleanup the code. 1648 MPM.addPass(createModuleToFunctionPassAdaptor(std::move(FPM), 1649 PTO.EagerlyInvalidateAnalyses)); 1650 1651 MPM.addPass( 1652 createModuleToPostOrderCGSCCPassAdaptor(PostOrderFunctionAttrsPass())); 1653 1654 // Require the GlobalsAA analysis for the module so we can query it within 1655 // MainFPM. 1656 MPM.addPass(RequireAnalysisPass<GlobalsAA, Module>()); 1657 // Invalidate AAManager so it can be recreated and pick up the newly available 1658 // GlobalsAA. 1659 MPM.addPass( 1660 createModuleToFunctionPassAdaptor(InvalidateAnalysisPass<AAManager>())); 1661 1662 FunctionPassManager MainFPM; 1663 MainFPM.addPass(createFunctionToLoopPassAdaptor( 1664 LICMPass(PTO.LicmMssaOptCap, PTO.LicmMssaNoAccForPromotionCap, 1665 /*AllowSpeculation=*/true), 1666 /*USeMemorySSA=*/true, /*UseBlockFrequencyInfo=*/true)); 1667 1668 if (RunNewGVN) 1669 MainFPM.addPass(NewGVNPass()); 1670 else 1671 MainFPM.addPass(GVNPass()); 1672 1673 // Remove dead memcpy()'s. 1674 MainFPM.addPass(MemCpyOptPass()); 1675 1676 // Nuke dead stores. 1677 MainFPM.addPass(DSEPass()); 1678 MainFPM.addPass(MergedLoadStoreMotionPass()); 1679 1680 1681 if (EnableConstraintElimination) 1682 MainFPM.addPass(ConstraintEliminationPass()); 1683 1684 LoopPassManager LPM; 1685 if (EnableLoopFlatten && Level.getSpeedupLevel() > 1) 1686 LPM.addPass(LoopFlattenPass()); 1687 LPM.addPass(IndVarSimplifyPass()); 1688 LPM.addPass(LoopDeletionPass()); 1689 // FIXME: Add loop interchange. 1690 1691 // Unroll small loops and perform peeling. 1692 LPM.addPass(LoopFullUnrollPass(Level.getSpeedupLevel(), 1693 /* OnlyWhenForced= */ !PTO.LoopUnrolling, 1694 PTO.ForgetAllSCEVInLoopUnroll)); 1695 // The loop passes in LPM (LoopFullUnrollPass) do not preserve MemorySSA. 1696 // *All* loop passes must preserve it, in order to be able to use it. 1697 MainFPM.addPass(createFunctionToLoopPassAdaptor( 1698 std::move(LPM), /*UseMemorySSA=*/false, /*UseBlockFrequencyInfo=*/true)); 1699 1700 MainFPM.addPass(LoopDistributePass()); 1701 1702 addVectorPasses(Level, MainFPM, /* IsFullLTO */ true); 1703 1704 // Run the OpenMPOpt CGSCC pass again late. 1705 MPM.addPass( 1706 createModuleToPostOrderCGSCCPassAdaptor(OpenMPOptCGSCCPass())); 1707 1708 invokePeepholeEPCallbacks(MainFPM, Level); 1709 MainFPM.addPass(JumpThreadingPass(/*InsertFreezeWhenUnfoldingSelect*/ true)); 1710 MPM.addPass(createModuleToFunctionPassAdaptor(std::move(MainFPM), 1711 PTO.EagerlyInvalidateAnalyses)); 1712 1713 // Lower type metadata and the type.test intrinsic. This pass supports 1714 // clang's control flow integrity mechanisms (-fsanitize=cfi*) and needs 1715 // to be run at link time if CFI is enabled. This pass does nothing if 1716 // CFI is disabled. 1717 MPM.addPass(LowerTypeTestsPass(ExportSummary, nullptr)); 1718 // Run a second time to clean up any type tests left behind by WPD for use 1719 // in ICP (which is performed earlier than this in the regular LTO pipeline). 1720 MPM.addPass(LowerTypeTestsPass(nullptr, nullptr, true)); 1721 1722 // Enable splitting late in the FullLTO post-link pipeline. This is done in 1723 // the same stage in the old pass manager (\ref addLateLTOOptimizationPasses). 1724 if (EnableHotColdSplit) 1725 MPM.addPass(HotColdSplittingPass()); 1726 1727 // Add late LTO optimization passes. 1728 // Delete basic blocks, which optimization passes may have killed. 1729 MPM.addPass(createModuleToFunctionPassAdaptor(SimplifyCFGPass( 1730 SimplifyCFGOptions().convertSwitchRangeToICmp(true).hoistCommonInsts( 1731 true)))); 1732 1733 // Drop bodies of available eternally objects to improve GlobalDCE. 1734 MPM.addPass(EliminateAvailableExternallyPass()); 1735 1736 // Now that we have optimized the program, discard unreachable functions. 1737 MPM.addPass(GlobalDCEPass()); 1738 1739 if (PTO.MergeFunctions) 1740 MPM.addPass(MergeFunctionsPass()); 1741 1742 for (auto &C : FullLinkTimeOptimizationLastEPCallbacks) 1743 C(MPM, Level); 1744 1745 // Emit annotation remarks. 1746 addAnnotationRemarksPass(MPM); 1747 1748 return MPM; 1749 } 1750 1751 ModulePassManager PassBuilder::buildO0DefaultPipeline(OptimizationLevel Level, 1752 bool LTOPreLink) { 1753 assert(Level == OptimizationLevel::O0 && 1754 "buildO0DefaultPipeline should only be used with O0"); 1755 1756 ModulePassManager MPM; 1757 1758 // Perform pseudo probe instrumentation in O0 mode. This is for the 1759 // consistency between different build modes. For example, a LTO build can be 1760 // mixed with an O0 prelink and an O2 postlink. Loading a sample profile in 1761 // the postlink will require pseudo probe instrumentation in the prelink. 1762 if (PGOOpt && PGOOpt->PseudoProbeForProfiling) 1763 MPM.addPass(SampleProfileProbePass(TM)); 1764 1765 if (PGOOpt && (PGOOpt->Action == PGOOptions::IRInstr || 1766 PGOOpt->Action == PGOOptions::IRUse)) 1767 addPGOInstrPassesForO0( 1768 MPM, 1769 /* RunProfileGen */ (PGOOpt->Action == PGOOptions::IRInstr), 1770 /* IsCS */ false, PGOOpt->ProfileFile, PGOOpt->ProfileRemappingFile); 1771 1772 for (auto &C : PipelineStartEPCallbacks) 1773 C(MPM, Level); 1774 1775 if (PGOOpt && PGOOpt->DebugInfoForProfiling) 1776 MPM.addPass(createModuleToFunctionPassAdaptor(AddDiscriminatorsPass())); 1777 1778 for (auto &C : PipelineEarlySimplificationEPCallbacks) 1779 C(MPM, Level); 1780 1781 // Build a minimal pipeline based on the semantics required by LLVM, 1782 // which is just that always inlining occurs. Further, disable generating 1783 // lifetime intrinsics to avoid enabling further optimizations during 1784 // code generation. 1785 MPM.addPass(AlwaysInlinerPass( 1786 /*InsertLifetimeIntrinsics=*/false)); 1787 1788 if (PTO.MergeFunctions) 1789 MPM.addPass(MergeFunctionsPass()); 1790 1791 if (EnableMatrix) 1792 MPM.addPass( 1793 createModuleToFunctionPassAdaptor(LowerMatrixIntrinsicsPass(true))); 1794 1795 if (!CGSCCOptimizerLateEPCallbacks.empty()) { 1796 CGSCCPassManager CGPM; 1797 for (auto &C : CGSCCOptimizerLateEPCallbacks) 1798 C(CGPM, Level); 1799 if (!CGPM.isEmpty()) 1800 MPM.addPass(createModuleToPostOrderCGSCCPassAdaptor(std::move(CGPM))); 1801 } 1802 if (!LateLoopOptimizationsEPCallbacks.empty()) { 1803 LoopPassManager LPM; 1804 for (auto &C : LateLoopOptimizationsEPCallbacks) 1805 C(LPM, Level); 1806 if (!LPM.isEmpty()) { 1807 MPM.addPass(createModuleToFunctionPassAdaptor( 1808 createFunctionToLoopPassAdaptor(std::move(LPM)))); 1809 } 1810 } 1811 if (!LoopOptimizerEndEPCallbacks.empty()) { 1812 LoopPassManager LPM; 1813 for (auto &C : LoopOptimizerEndEPCallbacks) 1814 C(LPM, Level); 1815 if (!LPM.isEmpty()) { 1816 MPM.addPass(createModuleToFunctionPassAdaptor( 1817 createFunctionToLoopPassAdaptor(std::move(LPM)))); 1818 } 1819 } 1820 if (!ScalarOptimizerLateEPCallbacks.empty()) { 1821 FunctionPassManager FPM; 1822 for (auto &C : ScalarOptimizerLateEPCallbacks) 1823 C(FPM, Level); 1824 if (!FPM.isEmpty()) 1825 MPM.addPass(createModuleToFunctionPassAdaptor(std::move(FPM))); 1826 } 1827 1828 for (auto &C : OptimizerEarlyEPCallbacks) 1829 C(MPM, Level); 1830 1831 if (!VectorizerStartEPCallbacks.empty()) { 1832 FunctionPassManager FPM; 1833 for (auto &C : VectorizerStartEPCallbacks) 1834 C(FPM, Level); 1835 if (!FPM.isEmpty()) 1836 MPM.addPass(createModuleToFunctionPassAdaptor(std::move(FPM))); 1837 } 1838 1839 ModulePassManager CoroPM; 1840 CoroPM.addPass(CoroEarlyPass()); 1841 CGSCCPassManager CGPM; 1842 CGPM.addPass(CoroSplitPass()); 1843 CoroPM.addPass(createModuleToPostOrderCGSCCPassAdaptor(std::move(CGPM))); 1844 CoroPM.addPass(createModuleToFunctionPassAdaptor(CoroCleanupPass())); 1845 CoroPM.addPass(GlobalDCEPass()); 1846 MPM.addPass(CoroConditionalWrapper(std::move(CoroPM))); 1847 1848 for (auto &C : OptimizerLastEPCallbacks) 1849 C(MPM, Level); 1850 1851 if (LTOPreLink) 1852 addRequiredLTOPreLinkPasses(MPM); 1853 1854 MPM.addPass(createModuleToFunctionPassAdaptor(AnnotationRemarksPass())); 1855 1856 return MPM; 1857 } 1858 1859 AAManager PassBuilder::buildDefaultAAPipeline() { 1860 AAManager AA; 1861 1862 // The order in which these are registered determines their priority when 1863 // being queried. 1864 1865 // First we register the basic alias analysis that provides the majority of 1866 // per-function local AA logic. This is a stateless, on-demand local set of 1867 // AA techniques. 1868 AA.registerFunctionAnalysis<BasicAA>(); 1869 1870 // Next we query fast, specialized alias analyses that wrap IR-embedded 1871 // information about aliasing. 1872 AA.registerFunctionAnalysis<ScopedNoAliasAA>(); 1873 AA.registerFunctionAnalysis<TypeBasedAA>(); 1874 1875 // Add support for querying global aliasing information when available. 1876 // Because the `AAManager` is a function analysis and `GlobalsAA` is a module 1877 // analysis, all that the `AAManager` can do is query for any *cached* 1878 // results from `GlobalsAA` through a readonly proxy. 1879 AA.registerModuleAnalysis<GlobalsAA>(); 1880 1881 // Add target-specific alias analyses. 1882 if (TM) 1883 TM->registerDefaultAliasAnalyses(AA); 1884 1885 return AA; 1886 } 1887