1 //===- ScheduleOptimizer.cpp - Calculate an optimized schedule ------------===// 2 // 3 // Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. 4 // See https://llvm.org/LICENSE.txt for license information. 5 // SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception 6 // 7 //===----------------------------------------------------------------------===// 8 // 9 // This pass generates an entirely new schedule tree from the data dependences 10 // and iteration domains. The new schedule tree is computed in two steps: 11 // 12 // 1) The isl scheduling optimizer is run 13 // 14 // The isl scheduling optimizer creates a new schedule tree that maximizes 15 // parallelism and tileability and minimizes data-dependence distances. The 16 // algorithm used is a modified version of the ``Pluto'' algorithm: 17 // 18 // U. Bondhugula, A. Hartono, J. Ramanujam, and P. Sadayappan. 19 // A Practical Automatic Polyhedral Parallelizer and Locality Optimizer. 20 // In Proceedings of the 2008 ACM SIGPLAN Conference On Programming Language 21 // Design and Implementation, PLDI ’08, pages 101–113. ACM, 2008. 22 // 23 // 2) A set of post-scheduling transformations is applied on the schedule tree. 24 // 25 // These optimizations include: 26 // 27 // - Tiling of the innermost tilable bands 28 // - Prevectorization - The choice of a possible outer loop that is strip-mined 29 // to the innermost level to enable inner-loop 30 // vectorization. 31 // - Some optimizations for spatial locality are also planned. 32 // 33 // For a detailed description of the schedule tree itself please see section 6 34 // of: 35 // 36 // Polyhedral AST generation is more than scanning polyhedra 37 // Tobias Grosser, Sven Verdoolaege, Albert Cohen 38 // ACM Transactions on Programming Languages and Systems (TOPLAS), 39 // 37(4), July 2015 40 // http://www.grosser.es/#pub-polyhedral-AST-generation 41 // 42 // This publication also contains a detailed discussion of the different options 43 // for polyhedral loop unrolling, full/partial tile separation and other uses 44 // of the schedule tree. 45 // 46 //===----------------------------------------------------------------------===// 47 48 #include "polly/ScheduleOptimizer.h" 49 #include "polly/CodeGen/CodeGeneration.h" 50 #include "polly/DependenceInfo.h" 51 #include "polly/ManualOptimizer.h" 52 #include "polly/MatmulOptimizer.h" 53 #include "polly/Options.h" 54 #include "polly/ScheduleTreeTransform.h" 55 #include "polly/Support/ISLOStream.h" 56 #include "llvm/ADT/Sequence.h" 57 #include "llvm/ADT/Statistic.h" 58 #include "llvm/Analysis/OptimizationRemarkEmitter.h" 59 #include "llvm/InitializePasses.h" 60 #include "llvm/Support/CommandLine.h" 61 #include "isl/options.h" 62 63 using namespace llvm; 64 using namespace polly; 65 66 namespace llvm { 67 class Loop; 68 class Module; 69 } // namespace llvm 70 71 #define DEBUG_TYPE "polly-opt-isl" 72 73 static cl::opt<std::string> 74 OptimizeDeps("polly-opt-optimize-only", 75 cl::desc("Only a certain kind of dependences (all/raw)"), 76 cl::Hidden, cl::init("all"), cl::ZeroOrMore, 77 cl::cat(PollyCategory)); 78 79 static cl::opt<std::string> 80 SimplifyDeps("polly-opt-simplify-deps", 81 cl::desc("Dependences should be simplified (yes/no)"), 82 cl::Hidden, cl::init("yes"), cl::ZeroOrMore, 83 cl::cat(PollyCategory)); 84 85 static cl::opt<int> MaxConstantTerm( 86 "polly-opt-max-constant-term", 87 cl::desc("The maximal constant term allowed (-1 is unlimited)"), cl::Hidden, 88 cl::init(20), cl::ZeroOrMore, cl::cat(PollyCategory)); 89 90 static cl::opt<int> MaxCoefficient( 91 "polly-opt-max-coefficient", 92 cl::desc("The maximal coefficient allowed (-1 is unlimited)"), cl::Hidden, 93 cl::init(20), cl::ZeroOrMore, cl::cat(PollyCategory)); 94 95 static cl::opt<std::string> 96 MaximizeBandDepth("polly-opt-maximize-bands", 97 cl::desc("Maximize the band depth (yes/no)"), cl::Hidden, 98 cl::init("yes"), cl::ZeroOrMore, cl::cat(PollyCategory)); 99 100 static cl::opt<bool> 101 GreedyFusion("polly-loopfusion-greedy", 102 cl::desc("Aggressively try to fuse everything"), cl::Hidden, 103 cl::ZeroOrMore, cl::cat(PollyCategory)); 104 105 static cl::opt<std::string> OuterCoincidence( 106 "polly-opt-outer-coincidence", 107 cl::desc("Try to construct schedules where the outer member of each band " 108 "satisfies the coincidence constraints (yes/no)"), 109 cl::Hidden, cl::init("no"), cl::ZeroOrMore, cl::cat(PollyCategory)); 110 111 static cl::opt<int> PrevectorWidth( 112 "polly-prevect-width", 113 cl::desc( 114 "The number of loop iterations to strip-mine for pre-vectorization"), 115 cl::Hidden, cl::init(4), cl::ZeroOrMore, cl::cat(PollyCategory)); 116 117 static cl::opt<bool> FirstLevelTiling("polly-tiling", 118 cl::desc("Enable loop tiling"), 119 cl::init(true), cl::ZeroOrMore, 120 cl::cat(PollyCategory)); 121 122 static cl::opt<int> FirstLevelDefaultTileSize( 123 "polly-default-tile-size", 124 cl::desc("The default tile size (if not enough were provided by" 125 " --polly-tile-sizes)"), 126 cl::Hidden, cl::init(32), cl::ZeroOrMore, cl::cat(PollyCategory)); 127 128 static cl::list<int> 129 FirstLevelTileSizes("polly-tile-sizes", 130 cl::desc("A tile size for each loop dimension, filled " 131 "with --polly-default-tile-size"), 132 cl::Hidden, cl::ZeroOrMore, cl::CommaSeparated, 133 cl::cat(PollyCategory)); 134 135 static cl::opt<bool> 136 SecondLevelTiling("polly-2nd-level-tiling", 137 cl::desc("Enable a 2nd level loop of loop tiling"), 138 cl::init(false), cl::ZeroOrMore, cl::cat(PollyCategory)); 139 140 static cl::opt<int> SecondLevelDefaultTileSize( 141 "polly-2nd-level-default-tile-size", 142 cl::desc("The default 2nd-level tile size (if not enough were provided by" 143 " --polly-2nd-level-tile-sizes)"), 144 cl::Hidden, cl::init(16), cl::ZeroOrMore, cl::cat(PollyCategory)); 145 146 static cl::list<int> 147 SecondLevelTileSizes("polly-2nd-level-tile-sizes", 148 cl::desc("A tile size for each loop dimension, filled " 149 "with --polly-default-tile-size"), 150 cl::Hidden, cl::ZeroOrMore, cl::CommaSeparated, 151 cl::cat(PollyCategory)); 152 153 static cl::opt<bool> RegisterTiling("polly-register-tiling", 154 cl::desc("Enable register tiling"), 155 cl::init(false), cl::ZeroOrMore, 156 cl::cat(PollyCategory)); 157 158 static cl::opt<int> RegisterDefaultTileSize( 159 "polly-register-tiling-default-tile-size", 160 cl::desc("The default register tile size (if not enough were provided by" 161 " --polly-register-tile-sizes)"), 162 cl::Hidden, cl::init(2), cl::ZeroOrMore, cl::cat(PollyCategory)); 163 164 static cl::list<int> 165 RegisterTileSizes("polly-register-tile-sizes", 166 cl::desc("A tile size for each loop dimension, filled " 167 "with --polly-register-tile-size"), 168 cl::Hidden, cl::ZeroOrMore, cl::CommaSeparated, 169 cl::cat(PollyCategory)); 170 171 static cl::opt<bool> PragmaBasedOpts( 172 "polly-pragma-based-opts", 173 cl::desc("Apply user-directed transformation from metadata"), 174 cl::init(true), cl::ZeroOrMore, cl::cat(PollyCategory)); 175 176 static cl::opt<bool> EnableReschedule("polly-reschedule", 177 cl::desc("Optimize SCoPs using ISL"), 178 cl::init(true), cl::ZeroOrMore, 179 cl::cat(PollyCategory)); 180 181 static cl::opt<bool> 182 PMBasedOpts("polly-pattern-matching-based-opts", 183 cl::desc("Perform optimizations based on pattern matching"), 184 cl::init(true), cl::ZeroOrMore, cl::cat(PollyCategory)); 185 186 static cl::opt<bool> 187 EnablePostopts("polly-postopts", 188 cl::desc("Apply post-rescheduling optimizations such as " 189 "tiling (requires -polly-reschedule)"), 190 cl::init(true), cl::ZeroOrMore, cl::cat(PollyCategory)); 191 192 static cl::opt<bool> OptimizedScops( 193 "polly-optimized-scops", 194 cl::desc("Polly - Dump polyhedral description of Scops optimized with " 195 "the isl scheduling optimizer and the set of post-scheduling " 196 "transformations is applied on the schedule tree"), 197 cl::init(false), cl::ZeroOrMore, cl::cat(PollyCategory)); 198 199 STATISTIC(ScopsProcessed, "Number of scops processed"); 200 STATISTIC(ScopsRescheduled, "Number of scops rescheduled"); 201 STATISTIC(ScopsOptimized, "Number of scops optimized"); 202 203 STATISTIC(NumAffineLoopsOptimized, "Number of affine loops optimized"); 204 STATISTIC(NumBoxedLoopsOptimized, "Number of boxed loops optimized"); 205 206 #define THREE_STATISTICS(VARNAME, DESC) \ 207 static Statistic VARNAME[3] = { \ 208 {DEBUG_TYPE, #VARNAME "0", DESC " (original)"}, \ 209 {DEBUG_TYPE, #VARNAME "1", DESC " (after scheduler)"}, \ 210 {DEBUG_TYPE, #VARNAME "2", DESC " (after optimizer)"}} 211 212 THREE_STATISTICS(NumBands, "Number of bands"); 213 THREE_STATISTICS(NumBandMembers, "Number of band members"); 214 THREE_STATISTICS(NumCoincident, "Number of coincident band members"); 215 THREE_STATISTICS(NumPermutable, "Number of permutable bands"); 216 THREE_STATISTICS(NumFilters, "Number of filter nodes"); 217 THREE_STATISTICS(NumExtension, "Number of extension nodes"); 218 219 STATISTIC(FirstLevelTileOpts, "Number of first level tiling applied"); 220 STATISTIC(SecondLevelTileOpts, "Number of second level tiling applied"); 221 STATISTIC(RegisterTileOpts, "Number of register tiling applied"); 222 STATISTIC(PrevectOpts, "Number of strip-mining for prevectorization applied"); 223 STATISTIC(MatMulOpts, 224 "Number of matrix multiplication patterns detected and optimized"); 225 226 namespace { 227 /// Additional parameters of the schedule optimizer. 228 /// 229 /// Target Transform Info and the SCoP dependencies used by the schedule 230 /// optimizer. 231 struct OptimizerAdditionalInfoTy { 232 const llvm::TargetTransformInfo *TTI; 233 const Dependences *D; 234 bool PatternOpts; 235 bool Postopts; 236 bool Prevect; 237 }; 238 239 class ScheduleTreeOptimizer { 240 public: 241 /// Apply schedule tree transformations. 242 /// 243 /// This function takes an (possibly already optimized) schedule tree and 244 /// applies a set of additional optimizations on the schedule tree. The 245 /// transformations applied include: 246 /// 247 /// - Pattern-based optimizations 248 /// - Tiling 249 /// - Prevectorization 250 /// 251 /// @param Schedule The schedule object the transformations will be applied 252 /// to. 253 /// @param OAI Target Transform Info and the SCoP dependencies. 254 /// @returns The transformed schedule. 255 static isl::schedule 256 optimizeSchedule(isl::schedule Schedule, 257 const OptimizerAdditionalInfoTy *OAI = nullptr); 258 259 /// Apply schedule tree transformations. 260 /// 261 /// This function takes a node in an (possibly already optimized) schedule 262 /// tree and applies a set of additional optimizations on this schedule tree 263 /// node and its descendants. The transformations applied include: 264 /// 265 /// - Pattern-based optimizations 266 /// - Tiling 267 /// - Prevectorization 268 /// 269 /// @param Node The schedule object post-transformations will be applied to. 270 /// @param OAI Target Transform Info and the SCoP dependencies. 271 /// @returns The transformed schedule. 272 static isl::schedule_node 273 optimizeScheduleNode(isl::schedule_node Node, 274 const OptimizerAdditionalInfoTy *OAI = nullptr); 275 276 /// Decide if the @p NewSchedule is profitable for @p S. 277 /// 278 /// @param S The SCoP we optimize. 279 /// @param NewSchedule The new schedule we computed. 280 /// 281 /// @return True, if we believe @p NewSchedule is an improvement for @p S. 282 static bool isProfitableSchedule(polly::Scop &S, isl::schedule NewSchedule); 283 284 /// Isolate a set of partial tile prefixes. 285 /// 286 /// This set should ensure that it contains only partial tile prefixes that 287 /// have exactly VectorWidth iterations. 288 /// 289 /// @param Node A schedule node band, which is a parent of a band node, 290 /// that contains a vector loop. 291 /// @return Modified isl_schedule_node. 292 static isl::schedule_node isolateFullPartialTiles(isl::schedule_node Node, 293 int VectorWidth); 294 295 private: 296 /// Check if this node is a band node we want to tile. 297 /// 298 /// We look for innermost band nodes where individual dimensions are marked as 299 /// permutable. 300 /// 301 /// @param Node The node to check. 302 static bool isTileableBandNode(isl::schedule_node Node); 303 304 /// Pre-vectorizes one scheduling dimension of a schedule band. 305 /// 306 /// prevectSchedBand splits out the dimension DimToVectorize, tiles it and 307 /// sinks the resulting point loop. 308 /// 309 /// Example (DimToVectorize=0, VectorWidth=4): 310 /// 311 /// | Before transformation: 312 /// | 313 /// | A[i,j] -> [i,j] 314 /// | 315 /// | for (i = 0; i < 128; i++) 316 /// | for (j = 0; j < 128; j++) 317 /// | A(i,j); 318 /// 319 /// | After transformation: 320 /// | 321 /// | for (it = 0; it < 32; it+=1) 322 /// | for (j = 0; j < 128; j++) 323 /// | for (ip = 0; ip <= 3; ip++) 324 /// | A(4 * it + ip,j); 325 /// 326 /// The goal of this transformation is to create a trivially vectorizable 327 /// loop. This means a parallel loop at the innermost level that has a 328 /// constant number of iterations corresponding to the target vector width. 329 /// 330 /// This transformation creates a loop at the innermost level. The loop has 331 /// a constant number of iterations, if the number of loop iterations at 332 /// DimToVectorize can be divided by VectorWidth. The default VectorWidth is 333 /// currently constant and not yet target specific. This function does not 334 /// reason about parallelism. 335 static isl::schedule_node prevectSchedBand(isl::schedule_node Node, 336 unsigned DimToVectorize, 337 int VectorWidth); 338 339 /// Apply additional optimizations on the bands in the schedule tree. 340 /// 341 /// We are looking for an innermost band node and apply the following 342 /// transformations: 343 /// 344 /// - Tile the band 345 /// - if the band is tileable 346 /// - if the band has more than one loop dimension 347 /// 348 /// - Prevectorize the schedule of the band (or the point loop in case of 349 /// tiling). 350 /// - if vectorization is enabled 351 /// 352 /// @param Node The schedule node to (possibly) optimize. 353 /// @param User A pointer to forward some use information 354 /// (currently unused). 355 static isl_schedule_node *optimizeBand(isl_schedule_node *Node, void *User); 356 357 /// Apply tiling optimizations on the bands in the schedule tree. 358 /// 359 /// @param Node The schedule node to (possibly) optimize. 360 static isl::schedule_node applyTileBandOpt(isl::schedule_node Node); 361 362 /// Apply prevectorization on the bands in the schedule tree. 363 /// 364 /// @param Node The schedule node to (possibly) prevectorize. 365 static isl::schedule_node applyPrevectBandOpt(isl::schedule_node Node); 366 }; 367 368 isl::schedule_node 369 ScheduleTreeOptimizer::isolateFullPartialTiles(isl::schedule_node Node, 370 int VectorWidth) { 371 assert(isl_schedule_node_get_type(Node.get()) == isl_schedule_node_band); 372 Node = Node.child(0).child(0); 373 isl::union_map SchedRelUMap = Node.get_prefix_schedule_relation(); 374 isl::union_set ScheduleRangeUSet = SchedRelUMap.range(); 375 isl::set ScheduleRange{ScheduleRangeUSet}; 376 isl::set IsolateDomain = getPartialTilePrefixes(ScheduleRange, VectorWidth); 377 auto AtomicOption = getDimOptions(IsolateDomain.ctx(), "atomic"); 378 isl::union_set IsolateOption = getIsolateOptions(IsolateDomain, 1); 379 Node = Node.parent().parent(); 380 isl::union_set Options = IsolateOption.unite(AtomicOption); 381 isl::schedule_node_band Result = 382 Node.as<isl::schedule_node_band>().set_ast_build_options(Options); 383 return Result; 384 } 385 386 isl::schedule_node ScheduleTreeOptimizer::prevectSchedBand( 387 isl::schedule_node Node, unsigned DimToVectorize, int VectorWidth) { 388 assert(isl_schedule_node_get_type(Node.get()) == isl_schedule_node_band); 389 390 auto Space = isl::manage(isl_schedule_node_band_get_space(Node.get())); 391 isl_size ScheduleDimensions = Space.dim(isl::dim::set).release(); 392 assert((isl_size)DimToVectorize < ScheduleDimensions); 393 394 if (DimToVectorize > 0) { 395 Node = isl::manage( 396 isl_schedule_node_band_split(Node.release(), DimToVectorize)); 397 Node = Node.child(0); 398 } 399 if ((isl_size)DimToVectorize < ScheduleDimensions - 1) 400 Node = isl::manage(isl_schedule_node_band_split(Node.release(), 1)); 401 Space = isl::manage(isl_schedule_node_band_get_space(Node.get())); 402 auto Sizes = isl::multi_val::zero(Space); 403 Sizes = Sizes.set_val(0, isl::val(Node.ctx(), VectorWidth)); 404 Node = 405 isl::manage(isl_schedule_node_band_tile(Node.release(), Sizes.release())); 406 Node = isolateFullPartialTiles(Node, VectorWidth); 407 Node = Node.child(0); 408 // Make sure the "trivially vectorizable loop" is not unrolled. Otherwise, 409 // we will have troubles to match it in the backend. 410 isl::schedule_node_band NodeBand = 411 Node.as<isl::schedule_node_band>().set_ast_build_options( 412 isl::union_set(Node.ctx(), "{ unroll[x]: 1 = 0 }")); 413 Node = isl::manage(isl_schedule_node_band_sink(NodeBand.release())); 414 Node = Node.child(0); 415 if (isl_schedule_node_get_type(Node.get()) == isl_schedule_node_leaf) 416 Node = Node.parent(); 417 auto LoopMarker = isl::id::alloc(Node.ctx(), "SIMD", nullptr); 418 PrevectOpts++; 419 return Node.insert_mark(LoopMarker); 420 } 421 422 static bool isSimpleInnermostBand(const isl::schedule_node &Node) { 423 assert(isl_schedule_node_get_type(Node.get()) == isl_schedule_node_band); 424 assert(isl_schedule_node_n_children(Node.get()) == 1); 425 426 auto ChildType = isl_schedule_node_get_type(Node.child(0).get()); 427 428 if (ChildType == isl_schedule_node_leaf) 429 return true; 430 431 if (ChildType != isl_schedule_node_sequence) 432 return false; 433 434 auto Sequence = Node.child(0); 435 436 for (int c = 0, nc = isl_schedule_node_n_children(Sequence.get()); c < nc; 437 ++c) { 438 auto Child = Sequence.child(c); 439 if (isl_schedule_node_get_type(Child.get()) != isl_schedule_node_filter) 440 return false; 441 if (isl_schedule_node_get_type(Child.child(0).get()) != 442 isl_schedule_node_leaf) 443 return false; 444 } 445 return true; 446 } 447 448 bool ScheduleTreeOptimizer::isTileableBandNode(isl::schedule_node Node) { 449 if (isl_schedule_node_get_type(Node.get()) != isl_schedule_node_band) 450 return false; 451 452 if (isl_schedule_node_n_children(Node.get()) != 1) 453 return false; 454 455 if (!isl_schedule_node_band_get_permutable(Node.get())) 456 return false; 457 458 auto Space = isl::manage(isl_schedule_node_band_get_space(Node.get())); 459 auto Dims = Space.dim(isl::dim::set).release(); 460 461 if (Dims <= 1) 462 return false; 463 464 return isSimpleInnermostBand(Node); 465 } 466 467 __isl_give isl::schedule_node 468 ScheduleTreeOptimizer::applyTileBandOpt(isl::schedule_node Node) { 469 if (FirstLevelTiling) { 470 Node = tileNode(Node, "1st level tiling", FirstLevelTileSizes, 471 FirstLevelDefaultTileSize); 472 FirstLevelTileOpts++; 473 } 474 475 if (SecondLevelTiling) { 476 Node = tileNode(Node, "2nd level tiling", SecondLevelTileSizes, 477 SecondLevelDefaultTileSize); 478 SecondLevelTileOpts++; 479 } 480 481 if (RegisterTiling) { 482 Node = 483 applyRegisterTiling(Node, RegisterTileSizes, RegisterDefaultTileSize); 484 RegisterTileOpts++; 485 } 486 487 return Node; 488 } 489 490 isl::schedule_node 491 ScheduleTreeOptimizer::applyPrevectBandOpt(isl::schedule_node Node) { 492 auto Space = isl::manage(isl_schedule_node_band_get_space(Node.get())); 493 auto Dims = Space.dim(isl::dim::set).release(); 494 495 for (int i = Dims - 1; i >= 0; i--) 496 if (Node.as<isl::schedule_node_band>().member_get_coincident(i)) { 497 Node = prevectSchedBand(Node, i, PrevectorWidth); 498 break; 499 } 500 501 return Node; 502 } 503 504 __isl_give isl_schedule_node * 505 ScheduleTreeOptimizer::optimizeBand(__isl_take isl_schedule_node *NodeArg, 506 void *User) { 507 const OptimizerAdditionalInfoTy *OAI = 508 static_cast<const OptimizerAdditionalInfoTy *>(User); 509 assert(OAI && "Expecting optimization options"); 510 511 isl::schedule_node Node = isl::manage(NodeArg); 512 if (!isTileableBandNode(Node)) 513 return Node.release(); 514 515 if (OAI->PatternOpts) { 516 isl::schedule_node PatternOptimizedSchedule = 517 tryOptimizeMatMulPattern(Node, OAI->TTI, OAI->D); 518 if (!PatternOptimizedSchedule.is_null()) { 519 MatMulOpts++; 520 return PatternOptimizedSchedule.release(); 521 } 522 } 523 524 if (OAI->Postopts) 525 Node = applyTileBandOpt(Node); 526 527 if (OAI->Prevect) { 528 // FIXME: Prevectorization requirements are different from those checked by 529 // isTileableBandNode. 530 Node = applyPrevectBandOpt(Node); 531 } 532 533 return Node.release(); 534 } 535 536 isl::schedule 537 ScheduleTreeOptimizer::optimizeSchedule(isl::schedule Schedule, 538 const OptimizerAdditionalInfoTy *OAI) { 539 auto Root = Schedule.get_root(); 540 Root = optimizeScheduleNode(Root, OAI); 541 return Root.get_schedule(); 542 } 543 544 isl::schedule_node ScheduleTreeOptimizer::optimizeScheduleNode( 545 isl::schedule_node Node, const OptimizerAdditionalInfoTy *OAI) { 546 Node = isl::manage(isl_schedule_node_map_descendant_bottom_up( 547 Node.release(), optimizeBand, 548 const_cast<void *>(static_cast<const void *>(OAI)))); 549 return Node; 550 } 551 552 bool ScheduleTreeOptimizer::isProfitableSchedule(Scop &S, 553 isl::schedule NewSchedule) { 554 // To understand if the schedule has been optimized we check if the schedule 555 // has changed at all. 556 // TODO: We can improve this by tracking if any necessarily beneficial 557 // transformations have been performed. This can e.g. be tiling, loop 558 // interchange, or ...) We can track this either at the place where the 559 // transformation has been performed or, in case of automatic ILP based 560 // optimizations, by comparing (yet to be defined) performance metrics 561 // before/after the scheduling optimizer 562 // (e.g., #stride-one accesses) 563 // FIXME: A schedule tree whose union_map-conversion is identical to the 564 // original schedule map may still allow for parallelization, i.e. can still 565 // be profitable. 566 auto NewScheduleMap = NewSchedule.get_map(); 567 auto OldSchedule = S.getSchedule(); 568 assert(!OldSchedule.is_null() && 569 "Only IslScheduleOptimizer can insert extension nodes " 570 "that make Scop::getSchedule() return nullptr."); 571 bool changed = !OldSchedule.is_equal(NewScheduleMap); 572 return changed; 573 } 574 575 class IslScheduleOptimizerWrapperPass : public ScopPass { 576 public: 577 static char ID; 578 579 explicit IslScheduleOptimizerWrapperPass() : ScopPass(ID) {} 580 581 /// Optimize the schedule of the SCoP @p S. 582 bool runOnScop(Scop &S) override; 583 584 /// Print the new schedule for the SCoP @p S. 585 void printScop(raw_ostream &OS, Scop &S) const override; 586 587 /// Register all analyses and transformation required. 588 void getAnalysisUsage(AnalysisUsage &AU) const override; 589 590 /// Release the internal memory. 591 void releaseMemory() override { 592 LastSchedule = {}; 593 IslCtx.reset(); 594 } 595 596 private: 597 std::shared_ptr<isl_ctx> IslCtx; 598 isl::schedule LastSchedule; 599 }; 600 601 char IslScheduleOptimizerWrapperPass::ID = 0; 602 603 #ifndef NDEBUG 604 static void printSchedule(llvm::raw_ostream &OS, const isl::schedule &Schedule, 605 StringRef Desc) { 606 isl::ctx Ctx = Schedule.ctx(); 607 isl_printer *P = isl_printer_to_str(Ctx.get()); 608 P = isl_printer_set_yaml_style(P, ISL_YAML_STYLE_BLOCK); 609 P = isl_printer_print_schedule(P, Schedule.get()); 610 char *Str = isl_printer_get_str(P); 611 OS << Desc << ": \n" << Str << "\n"; 612 free(Str); 613 isl_printer_free(P); 614 } 615 #endif 616 617 /// Collect statistics for the schedule tree. 618 /// 619 /// @param Schedule The schedule tree to analyze. If not a schedule tree it is 620 /// ignored. 621 /// @param Version The version of the schedule tree that is analyzed. 622 /// 0 for the original schedule tree before any transformation. 623 /// 1 for the schedule tree after isl's rescheduling. 624 /// 2 for the schedule tree after optimizations are applied 625 /// (tiling, pattern matching) 626 static void walkScheduleTreeForStatistics(isl::schedule Schedule, int Version) { 627 auto Root = Schedule.get_root(); 628 if (Root.is_null()) 629 return; 630 631 isl_schedule_node_foreach_descendant_top_down( 632 Root.get(), 633 [](__isl_keep isl_schedule_node *nodeptr, void *user) -> isl_bool { 634 isl::schedule_node Node = isl::manage_copy(nodeptr); 635 int Version = *static_cast<int *>(user); 636 637 switch (isl_schedule_node_get_type(Node.get())) { 638 case isl_schedule_node_band: { 639 NumBands[Version]++; 640 if (isl_schedule_node_band_get_permutable(Node.get()) == 641 isl_bool_true) 642 NumPermutable[Version]++; 643 644 int CountMembers = isl_schedule_node_band_n_member(Node.get()); 645 NumBandMembers[Version] += CountMembers; 646 for (int i = 0; i < CountMembers; i += 1) { 647 if (Node.as<isl::schedule_node_band>().member_get_coincident(i)) 648 NumCoincident[Version]++; 649 } 650 break; 651 } 652 653 case isl_schedule_node_filter: 654 NumFilters[Version]++; 655 break; 656 657 case isl_schedule_node_extension: 658 NumExtension[Version]++; 659 break; 660 661 default: 662 break; 663 } 664 665 return isl_bool_true; 666 }, 667 &Version); 668 } 669 670 static bool runIslScheduleOptimizer( 671 Scop &S, 672 function_ref<const Dependences &(Dependences::AnalysisLevel)> GetDeps, 673 TargetTransformInfo *TTI, OptimizationRemarkEmitter *ORE, 674 isl::schedule &LastSchedule) { 675 676 // Skip SCoPs in case they're already optimised by PPCGCodeGeneration 677 if (S.isToBeSkipped()) 678 return false; 679 680 // Skip empty SCoPs but still allow code generation as it will delete the 681 // loops present but not needed. 682 if (S.getSize() == 0) { 683 S.markAsOptimized(); 684 return false; 685 } 686 687 ScopsProcessed++; 688 689 // Schedule without optimizations. 690 isl::schedule Schedule = S.getScheduleTree(); 691 walkScheduleTreeForStatistics(S.getScheduleTree(), 0); 692 LLVM_DEBUG(printSchedule(dbgs(), Schedule, "Original schedule tree")); 693 694 bool HasUserTransformation = false; 695 if (PragmaBasedOpts) { 696 isl::schedule ManuallyTransformed = applyManualTransformations( 697 &S, Schedule, GetDeps(Dependences::AL_Statement), ORE); 698 if (ManuallyTransformed.is_null()) { 699 LLVM_DEBUG(dbgs() << "Error during manual optimization\n"); 700 return false; 701 } 702 703 if (ManuallyTransformed.get() != Schedule.get()) { 704 // User transformations have precedence over other transformations. 705 HasUserTransformation = true; 706 Schedule = std::move(ManuallyTransformed); 707 LLVM_DEBUG( 708 printSchedule(dbgs(), Schedule, "After manual transformations")); 709 } 710 } 711 712 // Only continue if either manual transformations have been applied or we are 713 // allowed to apply heuristics. 714 // TODO: Detect disabled heuristics and no user-directed transformation 715 // metadata earlier in ScopDetection. 716 if (!HasUserTransformation && S.hasDisableHeuristicsHint()) { 717 LLVM_DEBUG(dbgs() << "Heuristic optimizations disabled by metadata\n"); 718 return false; 719 } 720 721 // Get dependency analysis. 722 const Dependences &D = GetDeps(Dependences::AL_Statement); 723 if (D.getSharedIslCtx() != S.getSharedIslCtx()) { 724 LLVM_DEBUG(dbgs() << "DependenceInfo for another SCoP/isl_ctx\n"); 725 return false; 726 } 727 if (!D.hasValidDependences()) { 728 LLVM_DEBUG(dbgs() << "Dependency information not available\n"); 729 return false; 730 } 731 732 // Apply ISL's algorithm only if not overriden by the user. Note that 733 // post-rescheduling optimizations (tiling, pattern-based, prevectorization) 734 // rely on the coincidence/permutable annotations on schedule tree bands that 735 // are added by the rescheduling analyzer. Therefore, disabling the 736 // rescheduler implicitly also disables these optimizations. 737 if (!EnableReschedule) { 738 LLVM_DEBUG(dbgs() << "Skipping rescheduling due to command line option\n"); 739 } else if (HasUserTransformation) { 740 LLVM_DEBUG( 741 dbgs() << "Skipping rescheduling due to manual transformation\n"); 742 } else { 743 // Build input data. 744 int ValidityKinds = 745 Dependences::TYPE_RAW | Dependences::TYPE_WAR | Dependences::TYPE_WAW; 746 int ProximityKinds; 747 748 if (OptimizeDeps == "all") 749 ProximityKinds = 750 Dependences::TYPE_RAW | Dependences::TYPE_WAR | Dependences::TYPE_WAW; 751 else if (OptimizeDeps == "raw") 752 ProximityKinds = Dependences::TYPE_RAW; 753 else { 754 errs() << "Do not know how to optimize for '" << OptimizeDeps << "'" 755 << " Falling back to optimizing all dependences.\n"; 756 ProximityKinds = 757 Dependences::TYPE_RAW | Dependences::TYPE_WAR | Dependences::TYPE_WAW; 758 } 759 760 isl::union_set Domain = S.getDomains(); 761 762 if (Domain.is_null()) 763 return false; 764 765 isl::union_map Validity = D.getDependences(ValidityKinds); 766 isl::union_map Proximity = D.getDependences(ProximityKinds); 767 768 // Simplify the dependences by removing the constraints introduced by the 769 // domains. This can speed up the scheduling time significantly, as large 770 // constant coefficients will be removed from the dependences. The 771 // introduction of some additional dependences reduces the possible 772 // transformations, but in most cases, such transformation do not seem to be 773 // interesting anyway. In some cases this option may stop the scheduler to 774 // find any schedule. 775 if (SimplifyDeps == "yes") { 776 Validity = Validity.gist_domain(Domain); 777 Validity = Validity.gist_range(Domain); 778 Proximity = Proximity.gist_domain(Domain); 779 Proximity = Proximity.gist_range(Domain); 780 } else if (SimplifyDeps != "no") { 781 errs() 782 << "warning: Option -polly-opt-simplify-deps should either be 'yes' " 783 "or 'no'. Falling back to default: 'yes'\n"; 784 } 785 786 LLVM_DEBUG(dbgs() << "\n\nCompute schedule from: "); 787 LLVM_DEBUG(dbgs() << "Domain := " << Domain << ";\n"); 788 LLVM_DEBUG(dbgs() << "Proximity := " << Proximity << ";\n"); 789 LLVM_DEBUG(dbgs() << "Validity := " << Validity << ";\n"); 790 791 int IslMaximizeBands; 792 if (MaximizeBandDepth == "yes") { 793 IslMaximizeBands = 1; 794 } else if (MaximizeBandDepth == "no") { 795 IslMaximizeBands = 0; 796 } else { 797 errs() 798 << "warning: Option -polly-opt-maximize-bands should either be 'yes'" 799 " or 'no'. Falling back to default: 'yes'\n"; 800 IslMaximizeBands = 1; 801 } 802 803 int IslOuterCoincidence; 804 if (OuterCoincidence == "yes") { 805 IslOuterCoincidence = 1; 806 } else if (OuterCoincidence == "no") { 807 IslOuterCoincidence = 0; 808 } else { 809 errs() << "warning: Option -polly-opt-outer-coincidence should either be " 810 "'yes' or 'no'. Falling back to default: 'no'\n"; 811 IslOuterCoincidence = 0; 812 } 813 814 isl_ctx *Ctx = S.getIslCtx().get(); 815 816 isl_options_set_schedule_outer_coincidence(Ctx, IslOuterCoincidence); 817 isl_options_set_schedule_maximize_band_depth(Ctx, IslMaximizeBands); 818 isl_options_set_schedule_max_constant_term(Ctx, MaxConstantTerm); 819 isl_options_set_schedule_max_coefficient(Ctx, MaxCoefficient); 820 isl_options_set_tile_scale_tile_loops(Ctx, 0); 821 822 auto OnErrorStatus = isl_options_get_on_error(Ctx); 823 isl_options_set_on_error(Ctx, ISL_ON_ERROR_CONTINUE); 824 825 auto SC = isl::schedule_constraints::on_domain(Domain); 826 SC = SC.set_proximity(Proximity); 827 SC = SC.set_validity(Validity); 828 SC = SC.set_coincidence(Validity); 829 Schedule = SC.compute_schedule(); 830 isl_options_set_on_error(Ctx, OnErrorStatus); 831 832 ScopsRescheduled++; 833 LLVM_DEBUG(printSchedule(dbgs(), Schedule, "After rescheduling")); 834 } 835 836 walkScheduleTreeForStatistics(Schedule, 1); 837 838 // In cases the scheduler is not able to optimize the code, we just do not 839 // touch the schedule. 840 if (Schedule.is_null()) 841 return false; 842 843 if (GreedyFusion) { 844 isl::union_map Validity = D.getDependences( 845 Dependences::TYPE_RAW | Dependences::TYPE_WAR | Dependences::TYPE_WAW); 846 Schedule = applyGreedyFusion(Schedule, Validity); 847 assert(!Schedule.is_null()); 848 } 849 850 // Apply post-rescheduling optimizations (if enabled) and/or prevectorization. 851 const OptimizerAdditionalInfoTy OAI = { 852 TTI, const_cast<Dependences *>(&D), 853 /*PatternOpts=*/!HasUserTransformation && PMBasedOpts, 854 /*Postopts=*/!HasUserTransformation && EnablePostopts, 855 /*Prevect=*/PollyVectorizerChoice != VECTORIZER_NONE}; 856 if (OAI.PatternOpts || OAI.Postopts || OAI.Prevect) { 857 Schedule = ScheduleTreeOptimizer::optimizeSchedule(Schedule, &OAI); 858 Schedule = hoistExtensionNodes(Schedule); 859 LLVM_DEBUG(printSchedule(dbgs(), Schedule, "After post-optimizations")); 860 walkScheduleTreeForStatistics(Schedule, 2); 861 } 862 863 // Skip profitability check if user transformation(s) have been applied. 864 if (!HasUserTransformation && 865 !ScheduleTreeOptimizer::isProfitableSchedule(S, Schedule)) 866 return false; 867 868 auto ScopStats = S.getStatistics(); 869 ScopsOptimized++; 870 NumAffineLoopsOptimized += ScopStats.NumAffineLoops; 871 NumBoxedLoopsOptimized += ScopStats.NumBoxedLoops; 872 LastSchedule = Schedule; 873 874 S.setScheduleTree(Schedule); 875 S.markAsOptimized(); 876 877 if (OptimizedScops) 878 errs() << S; 879 880 return false; 881 } 882 883 bool IslScheduleOptimizerWrapperPass::runOnScop(Scop &S) { 884 releaseMemory(); 885 886 Function &F = S.getFunction(); 887 IslCtx = S.getSharedIslCtx(); 888 889 auto getDependences = 890 [this](Dependences::AnalysisLevel) -> const Dependences & { 891 return getAnalysis<DependenceInfo>().getDependences( 892 Dependences::AL_Statement); 893 }; 894 OptimizationRemarkEmitter &ORE = 895 getAnalysis<OptimizationRemarkEmitterWrapperPass>().getORE(); 896 TargetTransformInfo *TTI = 897 &getAnalysis<TargetTransformInfoWrapperPass>().getTTI(F); 898 return runIslScheduleOptimizer(S, getDependences, TTI, &ORE, LastSchedule); 899 } 900 901 static void runScheduleOptimizerPrinter(raw_ostream &OS, 902 isl::schedule LastSchedule) { 903 isl_printer *p; 904 char *ScheduleStr; 905 906 OS << "Calculated schedule:\n"; 907 908 if (LastSchedule.is_null()) { 909 OS << "n/a\n"; 910 return; 911 } 912 913 p = isl_printer_to_str(LastSchedule.ctx().get()); 914 p = isl_printer_set_yaml_style(p, ISL_YAML_STYLE_BLOCK); 915 p = isl_printer_print_schedule(p, LastSchedule.get()); 916 ScheduleStr = isl_printer_get_str(p); 917 isl_printer_free(p); 918 919 OS << ScheduleStr << "\n"; 920 921 free(ScheduleStr); 922 } 923 924 void IslScheduleOptimizerWrapperPass::printScop(raw_ostream &OS, Scop &) const { 925 runScheduleOptimizerPrinter(OS, LastSchedule); 926 } 927 928 void IslScheduleOptimizerWrapperPass::getAnalysisUsage( 929 AnalysisUsage &AU) const { 930 ScopPass::getAnalysisUsage(AU); 931 AU.addRequired<DependenceInfo>(); 932 AU.addRequired<TargetTransformInfoWrapperPass>(); 933 AU.addRequired<OptimizationRemarkEmitterWrapperPass>(); 934 935 AU.addPreserved<DependenceInfo>(); 936 AU.addPreserved<OptimizationRemarkEmitterWrapperPass>(); 937 } 938 939 } // namespace 940 941 Pass *polly::createIslScheduleOptimizerWrapperPass() { 942 return new IslScheduleOptimizerWrapperPass(); 943 } 944 945 INITIALIZE_PASS_BEGIN(IslScheduleOptimizerWrapperPass, "polly-opt-isl", 946 "Polly - Optimize schedule of SCoP", false, false); 947 INITIALIZE_PASS_DEPENDENCY(DependenceInfo); 948 INITIALIZE_PASS_DEPENDENCY(ScopInfoRegionPass); 949 INITIALIZE_PASS_DEPENDENCY(TargetTransformInfoWrapperPass); 950 INITIALIZE_PASS_DEPENDENCY(OptimizationRemarkEmitterWrapperPass); 951 INITIALIZE_PASS_END(IslScheduleOptimizerWrapperPass, "polly-opt-isl", 952 "Polly - Optimize schedule of SCoP", false, false) 953 954 static llvm::PreservedAnalyses 955 runIslScheduleOptimizerUsingNPM(Scop &S, ScopAnalysisManager &SAM, 956 ScopStandardAnalysisResults &SAR, SPMUpdater &U, 957 raw_ostream *OS) { 958 DependenceAnalysis::Result &Deps = SAM.getResult<DependenceAnalysis>(S, SAR); 959 auto GetDeps = [&Deps](Dependences::AnalysisLevel) -> const Dependences & { 960 return Deps.getDependences(Dependences::AL_Statement); 961 }; 962 OptimizationRemarkEmitter ORE(&S.getFunction()); 963 TargetTransformInfo *TTI = &SAR.TTI; 964 isl::schedule LastSchedule; 965 bool Modified = runIslScheduleOptimizer(S, GetDeps, TTI, &ORE, LastSchedule); 966 if (OS) { 967 *OS << "Printing analysis 'Polly - Optimize schedule of SCoP' for region: '" 968 << S.getName() << "' in function '" << S.getFunction().getName() 969 << "':\n"; 970 runScheduleOptimizerPrinter(*OS, LastSchedule); 971 } 972 973 if (!Modified) 974 return PreservedAnalyses::all(); 975 976 PreservedAnalyses PA; 977 PA.preserveSet<AllAnalysesOn<Module>>(); 978 PA.preserveSet<AllAnalysesOn<Function>>(); 979 PA.preserveSet<AllAnalysesOn<Loop>>(); 980 return PA; 981 } 982 983 llvm::PreservedAnalyses 984 IslScheduleOptimizerPass::run(Scop &S, ScopAnalysisManager &SAM, 985 ScopStandardAnalysisResults &SAR, SPMUpdater &U) { 986 return runIslScheduleOptimizerUsingNPM(S, SAM, SAR, U, nullptr); 987 } 988 989 llvm::PreservedAnalyses 990 IslScheduleOptimizerPrinterPass::run(Scop &S, ScopAnalysisManager &SAM, 991 ScopStandardAnalysisResults &SAR, 992 SPMUpdater &U) { 993 return runIslScheduleOptimizerUsingNPM(S, SAM, SAR, U, &OS); 994 } 995