1 //===- ScheduleOptimizer.cpp - Calculate an optimized schedule ------------===// 2 // 3 // Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. 4 // See https://llvm.org/LICENSE.txt for license information. 5 // SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception 6 // 7 //===----------------------------------------------------------------------===// 8 // 9 // This pass generates an entirely new schedule tree from the data dependences 10 // and iteration domains. The new schedule tree is computed in two steps: 11 // 12 // 1) The isl scheduling optimizer is run 13 // 14 // The isl scheduling optimizer creates a new schedule tree that maximizes 15 // parallelism and tileability and minimizes data-dependence distances. The 16 // algorithm used is a modified version of the ``Pluto'' algorithm: 17 // 18 // U. Bondhugula, A. Hartono, J. Ramanujam, and P. Sadayappan. 19 // A Practical Automatic Polyhedral Parallelizer and Locality Optimizer. 20 // In Proceedings of the 2008 ACM SIGPLAN Conference On Programming Language 21 // Design and Implementation, PLDI ’08, pages 101–113. ACM, 2008. 22 // 23 // 2) A set of post-scheduling transformations is applied on the schedule tree. 24 // 25 // These optimizations include: 26 // 27 // - Tiling of the innermost tilable bands 28 // - Prevectorization - The choice of a possible outer loop that is strip-mined 29 // to the innermost level to enable inner-loop 30 // vectorization. 31 // - Some optimizations for spatial locality are also planned. 32 // 33 // For a detailed description of the schedule tree itself please see section 6 34 // of: 35 // 36 // Polyhedral AST generation is more than scanning polyhedra 37 // Tobias Grosser, Sven Verdoolaege, Albert Cohen 38 // ACM Transactions on Programming Languages and Systems (TOPLAS), 39 // 37(4), July 2015 40 // http://www.grosser.es/#pub-polyhedral-AST-generation 41 // 42 // This publication also contains a detailed discussion of the different options 43 // for polyhedral loop unrolling, full/partial tile separation and other uses 44 // of the schedule tree. 45 // 46 //===----------------------------------------------------------------------===// 47 48 #include "polly/ScheduleOptimizer.h" 49 #include "polly/CodeGen/CodeGeneration.h" 50 #include "polly/DependenceInfo.h" 51 #include "polly/ManualOptimizer.h" 52 #include "polly/MatmulOptimizer.h" 53 #include "polly/Options.h" 54 #include "polly/ScheduleTreeTransform.h" 55 #include "polly/Support/ISLOStream.h" 56 #include "polly/Support/ISLTools.h" 57 #include "llvm/ADT/Sequence.h" 58 #include "llvm/ADT/Statistic.h" 59 #include "llvm/Analysis/OptimizationRemarkEmitter.h" 60 #include "llvm/InitializePasses.h" 61 #include "llvm/Support/CommandLine.h" 62 #include "isl/options.h" 63 64 using namespace llvm; 65 using namespace polly; 66 67 namespace llvm { 68 class Loop; 69 class Module; 70 } // namespace llvm 71 72 #define DEBUG_TYPE "polly-opt-isl" 73 74 static cl::opt<std::string> 75 OptimizeDeps("polly-opt-optimize-only", 76 cl::desc("Only a certain kind of dependences (all/raw)"), 77 cl::Hidden, cl::init("all"), cl::cat(PollyCategory)); 78 79 static cl::opt<std::string> 80 SimplifyDeps("polly-opt-simplify-deps", 81 cl::desc("Dependences should be simplified (yes/no)"), 82 cl::Hidden, cl::init("yes"), cl::cat(PollyCategory)); 83 84 static cl::opt<int> MaxConstantTerm( 85 "polly-opt-max-constant-term", 86 cl::desc("The maximal constant term allowed (-1 is unlimited)"), cl::Hidden, 87 cl::init(20), cl::cat(PollyCategory)); 88 89 static cl::opt<int> MaxCoefficient( 90 "polly-opt-max-coefficient", 91 cl::desc("The maximal coefficient allowed (-1 is unlimited)"), cl::Hidden, 92 cl::init(20), cl::cat(PollyCategory)); 93 94 static cl::opt<std::string> 95 MaximizeBandDepth("polly-opt-maximize-bands", 96 cl::desc("Maximize the band depth (yes/no)"), cl::Hidden, 97 cl::init("yes"), cl::cat(PollyCategory)); 98 99 static cl::opt<bool> 100 GreedyFusion("polly-loopfusion-greedy", 101 cl::desc("Aggressively try to fuse everything"), cl::Hidden, 102 cl::cat(PollyCategory)); 103 104 static cl::opt<std::string> OuterCoincidence( 105 "polly-opt-outer-coincidence", 106 cl::desc("Try to construct schedules where the outer member of each band " 107 "satisfies the coincidence constraints (yes/no)"), 108 cl::Hidden, cl::init("no"), cl::cat(PollyCategory)); 109 110 static cl::opt<int> PrevectorWidth( 111 "polly-prevect-width", 112 cl::desc( 113 "The number of loop iterations to strip-mine for pre-vectorization"), 114 cl::Hidden, cl::init(4), cl::cat(PollyCategory)); 115 116 static cl::opt<bool> FirstLevelTiling("polly-tiling", 117 cl::desc("Enable loop tiling"), 118 cl::init(true), cl::cat(PollyCategory)); 119 120 static cl::opt<int> FirstLevelDefaultTileSize( 121 "polly-default-tile-size", 122 cl::desc("The default tile size (if not enough were provided by" 123 " --polly-tile-sizes)"), 124 cl::Hidden, cl::init(32), cl::cat(PollyCategory)); 125 126 static cl::list<int> 127 FirstLevelTileSizes("polly-tile-sizes", 128 cl::desc("A tile size for each loop dimension, filled " 129 "with --polly-default-tile-size"), 130 cl::Hidden, cl::CommaSeparated, cl::cat(PollyCategory)); 131 132 static cl::opt<bool> 133 SecondLevelTiling("polly-2nd-level-tiling", 134 cl::desc("Enable a 2nd level loop of loop tiling"), 135 cl::cat(PollyCategory)); 136 137 static cl::opt<int> SecondLevelDefaultTileSize( 138 "polly-2nd-level-default-tile-size", 139 cl::desc("The default 2nd-level tile size (if not enough were provided by" 140 " --polly-2nd-level-tile-sizes)"), 141 cl::Hidden, cl::init(16), cl::cat(PollyCategory)); 142 143 static cl::list<int> 144 SecondLevelTileSizes("polly-2nd-level-tile-sizes", 145 cl::desc("A tile size for each loop dimension, filled " 146 "with --polly-default-tile-size"), 147 cl::Hidden, cl::CommaSeparated, 148 cl::cat(PollyCategory)); 149 150 static cl::opt<bool> RegisterTiling("polly-register-tiling", 151 cl::desc("Enable register tiling"), 152 cl::cat(PollyCategory)); 153 154 static cl::opt<int> RegisterDefaultTileSize( 155 "polly-register-tiling-default-tile-size", 156 cl::desc("The default register tile size (if not enough were provided by" 157 " --polly-register-tile-sizes)"), 158 cl::Hidden, cl::init(2), cl::cat(PollyCategory)); 159 160 static cl::list<int> 161 RegisterTileSizes("polly-register-tile-sizes", 162 cl::desc("A tile size for each loop dimension, filled " 163 "with --polly-register-tile-size"), 164 cl::Hidden, cl::CommaSeparated, cl::cat(PollyCategory)); 165 166 static cl::opt<bool> PragmaBasedOpts( 167 "polly-pragma-based-opts", 168 cl::desc("Apply user-directed transformation from metadata"), 169 cl::init(true), cl::cat(PollyCategory)); 170 171 static cl::opt<bool> EnableReschedule("polly-reschedule", 172 cl::desc("Optimize SCoPs using ISL"), 173 cl::init(true), cl::cat(PollyCategory)); 174 175 static cl::opt<bool> 176 PMBasedOpts("polly-pattern-matching-based-opts", 177 cl::desc("Perform optimizations based on pattern matching"), 178 cl::init(true), cl::cat(PollyCategory)); 179 180 static cl::opt<bool> 181 EnablePostopts("polly-postopts", 182 cl::desc("Apply post-rescheduling optimizations such as " 183 "tiling (requires -polly-reschedule)"), 184 cl::init(true), cl::cat(PollyCategory)); 185 186 static cl::opt<bool> OptimizedScops( 187 "polly-optimized-scops", 188 cl::desc("Polly - Dump polyhedral description of Scops optimized with " 189 "the isl scheduling optimizer and the set of post-scheduling " 190 "transformations is applied on the schedule tree"), 191 cl::cat(PollyCategory)); 192 193 STATISTIC(ScopsProcessed, "Number of scops processed"); 194 STATISTIC(ScopsRescheduled, "Number of scops rescheduled"); 195 STATISTIC(ScopsOptimized, "Number of scops optimized"); 196 197 STATISTIC(NumAffineLoopsOptimized, "Number of affine loops optimized"); 198 STATISTIC(NumBoxedLoopsOptimized, "Number of boxed loops optimized"); 199 200 #define THREE_STATISTICS(VARNAME, DESC) \ 201 static Statistic VARNAME[3] = { \ 202 {DEBUG_TYPE, #VARNAME "0", DESC " (original)"}, \ 203 {DEBUG_TYPE, #VARNAME "1", DESC " (after scheduler)"}, \ 204 {DEBUG_TYPE, #VARNAME "2", DESC " (after optimizer)"}} 205 206 THREE_STATISTICS(NumBands, "Number of bands"); 207 THREE_STATISTICS(NumBandMembers, "Number of band members"); 208 THREE_STATISTICS(NumCoincident, "Number of coincident band members"); 209 THREE_STATISTICS(NumPermutable, "Number of permutable bands"); 210 THREE_STATISTICS(NumFilters, "Number of filter nodes"); 211 THREE_STATISTICS(NumExtension, "Number of extension nodes"); 212 213 STATISTIC(FirstLevelTileOpts, "Number of first level tiling applied"); 214 STATISTIC(SecondLevelTileOpts, "Number of second level tiling applied"); 215 STATISTIC(RegisterTileOpts, "Number of register tiling applied"); 216 STATISTIC(PrevectOpts, "Number of strip-mining for prevectorization applied"); 217 STATISTIC(MatMulOpts, 218 "Number of matrix multiplication patterns detected and optimized"); 219 220 namespace { 221 /// Additional parameters of the schedule optimizer. 222 /// 223 /// Target Transform Info and the SCoP dependencies used by the schedule 224 /// optimizer. 225 struct OptimizerAdditionalInfoTy { 226 const llvm::TargetTransformInfo *TTI; 227 const Dependences *D; 228 bool PatternOpts; 229 bool Postopts; 230 bool Prevect; 231 bool &DepsChanged; 232 }; 233 234 class ScheduleTreeOptimizer final { 235 public: 236 /// Apply schedule tree transformations. 237 /// 238 /// This function takes an (possibly already optimized) schedule tree and 239 /// applies a set of additional optimizations on the schedule tree. The 240 /// transformations applied include: 241 /// 242 /// - Pattern-based optimizations 243 /// - Tiling 244 /// - Prevectorization 245 /// 246 /// @param Schedule The schedule object the transformations will be applied 247 /// to. 248 /// @param OAI Target Transform Info and the SCoP dependencies. 249 /// @returns The transformed schedule. 250 static isl::schedule 251 optimizeSchedule(isl::schedule Schedule, 252 const OptimizerAdditionalInfoTy *OAI = nullptr); 253 254 /// Apply schedule tree transformations. 255 /// 256 /// This function takes a node in an (possibly already optimized) schedule 257 /// tree and applies a set of additional optimizations on this schedule tree 258 /// node and its descendants. The transformations applied include: 259 /// 260 /// - Pattern-based optimizations 261 /// - Tiling 262 /// - Prevectorization 263 /// 264 /// @param Node The schedule object post-transformations will be applied to. 265 /// @param OAI Target Transform Info and the SCoP dependencies. 266 /// @returns The transformed schedule. 267 static isl::schedule_node 268 optimizeScheduleNode(isl::schedule_node Node, 269 const OptimizerAdditionalInfoTy *OAI = nullptr); 270 271 /// Decide if the @p NewSchedule is profitable for @p S. 272 /// 273 /// @param S The SCoP we optimize. 274 /// @param NewSchedule The new schedule we computed. 275 /// 276 /// @return True, if we believe @p NewSchedule is an improvement for @p S. 277 static bool isProfitableSchedule(polly::Scop &S, isl::schedule NewSchedule); 278 279 /// Isolate a set of partial tile prefixes. 280 /// 281 /// This set should ensure that it contains only partial tile prefixes that 282 /// have exactly VectorWidth iterations. 283 /// 284 /// @param Node A schedule node band, which is a parent of a band node, 285 /// that contains a vector loop. 286 /// @return Modified isl_schedule_node. 287 static isl::schedule_node isolateFullPartialTiles(isl::schedule_node Node, 288 int VectorWidth); 289 290 private: 291 /// Check if this node is a band node we want to tile. 292 /// 293 /// We look for innermost band nodes where individual dimensions are marked as 294 /// permutable. 295 /// 296 /// @param Node The node to check. 297 static bool isTileableBandNode(isl::schedule_node Node); 298 299 /// Pre-vectorizes one scheduling dimension of a schedule band. 300 /// 301 /// prevectSchedBand splits out the dimension DimToVectorize, tiles it and 302 /// sinks the resulting point loop. 303 /// 304 /// Example (DimToVectorize=0, VectorWidth=4): 305 /// 306 /// | Before transformation: 307 /// | 308 /// | A[i,j] -> [i,j] 309 /// | 310 /// | for (i = 0; i < 128; i++) 311 /// | for (j = 0; j < 128; j++) 312 /// | A(i,j); 313 /// 314 /// | After transformation: 315 /// | 316 /// | for (it = 0; it < 32; it+=1) 317 /// | for (j = 0; j < 128; j++) 318 /// | for (ip = 0; ip <= 3; ip++) 319 /// | A(4 * it + ip,j); 320 /// 321 /// The goal of this transformation is to create a trivially vectorizable 322 /// loop. This means a parallel loop at the innermost level that has a 323 /// constant number of iterations corresponding to the target vector width. 324 /// 325 /// This transformation creates a loop at the innermost level. The loop has 326 /// a constant number of iterations, if the number of loop iterations at 327 /// DimToVectorize can be divided by VectorWidth. The default VectorWidth is 328 /// currently constant and not yet target specific. This function does not 329 /// reason about parallelism. 330 static isl::schedule_node prevectSchedBand(isl::schedule_node Node, 331 unsigned DimToVectorize, 332 int VectorWidth); 333 334 /// Apply additional optimizations on the bands in the schedule tree. 335 /// 336 /// We are looking for an innermost band node and apply the following 337 /// transformations: 338 /// 339 /// - Tile the band 340 /// - if the band is tileable 341 /// - if the band has more than one loop dimension 342 /// 343 /// - Prevectorize the schedule of the band (or the point loop in case of 344 /// tiling). 345 /// - if vectorization is enabled 346 /// 347 /// @param Node The schedule node to (possibly) optimize. 348 /// @param User A pointer to forward some use information 349 /// (currently unused). 350 static isl_schedule_node *optimizeBand(isl_schedule_node *Node, void *User); 351 352 /// Apply tiling optimizations on the bands in the schedule tree. 353 /// 354 /// @param Node The schedule node to (possibly) optimize. 355 static isl::schedule_node applyTileBandOpt(isl::schedule_node Node); 356 357 /// Apply prevectorization on the bands in the schedule tree. 358 /// 359 /// @param Node The schedule node to (possibly) prevectorize. 360 static isl::schedule_node applyPrevectBandOpt(isl::schedule_node Node); 361 }; 362 363 isl::schedule_node 364 ScheduleTreeOptimizer::isolateFullPartialTiles(isl::schedule_node Node, 365 int VectorWidth) { 366 assert(isl_schedule_node_get_type(Node.get()) == isl_schedule_node_band); 367 Node = Node.child(0).child(0); 368 isl::union_map SchedRelUMap = Node.get_prefix_schedule_relation(); 369 isl::union_set ScheduleRangeUSet = SchedRelUMap.range(); 370 isl::set ScheduleRange{ScheduleRangeUSet}; 371 isl::set IsolateDomain = getPartialTilePrefixes(ScheduleRange, VectorWidth); 372 auto AtomicOption = getDimOptions(IsolateDomain.ctx(), "atomic"); 373 isl::union_set IsolateOption = getIsolateOptions(IsolateDomain, 1); 374 Node = Node.parent().parent(); 375 isl::union_set Options = IsolateOption.unite(AtomicOption); 376 isl::schedule_node_band Result = 377 Node.as<isl::schedule_node_band>().set_ast_build_options(Options); 378 return Result; 379 } 380 381 struct InsertSimdMarkers final : ScheduleNodeRewriter<InsertSimdMarkers> { 382 isl::schedule_node visitBand(isl::schedule_node_band Band) { 383 isl::schedule_node Node = visitChildren(Band); 384 385 // Only add SIMD markers to innermost bands. 386 if (!Node.first_child().isa<isl::schedule_node_leaf>()) 387 return Node; 388 389 isl::id LoopMarker = isl::id::alloc(Band.ctx(), "SIMD", nullptr); 390 return Band.insert_mark(LoopMarker); 391 } 392 }; 393 394 isl::schedule_node ScheduleTreeOptimizer::prevectSchedBand( 395 isl::schedule_node Node, unsigned DimToVectorize, int VectorWidth) { 396 assert(isl_schedule_node_get_type(Node.get()) == isl_schedule_node_band); 397 398 auto Space = isl::manage(isl_schedule_node_band_get_space(Node.get())); 399 unsigned ScheduleDimensions = unsignedFromIslSize(Space.dim(isl::dim::set)); 400 assert(DimToVectorize < ScheduleDimensions); 401 402 if (DimToVectorize > 0) { 403 Node = isl::manage( 404 isl_schedule_node_band_split(Node.release(), DimToVectorize)); 405 Node = Node.child(0); 406 } 407 if (DimToVectorize < ScheduleDimensions - 1) 408 Node = isl::manage(isl_schedule_node_band_split(Node.release(), 1)); 409 Space = isl::manage(isl_schedule_node_band_get_space(Node.get())); 410 auto Sizes = isl::multi_val::zero(Space); 411 Sizes = Sizes.set_val(0, isl::val(Node.ctx(), VectorWidth)); 412 Node = 413 isl::manage(isl_schedule_node_band_tile(Node.release(), Sizes.release())); 414 Node = isolateFullPartialTiles(Node, VectorWidth); 415 Node = Node.child(0); 416 // Make sure the "trivially vectorizable loop" is not unrolled. Otherwise, 417 // we will have troubles to match it in the backend. 418 Node = Node.as<isl::schedule_node_band>().set_ast_build_options( 419 isl::union_set(Node.ctx(), "{ unroll[x]: 1 = 0 }")); 420 421 // Sink the inner loop into the smallest possible statements to make them 422 // represent a single vector instruction if possible. 423 Node = isl::manage(isl_schedule_node_band_sink(Node.release())); 424 425 // Add SIMD markers to those vector statements. 426 InsertSimdMarkers SimdMarkerInserter; 427 Node = SimdMarkerInserter.visit(Node); 428 429 PrevectOpts++; 430 return Node.parent(); 431 } 432 433 static bool isSimpleInnermostBand(const isl::schedule_node &Node) { 434 assert(isl_schedule_node_get_type(Node.get()) == isl_schedule_node_band); 435 assert(isl_schedule_node_n_children(Node.get()) == 1); 436 437 auto ChildType = isl_schedule_node_get_type(Node.child(0).get()); 438 439 if (ChildType == isl_schedule_node_leaf) 440 return true; 441 442 if (ChildType != isl_schedule_node_sequence) 443 return false; 444 445 auto Sequence = Node.child(0); 446 447 for (int c = 0, nc = isl_schedule_node_n_children(Sequence.get()); c < nc; 448 ++c) { 449 auto Child = Sequence.child(c); 450 if (isl_schedule_node_get_type(Child.get()) != isl_schedule_node_filter) 451 return false; 452 if (isl_schedule_node_get_type(Child.child(0).get()) != 453 isl_schedule_node_leaf) 454 return false; 455 } 456 return true; 457 } 458 459 bool ScheduleTreeOptimizer::isTileableBandNode(isl::schedule_node Node) { 460 if (isl_schedule_node_get_type(Node.get()) != isl_schedule_node_band) 461 return false; 462 463 if (isl_schedule_node_n_children(Node.get()) != 1) 464 return false; 465 466 if (!isl_schedule_node_band_get_permutable(Node.get())) 467 return false; 468 469 auto Space = isl::manage(isl_schedule_node_band_get_space(Node.get())); 470 471 if (unsignedFromIslSize(Space.dim(isl::dim::set)) <= 1u) 472 return false; 473 474 return isSimpleInnermostBand(Node); 475 } 476 477 __isl_give isl::schedule_node 478 ScheduleTreeOptimizer::applyTileBandOpt(isl::schedule_node Node) { 479 if (FirstLevelTiling) { 480 Node = tileNode(Node, "1st level tiling", FirstLevelTileSizes, 481 FirstLevelDefaultTileSize); 482 FirstLevelTileOpts++; 483 } 484 485 if (SecondLevelTiling) { 486 Node = tileNode(Node, "2nd level tiling", SecondLevelTileSizes, 487 SecondLevelDefaultTileSize); 488 SecondLevelTileOpts++; 489 } 490 491 if (RegisterTiling) { 492 Node = 493 applyRegisterTiling(Node, RegisterTileSizes, RegisterDefaultTileSize); 494 RegisterTileOpts++; 495 } 496 497 return Node; 498 } 499 500 isl::schedule_node 501 ScheduleTreeOptimizer::applyPrevectBandOpt(isl::schedule_node Node) { 502 auto Space = isl::manage(isl_schedule_node_band_get_space(Node.get())); 503 int Dims = unsignedFromIslSize(Space.dim(isl::dim::set)); 504 505 for (int i = Dims - 1; i >= 0; i--) 506 if (Node.as<isl::schedule_node_band>().member_get_coincident(i)) { 507 Node = prevectSchedBand(Node, i, PrevectorWidth); 508 break; 509 } 510 511 return Node; 512 } 513 514 __isl_give isl_schedule_node * 515 ScheduleTreeOptimizer::optimizeBand(__isl_take isl_schedule_node *NodeArg, 516 void *User) { 517 const OptimizerAdditionalInfoTy *OAI = 518 static_cast<const OptimizerAdditionalInfoTy *>(User); 519 assert(OAI && "Expecting optimization options"); 520 521 isl::schedule_node Node = isl::manage(NodeArg); 522 if (!isTileableBandNode(Node)) 523 return Node.release(); 524 525 if (OAI->PatternOpts) { 526 isl::schedule_node PatternOptimizedSchedule = 527 tryOptimizeMatMulPattern(Node, OAI->TTI, OAI->D); 528 if (!PatternOptimizedSchedule.is_null()) { 529 MatMulOpts++; 530 OAI->DepsChanged = true; 531 return PatternOptimizedSchedule.release(); 532 } 533 } 534 535 if (OAI->Postopts) 536 Node = applyTileBandOpt(Node); 537 538 if (OAI->Prevect) { 539 // FIXME: Prevectorization requirements are different from those checked by 540 // isTileableBandNode. 541 Node = applyPrevectBandOpt(Node); 542 } 543 544 return Node.release(); 545 } 546 547 isl::schedule 548 ScheduleTreeOptimizer::optimizeSchedule(isl::schedule Schedule, 549 const OptimizerAdditionalInfoTy *OAI) { 550 auto Root = Schedule.get_root(); 551 Root = optimizeScheduleNode(Root, OAI); 552 return Root.get_schedule(); 553 } 554 555 isl::schedule_node ScheduleTreeOptimizer::optimizeScheduleNode( 556 isl::schedule_node Node, const OptimizerAdditionalInfoTy *OAI) { 557 Node = isl::manage(isl_schedule_node_map_descendant_bottom_up( 558 Node.release(), optimizeBand, 559 const_cast<void *>(static_cast<const void *>(OAI)))); 560 return Node; 561 } 562 563 bool ScheduleTreeOptimizer::isProfitableSchedule(Scop &S, 564 isl::schedule NewSchedule) { 565 // To understand if the schedule has been optimized we check if the schedule 566 // has changed at all. 567 // TODO: We can improve this by tracking if any necessarily beneficial 568 // transformations have been performed. This can e.g. be tiling, loop 569 // interchange, or ...) We can track this either at the place where the 570 // transformation has been performed or, in case of automatic ILP based 571 // optimizations, by comparing (yet to be defined) performance metrics 572 // before/after the scheduling optimizer 573 // (e.g., #stride-one accesses) 574 // FIXME: A schedule tree whose union_map-conversion is identical to the 575 // original schedule map may still allow for parallelization, i.e. can still 576 // be profitable. 577 auto NewScheduleMap = NewSchedule.get_map(); 578 auto OldSchedule = S.getSchedule(); 579 assert(!OldSchedule.is_null() && 580 "Only IslScheduleOptimizer can insert extension nodes " 581 "that make Scop::getSchedule() return nullptr."); 582 bool changed = !OldSchedule.is_equal(NewScheduleMap); 583 return changed; 584 } 585 586 class IslScheduleOptimizerWrapperPass final : public ScopPass { 587 public: 588 static char ID; 589 590 explicit IslScheduleOptimizerWrapperPass() : ScopPass(ID) {} 591 592 /// Optimize the schedule of the SCoP @p S. 593 bool runOnScop(Scop &S) override; 594 595 /// Print the new schedule for the SCoP @p S. 596 void printScop(raw_ostream &OS, Scop &S) const override; 597 598 /// Register all analyses and transformation required. 599 void getAnalysisUsage(AnalysisUsage &AU) const override; 600 601 /// Release the internal memory. 602 void releaseMemory() override { 603 LastSchedule = {}; 604 IslCtx.reset(); 605 } 606 607 private: 608 std::shared_ptr<isl_ctx> IslCtx; 609 isl::schedule LastSchedule; 610 }; 611 612 char IslScheduleOptimizerWrapperPass::ID = 0; 613 614 #ifndef NDEBUG 615 static void printSchedule(llvm::raw_ostream &OS, const isl::schedule &Schedule, 616 StringRef Desc) { 617 isl::ctx Ctx = Schedule.ctx(); 618 isl_printer *P = isl_printer_to_str(Ctx.get()); 619 P = isl_printer_set_yaml_style(P, ISL_YAML_STYLE_BLOCK); 620 P = isl_printer_print_schedule(P, Schedule.get()); 621 char *Str = isl_printer_get_str(P); 622 OS << Desc << ": \n" << Str << "\n"; 623 free(Str); 624 isl_printer_free(P); 625 } 626 #endif 627 628 /// Collect statistics for the schedule tree. 629 /// 630 /// @param Schedule The schedule tree to analyze. If not a schedule tree it is 631 /// ignored. 632 /// @param Version The version of the schedule tree that is analyzed. 633 /// 0 for the original schedule tree before any transformation. 634 /// 1 for the schedule tree after isl's rescheduling. 635 /// 2 for the schedule tree after optimizations are applied 636 /// (tiling, pattern matching) 637 static void walkScheduleTreeForStatistics(isl::schedule Schedule, int Version) { 638 auto Root = Schedule.get_root(); 639 if (Root.is_null()) 640 return; 641 642 isl_schedule_node_foreach_descendant_top_down( 643 Root.get(), 644 [](__isl_keep isl_schedule_node *nodeptr, void *user) -> isl_bool { 645 isl::schedule_node Node = isl::manage_copy(nodeptr); 646 int Version = *static_cast<int *>(user); 647 648 switch (isl_schedule_node_get_type(Node.get())) { 649 case isl_schedule_node_band: { 650 NumBands[Version]++; 651 if (isl_schedule_node_band_get_permutable(Node.get()) == 652 isl_bool_true) 653 NumPermutable[Version]++; 654 655 int CountMembers = isl_schedule_node_band_n_member(Node.get()); 656 NumBandMembers[Version] += CountMembers; 657 for (int i = 0; i < CountMembers; i += 1) { 658 if (Node.as<isl::schedule_node_band>().member_get_coincident(i)) 659 NumCoincident[Version]++; 660 } 661 break; 662 } 663 664 case isl_schedule_node_filter: 665 NumFilters[Version]++; 666 break; 667 668 case isl_schedule_node_extension: 669 NumExtension[Version]++; 670 break; 671 672 default: 673 break; 674 } 675 676 return isl_bool_true; 677 }, 678 &Version); 679 } 680 681 static void runIslScheduleOptimizer( 682 Scop &S, 683 function_ref<const Dependences &(Dependences::AnalysisLevel)> GetDeps, 684 TargetTransformInfo *TTI, OptimizationRemarkEmitter *ORE, 685 isl::schedule &LastSchedule, bool &DepsChanged) { 686 687 // Skip SCoPs in case they're already optimised by PPCGCodeGeneration 688 if (S.isToBeSkipped()) 689 return; 690 691 // Skip empty SCoPs but still allow code generation as it will delete the 692 // loops present but not needed. 693 if (S.getSize() == 0) { 694 S.markAsOptimized(); 695 return; 696 } 697 698 ScopsProcessed++; 699 700 // Schedule without optimizations. 701 isl::schedule Schedule = S.getScheduleTree(); 702 walkScheduleTreeForStatistics(S.getScheduleTree(), 0); 703 LLVM_DEBUG(printSchedule(dbgs(), Schedule, "Original schedule tree")); 704 705 bool HasUserTransformation = false; 706 if (PragmaBasedOpts) { 707 isl::schedule ManuallyTransformed = applyManualTransformations( 708 &S, Schedule, GetDeps(Dependences::AL_Statement), ORE); 709 if (ManuallyTransformed.is_null()) { 710 LLVM_DEBUG(dbgs() << "Error during manual optimization\n"); 711 return; 712 } 713 714 if (ManuallyTransformed.get() != Schedule.get()) { 715 // User transformations have precedence over other transformations. 716 HasUserTransformation = true; 717 Schedule = std::move(ManuallyTransformed); 718 LLVM_DEBUG( 719 printSchedule(dbgs(), Schedule, "After manual transformations")); 720 } 721 } 722 723 // Only continue if either manual transformations have been applied or we are 724 // allowed to apply heuristics. 725 // TODO: Detect disabled heuristics and no user-directed transformation 726 // metadata earlier in ScopDetection. 727 if (!HasUserTransformation && S.hasDisableHeuristicsHint()) { 728 LLVM_DEBUG(dbgs() << "Heuristic optimizations disabled by metadata\n"); 729 return; 730 } 731 732 // Get dependency analysis. 733 const Dependences &D = GetDeps(Dependences::AL_Statement); 734 if (D.getSharedIslCtx() != S.getSharedIslCtx()) { 735 LLVM_DEBUG(dbgs() << "DependenceInfo for another SCoP/isl_ctx\n"); 736 return; 737 } 738 if (!D.hasValidDependences()) { 739 LLVM_DEBUG(dbgs() << "Dependency information not available\n"); 740 return; 741 } 742 743 // Apply ISL's algorithm only if not overriden by the user. Note that 744 // post-rescheduling optimizations (tiling, pattern-based, prevectorization) 745 // rely on the coincidence/permutable annotations on schedule tree bands that 746 // are added by the rescheduling analyzer. Therefore, disabling the 747 // rescheduler implicitly also disables these optimizations. 748 if (!EnableReschedule) { 749 LLVM_DEBUG(dbgs() << "Skipping rescheduling due to command line option\n"); 750 } else if (HasUserTransformation) { 751 LLVM_DEBUG( 752 dbgs() << "Skipping rescheduling due to manual transformation\n"); 753 } else { 754 // Build input data. 755 int ValidityKinds = 756 Dependences::TYPE_RAW | Dependences::TYPE_WAR | Dependences::TYPE_WAW; 757 int ProximityKinds; 758 759 if (OptimizeDeps == "all") 760 ProximityKinds = 761 Dependences::TYPE_RAW | Dependences::TYPE_WAR | Dependences::TYPE_WAW; 762 else if (OptimizeDeps == "raw") 763 ProximityKinds = Dependences::TYPE_RAW; 764 else { 765 errs() << "Do not know how to optimize for '" << OptimizeDeps << "'" 766 << " Falling back to optimizing all dependences.\n"; 767 ProximityKinds = 768 Dependences::TYPE_RAW | Dependences::TYPE_WAR | Dependences::TYPE_WAW; 769 } 770 771 isl::union_set Domain = S.getDomains(); 772 773 if (Domain.is_null()) 774 return; 775 776 isl::union_map Validity = D.getDependences(ValidityKinds); 777 isl::union_map Proximity = D.getDependences(ProximityKinds); 778 779 // Simplify the dependences by removing the constraints introduced by the 780 // domains. This can speed up the scheduling time significantly, as large 781 // constant coefficients will be removed from the dependences. The 782 // introduction of some additional dependences reduces the possible 783 // transformations, but in most cases, such transformation do not seem to be 784 // interesting anyway. In some cases this option may stop the scheduler to 785 // find any schedule. 786 if (SimplifyDeps == "yes") { 787 Validity = Validity.gist_domain(Domain); 788 Validity = Validity.gist_range(Domain); 789 Proximity = Proximity.gist_domain(Domain); 790 Proximity = Proximity.gist_range(Domain); 791 } else if (SimplifyDeps != "no") { 792 errs() 793 << "warning: Option -polly-opt-simplify-deps should either be 'yes' " 794 "or 'no'. Falling back to default: 'yes'\n"; 795 } 796 797 LLVM_DEBUG(dbgs() << "\n\nCompute schedule from: "); 798 LLVM_DEBUG(dbgs() << "Domain := " << Domain << ";\n"); 799 LLVM_DEBUG(dbgs() << "Proximity := " << Proximity << ";\n"); 800 LLVM_DEBUG(dbgs() << "Validity := " << Validity << ";\n"); 801 802 int IslMaximizeBands; 803 if (MaximizeBandDepth == "yes") { 804 IslMaximizeBands = 1; 805 } else if (MaximizeBandDepth == "no") { 806 IslMaximizeBands = 0; 807 } else { 808 errs() 809 << "warning: Option -polly-opt-maximize-bands should either be 'yes'" 810 " or 'no'. Falling back to default: 'yes'\n"; 811 IslMaximizeBands = 1; 812 } 813 814 int IslOuterCoincidence; 815 if (OuterCoincidence == "yes") { 816 IslOuterCoincidence = 1; 817 } else if (OuterCoincidence == "no") { 818 IslOuterCoincidence = 0; 819 } else { 820 errs() << "warning: Option -polly-opt-outer-coincidence should either be " 821 "'yes' or 'no'. Falling back to default: 'no'\n"; 822 IslOuterCoincidence = 0; 823 } 824 825 isl_ctx *Ctx = S.getIslCtx().get(); 826 827 isl_options_set_schedule_outer_coincidence(Ctx, IslOuterCoincidence); 828 isl_options_set_schedule_maximize_band_depth(Ctx, IslMaximizeBands); 829 isl_options_set_schedule_max_constant_term(Ctx, MaxConstantTerm); 830 isl_options_set_schedule_max_coefficient(Ctx, MaxCoefficient); 831 isl_options_set_tile_scale_tile_loops(Ctx, 0); 832 833 auto OnErrorStatus = isl_options_get_on_error(Ctx); 834 isl_options_set_on_error(Ctx, ISL_ON_ERROR_CONTINUE); 835 836 auto SC = isl::schedule_constraints::on_domain(Domain); 837 SC = SC.set_proximity(Proximity); 838 SC = SC.set_validity(Validity); 839 SC = SC.set_coincidence(Validity); 840 Schedule = SC.compute_schedule(); 841 isl_options_set_on_error(Ctx, OnErrorStatus); 842 843 ScopsRescheduled++; 844 LLVM_DEBUG(printSchedule(dbgs(), Schedule, "After rescheduling")); 845 } 846 847 walkScheduleTreeForStatistics(Schedule, 1); 848 849 // In cases the scheduler is not able to optimize the code, we just do not 850 // touch the schedule. 851 if (Schedule.is_null()) 852 return; 853 854 if (GreedyFusion) { 855 isl::union_map Validity = D.getDependences( 856 Dependences::TYPE_RAW | Dependences::TYPE_WAR | Dependences::TYPE_WAW); 857 Schedule = applyGreedyFusion(Schedule, Validity); 858 assert(!Schedule.is_null()); 859 } 860 861 // Apply post-rescheduling optimizations (if enabled) and/or prevectorization. 862 const OptimizerAdditionalInfoTy OAI = { 863 TTI, 864 const_cast<Dependences *>(&D), 865 /*PatternOpts=*/!HasUserTransformation && PMBasedOpts, 866 /*Postopts=*/!HasUserTransformation && EnablePostopts, 867 /*Prevect=*/PollyVectorizerChoice != VECTORIZER_NONE, 868 DepsChanged}; 869 if (OAI.PatternOpts || OAI.Postopts || OAI.Prevect) { 870 Schedule = ScheduleTreeOptimizer::optimizeSchedule(Schedule, &OAI); 871 Schedule = hoistExtensionNodes(Schedule); 872 LLVM_DEBUG(printSchedule(dbgs(), Schedule, "After post-optimizations")); 873 walkScheduleTreeForStatistics(Schedule, 2); 874 } 875 876 // Skip profitability check if user transformation(s) have been applied. 877 if (!HasUserTransformation && 878 !ScheduleTreeOptimizer::isProfitableSchedule(S, Schedule)) 879 return; 880 881 auto ScopStats = S.getStatistics(); 882 ScopsOptimized++; 883 NumAffineLoopsOptimized += ScopStats.NumAffineLoops; 884 NumBoxedLoopsOptimized += ScopStats.NumBoxedLoops; 885 LastSchedule = Schedule; 886 887 S.setScheduleTree(Schedule); 888 S.markAsOptimized(); 889 890 if (OptimizedScops) 891 errs() << S; 892 } 893 894 bool IslScheduleOptimizerWrapperPass::runOnScop(Scop &S) { 895 releaseMemory(); 896 897 Function &F = S.getFunction(); 898 IslCtx = S.getSharedIslCtx(); 899 900 auto getDependences = 901 [this](Dependences::AnalysisLevel) -> const Dependences & { 902 return getAnalysis<DependenceInfo>().getDependences( 903 Dependences::AL_Statement); 904 }; 905 OptimizationRemarkEmitter &ORE = 906 getAnalysis<OptimizationRemarkEmitterWrapperPass>().getORE(); 907 TargetTransformInfo *TTI = 908 &getAnalysis<TargetTransformInfoWrapperPass>().getTTI(F); 909 910 bool DepsChanged = false; 911 runIslScheduleOptimizer(S, getDependences, TTI, &ORE, LastSchedule, 912 DepsChanged); 913 if (DepsChanged) 914 getAnalysis<DependenceInfo>().abandonDependences(); 915 return false; 916 } 917 918 static void runScheduleOptimizerPrinter(raw_ostream &OS, 919 isl::schedule LastSchedule) { 920 isl_printer *p; 921 char *ScheduleStr; 922 923 OS << "Calculated schedule:\n"; 924 925 if (LastSchedule.is_null()) { 926 OS << "n/a\n"; 927 return; 928 } 929 930 p = isl_printer_to_str(LastSchedule.ctx().get()); 931 p = isl_printer_set_yaml_style(p, ISL_YAML_STYLE_BLOCK); 932 p = isl_printer_print_schedule(p, LastSchedule.get()); 933 ScheduleStr = isl_printer_get_str(p); 934 isl_printer_free(p); 935 936 OS << ScheduleStr << "\n"; 937 938 free(ScheduleStr); 939 } 940 941 void IslScheduleOptimizerWrapperPass::printScop(raw_ostream &OS, Scop &) const { 942 runScheduleOptimizerPrinter(OS, LastSchedule); 943 } 944 945 void IslScheduleOptimizerWrapperPass::getAnalysisUsage( 946 AnalysisUsage &AU) const { 947 ScopPass::getAnalysisUsage(AU); 948 AU.addRequired<DependenceInfo>(); 949 AU.addRequired<TargetTransformInfoWrapperPass>(); 950 AU.addRequired<OptimizationRemarkEmitterWrapperPass>(); 951 952 AU.addPreserved<DependenceInfo>(); 953 AU.addPreserved<OptimizationRemarkEmitterWrapperPass>(); 954 } 955 956 } // namespace 957 958 Pass *polly::createIslScheduleOptimizerWrapperPass() { 959 return new IslScheduleOptimizerWrapperPass(); 960 } 961 962 INITIALIZE_PASS_BEGIN(IslScheduleOptimizerWrapperPass, "polly-opt-isl", 963 "Polly - Optimize schedule of SCoP", false, false); 964 INITIALIZE_PASS_DEPENDENCY(DependenceInfo); 965 INITIALIZE_PASS_DEPENDENCY(ScopInfoRegionPass); 966 INITIALIZE_PASS_DEPENDENCY(TargetTransformInfoWrapperPass); 967 INITIALIZE_PASS_DEPENDENCY(OptimizationRemarkEmitterWrapperPass); 968 INITIALIZE_PASS_END(IslScheduleOptimizerWrapperPass, "polly-opt-isl", 969 "Polly - Optimize schedule of SCoP", false, false) 970 971 static llvm::PreservedAnalyses 972 runIslScheduleOptimizerUsingNPM(Scop &S, ScopAnalysisManager &SAM, 973 ScopStandardAnalysisResults &SAR, SPMUpdater &U, 974 raw_ostream *OS) { 975 DependenceAnalysis::Result &Deps = SAM.getResult<DependenceAnalysis>(S, SAR); 976 auto GetDeps = [&Deps](Dependences::AnalysisLevel) -> const Dependences & { 977 return Deps.getDependences(Dependences::AL_Statement); 978 }; 979 OptimizationRemarkEmitter ORE(&S.getFunction()); 980 TargetTransformInfo *TTI = &SAR.TTI; 981 isl::schedule LastSchedule; 982 bool DepsChanged = false; 983 runIslScheduleOptimizer(S, GetDeps, TTI, &ORE, LastSchedule, DepsChanged); 984 if (DepsChanged) 985 Deps.abandonDependences(); 986 987 if (OS) { 988 *OS << "Printing analysis 'Polly - Optimize schedule of SCoP' for region: '" 989 << S.getName() << "' in function '" << S.getFunction().getName() 990 << "':\n"; 991 runScheduleOptimizerPrinter(*OS, LastSchedule); 992 } 993 return PreservedAnalyses::all(); 994 } 995 996 llvm::PreservedAnalyses 997 IslScheduleOptimizerPass::run(Scop &S, ScopAnalysisManager &SAM, 998 ScopStandardAnalysisResults &SAR, SPMUpdater &U) { 999 return runIslScheduleOptimizerUsingNPM(S, SAM, SAR, U, nullptr); 1000 } 1001 1002 llvm::PreservedAnalyses 1003 IslScheduleOptimizerPrinterPass::run(Scop &S, ScopAnalysisManager &SAM, 1004 ScopStandardAnalysisResults &SAR, 1005 SPMUpdater &U) { 1006 return runIslScheduleOptimizerUsingNPM(S, SAM, SAR, U, &OS); 1007 } 1008 1009 //===----------------------------------------------------------------------===// 1010 1011 namespace { 1012 /// Print result from IslScheduleOptimizerWrapperPass. 1013 class IslScheduleOptimizerPrinterLegacyPass final : public ScopPass { 1014 public: 1015 static char ID; 1016 1017 IslScheduleOptimizerPrinterLegacyPass() 1018 : IslScheduleOptimizerPrinterLegacyPass(outs()) {} 1019 explicit IslScheduleOptimizerPrinterLegacyPass(llvm::raw_ostream &OS) 1020 : ScopPass(ID), OS(OS) {} 1021 1022 bool runOnScop(Scop &S) override { 1023 IslScheduleOptimizerWrapperPass &P = 1024 getAnalysis<IslScheduleOptimizerWrapperPass>(); 1025 1026 OS << "Printing analysis '" << P.getPassName() << "' for region: '" 1027 << S.getRegion().getNameStr() << "' in function '" 1028 << S.getFunction().getName() << "':\n"; 1029 P.printScop(OS, S); 1030 1031 return false; 1032 } 1033 1034 void getAnalysisUsage(AnalysisUsage &AU) const override { 1035 ScopPass::getAnalysisUsage(AU); 1036 AU.addRequired<IslScheduleOptimizerWrapperPass>(); 1037 AU.setPreservesAll(); 1038 } 1039 1040 private: 1041 llvm::raw_ostream &OS; 1042 }; 1043 1044 char IslScheduleOptimizerPrinterLegacyPass::ID = 0; 1045 } // namespace 1046 1047 Pass *polly::createIslScheduleOptimizerPrinterLegacyPass(raw_ostream &OS) { 1048 return new IslScheduleOptimizerPrinterLegacyPass(OS); 1049 } 1050 1051 INITIALIZE_PASS_BEGIN(IslScheduleOptimizerPrinterLegacyPass, 1052 "polly-print-opt-isl", 1053 "Polly - Print optimizer schedule of SCoP", false, false); 1054 INITIALIZE_PASS_DEPENDENCY(IslScheduleOptimizerWrapperPass) 1055 INITIALIZE_PASS_END(IslScheduleOptimizerPrinterLegacyPass, 1056 "polly-print-opt-isl", 1057 "Polly - Print optimizer schedule of SCoP", false, false) 1058