1 //===- ScheduleOptimizer.cpp - Calculate an optimized schedule ------------===// 2 // 3 // Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. 4 // See https://llvm.org/LICENSE.txt for license information. 5 // SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception 6 // 7 //===----------------------------------------------------------------------===// 8 // 9 // This pass generates an entirely new schedule tree from the data dependences 10 // and iteration domains. The new schedule tree is computed in two steps: 11 // 12 // 1) The isl scheduling optimizer is run 13 // 14 // The isl scheduling optimizer creates a new schedule tree that maximizes 15 // parallelism and tileability and minimizes data-dependence distances. The 16 // algorithm used is a modified version of the ``Pluto'' algorithm: 17 // 18 // U. Bondhugula, A. Hartono, J. Ramanujam, and P. Sadayappan. 19 // A Practical Automatic Polyhedral Parallelizer and Locality Optimizer. 20 // In Proceedings of the 2008 ACM SIGPLAN Conference On Programming Language 21 // Design and Implementation, PLDI ’08, pages 101–113. ACM, 2008. 22 // 23 // 2) A set of post-scheduling transformations is applied on the schedule tree. 24 // 25 // These optimizations include: 26 // 27 // - Tiling of the innermost tilable bands 28 // - Prevectorization - The choice of a possible outer loop that is strip-mined 29 // to the innermost level to enable inner-loop 30 // vectorization. 31 // - Some optimizations for spatial locality are also planned. 32 // 33 // For a detailed description of the schedule tree itself please see section 6 34 // of: 35 // 36 // Polyhedral AST generation is more than scanning polyhedra 37 // Tobias Grosser, Sven Verdoolaege, Albert Cohen 38 // ACM Transactions on Programming Languages and Systems (TOPLAS), 39 // 37(4), July 2015 40 // http://www.grosser.es/#pub-polyhedral-AST-generation 41 // 42 // This publication also contains a detailed discussion of the different options 43 // for polyhedral loop unrolling, full/partial tile separation and other uses 44 // of the schedule tree. 45 // 46 //===----------------------------------------------------------------------===// 47 48 #include "polly/ScheduleOptimizer.h" 49 #include "polly/CodeGen/CodeGeneration.h" 50 #include "polly/DependenceInfo.h" 51 #include "polly/ManualOptimizer.h" 52 #include "polly/MatmulOptimizer.h" 53 #include "polly/Options.h" 54 #include "polly/ScheduleTreeTransform.h" 55 #include "polly/Support/ISLOStream.h" 56 #include "polly/Support/ISLTools.h" 57 #include "llvm/ADT/Sequence.h" 58 #include "llvm/ADT/Statistic.h" 59 #include "llvm/Analysis/OptimizationRemarkEmitter.h" 60 #include "llvm/InitializePasses.h" 61 #include "llvm/Support/CommandLine.h" 62 #include "isl/options.h" 63 64 using namespace llvm; 65 using namespace polly; 66 67 namespace llvm { 68 class Loop; 69 class Module; 70 } // namespace llvm 71 72 #define DEBUG_TYPE "polly-opt-isl" 73 74 static cl::opt<std::string> 75 OptimizeDeps("polly-opt-optimize-only", 76 cl::desc("Only a certain kind of dependences (all/raw)"), 77 cl::Hidden, cl::init("all"), cl::ZeroOrMore, 78 cl::cat(PollyCategory)); 79 80 static cl::opt<std::string> 81 SimplifyDeps("polly-opt-simplify-deps", 82 cl::desc("Dependences should be simplified (yes/no)"), 83 cl::Hidden, cl::init("yes"), cl::ZeroOrMore, 84 cl::cat(PollyCategory)); 85 86 static cl::opt<int> MaxConstantTerm( 87 "polly-opt-max-constant-term", 88 cl::desc("The maximal constant term allowed (-1 is unlimited)"), cl::Hidden, 89 cl::init(20), cl::ZeroOrMore, cl::cat(PollyCategory)); 90 91 static cl::opt<int> MaxCoefficient( 92 "polly-opt-max-coefficient", 93 cl::desc("The maximal coefficient allowed (-1 is unlimited)"), cl::Hidden, 94 cl::init(20), cl::ZeroOrMore, cl::cat(PollyCategory)); 95 96 static cl::opt<std::string> 97 MaximizeBandDepth("polly-opt-maximize-bands", 98 cl::desc("Maximize the band depth (yes/no)"), cl::Hidden, 99 cl::init("yes"), cl::ZeroOrMore, cl::cat(PollyCategory)); 100 101 static cl::opt<bool> 102 GreedyFusion("polly-loopfusion-greedy", 103 cl::desc("Aggressively try to fuse everything"), cl::Hidden, 104 cl::ZeroOrMore, cl::cat(PollyCategory)); 105 106 static cl::opt<std::string> OuterCoincidence( 107 "polly-opt-outer-coincidence", 108 cl::desc("Try to construct schedules where the outer member of each band " 109 "satisfies the coincidence constraints (yes/no)"), 110 cl::Hidden, cl::init("no"), cl::ZeroOrMore, cl::cat(PollyCategory)); 111 112 static cl::opt<int> PrevectorWidth( 113 "polly-prevect-width", 114 cl::desc( 115 "The number of loop iterations to strip-mine for pre-vectorization"), 116 cl::Hidden, cl::init(4), cl::ZeroOrMore, cl::cat(PollyCategory)); 117 118 static cl::opt<bool> FirstLevelTiling("polly-tiling", 119 cl::desc("Enable loop tiling"), 120 cl::init(true), cl::ZeroOrMore, 121 cl::cat(PollyCategory)); 122 123 static cl::opt<int> FirstLevelDefaultTileSize( 124 "polly-default-tile-size", 125 cl::desc("The default tile size (if not enough were provided by" 126 " --polly-tile-sizes)"), 127 cl::Hidden, cl::init(32), cl::ZeroOrMore, cl::cat(PollyCategory)); 128 129 static cl::list<int> 130 FirstLevelTileSizes("polly-tile-sizes", 131 cl::desc("A tile size for each loop dimension, filled " 132 "with --polly-default-tile-size"), 133 cl::Hidden, cl::ZeroOrMore, cl::CommaSeparated, 134 cl::cat(PollyCategory)); 135 136 static cl::opt<bool> 137 SecondLevelTiling("polly-2nd-level-tiling", 138 cl::desc("Enable a 2nd level loop of loop tiling"), 139 cl::init(false), cl::ZeroOrMore, cl::cat(PollyCategory)); 140 141 static cl::opt<int> SecondLevelDefaultTileSize( 142 "polly-2nd-level-default-tile-size", 143 cl::desc("The default 2nd-level tile size (if not enough were provided by" 144 " --polly-2nd-level-tile-sizes)"), 145 cl::Hidden, cl::init(16), cl::ZeroOrMore, cl::cat(PollyCategory)); 146 147 static cl::list<int> 148 SecondLevelTileSizes("polly-2nd-level-tile-sizes", 149 cl::desc("A tile size for each loop dimension, filled " 150 "with --polly-default-tile-size"), 151 cl::Hidden, cl::ZeroOrMore, cl::CommaSeparated, 152 cl::cat(PollyCategory)); 153 154 static cl::opt<bool> RegisterTiling("polly-register-tiling", 155 cl::desc("Enable register tiling"), 156 cl::init(false), cl::ZeroOrMore, 157 cl::cat(PollyCategory)); 158 159 static cl::opt<int> RegisterDefaultTileSize( 160 "polly-register-tiling-default-tile-size", 161 cl::desc("The default register tile size (if not enough were provided by" 162 " --polly-register-tile-sizes)"), 163 cl::Hidden, cl::init(2), cl::ZeroOrMore, cl::cat(PollyCategory)); 164 165 static cl::list<int> 166 RegisterTileSizes("polly-register-tile-sizes", 167 cl::desc("A tile size for each loop dimension, filled " 168 "with --polly-register-tile-size"), 169 cl::Hidden, cl::ZeroOrMore, cl::CommaSeparated, 170 cl::cat(PollyCategory)); 171 172 static cl::opt<bool> PragmaBasedOpts( 173 "polly-pragma-based-opts", 174 cl::desc("Apply user-directed transformation from metadata"), 175 cl::init(true), cl::ZeroOrMore, cl::cat(PollyCategory)); 176 177 static cl::opt<bool> EnableReschedule("polly-reschedule", 178 cl::desc("Optimize SCoPs using ISL"), 179 cl::init(true), cl::ZeroOrMore, 180 cl::cat(PollyCategory)); 181 182 static cl::opt<bool> 183 PMBasedOpts("polly-pattern-matching-based-opts", 184 cl::desc("Perform optimizations based on pattern matching"), 185 cl::init(true), cl::ZeroOrMore, cl::cat(PollyCategory)); 186 187 static cl::opt<bool> 188 EnablePostopts("polly-postopts", 189 cl::desc("Apply post-rescheduling optimizations such as " 190 "tiling (requires -polly-reschedule)"), 191 cl::init(true), cl::ZeroOrMore, cl::cat(PollyCategory)); 192 193 static cl::opt<bool> OptimizedScops( 194 "polly-optimized-scops", 195 cl::desc("Polly - Dump polyhedral description of Scops optimized with " 196 "the isl scheduling optimizer and the set of post-scheduling " 197 "transformations is applied on the schedule tree"), 198 cl::init(false), cl::ZeroOrMore, cl::cat(PollyCategory)); 199 200 STATISTIC(ScopsProcessed, "Number of scops processed"); 201 STATISTIC(ScopsRescheduled, "Number of scops rescheduled"); 202 STATISTIC(ScopsOptimized, "Number of scops optimized"); 203 204 STATISTIC(NumAffineLoopsOptimized, "Number of affine loops optimized"); 205 STATISTIC(NumBoxedLoopsOptimized, "Number of boxed loops optimized"); 206 207 #define THREE_STATISTICS(VARNAME, DESC) \ 208 static Statistic VARNAME[3] = { \ 209 {DEBUG_TYPE, #VARNAME "0", DESC " (original)"}, \ 210 {DEBUG_TYPE, #VARNAME "1", DESC " (after scheduler)"}, \ 211 {DEBUG_TYPE, #VARNAME "2", DESC " (after optimizer)"}} 212 213 THREE_STATISTICS(NumBands, "Number of bands"); 214 THREE_STATISTICS(NumBandMembers, "Number of band members"); 215 THREE_STATISTICS(NumCoincident, "Number of coincident band members"); 216 THREE_STATISTICS(NumPermutable, "Number of permutable bands"); 217 THREE_STATISTICS(NumFilters, "Number of filter nodes"); 218 THREE_STATISTICS(NumExtension, "Number of extension nodes"); 219 220 STATISTIC(FirstLevelTileOpts, "Number of first level tiling applied"); 221 STATISTIC(SecondLevelTileOpts, "Number of second level tiling applied"); 222 STATISTIC(RegisterTileOpts, "Number of register tiling applied"); 223 STATISTIC(PrevectOpts, "Number of strip-mining for prevectorization applied"); 224 STATISTIC(MatMulOpts, 225 "Number of matrix multiplication patterns detected and optimized"); 226 227 namespace { 228 /// Additional parameters of the schedule optimizer. 229 /// 230 /// Target Transform Info and the SCoP dependencies used by the schedule 231 /// optimizer. 232 struct OptimizerAdditionalInfoTy { 233 const llvm::TargetTransformInfo *TTI; 234 const Dependences *D; 235 bool PatternOpts; 236 bool Postopts; 237 bool Prevect; 238 }; 239 240 class ScheduleTreeOptimizer { 241 public: 242 /// Apply schedule tree transformations. 243 /// 244 /// This function takes an (possibly already optimized) schedule tree and 245 /// applies a set of additional optimizations on the schedule tree. The 246 /// transformations applied include: 247 /// 248 /// - Pattern-based optimizations 249 /// - Tiling 250 /// - Prevectorization 251 /// 252 /// @param Schedule The schedule object the transformations will be applied 253 /// to. 254 /// @param OAI Target Transform Info and the SCoP dependencies. 255 /// @returns The transformed schedule. 256 static isl::schedule 257 optimizeSchedule(isl::schedule Schedule, 258 const OptimizerAdditionalInfoTy *OAI = nullptr); 259 260 /// Apply schedule tree transformations. 261 /// 262 /// This function takes a node in an (possibly already optimized) schedule 263 /// tree and applies a set of additional optimizations on this schedule tree 264 /// node and its descendants. The transformations applied include: 265 /// 266 /// - Pattern-based optimizations 267 /// - Tiling 268 /// - Prevectorization 269 /// 270 /// @param Node The schedule object post-transformations will be applied to. 271 /// @param OAI Target Transform Info and the SCoP dependencies. 272 /// @returns The transformed schedule. 273 static isl::schedule_node 274 optimizeScheduleNode(isl::schedule_node Node, 275 const OptimizerAdditionalInfoTy *OAI = nullptr); 276 277 /// Decide if the @p NewSchedule is profitable for @p S. 278 /// 279 /// @param S The SCoP we optimize. 280 /// @param NewSchedule The new schedule we computed. 281 /// 282 /// @return True, if we believe @p NewSchedule is an improvement for @p S. 283 static bool isProfitableSchedule(polly::Scop &S, isl::schedule NewSchedule); 284 285 /// Isolate a set of partial tile prefixes. 286 /// 287 /// This set should ensure that it contains only partial tile prefixes that 288 /// have exactly VectorWidth iterations. 289 /// 290 /// @param Node A schedule node band, which is a parent of a band node, 291 /// that contains a vector loop. 292 /// @return Modified isl_schedule_node. 293 static isl::schedule_node isolateFullPartialTiles(isl::schedule_node Node, 294 int VectorWidth); 295 296 private: 297 /// Check if this node is a band node we want to tile. 298 /// 299 /// We look for innermost band nodes where individual dimensions are marked as 300 /// permutable. 301 /// 302 /// @param Node The node to check. 303 static bool isTileableBandNode(isl::schedule_node Node); 304 305 /// Pre-vectorizes one scheduling dimension of a schedule band. 306 /// 307 /// prevectSchedBand splits out the dimension DimToVectorize, tiles it and 308 /// sinks the resulting point loop. 309 /// 310 /// Example (DimToVectorize=0, VectorWidth=4): 311 /// 312 /// | Before transformation: 313 /// | 314 /// | A[i,j] -> [i,j] 315 /// | 316 /// | for (i = 0; i < 128; i++) 317 /// | for (j = 0; j < 128; j++) 318 /// | A(i,j); 319 /// 320 /// | After transformation: 321 /// | 322 /// | for (it = 0; it < 32; it+=1) 323 /// | for (j = 0; j < 128; j++) 324 /// | for (ip = 0; ip <= 3; ip++) 325 /// | A(4 * it + ip,j); 326 /// 327 /// The goal of this transformation is to create a trivially vectorizable 328 /// loop. This means a parallel loop at the innermost level that has a 329 /// constant number of iterations corresponding to the target vector width. 330 /// 331 /// This transformation creates a loop at the innermost level. The loop has 332 /// a constant number of iterations, if the number of loop iterations at 333 /// DimToVectorize can be divided by VectorWidth. The default VectorWidth is 334 /// currently constant and not yet target specific. This function does not 335 /// reason about parallelism. 336 static isl::schedule_node prevectSchedBand(isl::schedule_node Node, 337 unsigned DimToVectorize, 338 int VectorWidth); 339 340 /// Apply additional optimizations on the bands in the schedule tree. 341 /// 342 /// We are looking for an innermost band node and apply the following 343 /// transformations: 344 /// 345 /// - Tile the band 346 /// - if the band is tileable 347 /// - if the band has more than one loop dimension 348 /// 349 /// - Prevectorize the schedule of the band (or the point loop in case of 350 /// tiling). 351 /// - if vectorization is enabled 352 /// 353 /// @param Node The schedule node to (possibly) optimize. 354 /// @param User A pointer to forward some use information 355 /// (currently unused). 356 static isl_schedule_node *optimizeBand(isl_schedule_node *Node, void *User); 357 358 /// Apply tiling optimizations on the bands in the schedule tree. 359 /// 360 /// @param Node The schedule node to (possibly) optimize. 361 static isl::schedule_node applyTileBandOpt(isl::schedule_node Node); 362 363 /// Apply prevectorization on the bands in the schedule tree. 364 /// 365 /// @param Node The schedule node to (possibly) prevectorize. 366 static isl::schedule_node applyPrevectBandOpt(isl::schedule_node Node); 367 }; 368 369 isl::schedule_node 370 ScheduleTreeOptimizer::isolateFullPartialTiles(isl::schedule_node Node, 371 int VectorWidth) { 372 assert(isl_schedule_node_get_type(Node.get()) == isl_schedule_node_band); 373 Node = Node.child(0).child(0); 374 isl::union_map SchedRelUMap = Node.get_prefix_schedule_relation(); 375 isl::union_set ScheduleRangeUSet = SchedRelUMap.range(); 376 isl::set ScheduleRange{ScheduleRangeUSet}; 377 isl::set IsolateDomain = getPartialTilePrefixes(ScheduleRange, VectorWidth); 378 auto AtomicOption = getDimOptions(IsolateDomain.ctx(), "atomic"); 379 isl::union_set IsolateOption = getIsolateOptions(IsolateDomain, 1); 380 Node = Node.parent().parent(); 381 isl::union_set Options = IsolateOption.unite(AtomicOption); 382 isl::schedule_node_band Result = 383 Node.as<isl::schedule_node_band>().set_ast_build_options(Options); 384 return Result; 385 } 386 387 struct InsertSimdMarkers : public ScheduleNodeRewriter<InsertSimdMarkers> { 388 isl::schedule_node visitBand(isl::schedule_node_band Band) { 389 isl::schedule_node Node = visitChildren(Band); 390 391 // Only add SIMD markers to innermost bands. 392 if (!Node.first_child().isa<isl::schedule_node_leaf>()) 393 return Node; 394 395 isl::id LoopMarker = isl::id::alloc(Band.ctx(), "SIMD", nullptr); 396 return Band.insert_mark(LoopMarker); 397 } 398 }; 399 400 isl::schedule_node ScheduleTreeOptimizer::prevectSchedBand( 401 isl::schedule_node Node, unsigned DimToVectorize, int VectorWidth) { 402 assert(isl_schedule_node_get_type(Node.get()) == isl_schedule_node_band); 403 404 auto Space = isl::manage(isl_schedule_node_band_get_space(Node.get())); 405 unsigned ScheduleDimensions = unsignedFromIslSize(Space.dim(isl::dim::set)); 406 assert(DimToVectorize < ScheduleDimensions); 407 408 if (DimToVectorize > 0) { 409 Node = isl::manage( 410 isl_schedule_node_band_split(Node.release(), DimToVectorize)); 411 Node = Node.child(0); 412 } 413 if (DimToVectorize < ScheduleDimensions - 1) 414 Node = isl::manage(isl_schedule_node_band_split(Node.release(), 1)); 415 Space = isl::manage(isl_schedule_node_band_get_space(Node.get())); 416 auto Sizes = isl::multi_val::zero(Space); 417 Sizes = Sizes.set_val(0, isl::val(Node.ctx(), VectorWidth)); 418 Node = 419 isl::manage(isl_schedule_node_band_tile(Node.release(), Sizes.release())); 420 Node = isolateFullPartialTiles(Node, VectorWidth); 421 Node = Node.child(0); 422 // Make sure the "trivially vectorizable loop" is not unrolled. Otherwise, 423 // we will have troubles to match it in the backend. 424 Node = Node.as<isl::schedule_node_band>().set_ast_build_options( 425 isl::union_set(Node.ctx(), "{ unroll[x]: 1 = 0 }")); 426 427 // Sink the inner loop into the smallest possible statements to make them 428 // represent a single vector instruction if possible. 429 Node = isl::manage(isl_schedule_node_band_sink(Node.release())); 430 431 // Add SIMD markers to those vector statements. 432 InsertSimdMarkers SimdMarkerInserter; 433 Node = SimdMarkerInserter.visit(Node); 434 435 PrevectOpts++; 436 return Node.parent(); 437 } 438 439 static bool isSimpleInnermostBand(const isl::schedule_node &Node) { 440 assert(isl_schedule_node_get_type(Node.get()) == isl_schedule_node_band); 441 assert(isl_schedule_node_n_children(Node.get()) == 1); 442 443 auto ChildType = isl_schedule_node_get_type(Node.child(0).get()); 444 445 if (ChildType == isl_schedule_node_leaf) 446 return true; 447 448 if (ChildType != isl_schedule_node_sequence) 449 return false; 450 451 auto Sequence = Node.child(0); 452 453 for (int c = 0, nc = isl_schedule_node_n_children(Sequence.get()); c < nc; 454 ++c) { 455 auto Child = Sequence.child(c); 456 if (isl_schedule_node_get_type(Child.get()) != isl_schedule_node_filter) 457 return false; 458 if (isl_schedule_node_get_type(Child.child(0).get()) != 459 isl_schedule_node_leaf) 460 return false; 461 } 462 return true; 463 } 464 465 bool ScheduleTreeOptimizer::isTileableBandNode(isl::schedule_node Node) { 466 if (isl_schedule_node_get_type(Node.get()) != isl_schedule_node_band) 467 return false; 468 469 if (isl_schedule_node_n_children(Node.get()) != 1) 470 return false; 471 472 if (!isl_schedule_node_band_get_permutable(Node.get())) 473 return false; 474 475 auto Space = isl::manage(isl_schedule_node_band_get_space(Node.get())); 476 477 if (unsignedFromIslSize(Space.dim(isl::dim::set)) <= 1u) 478 return false; 479 480 return isSimpleInnermostBand(Node); 481 } 482 483 __isl_give isl::schedule_node 484 ScheduleTreeOptimizer::applyTileBandOpt(isl::schedule_node Node) { 485 if (FirstLevelTiling) { 486 Node = tileNode(Node, "1st level tiling", FirstLevelTileSizes, 487 FirstLevelDefaultTileSize); 488 FirstLevelTileOpts++; 489 } 490 491 if (SecondLevelTiling) { 492 Node = tileNode(Node, "2nd level tiling", SecondLevelTileSizes, 493 SecondLevelDefaultTileSize); 494 SecondLevelTileOpts++; 495 } 496 497 if (RegisterTiling) { 498 Node = 499 applyRegisterTiling(Node, RegisterTileSizes, RegisterDefaultTileSize); 500 RegisterTileOpts++; 501 } 502 503 return Node; 504 } 505 506 isl::schedule_node 507 ScheduleTreeOptimizer::applyPrevectBandOpt(isl::schedule_node Node) { 508 auto Space = isl::manage(isl_schedule_node_band_get_space(Node.get())); 509 int Dims = unsignedFromIslSize(Space.dim(isl::dim::set)); 510 511 for (int i = Dims - 1; i >= 0; i--) 512 if (Node.as<isl::schedule_node_band>().member_get_coincident(i)) { 513 Node = prevectSchedBand(Node, i, PrevectorWidth); 514 break; 515 } 516 517 return Node; 518 } 519 520 __isl_give isl_schedule_node * 521 ScheduleTreeOptimizer::optimizeBand(__isl_take isl_schedule_node *NodeArg, 522 void *User) { 523 const OptimizerAdditionalInfoTy *OAI = 524 static_cast<const OptimizerAdditionalInfoTy *>(User); 525 assert(OAI && "Expecting optimization options"); 526 527 isl::schedule_node Node = isl::manage(NodeArg); 528 if (!isTileableBandNode(Node)) 529 return Node.release(); 530 531 if (OAI->PatternOpts) { 532 isl::schedule_node PatternOptimizedSchedule = 533 tryOptimizeMatMulPattern(Node, OAI->TTI, OAI->D); 534 if (!PatternOptimizedSchedule.is_null()) { 535 MatMulOpts++; 536 return PatternOptimizedSchedule.release(); 537 } 538 } 539 540 if (OAI->Postopts) 541 Node = applyTileBandOpt(Node); 542 543 if (OAI->Prevect) { 544 // FIXME: Prevectorization requirements are different from those checked by 545 // isTileableBandNode. 546 Node = applyPrevectBandOpt(Node); 547 } 548 549 return Node.release(); 550 } 551 552 isl::schedule 553 ScheduleTreeOptimizer::optimizeSchedule(isl::schedule Schedule, 554 const OptimizerAdditionalInfoTy *OAI) { 555 auto Root = Schedule.get_root(); 556 Root = optimizeScheduleNode(Root, OAI); 557 return Root.get_schedule(); 558 } 559 560 isl::schedule_node ScheduleTreeOptimizer::optimizeScheduleNode( 561 isl::schedule_node Node, const OptimizerAdditionalInfoTy *OAI) { 562 Node = isl::manage(isl_schedule_node_map_descendant_bottom_up( 563 Node.release(), optimizeBand, 564 const_cast<void *>(static_cast<const void *>(OAI)))); 565 return Node; 566 } 567 568 bool ScheduleTreeOptimizer::isProfitableSchedule(Scop &S, 569 isl::schedule NewSchedule) { 570 // To understand if the schedule has been optimized we check if the schedule 571 // has changed at all. 572 // TODO: We can improve this by tracking if any necessarily beneficial 573 // transformations have been performed. This can e.g. be tiling, loop 574 // interchange, or ...) We can track this either at the place where the 575 // transformation has been performed or, in case of automatic ILP based 576 // optimizations, by comparing (yet to be defined) performance metrics 577 // before/after the scheduling optimizer 578 // (e.g., #stride-one accesses) 579 // FIXME: A schedule tree whose union_map-conversion is identical to the 580 // original schedule map may still allow for parallelization, i.e. can still 581 // be profitable. 582 auto NewScheduleMap = NewSchedule.get_map(); 583 auto OldSchedule = S.getSchedule(); 584 assert(!OldSchedule.is_null() && 585 "Only IslScheduleOptimizer can insert extension nodes " 586 "that make Scop::getSchedule() return nullptr."); 587 bool changed = !OldSchedule.is_equal(NewScheduleMap); 588 return changed; 589 } 590 591 class IslScheduleOptimizerWrapperPass : public ScopPass { 592 public: 593 static char ID; 594 595 explicit IslScheduleOptimizerWrapperPass() : ScopPass(ID) {} 596 597 /// Optimize the schedule of the SCoP @p S. 598 bool runOnScop(Scop &S) override; 599 600 /// Print the new schedule for the SCoP @p S. 601 void printScop(raw_ostream &OS, Scop &S) const override; 602 603 /// Register all analyses and transformation required. 604 void getAnalysisUsage(AnalysisUsage &AU) const override; 605 606 /// Release the internal memory. 607 void releaseMemory() override { 608 LastSchedule = {}; 609 IslCtx.reset(); 610 } 611 612 private: 613 std::shared_ptr<isl_ctx> IslCtx; 614 isl::schedule LastSchedule; 615 }; 616 617 char IslScheduleOptimizerWrapperPass::ID = 0; 618 619 #ifndef NDEBUG 620 static void printSchedule(llvm::raw_ostream &OS, const isl::schedule &Schedule, 621 StringRef Desc) { 622 isl::ctx Ctx = Schedule.ctx(); 623 isl_printer *P = isl_printer_to_str(Ctx.get()); 624 P = isl_printer_set_yaml_style(P, ISL_YAML_STYLE_BLOCK); 625 P = isl_printer_print_schedule(P, Schedule.get()); 626 char *Str = isl_printer_get_str(P); 627 OS << Desc << ": \n" << Str << "\n"; 628 free(Str); 629 isl_printer_free(P); 630 } 631 #endif 632 633 /// Collect statistics for the schedule tree. 634 /// 635 /// @param Schedule The schedule tree to analyze. If not a schedule tree it is 636 /// ignored. 637 /// @param Version The version of the schedule tree that is analyzed. 638 /// 0 for the original schedule tree before any transformation. 639 /// 1 for the schedule tree after isl's rescheduling. 640 /// 2 for the schedule tree after optimizations are applied 641 /// (tiling, pattern matching) 642 static void walkScheduleTreeForStatistics(isl::schedule Schedule, int Version) { 643 auto Root = Schedule.get_root(); 644 if (Root.is_null()) 645 return; 646 647 isl_schedule_node_foreach_descendant_top_down( 648 Root.get(), 649 [](__isl_keep isl_schedule_node *nodeptr, void *user) -> isl_bool { 650 isl::schedule_node Node = isl::manage_copy(nodeptr); 651 int Version = *static_cast<int *>(user); 652 653 switch (isl_schedule_node_get_type(Node.get())) { 654 case isl_schedule_node_band: { 655 NumBands[Version]++; 656 if (isl_schedule_node_band_get_permutable(Node.get()) == 657 isl_bool_true) 658 NumPermutable[Version]++; 659 660 int CountMembers = isl_schedule_node_band_n_member(Node.get()); 661 NumBandMembers[Version] += CountMembers; 662 for (int i = 0; i < CountMembers; i += 1) { 663 if (Node.as<isl::schedule_node_band>().member_get_coincident(i)) 664 NumCoincident[Version]++; 665 } 666 break; 667 } 668 669 case isl_schedule_node_filter: 670 NumFilters[Version]++; 671 break; 672 673 case isl_schedule_node_extension: 674 NumExtension[Version]++; 675 break; 676 677 default: 678 break; 679 } 680 681 return isl_bool_true; 682 }, 683 &Version); 684 } 685 686 static bool runIslScheduleOptimizer( 687 Scop &S, 688 function_ref<const Dependences &(Dependences::AnalysisLevel)> GetDeps, 689 TargetTransformInfo *TTI, OptimizationRemarkEmitter *ORE, 690 isl::schedule &LastSchedule) { 691 692 // Skip SCoPs in case they're already optimised by PPCGCodeGeneration 693 if (S.isToBeSkipped()) 694 return false; 695 696 // Skip empty SCoPs but still allow code generation as it will delete the 697 // loops present but not needed. 698 if (S.getSize() == 0) { 699 S.markAsOptimized(); 700 return false; 701 } 702 703 ScopsProcessed++; 704 705 // Schedule without optimizations. 706 isl::schedule Schedule = S.getScheduleTree(); 707 walkScheduleTreeForStatistics(S.getScheduleTree(), 0); 708 LLVM_DEBUG(printSchedule(dbgs(), Schedule, "Original schedule tree")); 709 710 bool HasUserTransformation = false; 711 if (PragmaBasedOpts) { 712 isl::schedule ManuallyTransformed = applyManualTransformations( 713 &S, Schedule, GetDeps(Dependences::AL_Statement), ORE); 714 if (ManuallyTransformed.is_null()) { 715 LLVM_DEBUG(dbgs() << "Error during manual optimization\n"); 716 return false; 717 } 718 719 if (ManuallyTransformed.get() != Schedule.get()) { 720 // User transformations have precedence over other transformations. 721 HasUserTransformation = true; 722 Schedule = std::move(ManuallyTransformed); 723 LLVM_DEBUG( 724 printSchedule(dbgs(), Schedule, "After manual transformations")); 725 } 726 } 727 728 // Only continue if either manual transformations have been applied or we are 729 // allowed to apply heuristics. 730 // TODO: Detect disabled heuristics and no user-directed transformation 731 // metadata earlier in ScopDetection. 732 if (!HasUserTransformation && S.hasDisableHeuristicsHint()) { 733 LLVM_DEBUG(dbgs() << "Heuristic optimizations disabled by metadata\n"); 734 return false; 735 } 736 737 // Get dependency analysis. 738 const Dependences &D = GetDeps(Dependences::AL_Statement); 739 if (D.getSharedIslCtx() != S.getSharedIslCtx()) { 740 LLVM_DEBUG(dbgs() << "DependenceInfo for another SCoP/isl_ctx\n"); 741 return false; 742 } 743 if (!D.hasValidDependences()) { 744 LLVM_DEBUG(dbgs() << "Dependency information not available\n"); 745 return false; 746 } 747 748 // Apply ISL's algorithm only if not overriden by the user. Note that 749 // post-rescheduling optimizations (tiling, pattern-based, prevectorization) 750 // rely on the coincidence/permutable annotations on schedule tree bands that 751 // are added by the rescheduling analyzer. Therefore, disabling the 752 // rescheduler implicitly also disables these optimizations. 753 if (!EnableReschedule) { 754 LLVM_DEBUG(dbgs() << "Skipping rescheduling due to command line option\n"); 755 } else if (HasUserTransformation) { 756 LLVM_DEBUG( 757 dbgs() << "Skipping rescheduling due to manual transformation\n"); 758 } else { 759 // Build input data. 760 int ValidityKinds = 761 Dependences::TYPE_RAW | Dependences::TYPE_WAR | Dependences::TYPE_WAW; 762 int ProximityKinds; 763 764 if (OptimizeDeps == "all") 765 ProximityKinds = 766 Dependences::TYPE_RAW | Dependences::TYPE_WAR | Dependences::TYPE_WAW; 767 else if (OptimizeDeps == "raw") 768 ProximityKinds = Dependences::TYPE_RAW; 769 else { 770 errs() << "Do not know how to optimize for '" << OptimizeDeps << "'" 771 << " Falling back to optimizing all dependences.\n"; 772 ProximityKinds = 773 Dependences::TYPE_RAW | Dependences::TYPE_WAR | Dependences::TYPE_WAW; 774 } 775 776 isl::union_set Domain = S.getDomains(); 777 778 if (Domain.is_null()) 779 return false; 780 781 isl::union_map Validity = D.getDependences(ValidityKinds); 782 isl::union_map Proximity = D.getDependences(ProximityKinds); 783 784 // Simplify the dependences by removing the constraints introduced by the 785 // domains. This can speed up the scheduling time significantly, as large 786 // constant coefficients will be removed from the dependences. The 787 // introduction of some additional dependences reduces the possible 788 // transformations, but in most cases, such transformation do not seem to be 789 // interesting anyway. In some cases this option may stop the scheduler to 790 // find any schedule. 791 if (SimplifyDeps == "yes") { 792 Validity = Validity.gist_domain(Domain); 793 Validity = Validity.gist_range(Domain); 794 Proximity = Proximity.gist_domain(Domain); 795 Proximity = Proximity.gist_range(Domain); 796 } else if (SimplifyDeps != "no") { 797 errs() 798 << "warning: Option -polly-opt-simplify-deps should either be 'yes' " 799 "or 'no'. Falling back to default: 'yes'\n"; 800 } 801 802 LLVM_DEBUG(dbgs() << "\n\nCompute schedule from: "); 803 LLVM_DEBUG(dbgs() << "Domain := " << Domain << ";\n"); 804 LLVM_DEBUG(dbgs() << "Proximity := " << Proximity << ";\n"); 805 LLVM_DEBUG(dbgs() << "Validity := " << Validity << ";\n"); 806 807 int IslMaximizeBands; 808 if (MaximizeBandDepth == "yes") { 809 IslMaximizeBands = 1; 810 } else if (MaximizeBandDepth == "no") { 811 IslMaximizeBands = 0; 812 } else { 813 errs() 814 << "warning: Option -polly-opt-maximize-bands should either be 'yes'" 815 " or 'no'. Falling back to default: 'yes'\n"; 816 IslMaximizeBands = 1; 817 } 818 819 int IslOuterCoincidence; 820 if (OuterCoincidence == "yes") { 821 IslOuterCoincidence = 1; 822 } else if (OuterCoincidence == "no") { 823 IslOuterCoincidence = 0; 824 } else { 825 errs() << "warning: Option -polly-opt-outer-coincidence should either be " 826 "'yes' or 'no'. Falling back to default: 'no'\n"; 827 IslOuterCoincidence = 0; 828 } 829 830 isl_ctx *Ctx = S.getIslCtx().get(); 831 832 isl_options_set_schedule_outer_coincidence(Ctx, IslOuterCoincidence); 833 isl_options_set_schedule_maximize_band_depth(Ctx, IslMaximizeBands); 834 isl_options_set_schedule_max_constant_term(Ctx, MaxConstantTerm); 835 isl_options_set_schedule_max_coefficient(Ctx, MaxCoefficient); 836 isl_options_set_tile_scale_tile_loops(Ctx, 0); 837 838 auto OnErrorStatus = isl_options_get_on_error(Ctx); 839 isl_options_set_on_error(Ctx, ISL_ON_ERROR_CONTINUE); 840 841 auto SC = isl::schedule_constraints::on_domain(Domain); 842 SC = SC.set_proximity(Proximity); 843 SC = SC.set_validity(Validity); 844 SC = SC.set_coincidence(Validity); 845 Schedule = SC.compute_schedule(); 846 isl_options_set_on_error(Ctx, OnErrorStatus); 847 848 ScopsRescheduled++; 849 LLVM_DEBUG(printSchedule(dbgs(), Schedule, "After rescheduling")); 850 } 851 852 walkScheduleTreeForStatistics(Schedule, 1); 853 854 // In cases the scheduler is not able to optimize the code, we just do not 855 // touch the schedule. 856 if (Schedule.is_null()) 857 return false; 858 859 if (GreedyFusion) { 860 isl::union_map Validity = D.getDependences( 861 Dependences::TYPE_RAW | Dependences::TYPE_WAR | Dependences::TYPE_WAW); 862 Schedule = applyGreedyFusion(Schedule, Validity); 863 assert(!Schedule.is_null()); 864 } 865 866 // Apply post-rescheduling optimizations (if enabled) and/or prevectorization. 867 const OptimizerAdditionalInfoTy OAI = { 868 TTI, const_cast<Dependences *>(&D), 869 /*PatternOpts=*/!HasUserTransformation && PMBasedOpts, 870 /*Postopts=*/!HasUserTransformation && EnablePostopts, 871 /*Prevect=*/PollyVectorizerChoice != VECTORIZER_NONE}; 872 if (OAI.PatternOpts || OAI.Postopts || OAI.Prevect) { 873 Schedule = ScheduleTreeOptimizer::optimizeSchedule(Schedule, &OAI); 874 Schedule = hoistExtensionNodes(Schedule); 875 LLVM_DEBUG(printSchedule(dbgs(), Schedule, "After post-optimizations")); 876 walkScheduleTreeForStatistics(Schedule, 2); 877 } 878 879 // Skip profitability check if user transformation(s) have been applied. 880 if (!HasUserTransformation && 881 !ScheduleTreeOptimizer::isProfitableSchedule(S, Schedule)) 882 return false; 883 884 auto ScopStats = S.getStatistics(); 885 ScopsOptimized++; 886 NumAffineLoopsOptimized += ScopStats.NumAffineLoops; 887 NumBoxedLoopsOptimized += ScopStats.NumBoxedLoops; 888 LastSchedule = Schedule; 889 890 S.setScheduleTree(Schedule); 891 S.markAsOptimized(); 892 893 if (OptimizedScops) 894 errs() << S; 895 896 return false; 897 } 898 899 bool IslScheduleOptimizerWrapperPass::runOnScop(Scop &S) { 900 releaseMemory(); 901 902 Function &F = S.getFunction(); 903 IslCtx = S.getSharedIslCtx(); 904 905 auto getDependences = 906 [this](Dependences::AnalysisLevel) -> const Dependences & { 907 return getAnalysis<DependenceInfo>().getDependences( 908 Dependences::AL_Statement); 909 }; 910 OptimizationRemarkEmitter &ORE = 911 getAnalysis<OptimizationRemarkEmitterWrapperPass>().getORE(); 912 TargetTransformInfo *TTI = 913 &getAnalysis<TargetTransformInfoWrapperPass>().getTTI(F); 914 return runIslScheduleOptimizer(S, getDependences, TTI, &ORE, LastSchedule); 915 } 916 917 static void runScheduleOptimizerPrinter(raw_ostream &OS, 918 isl::schedule LastSchedule) { 919 isl_printer *p; 920 char *ScheduleStr; 921 922 OS << "Calculated schedule:\n"; 923 924 if (LastSchedule.is_null()) { 925 OS << "n/a\n"; 926 return; 927 } 928 929 p = isl_printer_to_str(LastSchedule.ctx().get()); 930 p = isl_printer_set_yaml_style(p, ISL_YAML_STYLE_BLOCK); 931 p = isl_printer_print_schedule(p, LastSchedule.get()); 932 ScheduleStr = isl_printer_get_str(p); 933 isl_printer_free(p); 934 935 OS << ScheduleStr << "\n"; 936 937 free(ScheduleStr); 938 } 939 940 void IslScheduleOptimizerWrapperPass::printScop(raw_ostream &OS, Scop &) const { 941 runScheduleOptimizerPrinter(OS, LastSchedule); 942 } 943 944 void IslScheduleOptimizerWrapperPass::getAnalysisUsage( 945 AnalysisUsage &AU) const { 946 ScopPass::getAnalysisUsage(AU); 947 AU.addRequired<DependenceInfo>(); 948 AU.addRequired<TargetTransformInfoWrapperPass>(); 949 AU.addRequired<OptimizationRemarkEmitterWrapperPass>(); 950 951 AU.addPreserved<DependenceInfo>(); 952 AU.addPreserved<OptimizationRemarkEmitterWrapperPass>(); 953 } 954 955 } // namespace 956 957 Pass *polly::createIslScheduleOptimizerWrapperPass() { 958 return new IslScheduleOptimizerWrapperPass(); 959 } 960 961 INITIALIZE_PASS_BEGIN(IslScheduleOptimizerWrapperPass, "polly-opt-isl", 962 "Polly - Optimize schedule of SCoP", false, false); 963 INITIALIZE_PASS_DEPENDENCY(DependenceInfo); 964 INITIALIZE_PASS_DEPENDENCY(ScopInfoRegionPass); 965 INITIALIZE_PASS_DEPENDENCY(TargetTransformInfoWrapperPass); 966 INITIALIZE_PASS_DEPENDENCY(OptimizationRemarkEmitterWrapperPass); 967 INITIALIZE_PASS_END(IslScheduleOptimizerWrapperPass, "polly-opt-isl", 968 "Polly - Optimize schedule of SCoP", false, false) 969 970 static llvm::PreservedAnalyses 971 runIslScheduleOptimizerUsingNPM(Scop &S, ScopAnalysisManager &SAM, 972 ScopStandardAnalysisResults &SAR, SPMUpdater &U, 973 raw_ostream *OS) { 974 DependenceAnalysis::Result &Deps = SAM.getResult<DependenceAnalysis>(S, SAR); 975 auto GetDeps = [&Deps](Dependences::AnalysisLevel) -> const Dependences & { 976 return Deps.getDependences(Dependences::AL_Statement); 977 }; 978 OptimizationRemarkEmitter ORE(&S.getFunction()); 979 TargetTransformInfo *TTI = &SAR.TTI; 980 isl::schedule LastSchedule; 981 bool Modified = runIslScheduleOptimizer(S, GetDeps, TTI, &ORE, LastSchedule); 982 if (OS) { 983 *OS << "Printing analysis 'Polly - Optimize schedule of SCoP' for region: '" 984 << S.getName() << "' in function '" << S.getFunction().getName() 985 << "':\n"; 986 runScheduleOptimizerPrinter(*OS, LastSchedule); 987 } 988 989 if (!Modified) 990 return PreservedAnalyses::all(); 991 992 PreservedAnalyses PA; 993 PA.preserveSet<AllAnalysesOn<Module>>(); 994 PA.preserveSet<AllAnalysesOn<Function>>(); 995 PA.preserveSet<AllAnalysesOn<Loop>>(); 996 return PA; 997 } 998 999 llvm::PreservedAnalyses 1000 IslScheduleOptimizerPass::run(Scop &S, ScopAnalysisManager &SAM, 1001 ScopStandardAnalysisResults &SAR, SPMUpdater &U) { 1002 return runIslScheduleOptimizerUsingNPM(S, SAM, SAR, U, nullptr); 1003 } 1004 1005 llvm::PreservedAnalyses 1006 IslScheduleOptimizerPrinterPass::run(Scop &S, ScopAnalysisManager &SAM, 1007 ScopStandardAnalysisResults &SAR, 1008 SPMUpdater &U) { 1009 return runIslScheduleOptimizerUsingNPM(S, SAM, SAR, U, &OS); 1010 } 1011