1 //===- Schedule.cpp - Calculate an optimized schedule ---------------------===// 2 // 3 // Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. 4 // See https://llvm.org/LICENSE.txt for license information. 5 // SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception 6 // 7 //===----------------------------------------------------------------------===// 8 // 9 // This pass generates an entirely new schedule tree from the data dependences 10 // and iteration domains. The new schedule tree is computed in two steps: 11 // 12 // 1) The isl scheduling optimizer is run 13 // 14 // The isl scheduling optimizer creates a new schedule tree that maximizes 15 // parallelism and tileability and minimizes data-dependence distances. The 16 // algorithm used is a modified version of the ``Pluto'' algorithm: 17 // 18 // U. Bondhugula, A. Hartono, J. Ramanujam, and P. Sadayappan. 19 // A Practical Automatic Polyhedral Parallelizer and Locality Optimizer. 20 // In Proceedings of the 2008 ACM SIGPLAN Conference On Programming Language 21 // Design and Implementation, PLDI ’08, pages 101–113. ACM, 2008. 22 // 23 // 2) A set of post-scheduling transformations is applied on the schedule tree. 24 // 25 // These optimizations include: 26 // 27 // - Tiling of the innermost tilable bands 28 // - Prevectorization - The choice of a possible outer loop that is strip-mined 29 // to the innermost level to enable inner-loop 30 // vectorization. 31 // - Some optimizations for spatial locality are also planned. 32 // 33 // For a detailed description of the schedule tree itself please see section 6 34 // of: 35 // 36 // Polyhedral AST generation is more than scanning polyhedra 37 // Tobias Grosser, Sven Verdoolaege, Albert Cohen 38 // ACM Transactions on Programming Languages and Systems (TOPLAS), 39 // 37(4), July 2015 40 // http://www.grosser.es/#pub-polyhedral-AST-generation 41 // 42 // This publication also contains a detailed discussion of the different options 43 // for polyhedral loop unrolling, full/partial tile separation and other uses 44 // of the schedule tree. 45 // 46 //===----------------------------------------------------------------------===// 47 48 #include "polly/ScheduleOptimizer.h" 49 #include "polly/CodeGen/CodeGeneration.h" 50 #include "polly/DependenceInfo.h" 51 #include "polly/ManualOptimizer.h" 52 #include "polly/MatmulOptimizer.h" 53 #include "polly/Options.h" 54 #include "polly/ScheduleTreeTransform.h" 55 #include "polly/Support/ISLOStream.h" 56 #include "llvm/ADT/Sequence.h" 57 #include "llvm/ADT/Statistic.h" 58 #include "llvm/InitializePasses.h" 59 #include "llvm/Support/CommandLine.h" 60 #include "isl/options.h" 61 62 using namespace llvm; 63 using namespace polly; 64 65 namespace llvm { 66 class Loop; 67 class Module; 68 } // namespace llvm 69 70 #define DEBUG_TYPE "polly-opt-isl" 71 72 static cl::opt<std::string> 73 OptimizeDeps("polly-opt-optimize-only", 74 cl::desc("Only a certain kind of dependences (all/raw)"), 75 cl::Hidden, cl::init("all"), cl::ZeroOrMore, 76 cl::cat(PollyCategory)); 77 78 static cl::opt<std::string> 79 SimplifyDeps("polly-opt-simplify-deps", 80 cl::desc("Dependences should be simplified (yes/no)"), 81 cl::Hidden, cl::init("yes"), cl::ZeroOrMore, 82 cl::cat(PollyCategory)); 83 84 static cl::opt<int> MaxConstantTerm( 85 "polly-opt-max-constant-term", 86 cl::desc("The maximal constant term allowed (-1 is unlimited)"), cl::Hidden, 87 cl::init(20), cl::ZeroOrMore, cl::cat(PollyCategory)); 88 89 static cl::opt<int> MaxCoefficient( 90 "polly-opt-max-coefficient", 91 cl::desc("The maximal coefficient allowed (-1 is unlimited)"), cl::Hidden, 92 cl::init(20), cl::ZeroOrMore, cl::cat(PollyCategory)); 93 94 static cl::opt<std::string> FusionStrategy( 95 "polly-opt-fusion", cl::desc("The fusion strategy to choose (min/max)"), 96 cl::Hidden, cl::init("min"), cl::ZeroOrMore, cl::cat(PollyCategory)); 97 98 static cl::opt<std::string> 99 MaximizeBandDepth("polly-opt-maximize-bands", 100 cl::desc("Maximize the band depth (yes/no)"), cl::Hidden, 101 cl::init("yes"), cl::ZeroOrMore, cl::cat(PollyCategory)); 102 103 static cl::opt<std::string> OuterCoincidence( 104 "polly-opt-outer-coincidence", 105 cl::desc("Try to construct schedules where the outer member of each band " 106 "satisfies the coincidence constraints (yes/no)"), 107 cl::Hidden, cl::init("no"), cl::ZeroOrMore, cl::cat(PollyCategory)); 108 109 static cl::opt<int> PrevectorWidth( 110 "polly-prevect-width", 111 cl::desc( 112 "The number of loop iterations to strip-mine for pre-vectorization"), 113 cl::Hidden, cl::init(4), cl::ZeroOrMore, cl::cat(PollyCategory)); 114 115 static cl::opt<bool> FirstLevelTiling("polly-tiling", 116 cl::desc("Enable loop tiling"), 117 cl::init(true), cl::ZeroOrMore, 118 cl::cat(PollyCategory)); 119 120 static cl::opt<int> FirstLevelDefaultTileSize( 121 "polly-default-tile-size", 122 cl::desc("The default tile size (if not enough were provided by" 123 " --polly-tile-sizes)"), 124 cl::Hidden, cl::init(32), cl::ZeroOrMore, cl::cat(PollyCategory)); 125 126 static cl::list<int> 127 FirstLevelTileSizes("polly-tile-sizes", 128 cl::desc("A tile size for each loop dimension, filled " 129 "with --polly-default-tile-size"), 130 cl::Hidden, cl::ZeroOrMore, cl::CommaSeparated, 131 cl::cat(PollyCategory)); 132 133 static cl::opt<bool> 134 SecondLevelTiling("polly-2nd-level-tiling", 135 cl::desc("Enable a 2nd level loop of loop tiling"), 136 cl::init(false), cl::ZeroOrMore, cl::cat(PollyCategory)); 137 138 static cl::opt<int> SecondLevelDefaultTileSize( 139 "polly-2nd-level-default-tile-size", 140 cl::desc("The default 2nd-level tile size (if not enough were provided by" 141 " --polly-2nd-level-tile-sizes)"), 142 cl::Hidden, cl::init(16), cl::ZeroOrMore, cl::cat(PollyCategory)); 143 144 static cl::list<int> 145 SecondLevelTileSizes("polly-2nd-level-tile-sizes", 146 cl::desc("A tile size for each loop dimension, filled " 147 "with --polly-default-tile-size"), 148 cl::Hidden, cl::ZeroOrMore, cl::CommaSeparated, 149 cl::cat(PollyCategory)); 150 151 static cl::opt<bool> RegisterTiling("polly-register-tiling", 152 cl::desc("Enable register tiling"), 153 cl::init(false), cl::ZeroOrMore, 154 cl::cat(PollyCategory)); 155 156 static cl::opt<int> RegisterDefaultTileSize( 157 "polly-register-tiling-default-tile-size", 158 cl::desc("The default register tile size (if not enough were provided by" 159 " --polly-register-tile-sizes)"), 160 cl::Hidden, cl::init(2), cl::ZeroOrMore, cl::cat(PollyCategory)); 161 162 static cl::list<int> 163 RegisterTileSizes("polly-register-tile-sizes", 164 cl::desc("A tile size for each loop dimension, filled " 165 "with --polly-register-tile-size"), 166 cl::Hidden, cl::ZeroOrMore, cl::CommaSeparated, 167 cl::cat(PollyCategory)); 168 169 static cl::opt<bool> PragmaBasedOpts( 170 "polly-pragma-based-opts", 171 cl::desc("Apply user-directed transformation from metadata"), 172 cl::init(true), cl::ZeroOrMore, cl::cat(PollyCategory)); 173 174 static cl::opt<bool> 175 PMBasedOpts("polly-pattern-matching-based-opts", 176 cl::desc("Perform optimizations based on pattern matching"), 177 cl::init(true), cl::ZeroOrMore, cl::cat(PollyCategory)); 178 179 static cl::opt<bool> OptimizedScops( 180 "polly-optimized-scops", 181 cl::desc("Polly - Dump polyhedral description of Scops optimized with " 182 "the isl scheduling optimizer and the set of post-scheduling " 183 "transformations is applied on the schedule tree"), 184 cl::init(false), cl::ZeroOrMore, cl::cat(PollyCategory)); 185 186 STATISTIC(ScopsProcessed, "Number of scops processed"); 187 STATISTIC(ScopsRescheduled, "Number of scops rescheduled"); 188 STATISTIC(ScopsOptimized, "Number of scops optimized"); 189 190 STATISTIC(NumAffineLoopsOptimized, "Number of affine loops optimized"); 191 STATISTIC(NumBoxedLoopsOptimized, "Number of boxed loops optimized"); 192 193 #define THREE_STATISTICS(VARNAME, DESC) \ 194 static Statistic VARNAME[3] = { \ 195 {DEBUG_TYPE, #VARNAME "0", DESC " (original)"}, \ 196 {DEBUG_TYPE, #VARNAME "1", DESC " (after scheduler)"}, \ 197 {DEBUG_TYPE, #VARNAME "2", DESC " (after optimizer)"}} 198 199 THREE_STATISTICS(NumBands, "Number of bands"); 200 THREE_STATISTICS(NumBandMembers, "Number of band members"); 201 THREE_STATISTICS(NumCoincident, "Number of coincident band members"); 202 THREE_STATISTICS(NumPermutable, "Number of permutable bands"); 203 THREE_STATISTICS(NumFilters, "Number of filter nodes"); 204 THREE_STATISTICS(NumExtension, "Number of extension nodes"); 205 206 STATISTIC(FirstLevelTileOpts, "Number of first level tiling applied"); 207 STATISTIC(SecondLevelTileOpts, "Number of second level tiling applied"); 208 STATISTIC(RegisterTileOpts, "Number of register tiling applied"); 209 STATISTIC(PrevectOpts, "Number of strip-mining for prevectorization applied"); 210 STATISTIC(MatMulOpts, 211 "Number of matrix multiplication patterns detected and optimized"); 212 213 namespace { 214 /// Additional parameters of the schedule optimizer. 215 /// 216 /// Target Transform Info and the SCoP dependencies used by the schedule 217 /// optimizer. 218 struct OptimizerAdditionalInfoTy { 219 const llvm::TargetTransformInfo *TTI; 220 const Dependences *D; 221 }; 222 223 class ScheduleTreeOptimizer { 224 public: 225 /// Apply schedule tree transformations. 226 /// 227 /// This function takes an (possibly already optimized) schedule tree and 228 /// applies a set of additional optimizations on the schedule tree. The 229 /// transformations applied include: 230 /// 231 /// - Tiling 232 /// - Prevectorization 233 /// 234 /// @param Schedule The schedule object the transformations will be applied 235 /// to. 236 /// @param OAI Target Transform Info and the SCoP dependencies. 237 /// @returns The transformed schedule. 238 static isl::schedule 239 optimizeSchedule(isl::schedule Schedule, 240 const OptimizerAdditionalInfoTy *OAI = nullptr); 241 242 /// Apply schedule tree transformations. 243 /// 244 /// This function takes a node in an (possibly already optimized) schedule 245 /// tree and applies a set of additional optimizations on this schedule tree 246 /// node and its descendants. The transformations applied include: 247 /// 248 /// - Tiling 249 /// - Prevectorization 250 /// 251 /// @param Node The schedule object post-transformations will be applied to. 252 /// @param OAI Target Transform Info and the SCoP dependencies. 253 /// @returns The transformed schedule. 254 static isl::schedule_node 255 optimizeScheduleNode(isl::schedule_node Node, 256 const OptimizerAdditionalInfoTy *OAI = nullptr); 257 258 /// Decide if the @p NewSchedule is profitable for @p S. 259 /// 260 /// @param S The SCoP we optimize. 261 /// @param NewSchedule The new schedule we computed. 262 /// 263 /// @return True, if we believe @p NewSchedule is an improvement for @p S. 264 static bool isProfitableSchedule(polly::Scop &S, isl::schedule NewSchedule); 265 266 /// Isolate a set of partial tile prefixes. 267 /// 268 /// This set should ensure that it contains only partial tile prefixes that 269 /// have exactly VectorWidth iterations. 270 /// 271 /// @param Node A schedule node band, which is a parent of a band node, 272 /// that contains a vector loop. 273 /// @return Modified isl_schedule_node. 274 static isl::schedule_node isolateFullPartialTiles(isl::schedule_node Node, 275 int VectorWidth); 276 277 private: 278 /// Check if this node is a band node we want to tile. 279 /// 280 /// We look for innermost band nodes where individual dimensions are marked as 281 /// permutable. 282 /// 283 /// @param Node The node to check. 284 static bool isTileableBandNode(isl::schedule_node Node); 285 286 /// Pre-vectorizes one scheduling dimension of a schedule band. 287 /// 288 /// prevectSchedBand splits out the dimension DimToVectorize, tiles it and 289 /// sinks the resulting point loop. 290 /// 291 /// Example (DimToVectorize=0, VectorWidth=4): 292 /// 293 /// | Before transformation: 294 /// | 295 /// | A[i,j] -> [i,j] 296 /// | 297 /// | for (i = 0; i < 128; i++) 298 /// | for (j = 0; j < 128; j++) 299 /// | A(i,j); 300 /// 301 /// | After transformation: 302 /// | 303 /// | for (it = 0; it < 32; it+=1) 304 /// | for (j = 0; j < 128; j++) 305 /// | for (ip = 0; ip <= 3; ip++) 306 /// | A(4 * it + ip,j); 307 /// 308 /// The goal of this transformation is to create a trivially vectorizable 309 /// loop. This means a parallel loop at the innermost level that has a 310 /// constant number of iterations corresponding to the target vector width. 311 /// 312 /// This transformation creates a loop at the innermost level. The loop has 313 /// a constant number of iterations, if the number of loop iterations at 314 /// DimToVectorize can be divided by VectorWidth. The default VectorWidth is 315 /// currently constant and not yet target specific. This function does not 316 /// reason about parallelism. 317 static isl::schedule_node prevectSchedBand(isl::schedule_node Node, 318 unsigned DimToVectorize, 319 int VectorWidth); 320 321 /// Apply additional optimizations on the bands in the schedule tree. 322 /// 323 /// We are looking for an innermost band node and apply the following 324 /// transformations: 325 /// 326 /// - Tile the band 327 /// - if the band is tileable 328 /// - if the band has more than one loop dimension 329 /// 330 /// - Prevectorize the schedule of the band (or the point loop in case of 331 /// tiling). 332 /// - if vectorization is enabled 333 /// 334 /// @param Node The schedule node to (possibly) optimize. 335 /// @param User A pointer to forward some use information 336 /// (currently unused). 337 static isl_schedule_node *optimizeBand(isl_schedule_node *Node, void *User); 338 339 /// Apply additional optimizations on the bands in the schedule tree. 340 /// 341 /// We apply the following 342 /// transformations: 343 /// 344 /// - Tile the band 345 /// - Prevectorize the schedule of the band (or the point loop in case of 346 /// tiling). 347 /// - if vectorization is enabled 348 /// 349 /// @param Node The schedule node to (possibly) optimize. 350 /// @param User A pointer to forward some use information 351 /// (currently unused). 352 static isl::schedule_node standardBandOpts(isl::schedule_node Node, 353 void *User); 354 }; 355 356 isl::schedule_node 357 ScheduleTreeOptimizer::isolateFullPartialTiles(isl::schedule_node Node, 358 int VectorWidth) { 359 assert(isl_schedule_node_get_type(Node.get()) == isl_schedule_node_band); 360 Node = Node.child(0).child(0); 361 isl::union_map SchedRelUMap = Node.get_prefix_schedule_relation(); 362 isl::union_set ScheduleRangeUSet = SchedRelUMap.range(); 363 isl::set ScheduleRange{ScheduleRangeUSet}; 364 isl::set IsolateDomain = getPartialTilePrefixes(ScheduleRange, VectorWidth); 365 auto AtomicOption = getDimOptions(IsolateDomain.ctx(), "atomic"); 366 isl::union_set IsolateOption = getIsolateOptions(IsolateDomain, 1); 367 Node = Node.parent().parent(); 368 isl::union_set Options = IsolateOption.unite(AtomicOption); 369 isl::schedule_node_band Result = 370 Node.as<isl::schedule_node_band>().set_ast_build_options(Options); 371 return Result; 372 } 373 374 isl::schedule_node ScheduleTreeOptimizer::prevectSchedBand( 375 isl::schedule_node Node, unsigned DimToVectorize, int VectorWidth) { 376 assert(isl_schedule_node_get_type(Node.get()) == isl_schedule_node_band); 377 378 auto Space = isl::manage(isl_schedule_node_band_get_space(Node.get())); 379 isl_size ScheduleDimensions = Space.dim(isl::dim::set).release(); 380 assert((isl_size)DimToVectorize < ScheduleDimensions); 381 382 if (DimToVectorize > 0) { 383 Node = isl::manage( 384 isl_schedule_node_band_split(Node.release(), DimToVectorize)); 385 Node = Node.child(0); 386 } 387 if ((isl_size)DimToVectorize < ScheduleDimensions - 1) 388 Node = isl::manage(isl_schedule_node_band_split(Node.release(), 1)); 389 Space = isl::manage(isl_schedule_node_band_get_space(Node.get())); 390 auto Sizes = isl::multi_val::zero(Space); 391 Sizes = Sizes.set_val(0, isl::val(Node.ctx(), VectorWidth)); 392 Node = 393 isl::manage(isl_schedule_node_band_tile(Node.release(), Sizes.release())); 394 Node = isolateFullPartialTiles(Node, VectorWidth); 395 Node = Node.child(0); 396 // Make sure the "trivially vectorizable loop" is not unrolled. Otherwise, 397 // we will have troubles to match it in the backend. 398 isl::schedule_node_band NodeBand = 399 Node.as<isl::schedule_node_band>().set_ast_build_options( 400 isl::union_set(Node.ctx(), "{ unroll[x]: 1 = 0 }")); 401 Node = isl::manage(isl_schedule_node_band_sink(NodeBand.release())); 402 Node = Node.child(0); 403 if (isl_schedule_node_get_type(Node.get()) == isl_schedule_node_leaf) 404 Node = Node.parent(); 405 auto LoopMarker = isl::id::alloc(Node.ctx(), "SIMD", nullptr); 406 PrevectOpts++; 407 return Node.insert_mark(LoopMarker); 408 } 409 410 static bool isSimpleInnermostBand(const isl::schedule_node &Node) { 411 assert(isl_schedule_node_get_type(Node.get()) == isl_schedule_node_band); 412 assert(isl_schedule_node_n_children(Node.get()) == 1); 413 414 auto ChildType = isl_schedule_node_get_type(Node.child(0).get()); 415 416 if (ChildType == isl_schedule_node_leaf) 417 return true; 418 419 if (ChildType != isl_schedule_node_sequence) 420 return false; 421 422 auto Sequence = Node.child(0); 423 424 for (int c = 0, nc = isl_schedule_node_n_children(Sequence.get()); c < nc; 425 ++c) { 426 auto Child = Sequence.child(c); 427 if (isl_schedule_node_get_type(Child.get()) != isl_schedule_node_filter) 428 return false; 429 if (isl_schedule_node_get_type(Child.child(0).get()) != 430 isl_schedule_node_leaf) 431 return false; 432 } 433 return true; 434 } 435 436 bool ScheduleTreeOptimizer::isTileableBandNode(isl::schedule_node Node) { 437 if (isl_schedule_node_get_type(Node.get()) != isl_schedule_node_band) 438 return false; 439 440 if (isl_schedule_node_n_children(Node.get()) != 1) 441 return false; 442 443 if (!isl_schedule_node_band_get_permutable(Node.get())) 444 return false; 445 446 auto Space = isl::manage(isl_schedule_node_band_get_space(Node.get())); 447 auto Dims = Space.dim(isl::dim::set).release(); 448 449 if (Dims <= 1) 450 return false; 451 452 return isSimpleInnermostBand(Node); 453 } 454 455 __isl_give isl::schedule_node 456 ScheduleTreeOptimizer::standardBandOpts(isl::schedule_node Node, void *User) { 457 if (FirstLevelTiling) { 458 Node = tileNode(Node, "1st level tiling", FirstLevelTileSizes, 459 FirstLevelDefaultTileSize); 460 FirstLevelTileOpts++; 461 } 462 463 if (SecondLevelTiling) { 464 Node = tileNode(Node, "2nd level tiling", SecondLevelTileSizes, 465 SecondLevelDefaultTileSize); 466 SecondLevelTileOpts++; 467 } 468 469 if (RegisterTiling) { 470 Node = 471 applyRegisterTiling(Node, RegisterTileSizes, RegisterDefaultTileSize); 472 RegisterTileOpts++; 473 } 474 475 if (PollyVectorizerChoice == VECTORIZER_NONE) 476 return Node; 477 478 auto Space = isl::manage(isl_schedule_node_band_get_space(Node.get())); 479 auto Dims = Space.dim(isl::dim::set).release(); 480 481 for (int i = Dims - 1; i >= 0; i--) 482 if (Node.as<isl::schedule_node_band>().member_get_coincident(i)) { 483 Node = prevectSchedBand(Node, i, PrevectorWidth); 484 break; 485 } 486 487 return Node; 488 } 489 490 __isl_give isl_schedule_node * 491 ScheduleTreeOptimizer::optimizeBand(__isl_take isl_schedule_node *Node, 492 void *User) { 493 if (!isTileableBandNode(isl::manage_copy(Node))) 494 return Node; 495 496 const OptimizerAdditionalInfoTy *OAI = 497 static_cast<const OptimizerAdditionalInfoTy *>(User); 498 499 if (PMBasedOpts && User) { 500 isl::schedule_node PatternOptimizedSchedule = 501 tryOptimizeMatMulPattern(isl::manage_copy(Node), OAI->TTI, OAI->D); 502 if (!PatternOptimizedSchedule.is_null()) { 503 MatMulOpts++; 504 isl_schedule_node_free(Node); 505 return PatternOptimizedSchedule.release(); 506 } 507 } 508 509 return standardBandOpts(isl::manage(Node), User).release(); 510 } 511 512 isl::schedule 513 ScheduleTreeOptimizer::optimizeSchedule(isl::schedule Schedule, 514 const OptimizerAdditionalInfoTy *OAI) { 515 auto Root = Schedule.get_root(); 516 Root = optimizeScheduleNode(Root, OAI); 517 return Root.get_schedule(); 518 } 519 520 isl::schedule_node ScheduleTreeOptimizer::optimizeScheduleNode( 521 isl::schedule_node Node, const OptimizerAdditionalInfoTy *OAI) { 522 Node = isl::manage(isl_schedule_node_map_descendant_bottom_up( 523 Node.release(), optimizeBand, 524 const_cast<void *>(static_cast<const void *>(OAI)))); 525 return Node; 526 } 527 528 bool ScheduleTreeOptimizer::isProfitableSchedule(Scop &S, 529 isl::schedule NewSchedule) { 530 // To understand if the schedule has been optimized we check if the schedule 531 // has changed at all. 532 // TODO: We can improve this by tracking if any necessarily beneficial 533 // transformations have been performed. This can e.g. be tiling, loop 534 // interchange, or ...) We can track this either at the place where the 535 // transformation has been performed or, in case of automatic ILP based 536 // optimizations, by comparing (yet to be defined) performance metrics 537 // before/after the scheduling optimizer 538 // (e.g., #stride-one accesses) 539 auto NewScheduleMap = NewSchedule.get_map(); 540 auto OldSchedule = S.getSchedule(); 541 assert(!OldSchedule.is_null() && 542 "Only IslScheduleOptimizer can insert extension nodes " 543 "that make Scop::getSchedule() return nullptr."); 544 bool changed = !OldSchedule.is_equal(NewScheduleMap); 545 return changed; 546 } 547 548 class IslScheduleOptimizerWrapperPass : public ScopPass { 549 public: 550 static char ID; 551 552 explicit IslScheduleOptimizerWrapperPass() : ScopPass(ID) {} 553 554 /// Optimize the schedule of the SCoP @p S. 555 bool runOnScop(Scop &S) override; 556 557 /// Print the new schedule for the SCoP @p S. 558 void printScop(raw_ostream &OS, Scop &S) const override; 559 560 /// Register all analyses and transformation required. 561 void getAnalysisUsage(AnalysisUsage &AU) const override; 562 563 /// Release the internal memory. 564 void releaseMemory() override { 565 LastSchedule = {}; 566 IslCtx.reset(); 567 } 568 569 private: 570 std::shared_ptr<isl_ctx> IslCtx; 571 isl::schedule LastSchedule; 572 }; 573 574 char IslScheduleOptimizerWrapperPass::ID = 0; 575 576 #ifndef NDEBUG 577 static void printSchedule(llvm::raw_ostream &OS, const isl::schedule &Schedule, 578 StringRef Desc) { 579 isl::ctx Ctx = Schedule.ctx(); 580 isl_printer *P = isl_printer_to_str(Ctx.get()); 581 P = isl_printer_set_yaml_style(P, ISL_YAML_STYLE_BLOCK); 582 P = isl_printer_print_schedule(P, Schedule.get()); 583 char *Str = isl_printer_get_str(P); 584 OS << Desc << ": \n" << Str << "\n"; 585 free(Str); 586 isl_printer_free(P); 587 } 588 #endif 589 590 /// Collect statistics for the schedule tree. 591 /// 592 /// @param Schedule The schedule tree to analyze. If not a schedule tree it is 593 /// ignored. 594 /// @param Version The version of the schedule tree that is analyzed. 595 /// 0 for the original schedule tree before any transformation. 596 /// 1 for the schedule tree after isl's rescheduling. 597 /// 2 for the schedule tree after optimizations are applied 598 /// (tiling, pattern matching) 599 static void walkScheduleTreeForStatistics(isl::schedule Schedule, int Version) { 600 auto Root = Schedule.get_root(); 601 if (Root.is_null()) 602 return; 603 604 isl_schedule_node_foreach_descendant_top_down( 605 Root.get(), 606 [](__isl_keep isl_schedule_node *nodeptr, void *user) -> isl_bool { 607 isl::schedule_node Node = isl::manage_copy(nodeptr); 608 int Version = *static_cast<int *>(user); 609 610 switch (isl_schedule_node_get_type(Node.get())) { 611 case isl_schedule_node_band: { 612 NumBands[Version]++; 613 if (isl_schedule_node_band_get_permutable(Node.get()) == 614 isl_bool_true) 615 NumPermutable[Version]++; 616 617 int CountMembers = isl_schedule_node_band_n_member(Node.get()); 618 NumBandMembers[Version] += CountMembers; 619 for (int i = 0; i < CountMembers; i += 1) { 620 if (Node.as<isl::schedule_node_band>().member_get_coincident(i)) 621 NumCoincident[Version]++; 622 } 623 break; 624 } 625 626 case isl_schedule_node_filter: 627 NumFilters[Version]++; 628 break; 629 630 case isl_schedule_node_extension: 631 NumExtension[Version]++; 632 break; 633 634 default: 635 break; 636 } 637 638 return isl_bool_true; 639 }, 640 &Version); 641 } 642 643 static bool runIslScheduleOptimizer( 644 Scop &S, 645 function_ref<const Dependences &(Dependences::AnalysisLevel)> GetDeps, 646 TargetTransformInfo *TTI, isl::schedule &LastSchedule) { 647 // Skip SCoPs in case they're already optimised by PPCGCodeGeneration 648 if (S.isToBeSkipped()) 649 return false; 650 651 // Skip empty SCoPs but still allow code generation as it will delete the 652 // loops present but not needed. 653 if (S.getSize() == 0) { 654 S.markAsOptimized(); 655 return false; 656 } 657 658 ScopsProcessed++; 659 660 // Schedule without optimizations. 661 isl::schedule Schedule = S.getScheduleTree(); 662 walkScheduleTreeForStatistics(S.getScheduleTree(), 0); 663 LLVM_DEBUG(printSchedule(dbgs(), Schedule, "Original schedule tree")); 664 665 bool HasUserTransformation = false; 666 if (PragmaBasedOpts) { 667 isl::schedule ManuallyTransformed = 668 applyManualTransformations(&S, Schedule); 669 if (ManuallyTransformed.is_null()) { 670 LLVM_DEBUG(dbgs() << "Error during manual optimization\n"); 671 return false; 672 } 673 674 if (ManuallyTransformed.get() != Schedule.get()) { 675 // User transformations have precedence over other transformations. 676 HasUserTransformation = true; 677 Schedule = std::move(ManuallyTransformed); 678 LLVM_DEBUG( 679 printSchedule(dbgs(), Schedule, "After manual transformations")); 680 } 681 } 682 683 // Only continue if either manual transformations have been applied or we are 684 // allowed to apply heuristics. 685 // TODO: Detect disabled heuristics and no user-directed transformation 686 // metadata earlier in ScopDetection. 687 if (!HasUserTransformation && S.hasDisableHeuristicsHint()) { 688 LLVM_DEBUG(dbgs() << "Heuristic optimizations disabled by metadata\n"); 689 return false; 690 } 691 692 // Get dependency analysis. 693 const Dependences &D = GetDeps(Dependences::AL_Statement); 694 if (D.getSharedIslCtx() != S.getSharedIslCtx()) { 695 LLVM_DEBUG(dbgs() << "DependenceInfo for another SCoP/isl_ctx\n"); 696 return false; 697 } 698 if (!D.hasValidDependences()) { 699 LLVM_DEBUG(dbgs() << "Dependency information not available\n"); 700 return false; 701 } 702 703 // Apply ISL's algorithm only if not overriden by the user. Note that 704 // post-rescheduling optimizations (tiling, pattern-based, prevectorization) 705 // rely on the coincidence/permutable annotations on schedule tree bands that 706 // are added by the rescheduling analyzer. Therefore, disabling the 707 // rescheduler implicitly also disables these optimizations. 708 if (HasUserTransformation) { 709 LLVM_DEBUG( 710 dbgs() << "Skipping rescheduling due to manual transformation\n"); 711 } else { 712 // Build input data. 713 int ValidityKinds = 714 Dependences::TYPE_RAW | Dependences::TYPE_WAR | Dependences::TYPE_WAW; 715 int ProximityKinds; 716 717 if (OptimizeDeps == "all") 718 ProximityKinds = 719 Dependences::TYPE_RAW | Dependences::TYPE_WAR | Dependences::TYPE_WAW; 720 else if (OptimizeDeps == "raw") 721 ProximityKinds = Dependences::TYPE_RAW; 722 else { 723 errs() << "Do not know how to optimize for '" << OptimizeDeps << "'" 724 << " Falling back to optimizing all dependences.\n"; 725 ProximityKinds = 726 Dependences::TYPE_RAW | Dependences::TYPE_WAR | Dependences::TYPE_WAW; 727 } 728 729 isl::union_set Domain = S.getDomains(); 730 731 if (Domain.is_null()) 732 return false; 733 734 isl::union_map Validity = D.getDependences(ValidityKinds); 735 isl::union_map Proximity = D.getDependences(ProximityKinds); 736 737 // Simplify the dependences by removing the constraints introduced by the 738 // domains. This can speed up the scheduling time significantly, as large 739 // constant coefficients will be removed from the dependences. The 740 // introduction of some additional dependences reduces the possible 741 // transformations, but in most cases, such transformation do not seem to be 742 // interesting anyway. In some cases this option may stop the scheduler to 743 // find any schedule. 744 if (SimplifyDeps == "yes") { 745 Validity = Validity.gist_domain(Domain); 746 Validity = Validity.gist_range(Domain); 747 Proximity = Proximity.gist_domain(Domain); 748 Proximity = Proximity.gist_range(Domain); 749 } else if (SimplifyDeps != "no") { 750 errs() 751 << "warning: Option -polly-opt-simplify-deps should either be 'yes' " 752 "or 'no'. Falling back to default: 'yes'\n"; 753 } 754 755 LLVM_DEBUG(dbgs() << "\n\nCompute schedule from: "); 756 LLVM_DEBUG(dbgs() << "Domain := " << Domain << ";\n"); 757 LLVM_DEBUG(dbgs() << "Proximity := " << Proximity << ";\n"); 758 LLVM_DEBUG(dbgs() << "Validity := " << Validity << ";\n"); 759 760 unsigned IslSerializeSCCs; 761 762 if (FusionStrategy == "max") { 763 IslSerializeSCCs = 0; 764 } else if (FusionStrategy == "min") { 765 IslSerializeSCCs = 1; 766 } else { 767 errs() << "warning: Unknown fusion strategy. Falling back to maximal " 768 "fusion.\n"; 769 IslSerializeSCCs = 0; 770 } 771 772 int IslMaximizeBands; 773 774 if (MaximizeBandDepth == "yes") { 775 IslMaximizeBands = 1; 776 } else if (MaximizeBandDepth == "no") { 777 IslMaximizeBands = 0; 778 } else { 779 errs() 780 << "warning: Option -polly-opt-maximize-bands should either be 'yes'" 781 " or 'no'. Falling back to default: 'yes'\n"; 782 IslMaximizeBands = 1; 783 } 784 785 int IslOuterCoincidence; 786 787 if (OuterCoincidence == "yes") { 788 IslOuterCoincidence = 1; 789 } else if (OuterCoincidence == "no") { 790 IslOuterCoincidence = 0; 791 } else { 792 errs() << "warning: Option -polly-opt-outer-coincidence should either be " 793 "'yes' or 'no'. Falling back to default: 'no'\n"; 794 IslOuterCoincidence = 0; 795 } 796 797 isl_ctx *Ctx = S.getIslCtx().get(); 798 799 isl_options_set_schedule_outer_coincidence(Ctx, IslOuterCoincidence); 800 isl_options_set_schedule_serialize_sccs(Ctx, IslSerializeSCCs); 801 isl_options_set_schedule_maximize_band_depth(Ctx, IslMaximizeBands); 802 isl_options_set_schedule_max_constant_term(Ctx, MaxConstantTerm); 803 isl_options_set_schedule_max_coefficient(Ctx, MaxCoefficient); 804 isl_options_set_tile_scale_tile_loops(Ctx, 0); 805 806 auto OnErrorStatus = isl_options_get_on_error(Ctx); 807 isl_options_set_on_error(Ctx, ISL_ON_ERROR_CONTINUE); 808 809 auto SC = isl::schedule_constraints::on_domain(Domain); 810 SC = SC.set_proximity(Proximity); 811 SC = SC.set_validity(Validity); 812 SC = SC.set_coincidence(Validity); 813 Schedule = SC.compute_schedule(); 814 isl_options_set_on_error(Ctx, OnErrorStatus); 815 816 ScopsRescheduled++; 817 LLVM_DEBUG(printSchedule(dbgs(), Schedule, "After rescheduling")); 818 } 819 820 walkScheduleTreeForStatistics(Schedule, 1); 821 822 // In cases the scheduler is not able to optimize the code, we just do not 823 // touch the schedule. 824 if (Schedule.is_null()) 825 return false; 826 827 // Apply post-rescheduling optimizations. 828 const OptimizerAdditionalInfoTy OAI = {TTI, const_cast<Dependences *>(&D)}; 829 Schedule = ScheduleTreeOptimizer::optimizeSchedule(Schedule, &OAI); 830 Schedule = hoistExtensionNodes(Schedule); 831 LLVM_DEBUG(printSchedule(dbgs(), Schedule, "After post-optimizations")); 832 walkScheduleTreeForStatistics(Schedule, 2); 833 834 if (!ScheduleTreeOptimizer::isProfitableSchedule(S, Schedule)) 835 return false; 836 837 auto ScopStats = S.getStatistics(); 838 ScopsOptimized++; 839 NumAffineLoopsOptimized += ScopStats.NumAffineLoops; 840 NumBoxedLoopsOptimized += ScopStats.NumBoxedLoops; 841 LastSchedule = Schedule; 842 843 S.setScheduleTree(Schedule); 844 S.markAsOptimized(); 845 846 if (OptimizedScops) 847 errs() << S; 848 849 return false; 850 } 851 852 bool IslScheduleOptimizerWrapperPass::runOnScop(Scop &S) { 853 releaseMemory(); 854 855 Function &F = S.getFunction(); 856 IslCtx = S.getSharedIslCtx(); 857 858 auto getDependences = 859 [this](Dependences::AnalysisLevel) -> const Dependences & { 860 return getAnalysis<DependenceInfo>().getDependences( 861 Dependences::AL_Statement); 862 }; 863 // auto &Deps = getAnalysis<DependenceInfo>(); 864 TargetTransformInfo *TTI = 865 &getAnalysis<TargetTransformInfoWrapperPass>().getTTI(F); 866 return runIslScheduleOptimizer(S, getDependences, TTI, LastSchedule); 867 } 868 869 static void runScheduleOptimizerPrinter(raw_ostream &OS, 870 isl::schedule LastSchedule) { 871 isl_printer *p; 872 char *ScheduleStr; 873 874 OS << "Calculated schedule:\n"; 875 876 if (LastSchedule.is_null()) { 877 OS << "n/a\n"; 878 return; 879 } 880 881 p = isl_printer_to_str(LastSchedule.ctx().get()); 882 p = isl_printer_set_yaml_style(p, ISL_YAML_STYLE_BLOCK); 883 p = isl_printer_print_schedule(p, LastSchedule.get()); 884 ScheduleStr = isl_printer_get_str(p); 885 isl_printer_free(p); 886 887 OS << ScheduleStr << "\n"; 888 889 free(ScheduleStr); 890 } 891 892 void IslScheduleOptimizerWrapperPass::printScop(raw_ostream &OS, Scop &) const { 893 runScheduleOptimizerPrinter(OS, LastSchedule); 894 } 895 896 void IslScheduleOptimizerWrapperPass::getAnalysisUsage( 897 AnalysisUsage &AU) const { 898 ScopPass::getAnalysisUsage(AU); 899 AU.addRequired<DependenceInfo>(); 900 AU.addRequired<TargetTransformInfoWrapperPass>(); 901 902 AU.addPreserved<DependenceInfo>(); 903 } 904 905 } // namespace 906 907 Pass *polly::createIslScheduleOptimizerWrapperPass() { 908 return new IslScheduleOptimizerWrapperPass(); 909 } 910 911 INITIALIZE_PASS_BEGIN(IslScheduleOptimizerWrapperPass, "polly-opt-isl", 912 "Polly - Optimize schedule of SCoP", false, false); 913 INITIALIZE_PASS_DEPENDENCY(DependenceInfo); 914 INITIALIZE_PASS_DEPENDENCY(ScopInfoRegionPass); 915 INITIALIZE_PASS_DEPENDENCY(TargetTransformInfoWrapperPass); 916 INITIALIZE_PASS_END(IslScheduleOptimizerWrapperPass, "polly-opt-isl", 917 "Polly - Optimize schedule of SCoP", false, false) 918 919 static llvm::PreservedAnalyses 920 runIslScheduleOptimizerUsingNPM(Scop &S, ScopAnalysisManager &SAM, 921 ScopStandardAnalysisResults &SAR, SPMUpdater &U, 922 raw_ostream *OS) { 923 DependenceAnalysis::Result &Deps = SAM.getResult<DependenceAnalysis>(S, SAR); 924 auto GetDeps = [&Deps](Dependences::AnalysisLevel) -> const Dependences & { 925 return Deps.getDependences(Dependences::AL_Statement); 926 }; 927 TargetTransformInfo *TTI = &SAR.TTI; 928 isl::schedule LastSchedule; 929 bool Modified = runIslScheduleOptimizer(S, GetDeps, TTI, LastSchedule); 930 if (OS) { 931 *OS << "Printing analysis 'Polly - Optimize schedule of SCoP' for region: '" 932 << S.getName() << "' in function '" << S.getFunction().getName() 933 << "':\n"; 934 runScheduleOptimizerPrinter(*OS, LastSchedule); 935 } 936 937 if (!Modified) 938 return PreservedAnalyses::all(); 939 940 PreservedAnalyses PA; 941 PA.preserveSet<AllAnalysesOn<Module>>(); 942 PA.preserveSet<AllAnalysesOn<Function>>(); 943 PA.preserveSet<AllAnalysesOn<Loop>>(); 944 return PA; 945 } 946 947 llvm::PreservedAnalyses 948 IslScheduleOptimizerPass::run(Scop &S, ScopAnalysisManager &SAM, 949 ScopStandardAnalysisResults &SAR, SPMUpdater &U) { 950 return runIslScheduleOptimizerUsingNPM(S, SAM, SAR, U, nullptr); 951 } 952 953 llvm::PreservedAnalyses 954 IslScheduleOptimizerPrinterPass::run(Scop &S, ScopAnalysisManager &SAM, 955 ScopStandardAnalysisResults &SAR, 956 SPMUpdater &U) { 957 return runIslScheduleOptimizerUsingNPM(S, SAM, SAR, U, &OS); 958 } 959