1 //===- ScheduleOptimizer.cpp - Calculate an optimized schedule ------------===// 2 // 3 // Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. 4 // See https://llvm.org/LICENSE.txt for license information. 5 // SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception 6 // 7 //===----------------------------------------------------------------------===// 8 // 9 // This pass generates an entirely new schedule tree from the data dependences 10 // and iteration domains. The new schedule tree is computed in two steps: 11 // 12 // 1) The isl scheduling optimizer is run 13 // 14 // The isl scheduling optimizer creates a new schedule tree that maximizes 15 // parallelism and tileability and minimizes data-dependence distances. The 16 // algorithm used is a modified version of the ``Pluto'' algorithm: 17 // 18 // U. Bondhugula, A. Hartono, J. Ramanujam, and P. Sadayappan. 19 // A Practical Automatic Polyhedral Parallelizer and Locality Optimizer. 20 // In Proceedings of the 2008 ACM SIGPLAN Conference On Programming Language 21 // Design and Implementation, PLDI ’08, pages 101–113. ACM, 2008. 22 // 23 // 2) A set of post-scheduling transformations is applied on the schedule tree. 24 // 25 // These optimizations include: 26 // 27 // - Tiling of the innermost tilable bands 28 // - Prevectorization - The choice of a possible outer loop that is strip-mined 29 // to the innermost level to enable inner-loop 30 // vectorization. 31 // - Some optimizations for spatial locality are also planned. 32 // 33 // For a detailed description of the schedule tree itself please see section 6 34 // of: 35 // 36 // Polyhedral AST generation is more than scanning polyhedra 37 // Tobias Grosser, Sven Verdoolaege, Albert Cohen 38 // ACM Transactions on Programming Languages and Systems (TOPLAS), 39 // 37(4), July 2015 40 // http://www.grosser.es/#pub-polyhedral-AST-generation 41 // 42 // This publication also contains a detailed discussion of the different options 43 // for polyhedral loop unrolling, full/partial tile separation and other uses 44 // of the schedule tree. 45 // 46 //===----------------------------------------------------------------------===// 47 48 #include "polly/ScheduleOptimizer.h" 49 #include "polly/CodeGen/CodeGeneration.h" 50 #include "polly/DependenceInfo.h" 51 #include "polly/ManualOptimizer.h" 52 #include "polly/MatmulOptimizer.h" 53 #include "polly/Options.h" 54 #include "polly/ScheduleTreeTransform.h" 55 #include "polly/Support/ISLOStream.h" 56 #include "llvm/ADT/Sequence.h" 57 #include "llvm/ADT/Statistic.h" 58 #include "llvm/Analysis/OptimizationRemarkEmitter.h" 59 #include "llvm/InitializePasses.h" 60 #include "llvm/Support/CommandLine.h" 61 #include "isl/options.h" 62 63 using namespace llvm; 64 using namespace polly; 65 66 namespace llvm { 67 class Loop; 68 class Module; 69 } // namespace llvm 70 71 #define DEBUG_TYPE "polly-opt-isl" 72 73 static cl::opt<std::string> 74 OptimizeDeps("polly-opt-optimize-only", 75 cl::desc("Only a certain kind of dependences (all/raw)"), 76 cl::Hidden, cl::init("all"), cl::ZeroOrMore, 77 cl::cat(PollyCategory)); 78 79 static cl::opt<std::string> 80 SimplifyDeps("polly-opt-simplify-deps", 81 cl::desc("Dependences should be simplified (yes/no)"), 82 cl::Hidden, cl::init("yes"), cl::ZeroOrMore, 83 cl::cat(PollyCategory)); 84 85 static cl::opt<int> MaxConstantTerm( 86 "polly-opt-max-constant-term", 87 cl::desc("The maximal constant term allowed (-1 is unlimited)"), cl::Hidden, 88 cl::init(20), cl::ZeroOrMore, cl::cat(PollyCategory)); 89 90 static cl::opt<int> MaxCoefficient( 91 "polly-opt-max-coefficient", 92 cl::desc("The maximal coefficient allowed (-1 is unlimited)"), cl::Hidden, 93 cl::init(20), cl::ZeroOrMore, cl::cat(PollyCategory)); 94 95 static cl::opt<std::string> FusionStrategy( 96 "polly-opt-fusion", cl::desc("The fusion strategy to choose (min/max)"), 97 cl::Hidden, cl::init("min"), cl::ZeroOrMore, cl::cat(PollyCategory)); 98 99 static cl::opt<std::string> 100 MaximizeBandDepth("polly-opt-maximize-bands", 101 cl::desc("Maximize the band depth (yes/no)"), cl::Hidden, 102 cl::init("yes"), cl::ZeroOrMore, cl::cat(PollyCategory)); 103 104 static cl::opt<std::string> OuterCoincidence( 105 "polly-opt-outer-coincidence", 106 cl::desc("Try to construct schedules where the outer member of each band " 107 "satisfies the coincidence constraints (yes/no)"), 108 cl::Hidden, cl::init("no"), cl::ZeroOrMore, cl::cat(PollyCategory)); 109 110 static cl::opt<int> PrevectorWidth( 111 "polly-prevect-width", 112 cl::desc( 113 "The number of loop iterations to strip-mine for pre-vectorization"), 114 cl::Hidden, cl::init(4), cl::ZeroOrMore, cl::cat(PollyCategory)); 115 116 static cl::opt<bool> FirstLevelTiling("polly-tiling", 117 cl::desc("Enable loop tiling"), 118 cl::init(true), cl::ZeroOrMore, 119 cl::cat(PollyCategory)); 120 121 static cl::opt<int> FirstLevelDefaultTileSize( 122 "polly-default-tile-size", 123 cl::desc("The default tile size (if not enough were provided by" 124 " --polly-tile-sizes)"), 125 cl::Hidden, cl::init(32), cl::ZeroOrMore, cl::cat(PollyCategory)); 126 127 static cl::list<int> 128 FirstLevelTileSizes("polly-tile-sizes", 129 cl::desc("A tile size for each loop dimension, filled " 130 "with --polly-default-tile-size"), 131 cl::Hidden, cl::ZeroOrMore, cl::CommaSeparated, 132 cl::cat(PollyCategory)); 133 134 static cl::opt<bool> 135 SecondLevelTiling("polly-2nd-level-tiling", 136 cl::desc("Enable a 2nd level loop of loop tiling"), 137 cl::init(false), cl::ZeroOrMore, cl::cat(PollyCategory)); 138 139 static cl::opt<int> SecondLevelDefaultTileSize( 140 "polly-2nd-level-default-tile-size", 141 cl::desc("The default 2nd-level tile size (if not enough were provided by" 142 " --polly-2nd-level-tile-sizes)"), 143 cl::Hidden, cl::init(16), cl::ZeroOrMore, cl::cat(PollyCategory)); 144 145 static cl::list<int> 146 SecondLevelTileSizes("polly-2nd-level-tile-sizes", 147 cl::desc("A tile size for each loop dimension, filled " 148 "with --polly-default-tile-size"), 149 cl::Hidden, cl::ZeroOrMore, cl::CommaSeparated, 150 cl::cat(PollyCategory)); 151 152 static cl::opt<bool> RegisterTiling("polly-register-tiling", 153 cl::desc("Enable register tiling"), 154 cl::init(false), cl::ZeroOrMore, 155 cl::cat(PollyCategory)); 156 157 static cl::opt<int> RegisterDefaultTileSize( 158 "polly-register-tiling-default-tile-size", 159 cl::desc("The default register tile size (if not enough were provided by" 160 " --polly-register-tile-sizes)"), 161 cl::Hidden, cl::init(2), cl::ZeroOrMore, cl::cat(PollyCategory)); 162 163 static cl::list<int> 164 RegisterTileSizes("polly-register-tile-sizes", 165 cl::desc("A tile size for each loop dimension, filled " 166 "with --polly-register-tile-size"), 167 cl::Hidden, cl::ZeroOrMore, cl::CommaSeparated, 168 cl::cat(PollyCategory)); 169 170 static cl::opt<bool> PragmaBasedOpts( 171 "polly-pragma-based-opts", 172 cl::desc("Apply user-directed transformation from metadata"), 173 cl::init(true), cl::ZeroOrMore, cl::cat(PollyCategory)); 174 175 static cl::opt<bool> EnableReschedule("polly-reschedule", 176 cl::desc("Optimize SCoPs using ISL"), 177 cl::init(true), cl::ZeroOrMore, 178 cl::cat(PollyCategory)); 179 180 static cl::opt<bool> 181 PMBasedOpts("polly-pattern-matching-based-opts", 182 cl::desc("Perform optimizations based on pattern matching"), 183 cl::init(true), cl::ZeroOrMore, cl::cat(PollyCategory)); 184 185 static cl::opt<bool> 186 EnablePostopts("polly-postopts", 187 cl::desc("Apply post-rescheduling optimizations such as " 188 "tiling (requires -polly-reschedule)"), 189 cl::init(true), cl::ZeroOrMore, cl::cat(PollyCategory)); 190 191 static cl::opt<bool> OptimizedScops( 192 "polly-optimized-scops", 193 cl::desc("Polly - Dump polyhedral description of Scops optimized with " 194 "the isl scheduling optimizer and the set of post-scheduling " 195 "transformations is applied on the schedule tree"), 196 cl::init(false), cl::ZeroOrMore, cl::cat(PollyCategory)); 197 198 STATISTIC(ScopsProcessed, "Number of scops processed"); 199 STATISTIC(ScopsRescheduled, "Number of scops rescheduled"); 200 STATISTIC(ScopsOptimized, "Number of scops optimized"); 201 202 STATISTIC(NumAffineLoopsOptimized, "Number of affine loops optimized"); 203 STATISTIC(NumBoxedLoopsOptimized, "Number of boxed loops optimized"); 204 205 #define THREE_STATISTICS(VARNAME, DESC) \ 206 static Statistic VARNAME[3] = { \ 207 {DEBUG_TYPE, #VARNAME "0", DESC " (original)"}, \ 208 {DEBUG_TYPE, #VARNAME "1", DESC " (after scheduler)"}, \ 209 {DEBUG_TYPE, #VARNAME "2", DESC " (after optimizer)"}} 210 211 THREE_STATISTICS(NumBands, "Number of bands"); 212 THREE_STATISTICS(NumBandMembers, "Number of band members"); 213 THREE_STATISTICS(NumCoincident, "Number of coincident band members"); 214 THREE_STATISTICS(NumPermutable, "Number of permutable bands"); 215 THREE_STATISTICS(NumFilters, "Number of filter nodes"); 216 THREE_STATISTICS(NumExtension, "Number of extension nodes"); 217 218 STATISTIC(FirstLevelTileOpts, "Number of first level tiling applied"); 219 STATISTIC(SecondLevelTileOpts, "Number of second level tiling applied"); 220 STATISTIC(RegisterTileOpts, "Number of register tiling applied"); 221 STATISTIC(PrevectOpts, "Number of strip-mining for prevectorization applied"); 222 STATISTIC(MatMulOpts, 223 "Number of matrix multiplication patterns detected and optimized"); 224 225 namespace { 226 /// Additional parameters of the schedule optimizer. 227 /// 228 /// Target Transform Info and the SCoP dependencies used by the schedule 229 /// optimizer. 230 struct OptimizerAdditionalInfoTy { 231 const llvm::TargetTransformInfo *TTI; 232 const Dependences *D; 233 bool PatternOpts; 234 bool Postopts; 235 bool Prevect; 236 }; 237 238 class ScheduleTreeOptimizer { 239 public: 240 /// Apply schedule tree transformations. 241 /// 242 /// This function takes an (possibly already optimized) schedule tree and 243 /// applies a set of additional optimizations on the schedule tree. The 244 /// transformations applied include: 245 /// 246 /// - Pattern-based optimizations 247 /// - Tiling 248 /// - Prevectorization 249 /// 250 /// @param Schedule The schedule object the transformations will be applied 251 /// to. 252 /// @param OAI Target Transform Info and the SCoP dependencies. 253 /// @returns The transformed schedule. 254 static isl::schedule 255 optimizeSchedule(isl::schedule Schedule, 256 const OptimizerAdditionalInfoTy *OAI = nullptr); 257 258 /// Apply schedule tree transformations. 259 /// 260 /// This function takes a node in an (possibly already optimized) schedule 261 /// tree and applies a set of additional optimizations on this schedule tree 262 /// node and its descendants. The transformations applied include: 263 /// 264 /// - Pattern-based optimizations 265 /// - Tiling 266 /// - Prevectorization 267 /// 268 /// @param Node The schedule object post-transformations will be applied to. 269 /// @param OAI Target Transform Info and the SCoP dependencies. 270 /// @returns The transformed schedule. 271 static isl::schedule_node 272 optimizeScheduleNode(isl::schedule_node Node, 273 const OptimizerAdditionalInfoTy *OAI = nullptr); 274 275 /// Decide if the @p NewSchedule is profitable for @p S. 276 /// 277 /// @param S The SCoP we optimize. 278 /// @param NewSchedule The new schedule we computed. 279 /// 280 /// @return True, if we believe @p NewSchedule is an improvement for @p S. 281 static bool isProfitableSchedule(polly::Scop &S, isl::schedule NewSchedule); 282 283 /// Isolate a set of partial tile prefixes. 284 /// 285 /// This set should ensure that it contains only partial tile prefixes that 286 /// have exactly VectorWidth iterations. 287 /// 288 /// @param Node A schedule node band, which is a parent of a band node, 289 /// that contains a vector loop. 290 /// @return Modified isl_schedule_node. 291 static isl::schedule_node isolateFullPartialTiles(isl::schedule_node Node, 292 int VectorWidth); 293 294 private: 295 /// Check if this node is a band node we want to tile. 296 /// 297 /// We look for innermost band nodes where individual dimensions are marked as 298 /// permutable. 299 /// 300 /// @param Node The node to check. 301 static bool isTileableBandNode(isl::schedule_node Node); 302 303 /// Pre-vectorizes one scheduling dimension of a schedule band. 304 /// 305 /// prevectSchedBand splits out the dimension DimToVectorize, tiles it and 306 /// sinks the resulting point loop. 307 /// 308 /// Example (DimToVectorize=0, VectorWidth=4): 309 /// 310 /// | Before transformation: 311 /// | 312 /// | A[i,j] -> [i,j] 313 /// | 314 /// | for (i = 0; i < 128; i++) 315 /// | for (j = 0; j < 128; j++) 316 /// | A(i,j); 317 /// 318 /// | After transformation: 319 /// | 320 /// | for (it = 0; it < 32; it+=1) 321 /// | for (j = 0; j < 128; j++) 322 /// | for (ip = 0; ip <= 3; ip++) 323 /// | A(4 * it + ip,j); 324 /// 325 /// The goal of this transformation is to create a trivially vectorizable 326 /// loop. This means a parallel loop at the innermost level that has a 327 /// constant number of iterations corresponding to the target vector width. 328 /// 329 /// This transformation creates a loop at the innermost level. The loop has 330 /// a constant number of iterations, if the number of loop iterations at 331 /// DimToVectorize can be divided by VectorWidth. The default VectorWidth is 332 /// currently constant and not yet target specific. This function does not 333 /// reason about parallelism. 334 static isl::schedule_node prevectSchedBand(isl::schedule_node Node, 335 unsigned DimToVectorize, 336 int VectorWidth); 337 338 /// Apply additional optimizations on the bands in the schedule tree. 339 /// 340 /// We are looking for an innermost band node and apply the following 341 /// transformations: 342 /// 343 /// - Tile the band 344 /// - if the band is tileable 345 /// - if the band has more than one loop dimension 346 /// 347 /// - Prevectorize the schedule of the band (or the point loop in case of 348 /// tiling). 349 /// - if vectorization is enabled 350 /// 351 /// @param Node The schedule node to (possibly) optimize. 352 /// @param User A pointer to forward some use information 353 /// (currently unused). 354 static isl_schedule_node *optimizeBand(isl_schedule_node *Node, void *User); 355 356 /// Apply tiling optimizations on the bands in the schedule tree. 357 /// 358 /// @param Node The schedule node to (possibly) optimize. 359 static isl::schedule_node applyTileBandOpt(isl::schedule_node Node); 360 361 /// Apply prevectorization on the bands in the schedule tree. 362 /// 363 /// @param Node The schedule node to (possibly) prevectorize. 364 static isl::schedule_node applyPrevectBandOpt(isl::schedule_node Node); 365 }; 366 367 isl::schedule_node 368 ScheduleTreeOptimizer::isolateFullPartialTiles(isl::schedule_node Node, 369 int VectorWidth) { 370 assert(isl_schedule_node_get_type(Node.get()) == isl_schedule_node_band); 371 Node = Node.child(0).child(0); 372 isl::union_map SchedRelUMap = Node.get_prefix_schedule_relation(); 373 isl::union_set ScheduleRangeUSet = SchedRelUMap.range(); 374 isl::set ScheduleRange{ScheduleRangeUSet}; 375 isl::set IsolateDomain = getPartialTilePrefixes(ScheduleRange, VectorWidth); 376 auto AtomicOption = getDimOptions(IsolateDomain.ctx(), "atomic"); 377 isl::union_set IsolateOption = getIsolateOptions(IsolateDomain, 1); 378 Node = Node.parent().parent(); 379 isl::union_set Options = IsolateOption.unite(AtomicOption); 380 isl::schedule_node_band Result = 381 Node.as<isl::schedule_node_band>().set_ast_build_options(Options); 382 return Result; 383 } 384 385 isl::schedule_node ScheduleTreeOptimizer::prevectSchedBand( 386 isl::schedule_node Node, unsigned DimToVectorize, int VectorWidth) { 387 assert(isl_schedule_node_get_type(Node.get()) == isl_schedule_node_band); 388 389 auto Space = isl::manage(isl_schedule_node_band_get_space(Node.get())); 390 isl_size ScheduleDimensions = Space.dim(isl::dim::set).release(); 391 assert((isl_size)DimToVectorize < ScheduleDimensions); 392 393 if (DimToVectorize > 0) { 394 Node = isl::manage( 395 isl_schedule_node_band_split(Node.release(), DimToVectorize)); 396 Node = Node.child(0); 397 } 398 if ((isl_size)DimToVectorize < ScheduleDimensions - 1) 399 Node = isl::manage(isl_schedule_node_band_split(Node.release(), 1)); 400 Space = isl::manage(isl_schedule_node_band_get_space(Node.get())); 401 auto Sizes = isl::multi_val::zero(Space); 402 Sizes = Sizes.set_val(0, isl::val(Node.ctx(), VectorWidth)); 403 Node = 404 isl::manage(isl_schedule_node_band_tile(Node.release(), Sizes.release())); 405 Node = isolateFullPartialTiles(Node, VectorWidth); 406 Node = Node.child(0); 407 // Make sure the "trivially vectorizable loop" is not unrolled. Otherwise, 408 // we will have troubles to match it in the backend. 409 isl::schedule_node_band NodeBand = 410 Node.as<isl::schedule_node_band>().set_ast_build_options( 411 isl::union_set(Node.ctx(), "{ unroll[x]: 1 = 0 }")); 412 Node = isl::manage(isl_schedule_node_band_sink(NodeBand.release())); 413 Node = Node.child(0); 414 if (isl_schedule_node_get_type(Node.get()) == isl_schedule_node_leaf) 415 Node = Node.parent(); 416 auto LoopMarker = isl::id::alloc(Node.ctx(), "SIMD", nullptr); 417 PrevectOpts++; 418 return Node.insert_mark(LoopMarker); 419 } 420 421 static bool isSimpleInnermostBand(const isl::schedule_node &Node) { 422 assert(isl_schedule_node_get_type(Node.get()) == isl_schedule_node_band); 423 assert(isl_schedule_node_n_children(Node.get()) == 1); 424 425 auto ChildType = isl_schedule_node_get_type(Node.child(0).get()); 426 427 if (ChildType == isl_schedule_node_leaf) 428 return true; 429 430 if (ChildType != isl_schedule_node_sequence) 431 return false; 432 433 auto Sequence = Node.child(0); 434 435 for (int c = 0, nc = isl_schedule_node_n_children(Sequence.get()); c < nc; 436 ++c) { 437 auto Child = Sequence.child(c); 438 if (isl_schedule_node_get_type(Child.get()) != isl_schedule_node_filter) 439 return false; 440 if (isl_schedule_node_get_type(Child.child(0).get()) != 441 isl_schedule_node_leaf) 442 return false; 443 } 444 return true; 445 } 446 447 bool ScheduleTreeOptimizer::isTileableBandNode(isl::schedule_node Node) { 448 if (isl_schedule_node_get_type(Node.get()) != isl_schedule_node_band) 449 return false; 450 451 if (isl_schedule_node_n_children(Node.get()) != 1) 452 return false; 453 454 if (!isl_schedule_node_band_get_permutable(Node.get())) 455 return false; 456 457 auto Space = isl::manage(isl_schedule_node_band_get_space(Node.get())); 458 auto Dims = Space.dim(isl::dim::set).release(); 459 460 if (Dims <= 1) 461 return false; 462 463 return isSimpleInnermostBand(Node); 464 } 465 466 __isl_give isl::schedule_node 467 ScheduleTreeOptimizer::applyTileBandOpt(isl::schedule_node Node) { 468 if (FirstLevelTiling) { 469 Node = tileNode(Node, "1st level tiling", FirstLevelTileSizes, 470 FirstLevelDefaultTileSize); 471 FirstLevelTileOpts++; 472 } 473 474 if (SecondLevelTiling) { 475 Node = tileNode(Node, "2nd level tiling", SecondLevelTileSizes, 476 SecondLevelDefaultTileSize); 477 SecondLevelTileOpts++; 478 } 479 480 if (RegisterTiling) { 481 Node = 482 applyRegisterTiling(Node, RegisterTileSizes, RegisterDefaultTileSize); 483 RegisterTileOpts++; 484 } 485 486 return Node; 487 } 488 489 isl::schedule_node 490 ScheduleTreeOptimizer::applyPrevectBandOpt(isl::schedule_node Node) { 491 auto Space = isl::manage(isl_schedule_node_band_get_space(Node.get())); 492 auto Dims = Space.dim(isl::dim::set).release(); 493 494 for (int i = Dims - 1; i >= 0; i--) 495 if (Node.as<isl::schedule_node_band>().member_get_coincident(i)) { 496 Node = prevectSchedBand(Node, i, PrevectorWidth); 497 break; 498 } 499 500 return Node; 501 } 502 503 __isl_give isl_schedule_node * 504 ScheduleTreeOptimizer::optimizeBand(__isl_take isl_schedule_node *NodeArg, 505 void *User) { 506 const OptimizerAdditionalInfoTy *OAI = 507 static_cast<const OptimizerAdditionalInfoTy *>(User); 508 assert(OAI && "Expecting optimization options"); 509 510 isl::schedule_node Node = isl::manage(NodeArg); 511 if (!isTileableBandNode(Node)) 512 return Node.release(); 513 514 if (OAI->PatternOpts) { 515 isl::schedule_node PatternOptimizedSchedule = 516 tryOptimizeMatMulPattern(Node, OAI->TTI, OAI->D); 517 if (!PatternOptimizedSchedule.is_null()) { 518 MatMulOpts++; 519 return PatternOptimizedSchedule.release(); 520 } 521 } 522 523 if (OAI->Postopts) 524 Node = applyTileBandOpt(Node); 525 526 if (OAI->Prevect) { 527 // FIXME: Prevectorization requirements are different from those checked by 528 // isTileableBandNode. 529 Node = applyPrevectBandOpt(Node); 530 } 531 532 return Node.release(); 533 } 534 535 isl::schedule 536 ScheduleTreeOptimizer::optimizeSchedule(isl::schedule Schedule, 537 const OptimizerAdditionalInfoTy *OAI) { 538 auto Root = Schedule.get_root(); 539 Root = optimizeScheduleNode(Root, OAI); 540 return Root.get_schedule(); 541 } 542 543 isl::schedule_node ScheduleTreeOptimizer::optimizeScheduleNode( 544 isl::schedule_node Node, const OptimizerAdditionalInfoTy *OAI) { 545 Node = isl::manage(isl_schedule_node_map_descendant_bottom_up( 546 Node.release(), optimizeBand, 547 const_cast<void *>(static_cast<const void *>(OAI)))); 548 return Node; 549 } 550 551 bool ScheduleTreeOptimizer::isProfitableSchedule(Scop &S, 552 isl::schedule NewSchedule) { 553 // To understand if the schedule has been optimized we check if the schedule 554 // has changed at all. 555 // TODO: We can improve this by tracking if any necessarily beneficial 556 // transformations have been performed. This can e.g. be tiling, loop 557 // interchange, or ...) We can track this either at the place where the 558 // transformation has been performed or, in case of automatic ILP based 559 // optimizations, by comparing (yet to be defined) performance metrics 560 // before/after the scheduling optimizer 561 // (e.g., #stride-one accesses) 562 // FIXME: A schedule tree whose union_map-conversion is identical to the 563 // original schedule map may still allow for parallelization, i.e. can still 564 // be profitable. 565 auto NewScheduleMap = NewSchedule.get_map(); 566 auto OldSchedule = S.getSchedule(); 567 assert(!OldSchedule.is_null() && 568 "Only IslScheduleOptimizer can insert extension nodes " 569 "that make Scop::getSchedule() return nullptr."); 570 bool changed = !OldSchedule.is_equal(NewScheduleMap); 571 return changed; 572 } 573 574 class IslScheduleOptimizerWrapperPass : public ScopPass { 575 public: 576 static char ID; 577 578 explicit IslScheduleOptimizerWrapperPass() : ScopPass(ID) {} 579 580 /// Optimize the schedule of the SCoP @p S. 581 bool runOnScop(Scop &S) override; 582 583 /// Print the new schedule for the SCoP @p S. 584 void printScop(raw_ostream &OS, Scop &S) const override; 585 586 /// Register all analyses and transformation required. 587 void getAnalysisUsage(AnalysisUsage &AU) const override; 588 589 /// Release the internal memory. 590 void releaseMemory() override { 591 LastSchedule = {}; 592 IslCtx.reset(); 593 } 594 595 private: 596 std::shared_ptr<isl_ctx> IslCtx; 597 isl::schedule LastSchedule; 598 }; 599 600 char IslScheduleOptimizerWrapperPass::ID = 0; 601 602 #ifndef NDEBUG 603 static void printSchedule(llvm::raw_ostream &OS, const isl::schedule &Schedule, 604 StringRef Desc) { 605 isl::ctx Ctx = Schedule.ctx(); 606 isl_printer *P = isl_printer_to_str(Ctx.get()); 607 P = isl_printer_set_yaml_style(P, ISL_YAML_STYLE_BLOCK); 608 P = isl_printer_print_schedule(P, Schedule.get()); 609 char *Str = isl_printer_get_str(P); 610 OS << Desc << ": \n" << Str << "\n"; 611 free(Str); 612 isl_printer_free(P); 613 } 614 #endif 615 616 /// Collect statistics for the schedule tree. 617 /// 618 /// @param Schedule The schedule tree to analyze. If not a schedule tree it is 619 /// ignored. 620 /// @param Version The version of the schedule tree that is analyzed. 621 /// 0 for the original schedule tree before any transformation. 622 /// 1 for the schedule tree after isl's rescheduling. 623 /// 2 for the schedule tree after optimizations are applied 624 /// (tiling, pattern matching) 625 static void walkScheduleTreeForStatistics(isl::schedule Schedule, int Version) { 626 auto Root = Schedule.get_root(); 627 if (Root.is_null()) 628 return; 629 630 isl_schedule_node_foreach_descendant_top_down( 631 Root.get(), 632 [](__isl_keep isl_schedule_node *nodeptr, void *user) -> isl_bool { 633 isl::schedule_node Node = isl::manage_copy(nodeptr); 634 int Version = *static_cast<int *>(user); 635 636 switch (isl_schedule_node_get_type(Node.get())) { 637 case isl_schedule_node_band: { 638 NumBands[Version]++; 639 if (isl_schedule_node_band_get_permutable(Node.get()) == 640 isl_bool_true) 641 NumPermutable[Version]++; 642 643 int CountMembers = isl_schedule_node_band_n_member(Node.get()); 644 NumBandMembers[Version] += CountMembers; 645 for (int i = 0; i < CountMembers; i += 1) { 646 if (Node.as<isl::schedule_node_band>().member_get_coincident(i)) 647 NumCoincident[Version]++; 648 } 649 break; 650 } 651 652 case isl_schedule_node_filter: 653 NumFilters[Version]++; 654 break; 655 656 case isl_schedule_node_extension: 657 NumExtension[Version]++; 658 break; 659 660 default: 661 break; 662 } 663 664 return isl_bool_true; 665 }, 666 &Version); 667 } 668 669 static bool runIslScheduleOptimizer( 670 Scop &S, 671 function_ref<const Dependences &(Dependences::AnalysisLevel)> GetDeps, 672 TargetTransformInfo *TTI, OptimizationRemarkEmitter *ORE, 673 isl::schedule &LastSchedule) { 674 675 // Skip SCoPs in case they're already optimised by PPCGCodeGeneration 676 if (S.isToBeSkipped()) 677 return false; 678 679 // Skip empty SCoPs but still allow code generation as it will delete the 680 // loops present but not needed. 681 if (S.getSize() == 0) { 682 S.markAsOptimized(); 683 return false; 684 } 685 686 ScopsProcessed++; 687 688 // Schedule without optimizations. 689 isl::schedule Schedule = S.getScheduleTree(); 690 walkScheduleTreeForStatistics(S.getScheduleTree(), 0); 691 LLVM_DEBUG(printSchedule(dbgs(), Schedule, "Original schedule tree")); 692 693 bool HasUserTransformation = false; 694 if (PragmaBasedOpts) { 695 isl::schedule ManuallyTransformed = applyManualTransformations( 696 &S, Schedule, GetDeps(Dependences::AL_Statement), ORE); 697 if (ManuallyTransformed.is_null()) { 698 LLVM_DEBUG(dbgs() << "Error during manual optimization\n"); 699 return false; 700 } 701 702 if (ManuallyTransformed.get() != Schedule.get()) { 703 // User transformations have precedence over other transformations. 704 HasUserTransformation = true; 705 Schedule = std::move(ManuallyTransformed); 706 LLVM_DEBUG( 707 printSchedule(dbgs(), Schedule, "After manual transformations")); 708 } 709 } 710 711 // Only continue if either manual transformations have been applied or we are 712 // allowed to apply heuristics. 713 // TODO: Detect disabled heuristics and no user-directed transformation 714 // metadata earlier in ScopDetection. 715 if (!HasUserTransformation && S.hasDisableHeuristicsHint()) { 716 LLVM_DEBUG(dbgs() << "Heuristic optimizations disabled by metadata\n"); 717 return false; 718 } 719 720 // Get dependency analysis. 721 const Dependences &D = GetDeps(Dependences::AL_Statement); 722 if (D.getSharedIslCtx() != S.getSharedIslCtx()) { 723 LLVM_DEBUG(dbgs() << "DependenceInfo for another SCoP/isl_ctx\n"); 724 return false; 725 } 726 if (!D.hasValidDependences()) { 727 LLVM_DEBUG(dbgs() << "Dependency information not available\n"); 728 return false; 729 } 730 731 // Apply ISL's algorithm only if not overriden by the user. Note that 732 // post-rescheduling optimizations (tiling, pattern-based, prevectorization) 733 // rely on the coincidence/permutable annotations on schedule tree bands that 734 // are added by the rescheduling analyzer. Therefore, disabling the 735 // rescheduler implicitly also disables these optimizations. 736 if (!EnableReschedule) { 737 LLVM_DEBUG(dbgs() << "Skipping rescheduling due to command line option\n"); 738 } else if (HasUserTransformation) { 739 LLVM_DEBUG( 740 dbgs() << "Skipping rescheduling due to manual transformation\n"); 741 } else { 742 // Build input data. 743 int ValidityKinds = 744 Dependences::TYPE_RAW | Dependences::TYPE_WAR | Dependences::TYPE_WAW; 745 int ProximityKinds; 746 747 if (OptimizeDeps == "all") 748 ProximityKinds = 749 Dependences::TYPE_RAW | Dependences::TYPE_WAR | Dependences::TYPE_WAW; 750 else if (OptimizeDeps == "raw") 751 ProximityKinds = Dependences::TYPE_RAW; 752 else { 753 errs() << "Do not know how to optimize for '" << OptimizeDeps << "'" 754 << " Falling back to optimizing all dependences.\n"; 755 ProximityKinds = 756 Dependences::TYPE_RAW | Dependences::TYPE_WAR | Dependences::TYPE_WAW; 757 } 758 759 isl::union_set Domain = S.getDomains(); 760 761 if (Domain.is_null()) 762 return false; 763 764 isl::union_map Validity = D.getDependences(ValidityKinds); 765 isl::union_map Proximity = D.getDependences(ProximityKinds); 766 767 // Simplify the dependences by removing the constraints introduced by the 768 // domains. This can speed up the scheduling time significantly, as large 769 // constant coefficients will be removed from the dependences. The 770 // introduction of some additional dependences reduces the possible 771 // transformations, but in most cases, such transformation do not seem to be 772 // interesting anyway. In some cases this option may stop the scheduler to 773 // find any schedule. 774 if (SimplifyDeps == "yes") { 775 Validity = Validity.gist_domain(Domain); 776 Validity = Validity.gist_range(Domain); 777 Proximity = Proximity.gist_domain(Domain); 778 Proximity = Proximity.gist_range(Domain); 779 } else if (SimplifyDeps != "no") { 780 errs() 781 << "warning: Option -polly-opt-simplify-deps should either be 'yes' " 782 "or 'no'. Falling back to default: 'yes'\n"; 783 } 784 785 LLVM_DEBUG(dbgs() << "\n\nCompute schedule from: "); 786 LLVM_DEBUG(dbgs() << "Domain := " << Domain << ";\n"); 787 LLVM_DEBUG(dbgs() << "Proximity := " << Proximity << ";\n"); 788 LLVM_DEBUG(dbgs() << "Validity := " << Validity << ";\n"); 789 790 unsigned IslSerializeSCCs; 791 792 if (FusionStrategy == "max") { 793 IslSerializeSCCs = 0; 794 } else if (FusionStrategy == "min") { 795 IslSerializeSCCs = 1; 796 } else { 797 errs() << "warning: Unknown fusion strategy. Falling back to maximal " 798 "fusion.\n"; 799 IslSerializeSCCs = 0; 800 } 801 802 int IslMaximizeBands; 803 804 if (MaximizeBandDepth == "yes") { 805 IslMaximizeBands = 1; 806 } else if (MaximizeBandDepth == "no") { 807 IslMaximizeBands = 0; 808 } else { 809 errs() 810 << "warning: Option -polly-opt-maximize-bands should either be 'yes'" 811 " or 'no'. Falling back to default: 'yes'\n"; 812 IslMaximizeBands = 1; 813 } 814 815 int IslOuterCoincidence; 816 817 if (OuterCoincidence == "yes") { 818 IslOuterCoincidence = 1; 819 } else if (OuterCoincidence == "no") { 820 IslOuterCoincidence = 0; 821 } else { 822 errs() << "warning: Option -polly-opt-outer-coincidence should either be " 823 "'yes' or 'no'. Falling back to default: 'no'\n"; 824 IslOuterCoincidence = 0; 825 } 826 827 isl_ctx *Ctx = S.getIslCtx().get(); 828 829 isl_options_set_schedule_outer_coincidence(Ctx, IslOuterCoincidence); 830 isl_options_set_schedule_serialize_sccs(Ctx, IslSerializeSCCs); 831 isl_options_set_schedule_maximize_band_depth(Ctx, IslMaximizeBands); 832 isl_options_set_schedule_max_constant_term(Ctx, MaxConstantTerm); 833 isl_options_set_schedule_max_coefficient(Ctx, MaxCoefficient); 834 isl_options_set_tile_scale_tile_loops(Ctx, 0); 835 836 auto OnErrorStatus = isl_options_get_on_error(Ctx); 837 isl_options_set_on_error(Ctx, ISL_ON_ERROR_CONTINUE); 838 839 auto SC = isl::schedule_constraints::on_domain(Domain); 840 SC = SC.set_proximity(Proximity); 841 SC = SC.set_validity(Validity); 842 SC = SC.set_coincidence(Validity); 843 Schedule = SC.compute_schedule(); 844 isl_options_set_on_error(Ctx, OnErrorStatus); 845 846 ScopsRescheduled++; 847 LLVM_DEBUG(printSchedule(dbgs(), Schedule, "After rescheduling")); 848 } 849 850 walkScheduleTreeForStatistics(Schedule, 1); 851 852 // In cases the scheduler is not able to optimize the code, we just do not 853 // touch the schedule. 854 if (Schedule.is_null()) 855 return false; 856 857 // Apply post-rescheduling optimizations (if enabled) and/or prevectorization. 858 const OptimizerAdditionalInfoTy OAI = { 859 TTI, const_cast<Dependences *>(&D), 860 /*PatternOpts=*/!HasUserTransformation && PMBasedOpts, 861 /*Postopts=*/!HasUserTransformation && EnablePostopts, 862 /*Prevect=*/PollyVectorizerChoice != VECTORIZER_NONE}; 863 if (OAI.PatternOpts || OAI.Postopts || OAI.Prevect) { 864 Schedule = ScheduleTreeOptimizer::optimizeSchedule(Schedule, &OAI); 865 Schedule = hoistExtensionNodes(Schedule); 866 LLVM_DEBUG(printSchedule(dbgs(), Schedule, "After post-optimizations")); 867 walkScheduleTreeForStatistics(Schedule, 2); 868 } 869 870 // Skip profitability check if user transformation(s) have been applied. 871 if (!HasUserTransformation && 872 !ScheduleTreeOptimizer::isProfitableSchedule(S, Schedule)) 873 return false; 874 875 auto ScopStats = S.getStatistics(); 876 ScopsOptimized++; 877 NumAffineLoopsOptimized += ScopStats.NumAffineLoops; 878 NumBoxedLoopsOptimized += ScopStats.NumBoxedLoops; 879 LastSchedule = Schedule; 880 881 S.setScheduleTree(Schedule); 882 S.markAsOptimized(); 883 884 if (OptimizedScops) 885 errs() << S; 886 887 return false; 888 } 889 890 bool IslScheduleOptimizerWrapperPass::runOnScop(Scop &S) { 891 releaseMemory(); 892 893 Function &F = S.getFunction(); 894 IslCtx = S.getSharedIslCtx(); 895 896 auto getDependences = 897 [this](Dependences::AnalysisLevel) -> const Dependences & { 898 return getAnalysis<DependenceInfo>().getDependences( 899 Dependences::AL_Statement); 900 }; 901 OptimizationRemarkEmitter &ORE = 902 getAnalysis<OptimizationRemarkEmitterWrapperPass>().getORE(); 903 TargetTransformInfo *TTI = 904 &getAnalysis<TargetTransformInfoWrapperPass>().getTTI(F); 905 return runIslScheduleOptimizer(S, getDependences, TTI, &ORE, LastSchedule); 906 } 907 908 static void runScheduleOptimizerPrinter(raw_ostream &OS, 909 isl::schedule LastSchedule) { 910 isl_printer *p; 911 char *ScheduleStr; 912 913 OS << "Calculated schedule:\n"; 914 915 if (LastSchedule.is_null()) { 916 OS << "n/a\n"; 917 return; 918 } 919 920 p = isl_printer_to_str(LastSchedule.ctx().get()); 921 p = isl_printer_set_yaml_style(p, ISL_YAML_STYLE_BLOCK); 922 p = isl_printer_print_schedule(p, LastSchedule.get()); 923 ScheduleStr = isl_printer_get_str(p); 924 isl_printer_free(p); 925 926 OS << ScheduleStr << "\n"; 927 928 free(ScheduleStr); 929 } 930 931 void IslScheduleOptimizerWrapperPass::printScop(raw_ostream &OS, Scop &) const { 932 runScheduleOptimizerPrinter(OS, LastSchedule); 933 } 934 935 void IslScheduleOptimizerWrapperPass::getAnalysisUsage( 936 AnalysisUsage &AU) const { 937 ScopPass::getAnalysisUsage(AU); 938 AU.addRequired<DependenceInfo>(); 939 AU.addRequired<TargetTransformInfoWrapperPass>(); 940 AU.addRequired<OptimizationRemarkEmitterWrapperPass>(); 941 942 AU.addPreserved<DependenceInfo>(); 943 AU.addPreserved<OptimizationRemarkEmitterWrapperPass>(); 944 } 945 946 } // namespace 947 948 Pass *polly::createIslScheduleOptimizerWrapperPass() { 949 return new IslScheduleOptimizerWrapperPass(); 950 } 951 952 INITIALIZE_PASS_BEGIN(IslScheduleOptimizerWrapperPass, "polly-opt-isl", 953 "Polly - Optimize schedule of SCoP", false, false); 954 INITIALIZE_PASS_DEPENDENCY(DependenceInfo); 955 INITIALIZE_PASS_DEPENDENCY(ScopInfoRegionPass); 956 INITIALIZE_PASS_DEPENDENCY(TargetTransformInfoWrapperPass); 957 INITIALIZE_PASS_DEPENDENCY(OptimizationRemarkEmitterWrapperPass); 958 INITIALIZE_PASS_END(IslScheduleOptimizerWrapperPass, "polly-opt-isl", 959 "Polly - Optimize schedule of SCoP", false, false) 960 961 static llvm::PreservedAnalyses 962 runIslScheduleOptimizerUsingNPM(Scop &S, ScopAnalysisManager &SAM, 963 ScopStandardAnalysisResults &SAR, SPMUpdater &U, 964 raw_ostream *OS) { 965 DependenceAnalysis::Result &Deps = SAM.getResult<DependenceAnalysis>(S, SAR); 966 auto GetDeps = [&Deps](Dependences::AnalysisLevel) -> const Dependences & { 967 return Deps.getDependences(Dependences::AL_Statement); 968 }; 969 OptimizationRemarkEmitter ORE(&S.getFunction()); 970 TargetTransformInfo *TTI = &SAR.TTI; 971 isl::schedule LastSchedule; 972 bool Modified = runIslScheduleOptimizer(S, GetDeps, TTI, &ORE, LastSchedule); 973 if (OS) { 974 *OS << "Printing analysis 'Polly - Optimize schedule of SCoP' for region: '" 975 << S.getName() << "' in function '" << S.getFunction().getName() 976 << "':\n"; 977 runScheduleOptimizerPrinter(*OS, LastSchedule); 978 } 979 980 if (!Modified) 981 return PreservedAnalyses::all(); 982 983 PreservedAnalyses PA; 984 PA.preserveSet<AllAnalysesOn<Module>>(); 985 PA.preserveSet<AllAnalysesOn<Function>>(); 986 PA.preserveSet<AllAnalysesOn<Loop>>(); 987 return PA; 988 } 989 990 llvm::PreservedAnalyses 991 IslScheduleOptimizerPass::run(Scop &S, ScopAnalysisManager &SAM, 992 ScopStandardAnalysisResults &SAR, SPMUpdater &U) { 993 return runIslScheduleOptimizerUsingNPM(S, SAM, SAR, U, nullptr); 994 } 995 996 llvm::PreservedAnalyses 997 IslScheduleOptimizerPrinterPass::run(Scop &S, ScopAnalysisManager &SAM, 998 ScopStandardAnalysisResults &SAR, 999 SPMUpdater &U) { 1000 return runIslScheduleOptimizerUsingNPM(S, SAM, SAR, U, &OS); 1001 } 1002