1 //===- Schedule.cpp - Calculate an optimized schedule ---------------------===// 2 // 3 // The LLVM Compiler Infrastructure 4 // 5 // This file is distributed under the University of Illinois Open Source 6 // License. See LICENSE.TXT for details. 7 // 8 //===----------------------------------------------------------------------===// 9 // 10 // This pass generates an entirey new schedule tree from the data dependences 11 // and iteration domains. The new schedule tree is computed in two steps: 12 // 13 // 1) The isl scheduling optimizer is run 14 // 15 // The isl scheduling optimizer creates a new schedule tree that maximizes 16 // parallelism and tileability and minimizes data-dependence distances. The 17 // algorithm used is a modified version of the ``Pluto'' algorithm: 18 // 19 // U. Bondhugula, A. Hartono, J. Ramanujam, and P. Sadayappan. 20 // A Practical Automatic Polyhedral Parallelizer and Locality Optimizer. 21 // In Proceedings of the 2008 ACM SIGPLAN Conference On Programming Language 22 // Design and Implementation, PLDI ’08, pages 101–113. ACM, 2008. 23 // 24 // 2) A set of post-scheduling transformations is applied on the schedule tree. 25 // 26 // These optimizations include: 27 // 28 // - Tiling of the innermost tilable bands 29 // - Prevectorization - The coice of a possible outer loop that is strip-mined 30 // to the innermost level to enable inner-loop 31 // vectorization. 32 // - Some optimizations for spatial locality are also planned. 33 // 34 // For a detailed description of the schedule tree itself please see section 6 35 // of: 36 // 37 // Polyhedral AST generation is more than scanning polyhedra 38 // Tobias Grosser, Sven Verdoolaege, Albert Cohen 39 // ACM Transations on Programming Languages and Systems (TOPLAS), 40 // 37(4), July 2015 41 // http://www.grosser.es/#pub-polyhedral-AST-generation 42 // 43 // This publication also contains a detailed discussion of the different options 44 // for polyhedral loop unrolling, full/partial tile separation and other uses 45 // of the schedule tree. 46 // 47 //===----------------------------------------------------------------------===// 48 49 #include "polly/ScheduleOptimizer.h" 50 #include "polly/CodeGen/CodeGeneration.h" 51 #include "polly/DependenceInfo.h" 52 #include "polly/LinkAllPasses.h" 53 #include "polly/Options.h" 54 #include "polly/ScopInfo.h" 55 #include "polly/Support/GICHelper.h" 56 #include "llvm/Support/Debug.h" 57 #include "isl/aff.h" 58 #include "isl/band.h" 59 #include "isl/constraint.h" 60 #include "isl/map.h" 61 #include "isl/options.h" 62 #include "isl/printer.h" 63 #include "isl/schedule.h" 64 #include "isl/schedule_node.h" 65 #include "isl/space.h" 66 #include "isl/union_map.h" 67 #include "isl/union_set.h" 68 69 using namespace llvm; 70 using namespace polly; 71 72 #define DEBUG_TYPE "polly-opt-isl" 73 74 static cl::opt<std::string> 75 OptimizeDeps("polly-opt-optimize-only", 76 cl::desc("Only a certain kind of dependences (all/raw)"), 77 cl::Hidden, cl::init("all"), cl::ZeroOrMore, 78 cl::cat(PollyCategory)); 79 80 static cl::opt<std::string> 81 SimplifyDeps("polly-opt-simplify-deps", 82 cl::desc("Dependences should be simplified (yes/no)"), 83 cl::Hidden, cl::init("yes"), cl::ZeroOrMore, 84 cl::cat(PollyCategory)); 85 86 static cl::opt<int> MaxConstantTerm( 87 "polly-opt-max-constant-term", 88 cl::desc("The maximal constant term allowed (-1 is unlimited)"), cl::Hidden, 89 cl::init(20), cl::ZeroOrMore, cl::cat(PollyCategory)); 90 91 static cl::opt<int> MaxCoefficient( 92 "polly-opt-max-coefficient", 93 cl::desc("The maximal coefficient allowed (-1 is unlimited)"), cl::Hidden, 94 cl::init(20), cl::ZeroOrMore, cl::cat(PollyCategory)); 95 96 static cl::opt<std::string> FusionStrategy( 97 "polly-opt-fusion", cl::desc("The fusion strategy to choose (min/max)"), 98 cl::Hidden, cl::init("min"), cl::ZeroOrMore, cl::cat(PollyCategory)); 99 100 static cl::opt<std::string> 101 MaximizeBandDepth("polly-opt-maximize-bands", 102 cl::desc("Maximize the band depth (yes/no)"), cl::Hidden, 103 cl::init("yes"), cl::ZeroOrMore, cl::cat(PollyCategory)); 104 105 static cl::opt<int> PrevectorWidth( 106 "polly-prevect-width", 107 cl::desc( 108 "The number of loop iterations to strip-mine for pre-vectorization"), 109 cl::Hidden, cl::init(4), cl::ZeroOrMore, cl::cat(PollyCategory)); 110 111 static cl::opt<bool> FirstLevelTiling("polly-tiling", 112 cl::desc("Enable loop tiling"), 113 cl::init(true), cl::ZeroOrMore, 114 cl::cat(PollyCategory)); 115 116 static cl::opt<int> FirstLevelDefaultTileSize( 117 "polly-default-tile-size", 118 cl::desc("The default tile size (if not enough were provided by" 119 " --polly-tile-sizes)"), 120 cl::Hidden, cl::init(32), cl::ZeroOrMore, cl::cat(PollyCategory)); 121 122 static cl::list<int> FirstLevelTileSizes( 123 "polly-tile-sizes", cl::desc("A tile size for each loop dimension, filled " 124 "with --polly-default-tile-size"), 125 cl::Hidden, cl::ZeroOrMore, cl::CommaSeparated, cl::cat(PollyCategory)); 126 127 static cl::opt<bool> 128 SecondLevelTiling("polly-2nd-level-tiling", 129 cl::desc("Enable a 2nd level loop of loop tiling"), 130 cl::init(false), cl::ZeroOrMore, cl::cat(PollyCategory)); 131 132 static cl::opt<int> SecondLevelDefaultTileSize( 133 "polly-2nd-level-default-tile-size", 134 cl::desc("The default 2nd-level tile size (if not enough were provided by" 135 " --polly-2nd-level-tile-sizes)"), 136 cl::Hidden, cl::init(16), cl::ZeroOrMore, cl::cat(PollyCategory)); 137 138 static cl::list<int> 139 SecondLevelTileSizes("polly-2nd-level-tile-sizes", 140 cl::desc("A tile size for each loop dimension, filled " 141 "with --polly-default-tile-size"), 142 cl::Hidden, cl::ZeroOrMore, cl::CommaSeparated, 143 cl::cat(PollyCategory)); 144 145 static cl::opt<bool> RegisterTiling("polly-register-tiling", 146 cl::desc("Enable register tiling"), 147 cl::init(false), cl::ZeroOrMore, 148 cl::cat(PollyCategory)); 149 150 static cl::opt<int> RegisterDefaultTileSize( 151 "polly-register-tiling-default-tile-size", 152 cl::desc("The default register tile size (if not enough were provided by" 153 " --polly-register-tile-sizes)"), 154 cl::Hidden, cl::init(2), cl::ZeroOrMore, cl::cat(PollyCategory)); 155 156 static cl::list<int> 157 RegisterTileSizes("polly-register-tile-sizes", 158 cl::desc("A tile size for each loop dimension, filled " 159 "with --polly-register-tile-size"), 160 cl::Hidden, cl::ZeroOrMore, cl::CommaSeparated, 161 cl::cat(PollyCategory)); 162 163 __isl_give isl_schedule_node * 164 ScheduleTreeOptimizer::prevectSchedBand(__isl_take isl_schedule_node *Node, 165 unsigned DimToVectorize, 166 int VectorWidth) { 167 assert(isl_schedule_node_get_type(Node) == isl_schedule_node_band); 168 169 auto Space = isl_schedule_node_band_get_space(Node); 170 auto ScheduleDimensions = isl_space_dim(Space, isl_dim_set); 171 isl_space_free(Space); 172 assert(DimToVectorize < ScheduleDimensions); 173 174 if (DimToVectorize > 0) { 175 Node = isl_schedule_node_band_split(Node, DimToVectorize); 176 Node = isl_schedule_node_child(Node, 0); 177 } 178 if (DimToVectorize < ScheduleDimensions - 1) 179 Node = isl_schedule_node_band_split(Node, 1); 180 Space = isl_schedule_node_band_get_space(Node); 181 auto Sizes = isl_multi_val_zero(Space); 182 auto Ctx = isl_schedule_node_get_ctx(Node); 183 Sizes = 184 isl_multi_val_set_val(Sizes, 0, isl_val_int_from_si(Ctx, VectorWidth)); 185 Node = isl_schedule_node_band_tile(Node, Sizes); 186 Node = isl_schedule_node_child(Node, 0); 187 // Make sure the "trivially vectorizable loop" is not unrolled. Otherwise, 188 // we will have troubles to match it in the backend. 189 Node = isl_schedule_node_band_set_ast_build_options( 190 Node, isl_union_set_read_from_str(Ctx, "{ unroll[x]: 1 = 0 }")); 191 Node = isl_schedule_node_band_sink(Node); 192 Node = isl_schedule_node_child(Node, 0); 193 return Node; 194 } 195 196 __isl_give isl_schedule_node * 197 ScheduleTreeOptimizer::tileNode(__isl_take isl_schedule_node *Node, 198 const char *Identifier, ArrayRef<int> TileSizes, 199 int DefaultTileSize) { 200 auto Ctx = isl_schedule_node_get_ctx(Node); 201 auto Space = isl_schedule_node_band_get_space(Node); 202 auto Dims = isl_space_dim(Space, isl_dim_set); 203 auto Sizes = isl_multi_val_zero(Space); 204 std::string IdentifierString(Identifier); 205 for (unsigned i = 0; i < Dims; i++) { 206 auto tileSize = i < TileSizes.size() ? TileSizes[i] : DefaultTileSize; 207 Sizes = isl_multi_val_set_val(Sizes, i, isl_val_int_from_si(Ctx, tileSize)); 208 } 209 auto TileLoopMarkerStr = IdentifierString + " - Tiles"; 210 isl_id *TileLoopMarker = 211 isl_id_alloc(Ctx, TileLoopMarkerStr.c_str(), nullptr); 212 Node = isl_schedule_node_insert_mark(Node, TileLoopMarker); 213 Node = isl_schedule_node_child(Node, 0); 214 Node = isl_schedule_node_band_tile(Node, Sizes); 215 Node = isl_schedule_node_child(Node, 0); 216 auto PointLoopMarkerStr = IdentifierString + " - Points"; 217 isl_id *PointLoopMarker = 218 isl_id_alloc(Ctx, PointLoopMarkerStr.c_str(), nullptr); 219 Node = isl_schedule_node_insert_mark(Node, PointLoopMarker); 220 Node = isl_schedule_node_child(Node, 0); 221 return Node; 222 } 223 224 bool ScheduleTreeOptimizer::isTileableBandNode( 225 __isl_keep isl_schedule_node *Node) { 226 if (isl_schedule_node_get_type(Node) != isl_schedule_node_band) 227 return false; 228 229 if (isl_schedule_node_n_children(Node) != 1) 230 return false; 231 232 if (!isl_schedule_node_band_get_permutable(Node)) 233 return false; 234 235 auto Space = isl_schedule_node_band_get_space(Node); 236 auto Dims = isl_space_dim(Space, isl_dim_set); 237 isl_space_free(Space); 238 239 if (Dims <= 1) 240 return false; 241 242 auto Child = isl_schedule_node_get_child(Node, 0); 243 auto Type = isl_schedule_node_get_type(Child); 244 isl_schedule_node_free(Child); 245 246 if (Type != isl_schedule_node_leaf) 247 return false; 248 249 return true; 250 } 251 252 __isl_give isl_schedule_node * 253 ScheduleTreeOptimizer::optimizeBand(__isl_take isl_schedule_node *Node, 254 void *User) { 255 if (!isTileableBandNode(Node)) 256 return Node; 257 258 if (FirstLevelTiling) 259 Node = tileNode(Node, "1st level tiling", FirstLevelTileSizes, 260 FirstLevelDefaultTileSize); 261 262 if (SecondLevelTiling) 263 Node = tileNode(Node, "2nd level tiling", SecondLevelTileSizes, 264 SecondLevelDefaultTileSize); 265 266 if (RegisterTiling) { 267 auto *Ctx = isl_schedule_node_get_ctx(Node); 268 Node = tileNode(Node, "Register tiling", RegisterTileSizes, 269 RegisterDefaultTileSize); 270 Node = isl_schedule_node_band_set_ast_build_options( 271 Node, isl_union_set_read_from_str(Ctx, "{unroll[x]}")); 272 } 273 274 if (PollyVectorizerChoice == VECTORIZER_NONE) 275 return Node; 276 277 auto Space = isl_schedule_node_band_get_space(Node); 278 auto Dims = isl_space_dim(Space, isl_dim_set); 279 isl_space_free(Space); 280 281 for (int i = Dims - 1; i >= 0; i--) 282 if (isl_schedule_node_band_member_get_coincident(Node, i)) { 283 Node = prevectSchedBand(Node, i, PrevectorWidth); 284 break; 285 } 286 287 return Node; 288 } 289 290 __isl_give isl_schedule * 291 ScheduleTreeOptimizer::optimizeSchedule(__isl_take isl_schedule *Schedule) { 292 isl_schedule_node *Root = isl_schedule_get_root(Schedule); 293 Root = optimizeScheduleNode(Root); 294 isl_schedule_free(Schedule); 295 auto S = isl_schedule_node_get_schedule(Root); 296 isl_schedule_node_free(Root); 297 return S; 298 } 299 300 __isl_give isl_schedule_node *ScheduleTreeOptimizer::optimizeScheduleNode( 301 __isl_take isl_schedule_node *Node) { 302 Node = isl_schedule_node_map_descendant_bottom_up(Node, optimizeBand, NULL); 303 return Node; 304 } 305 306 bool ScheduleTreeOptimizer::isProfitableSchedule( 307 Scop &S, __isl_keep isl_union_map *NewSchedule) { 308 // To understand if the schedule has been optimized we check if the schedule 309 // has changed at all. 310 // TODO: We can improve this by tracking if any necessarily beneficial 311 // transformations have been performed. This can e.g. be tiling, loop 312 // interchange, or ...) We can track this either at the place where the 313 // transformation has been performed or, in case of automatic ILP based 314 // optimizations, by comparing (yet to be defined) performance metrics 315 // before/after the scheduling optimizer 316 // (e.g., #stride-one accesses) 317 isl_union_map *OldSchedule = S.getSchedule(); 318 bool changed = !isl_union_map_is_equal(OldSchedule, NewSchedule); 319 isl_union_map_free(OldSchedule); 320 return changed; 321 } 322 323 namespace { 324 class IslScheduleOptimizer : public ScopPass { 325 public: 326 static char ID; 327 explicit IslScheduleOptimizer() : ScopPass(ID) { LastSchedule = nullptr; } 328 329 ~IslScheduleOptimizer() { isl_schedule_free(LastSchedule); } 330 331 /// @brief Optimize the schedule of the SCoP @p S. 332 bool runOnScop(Scop &S) override; 333 334 /// @brief Print the new schedule for the SCoP @p S. 335 void printScop(raw_ostream &OS, Scop &S) const override; 336 337 /// @brief Register all analyses and transformation required. 338 void getAnalysisUsage(AnalysisUsage &AU) const override; 339 340 /// @brief Release the internal memory. 341 void releaseMemory() override { 342 isl_schedule_free(LastSchedule); 343 LastSchedule = nullptr; 344 } 345 346 private: 347 isl_schedule *LastSchedule; 348 }; 349 } 350 351 char IslScheduleOptimizer::ID = 0; 352 353 bool IslScheduleOptimizer::runOnScop(Scop &S) { 354 355 // Skip empty SCoPs but still allow code generation as it will delete the 356 // loops present but not needed. 357 if (S.getSize() == 0) { 358 S.markAsOptimized(); 359 return false; 360 } 361 362 const Dependences &D = getAnalysis<DependenceInfo>().getDependences(); 363 364 if (!D.hasValidDependences()) 365 return false; 366 367 isl_schedule_free(LastSchedule); 368 LastSchedule = nullptr; 369 370 // Build input data. 371 int ValidityKinds = 372 Dependences::TYPE_RAW | Dependences::TYPE_WAR | Dependences::TYPE_WAW; 373 int ProximityKinds; 374 375 if (OptimizeDeps == "all") 376 ProximityKinds = 377 Dependences::TYPE_RAW | Dependences::TYPE_WAR | Dependences::TYPE_WAW; 378 else if (OptimizeDeps == "raw") 379 ProximityKinds = Dependences::TYPE_RAW; 380 else { 381 errs() << "Do not know how to optimize for '" << OptimizeDeps << "'" 382 << " Falling back to optimizing all dependences.\n"; 383 ProximityKinds = 384 Dependences::TYPE_RAW | Dependences::TYPE_WAR | Dependences::TYPE_WAW; 385 } 386 387 isl_union_set *Domain = S.getDomains(); 388 389 if (!Domain) 390 return false; 391 392 isl_union_map *Validity = D.getDependences(ValidityKinds); 393 isl_union_map *Proximity = D.getDependences(ProximityKinds); 394 395 // Simplify the dependences by removing the constraints introduced by the 396 // domains. This can speed up the scheduling time significantly, as large 397 // constant coefficients will be removed from the dependences. The 398 // introduction of some additional dependences reduces the possible 399 // transformations, but in most cases, such transformation do not seem to be 400 // interesting anyway. In some cases this option may stop the scheduler to 401 // find any schedule. 402 if (SimplifyDeps == "yes") { 403 Validity = isl_union_map_gist_domain(Validity, isl_union_set_copy(Domain)); 404 Validity = isl_union_map_gist_range(Validity, isl_union_set_copy(Domain)); 405 Proximity = 406 isl_union_map_gist_domain(Proximity, isl_union_set_copy(Domain)); 407 Proximity = isl_union_map_gist_range(Proximity, isl_union_set_copy(Domain)); 408 } else if (SimplifyDeps != "no") { 409 errs() << "warning: Option -polly-opt-simplify-deps should either be 'yes' " 410 "or 'no'. Falling back to default: 'yes'\n"; 411 } 412 413 DEBUG(dbgs() << "\n\nCompute schedule from: "); 414 DEBUG(dbgs() << "Domain := " << stringFromIslObj(Domain) << ";\n"); 415 DEBUG(dbgs() << "Proximity := " << stringFromIslObj(Proximity) << ";\n"); 416 DEBUG(dbgs() << "Validity := " << stringFromIslObj(Validity) << ";\n"); 417 418 unsigned IslSerializeSCCs; 419 420 if (FusionStrategy == "max") { 421 IslSerializeSCCs = 0; 422 } else if (FusionStrategy == "min") { 423 IslSerializeSCCs = 1; 424 } else { 425 errs() << "warning: Unknown fusion strategy. Falling back to maximal " 426 "fusion.\n"; 427 IslSerializeSCCs = 0; 428 } 429 430 int IslMaximizeBands; 431 432 if (MaximizeBandDepth == "yes") { 433 IslMaximizeBands = 1; 434 } else if (MaximizeBandDepth == "no") { 435 IslMaximizeBands = 0; 436 } else { 437 errs() << "warning: Option -polly-opt-maximize-bands should either be 'yes'" 438 " or 'no'. Falling back to default: 'yes'\n"; 439 IslMaximizeBands = 1; 440 } 441 442 isl_options_set_schedule_serialize_sccs(S.getIslCtx(), IslSerializeSCCs); 443 isl_options_set_schedule_maximize_band_depth(S.getIslCtx(), IslMaximizeBands); 444 isl_options_set_schedule_max_constant_term(S.getIslCtx(), MaxConstantTerm); 445 isl_options_set_schedule_max_coefficient(S.getIslCtx(), MaxCoefficient); 446 isl_options_set_tile_scale_tile_loops(S.getIslCtx(), 0); 447 448 isl_options_set_on_error(S.getIslCtx(), ISL_ON_ERROR_CONTINUE); 449 450 isl_schedule_constraints *ScheduleConstraints; 451 ScheduleConstraints = isl_schedule_constraints_on_domain(Domain); 452 ScheduleConstraints = 453 isl_schedule_constraints_set_proximity(ScheduleConstraints, Proximity); 454 ScheduleConstraints = isl_schedule_constraints_set_validity( 455 ScheduleConstraints, isl_union_map_copy(Validity)); 456 ScheduleConstraints = 457 isl_schedule_constraints_set_coincidence(ScheduleConstraints, Validity); 458 isl_schedule *Schedule; 459 Schedule = isl_schedule_constraints_compute_schedule(ScheduleConstraints); 460 isl_options_set_on_error(S.getIslCtx(), ISL_ON_ERROR_ABORT); 461 462 // In cases the scheduler is not able to optimize the code, we just do not 463 // touch the schedule. 464 if (!Schedule) 465 return false; 466 467 DEBUG({ 468 auto *P = isl_printer_to_str(S.getIslCtx()); 469 P = isl_printer_set_yaml_style(P, ISL_YAML_STYLE_BLOCK); 470 P = isl_printer_print_schedule(P, Schedule); 471 dbgs() << "NewScheduleTree: \n" << isl_printer_get_str(P) << "\n"; 472 isl_printer_free(P); 473 }); 474 475 isl_schedule *NewSchedule = ScheduleTreeOptimizer::optimizeSchedule(Schedule); 476 isl_union_map *NewScheduleMap = isl_schedule_get_map(NewSchedule); 477 478 if (!ScheduleTreeOptimizer::isProfitableSchedule(S, NewScheduleMap)) { 479 isl_union_map_free(NewScheduleMap); 480 isl_schedule_free(NewSchedule); 481 return false; 482 } 483 484 S.setScheduleTree(NewSchedule); 485 S.markAsOptimized(); 486 487 isl_union_map_free(NewScheduleMap); 488 return false; 489 } 490 491 void IslScheduleOptimizer::printScop(raw_ostream &OS, Scop &) const { 492 isl_printer *p; 493 char *ScheduleStr; 494 495 OS << "Calculated schedule:\n"; 496 497 if (!LastSchedule) { 498 OS << "n/a\n"; 499 return; 500 } 501 502 p = isl_printer_to_str(isl_schedule_get_ctx(LastSchedule)); 503 p = isl_printer_print_schedule(p, LastSchedule); 504 ScheduleStr = isl_printer_get_str(p); 505 isl_printer_free(p); 506 507 OS << ScheduleStr << "\n"; 508 } 509 510 void IslScheduleOptimizer::getAnalysisUsage(AnalysisUsage &AU) const { 511 ScopPass::getAnalysisUsage(AU); 512 AU.addRequired<DependenceInfo>(); 513 } 514 515 Pass *polly::createIslScheduleOptimizerPass() { 516 return new IslScheduleOptimizer(); 517 } 518 519 INITIALIZE_PASS_BEGIN(IslScheduleOptimizer, "polly-opt-isl", 520 "Polly - Optimize schedule of SCoP", false, false); 521 INITIALIZE_PASS_DEPENDENCY(DependenceInfo); 522 INITIALIZE_PASS_DEPENDENCY(ScopInfo); 523 INITIALIZE_PASS_END(IslScheduleOptimizer, "polly-opt-isl", 524 "Polly - Optimize schedule of SCoP", false, false) 525