1 //===- Schedule.cpp - Calculate an optimized schedule ---------------------===// 2 // 3 // The LLVM Compiler Infrastructure 4 // 5 // This file is distributed under the University of Illinois Open Source 6 // License. See LICENSE.TXT for details. 7 // 8 //===----------------------------------------------------------------------===// 9 // 10 // This pass the isl to calculate a schedule that is optimized for parallelism 11 // and tileablility. The algorithm used in isl is an optimized version of the 12 // algorithm described in following paper: 13 // 14 // U. Bondhugula, A. Hartono, J. Ramanujam, and P. Sadayappan. 15 // A Practical Automatic Polyhedral Parallelizer and Locality Optimizer. 16 // In Proceedings of the 2008 ACM SIGPLAN Conference On Programming Language 17 // Design and Implementation, PLDI ’08, pages 101–113. ACM, 2008. 18 //===----------------------------------------------------------------------===// 19 20 #include "polly/ScheduleOptimizer.h" 21 #include "polly/CodeGen/CodeGeneration.h" 22 #include "polly/DependenceInfo.h" 23 #include "polly/LinkAllPasses.h" 24 #include "polly/Options.h" 25 #include "polly/ScopInfo.h" 26 #include "polly/Support/GICHelper.h" 27 #include "llvm/Support/Debug.h" 28 #include "isl/aff.h" 29 #include "isl/band.h" 30 #include "isl/constraint.h" 31 #include "isl/map.h" 32 #include "isl/options.h" 33 #include "isl/printer.h" 34 #include "isl/schedule.h" 35 #include "isl/schedule_node.h" 36 #include "isl/space.h" 37 #include "isl/union_map.h" 38 #include "isl/union_set.h" 39 40 using namespace llvm; 41 using namespace polly; 42 43 #define DEBUG_TYPE "polly-opt-isl" 44 45 namespace polly { 46 bool DisablePollyTiling; 47 } 48 static cl::opt<bool, true> 49 DisableTiling("polly-no-tiling", 50 cl::desc("Disable tiling in the scheduler"), 51 cl::location(polly::DisablePollyTiling), cl::init(false), 52 cl::ZeroOrMore, cl::cat(PollyCategory)); 53 54 static cl::opt<std::string> 55 OptimizeDeps("polly-opt-optimize-only", 56 cl::desc("Only a certain kind of dependences (all/raw)"), 57 cl::Hidden, cl::init("all"), cl::ZeroOrMore, 58 cl::cat(PollyCategory)); 59 60 static cl::opt<std::string> 61 SimplifyDeps("polly-opt-simplify-deps", 62 cl::desc("Dependences should be simplified (yes/no)"), 63 cl::Hidden, cl::init("yes"), cl::ZeroOrMore, 64 cl::cat(PollyCategory)); 65 66 static cl::opt<int> MaxConstantTerm( 67 "polly-opt-max-constant-term", 68 cl::desc("The maximal constant term allowed (-1 is unlimited)"), cl::Hidden, 69 cl::init(20), cl::ZeroOrMore, cl::cat(PollyCategory)); 70 71 static cl::opt<int> MaxCoefficient( 72 "polly-opt-max-coefficient", 73 cl::desc("The maximal coefficient allowed (-1 is unlimited)"), cl::Hidden, 74 cl::init(20), cl::ZeroOrMore, cl::cat(PollyCategory)); 75 76 static cl::opt<std::string> FusionStrategy( 77 "polly-opt-fusion", cl::desc("The fusion strategy to choose (min/max)"), 78 cl::Hidden, cl::init("min"), cl::ZeroOrMore, cl::cat(PollyCategory)); 79 80 static cl::opt<std::string> 81 MaximizeBandDepth("polly-opt-maximize-bands", 82 cl::desc("Maximize the band depth (yes/no)"), cl::Hidden, 83 cl::init("yes"), cl::ZeroOrMore, cl::cat(PollyCategory)); 84 85 static cl::opt<int> DefaultTileSize( 86 "polly-default-tile-size", 87 cl::desc("The default tile size (if not enough were provided by" 88 " --polly-tile-sizes)"), 89 cl::Hidden, cl::init(32), cl::ZeroOrMore, cl::cat(PollyCategory)); 90 91 static cl::list<int> TileSizes("polly-tile-sizes", 92 cl::desc("A tile size" 93 " for each loop dimension, filled with" 94 " --polly-default-tile-size"), 95 cl::Hidden, cl::ZeroOrMore, cl::CommaSeparated, 96 cl::cat(PollyCategory)); 97 namespace { 98 99 class IslScheduleOptimizer : public ScopPass { 100 public: 101 static char ID; 102 explicit IslScheduleOptimizer() : ScopPass(ID) { LastSchedule = nullptr; } 103 104 ~IslScheduleOptimizer() { isl_schedule_free(LastSchedule); } 105 106 bool runOnScop(Scop &S) override; 107 void printScop(raw_ostream &OS, Scop &S) const override; 108 void getAnalysisUsage(AnalysisUsage &AU) const override; 109 110 private: 111 isl_schedule *LastSchedule; 112 113 /// @brief Decide if the @p NewSchedule is profitable for @p S. 114 /// 115 /// @param S The SCoP we optimize. 116 /// @param NewSchedule The new schedule we computed. 117 /// 118 /// @return True, if we believe @p NewSchedule is an improvement for @p S. 119 bool isProfitableSchedule(Scop &S, __isl_keep isl_union_map *NewSchedule); 120 121 /// @brief Create a map that pre-vectorizes one scheduling dimension. 122 /// 123 /// getPrevectorMap creates a map that maps each input dimension to the same 124 /// output dimension, except for the dimension DimToVectorize. 125 /// DimToVectorize is strip mined by 'VectorWidth' and the newly created 126 /// point loop of DimToVectorize is moved to the innermost level. 127 /// 128 /// Example (DimToVectorize=0, ScheduleDimensions=2, VectorWidth=4): 129 /// 130 /// | Before transformation 131 /// | 132 /// | A[i,j] -> [i,j] 133 /// | 134 /// | for (i = 0; i < 128; i++) 135 /// | for (j = 0; j < 128; j++) 136 /// | A(i,j); 137 /// 138 /// Prevector map: 139 /// [i,j] -> [it,j,ip] : it % 4 = 0 and it <= ip <= it + 3 and i = ip 140 /// 141 /// | After transformation: 142 /// | 143 /// | A[i,j] -> [it,j,ip] : it % 4 = 0 and it <= ip <= it + 3 and i = ip 144 /// | 145 /// | for (it = 0; it < 128; it+=4) 146 /// | for (j = 0; j < 128; j++) 147 /// | for (ip = max(0,it); ip < min(128, it + 3); ip++) 148 /// | A(ip,j); 149 /// 150 /// The goal of this transformation is to create a trivially vectorizable 151 /// loop. This means a parallel loop at the innermost level that has a 152 /// constant number of iterations corresponding to the target vector width. 153 /// 154 /// This transformation creates a loop at the innermost level. The loop has 155 /// a constant number of iterations, if the number of loop iterations at 156 /// DimToVectorize can be divided by VectorWidth. The default VectorWidth is 157 /// currently constant and not yet target specific. This function does not 158 /// reason about parallelism. 159 static __isl_give isl_map *getPrevectorMap(isl_ctx *ctx, int DimToVectorize, 160 int ScheduleDimensions, 161 int VectorWidth = 4); 162 163 /// @brief Apply additional optimizations on the bands in the schedule tree. 164 /// 165 /// We are looking for an innermost band node and apply the following 166 /// transformations: 167 /// 168 /// - Tile the band 169 /// - if the band is tileable 170 /// - if the band has more than one loop dimension 171 /// 172 /// - Prevectorize the point loop of the tile 173 /// - if vectorization is enabled 174 /// 175 /// @param Node The schedule node to (possibly) optimize. 176 /// @param User A pointer to forward some use information (currently unused). 177 static isl_schedule_node *optimizeBand(isl_schedule_node *Node, void *User); 178 179 static __isl_give isl_union_map * 180 getScheduleMap(__isl_keep isl_schedule *Schedule); 181 182 using llvm::Pass::doFinalization; 183 184 virtual bool doFinalization() override { 185 isl_schedule_free(LastSchedule); 186 LastSchedule = nullptr; 187 return true; 188 } 189 }; 190 } 191 192 char IslScheduleOptimizer::ID = 0; 193 194 __isl_give isl_map * 195 IslScheduleOptimizer::getPrevectorMap(isl_ctx *ctx, int DimToVectorize, 196 int ScheduleDimensions, int VectorWidth) { 197 isl_space *Space; 198 isl_local_space *LocalSpace, *LocalSpaceRange; 199 isl_set *Modulo; 200 isl_map *TilingMap; 201 isl_constraint *c; 202 isl_aff *Aff; 203 int PointDimension; /* ip */ 204 int TileDimension; /* it */ 205 isl_val *VectorWidthMP; 206 207 assert(0 <= DimToVectorize && DimToVectorize < ScheduleDimensions); 208 209 Space = isl_space_alloc(ctx, 0, ScheduleDimensions, ScheduleDimensions + 1); 210 TilingMap = isl_map_universe(isl_space_copy(Space)); 211 LocalSpace = isl_local_space_from_space(Space); 212 PointDimension = ScheduleDimensions; 213 TileDimension = DimToVectorize; 214 215 // Create an identity map for everything except DimToVectorize and map 216 // DimToVectorize to the point loop at the innermost dimension. 217 for (int i = 0; i < ScheduleDimensions; i++) 218 if (i == DimToVectorize) 219 TilingMap = 220 isl_map_equate(TilingMap, isl_dim_in, i, isl_dim_out, PointDimension); 221 else 222 TilingMap = isl_map_equate(TilingMap, isl_dim_in, i, isl_dim_out, i); 223 224 // it % 'VectorWidth' = 0 225 LocalSpaceRange = isl_local_space_range(isl_local_space_copy(LocalSpace)); 226 Aff = isl_aff_zero_on_domain(LocalSpaceRange); 227 Aff = isl_aff_set_constant_si(Aff, VectorWidth); 228 Aff = isl_aff_set_coefficient_si(Aff, isl_dim_in, TileDimension, 1); 229 VectorWidthMP = isl_val_int_from_si(ctx, VectorWidth); 230 Aff = isl_aff_mod_val(Aff, VectorWidthMP); 231 Modulo = isl_pw_aff_zero_set(isl_pw_aff_from_aff(Aff)); 232 TilingMap = isl_map_intersect_range(TilingMap, Modulo); 233 234 // it <= ip 235 TilingMap = isl_map_order_le(TilingMap, isl_dim_out, TileDimension, 236 isl_dim_out, PointDimension); 237 238 // ip <= it + ('VectorWidth' - 1) 239 c = isl_inequality_alloc(LocalSpace); 240 isl_constraint_set_coefficient_si(c, isl_dim_out, TileDimension, 1); 241 isl_constraint_set_coefficient_si(c, isl_dim_out, PointDimension, -1); 242 isl_constraint_set_constant_si(c, VectorWidth - 1); 243 TilingMap = isl_map_add_constraint(TilingMap, c); 244 245 return TilingMap; 246 } 247 248 isl_schedule_node *IslScheduleOptimizer::optimizeBand(isl_schedule_node *Node, 249 void *User) { 250 if (isl_schedule_node_get_type(Node) != isl_schedule_node_band) 251 return Node; 252 253 if (isl_schedule_node_n_children(Node) != 1) 254 return Node; 255 256 if (!isl_schedule_node_band_get_permutable(Node)) 257 return Node; 258 259 auto Space = isl_schedule_node_band_get_space(Node); 260 auto Dims = isl_space_dim(Space, isl_dim_set); 261 262 if (Dims <= 1) { 263 isl_space_free(Space); 264 return Node; 265 } 266 267 auto Child = isl_schedule_node_get_child(Node, 0); 268 auto Type = isl_schedule_node_get_type(Child); 269 isl_schedule_node_free(Child); 270 271 if (Type != isl_schedule_node_leaf) { 272 isl_space_free(Space); 273 return Node; 274 } 275 276 auto Sizes = isl_multi_val_zero(Space); 277 auto Ctx = isl_schedule_node_get_ctx(Node); 278 279 for (unsigned i = 0; i < Dims; i++) { 280 auto tileSize = TileSizes.size() > i ? TileSizes[i] : DefaultTileSize; 281 Sizes = isl_multi_val_set_val(Sizes, i, isl_val_int_from_si(Ctx, tileSize)); 282 } 283 284 isl_schedule_node *Res; 285 286 if (DisableTiling) { 287 isl_multi_val_free(Sizes); 288 Res = Node; 289 } else { 290 Res = isl_schedule_node_band_tile(Node, Sizes); 291 } 292 293 if (PollyVectorizerChoice == VECTORIZER_NONE) 294 return Res; 295 296 Child = isl_schedule_node_get_child(Res, 0); 297 auto ChildSchedule = isl_schedule_node_band_get_partial_schedule(Child); 298 299 for (int i = Dims - 1; i >= 0; i--) { 300 if (isl_schedule_node_band_member_get_coincident(Child, i)) { 301 auto TileMap = IslScheduleOptimizer::getPrevectorMap(Ctx, i, Dims); 302 auto TileUMap = isl_union_map_from_map(TileMap); 303 auto ChildSchedule2 = isl_union_map_apply_range( 304 isl_union_map_from_multi_union_pw_aff(ChildSchedule), TileUMap); 305 ChildSchedule = isl_multi_union_pw_aff_from_union_map(ChildSchedule2); 306 break; 307 } 308 } 309 310 isl_schedule_node_free(Res); 311 Res = isl_schedule_node_delete(Child); 312 Res = isl_schedule_node_insert_partial_schedule(Res, ChildSchedule); 313 return Res; 314 } 315 316 __isl_give isl_union_map * 317 IslScheduleOptimizer::getScheduleMap(__isl_keep isl_schedule *Schedule) { 318 isl_schedule_node *Root = isl_schedule_get_root(Schedule); 319 Root = isl_schedule_node_map_descendant_bottom_up( 320 Root, IslScheduleOptimizer::optimizeBand, NULL); 321 auto ScheduleMap = isl_schedule_node_get_subtree_schedule_union_map(Root); 322 ScheduleMap = isl_union_map_detect_equalities(ScheduleMap); 323 isl_schedule_node_free(Root); 324 return ScheduleMap; 325 } 326 327 bool IslScheduleOptimizer::isProfitableSchedule( 328 Scop &S, __isl_keep isl_union_map *NewSchedule) { 329 // To understand if the schedule has been optimized we check if the schedule 330 // has changed at all. 331 // TODO: We can improve this by tracking if any necessarily beneficial 332 // transformations have been performed. This can e.g. be tiling, loop 333 // interchange, or ...) We can track this either at the place where the 334 // transformation has been performed or, in case of automatic ILP based 335 // optimizations, by comparing (yet to be defined) performance metrics 336 // before/after the scheduling optimizer 337 // (e.g., #stride-one accesses) 338 isl_union_map *OldSchedule = S.getSchedule(); 339 bool changed = !isl_union_map_is_equal(OldSchedule, NewSchedule); 340 isl_union_map_free(OldSchedule); 341 return changed; 342 } 343 344 bool IslScheduleOptimizer::runOnScop(Scop &S) { 345 346 // Skip empty SCoPs but still allow code generation as it will delete the 347 // loops present but not needed. 348 if (S.getSize() == 0) { 349 S.markAsOptimized(); 350 return false; 351 } 352 353 const Dependences &D = getAnalysis<DependenceInfo>().getDependences(); 354 355 if (!D.hasValidDependences()) 356 return false; 357 358 isl_schedule_free(LastSchedule); 359 LastSchedule = nullptr; 360 361 // Build input data. 362 int ValidityKinds = 363 Dependences::TYPE_RAW | Dependences::TYPE_WAR | Dependences::TYPE_WAW; 364 int ProximityKinds; 365 366 if (OptimizeDeps == "all") 367 ProximityKinds = 368 Dependences::TYPE_RAW | Dependences::TYPE_WAR | Dependences::TYPE_WAW; 369 else if (OptimizeDeps == "raw") 370 ProximityKinds = Dependences::TYPE_RAW; 371 else { 372 errs() << "Do not know how to optimize for '" << OptimizeDeps << "'" 373 << " Falling back to optimizing all dependences.\n"; 374 ProximityKinds = 375 Dependences::TYPE_RAW | Dependences::TYPE_WAR | Dependences::TYPE_WAW; 376 } 377 378 isl_union_set *Domain = S.getDomains(); 379 380 if (!Domain) 381 return false; 382 383 isl_union_map *Validity = D.getDependences(ValidityKinds); 384 isl_union_map *Proximity = D.getDependences(ProximityKinds); 385 386 // Simplify the dependences by removing the constraints introduced by the 387 // domains. This can speed up the scheduling time significantly, as large 388 // constant coefficients will be removed from the dependences. The 389 // introduction of some additional dependences reduces the possible 390 // transformations, but in most cases, such transformation do not seem to be 391 // interesting anyway. In some cases this option may stop the scheduler to 392 // find any schedule. 393 if (SimplifyDeps == "yes") { 394 Validity = isl_union_map_gist_domain(Validity, isl_union_set_copy(Domain)); 395 Validity = isl_union_map_gist_range(Validity, isl_union_set_copy(Domain)); 396 Proximity = 397 isl_union_map_gist_domain(Proximity, isl_union_set_copy(Domain)); 398 Proximity = isl_union_map_gist_range(Proximity, isl_union_set_copy(Domain)); 399 } else if (SimplifyDeps != "no") { 400 errs() << "warning: Option -polly-opt-simplify-deps should either be 'yes' " 401 "or 'no'. Falling back to default: 'yes'\n"; 402 } 403 404 DEBUG(dbgs() << "\n\nCompute schedule from: "); 405 DEBUG(dbgs() << "Domain := " << stringFromIslObj(Domain) << ";\n"); 406 DEBUG(dbgs() << "Proximity := " << stringFromIslObj(Proximity) << ";\n"); 407 DEBUG(dbgs() << "Validity := " << stringFromIslObj(Validity) << ";\n"); 408 409 unsigned IslSerializeSCCs; 410 411 if (FusionStrategy == "max") { 412 IslSerializeSCCs = 0; 413 } else if (FusionStrategy == "min") { 414 IslSerializeSCCs = 1; 415 } else { 416 errs() << "warning: Unknown fusion strategy. Falling back to maximal " 417 "fusion.\n"; 418 IslSerializeSCCs = 0; 419 } 420 421 int IslMaximizeBands; 422 423 if (MaximizeBandDepth == "yes") { 424 IslMaximizeBands = 1; 425 } else if (MaximizeBandDepth == "no") { 426 IslMaximizeBands = 0; 427 } else { 428 errs() << "warning: Option -polly-opt-maximize-bands should either be 'yes'" 429 " or 'no'. Falling back to default: 'yes'\n"; 430 IslMaximizeBands = 1; 431 } 432 433 isl_options_set_schedule_serialize_sccs(S.getIslCtx(), IslSerializeSCCs); 434 isl_options_set_schedule_maximize_band_depth(S.getIslCtx(), IslMaximizeBands); 435 isl_options_set_schedule_max_constant_term(S.getIslCtx(), MaxConstantTerm); 436 isl_options_set_schedule_max_coefficient(S.getIslCtx(), MaxCoefficient); 437 isl_options_set_tile_scale_tile_loops(S.getIslCtx(), 0); 438 439 isl_options_set_on_error(S.getIslCtx(), ISL_ON_ERROR_CONTINUE); 440 441 isl_schedule_constraints *ScheduleConstraints; 442 ScheduleConstraints = isl_schedule_constraints_on_domain(Domain); 443 ScheduleConstraints = 444 isl_schedule_constraints_set_proximity(ScheduleConstraints, Proximity); 445 ScheduleConstraints = isl_schedule_constraints_set_validity( 446 ScheduleConstraints, isl_union_map_copy(Validity)); 447 ScheduleConstraints = 448 isl_schedule_constraints_set_coincidence(ScheduleConstraints, Validity); 449 isl_schedule *Schedule; 450 Schedule = isl_schedule_constraints_compute_schedule(ScheduleConstraints); 451 isl_options_set_on_error(S.getIslCtx(), ISL_ON_ERROR_ABORT); 452 453 // In cases the scheduler is not able to optimize the code, we just do not 454 // touch the schedule. 455 if (!Schedule) 456 return false; 457 458 DEBUG({ 459 auto *P = isl_printer_to_str(S.getIslCtx()); 460 P = isl_printer_set_yaml_style(P, ISL_YAML_STYLE_BLOCK); 461 P = isl_printer_print_schedule(P, Schedule); 462 dbgs() << "NewScheduleTree: \n" << isl_printer_get_str(P) << "\n"; 463 isl_printer_free(P); 464 }); 465 466 isl_union_map *NewSchedule = getScheduleMap(Schedule); 467 468 // Check if the optimizations performed were profitable, otherwise exit early. 469 if (!isProfitableSchedule(S, NewSchedule)) { 470 isl_schedule_free(Schedule); 471 isl_union_map_free(NewSchedule); 472 return false; 473 } 474 475 S.markAsOptimized(); 476 477 for (ScopStmt &Stmt : S) { 478 isl_map *StmtSchedule; 479 isl_set *Domain = Stmt.getDomain(); 480 isl_union_map *StmtBand; 481 StmtBand = isl_union_map_intersect_domain(isl_union_map_copy(NewSchedule), 482 isl_union_set_from_set(Domain)); 483 if (isl_union_map_is_empty(StmtBand)) { 484 StmtSchedule = isl_map_from_domain(isl_set_empty(Stmt.getDomainSpace())); 485 isl_union_map_free(StmtBand); 486 } else { 487 assert(isl_union_map_n_map(StmtBand) == 1); 488 StmtSchedule = isl_map_from_union_map(StmtBand); 489 } 490 491 Stmt.setSchedule(StmtSchedule); 492 } 493 494 isl_schedule_free(Schedule); 495 isl_union_map_free(NewSchedule); 496 return false; 497 } 498 499 void IslScheduleOptimizer::printScop(raw_ostream &OS, Scop &) const { 500 isl_printer *p; 501 char *ScheduleStr; 502 503 OS << "Calculated schedule:\n"; 504 505 if (!LastSchedule) { 506 OS << "n/a\n"; 507 return; 508 } 509 510 p = isl_printer_to_str(isl_schedule_get_ctx(LastSchedule)); 511 p = isl_printer_print_schedule(p, LastSchedule); 512 ScheduleStr = isl_printer_get_str(p); 513 isl_printer_free(p); 514 515 OS << ScheduleStr << "\n"; 516 } 517 518 void IslScheduleOptimizer::getAnalysisUsage(AnalysisUsage &AU) const { 519 ScopPass::getAnalysisUsage(AU); 520 AU.addRequired<DependenceInfo>(); 521 } 522 523 Pass *polly::createIslScheduleOptimizerPass() { 524 return new IslScheduleOptimizer(); 525 } 526 527 INITIALIZE_PASS_BEGIN(IslScheduleOptimizer, "polly-opt-isl", 528 "Polly - Optimize schedule of SCoP", false, false); 529 INITIALIZE_PASS_DEPENDENCY(DependenceInfo); 530 INITIALIZE_PASS_DEPENDENCY(ScopInfo); 531 INITIALIZE_PASS_END(IslScheduleOptimizer, "polly-opt-isl", 532 "Polly - Optimize schedule of SCoP", false, false) 533