1 //===- Schedule.cpp - Calculate an optimized schedule ---------------------===// 2 // 3 // The LLVM Compiler Infrastructure 4 // 5 // This file is distributed under the University of Illinois Open Source 6 // License. See LICENSE.TXT for details. 7 // 8 //===----------------------------------------------------------------------===// 9 // 10 // This pass the isl to calculate a schedule that is optimized for parallelism 11 // and tileablility. The algorithm used in isl is an optimized version of the 12 // algorithm described in following paper: 13 // 14 // U. Bondhugula, A. Hartono, J. Ramanujam, and P. Sadayappan. 15 // A Practical Automatic Polyhedral Parallelizer and Locality Optimizer. 16 // In Proceedings of the 2008 ACM SIGPLAN Conference On Programming Language 17 // Design and Implementation, PLDI ’08, pages 101–113. ACM, 2008. 18 //===----------------------------------------------------------------------===// 19 20 #include "polly/ScheduleOptimizer.h" 21 #include "isl/aff.h" 22 #include "isl/band.h" 23 #include "isl/constraint.h" 24 #include "isl/map.h" 25 #include "isl/options.h" 26 #include "isl/schedule.h" 27 #include "isl/schedule_node.h" 28 #include "isl/space.h" 29 #include "polly/CodeGen/CodeGeneration.h" 30 #include "polly/DependenceInfo.h" 31 #include "polly/LinkAllPasses.h" 32 #include "polly/Options.h" 33 #include "polly/ScopInfo.h" 34 #include "polly/Support/GICHelper.h" 35 #include "llvm/Support/Debug.h" 36 37 using namespace llvm; 38 using namespace polly; 39 40 #define DEBUG_TYPE "polly-opt-isl" 41 42 namespace polly { 43 bool DisablePollyTiling; 44 } 45 static cl::opt<bool, true> 46 DisableTiling("polly-no-tiling", 47 cl::desc("Disable tiling in the scheduler"), 48 cl::location(polly::DisablePollyTiling), cl::init(false), 49 cl::ZeroOrMore, cl::cat(PollyCategory)); 50 51 static cl::opt<std::string> 52 OptimizeDeps("polly-opt-optimize-only", 53 cl::desc("Only a certain kind of dependences (all/raw)"), 54 cl::Hidden, cl::init("all"), cl::ZeroOrMore, 55 cl::cat(PollyCategory)); 56 57 static cl::opt<std::string> 58 SimplifyDeps("polly-opt-simplify-deps", 59 cl::desc("Dependences should be simplified (yes/no)"), 60 cl::Hidden, cl::init("yes"), cl::ZeroOrMore, 61 cl::cat(PollyCategory)); 62 63 static cl::opt<int> MaxConstantTerm( 64 "polly-opt-max-constant-term", 65 cl::desc("The maximal constant term allowed (-1 is unlimited)"), cl::Hidden, 66 cl::init(20), cl::ZeroOrMore, cl::cat(PollyCategory)); 67 68 static cl::opt<int> MaxCoefficient( 69 "polly-opt-max-coefficient", 70 cl::desc("The maximal coefficient allowed (-1 is unlimited)"), cl::Hidden, 71 cl::init(20), cl::ZeroOrMore, cl::cat(PollyCategory)); 72 73 static cl::opt<std::string> FusionStrategy( 74 "polly-opt-fusion", cl::desc("The fusion strategy to choose (min/max)"), 75 cl::Hidden, cl::init("min"), cl::ZeroOrMore, cl::cat(PollyCategory)); 76 77 static cl::opt<std::string> 78 MaximizeBandDepth("polly-opt-maximize-bands", 79 cl::desc("Maximize the band depth (yes/no)"), cl::Hidden, 80 cl::init("yes"), cl::ZeroOrMore, cl::cat(PollyCategory)); 81 82 static cl::opt<int> DefaultTileSize( 83 "polly-default-tile-size", 84 cl::desc("The default tile size (if not enough were provided by" 85 " --polly-tile-sizes)"), 86 cl::Hidden, cl::init(32), cl::ZeroOrMore, cl::cat(PollyCategory)); 87 88 static cl::list<int> TileSizes("polly-tile-sizes", 89 cl::desc("A tile size" 90 " for each loop dimension, filled with" 91 " --polly-default-tile-size"), 92 cl::Hidden, cl::ZeroOrMore, cl::CommaSeparated, 93 cl::cat(PollyCategory)); 94 namespace { 95 96 class IslScheduleOptimizer : public ScopPass { 97 public: 98 static char ID; 99 explicit IslScheduleOptimizer() : ScopPass(ID) { LastSchedule = nullptr; } 100 101 ~IslScheduleOptimizer() { isl_schedule_free(LastSchedule); } 102 103 bool runOnScop(Scop &S) override; 104 void printScop(raw_ostream &OS, Scop &S) const override; 105 void getAnalysisUsage(AnalysisUsage &AU) const override; 106 107 private: 108 isl_schedule *LastSchedule; 109 110 /// @brief Decide if the @p NewSchedule is profitable for @p S. 111 /// 112 /// @param S The SCoP we optimize. 113 /// @param NewSchedule The new schedule we computed. 114 /// 115 /// @return True, if we believe @p NewSchedule is an improvement for @p S. 116 bool isProfitableSchedule(Scop &S, __isl_keep isl_union_map *NewSchedule); 117 118 /// @brief Create a map that pre-vectorizes one scheduling dimension. 119 /// 120 /// getPrevectorMap creates a map that maps each input dimension to the same 121 /// output dimension, except for the dimension DimToVectorize. 122 /// DimToVectorize is strip mined by 'VectorWidth' and the newly created 123 /// point loop of DimToVectorize is moved to the innermost level. 124 /// 125 /// Example (DimToVectorize=0, ScheduleDimensions=2, VectorWidth=4): 126 /// 127 /// | Before transformation 128 /// | 129 /// | A[i,j] -> [i,j] 130 /// | 131 /// | for (i = 0; i < 128; i++) 132 /// | for (j = 0; j < 128; j++) 133 /// | A(i,j); 134 /// 135 /// Prevector map: 136 /// [i,j] -> [it,j,ip] : it % 4 = 0 and it <= ip <= it + 3 and i = ip 137 /// 138 /// | After transformation: 139 /// | 140 /// | A[i,j] -> [it,j,ip] : it % 4 = 0 and it <= ip <= it + 3 and i = ip 141 /// | 142 /// | for (it = 0; it < 128; it+=4) 143 /// | for (j = 0; j < 128; j++) 144 /// | for (ip = max(0,it); ip < min(128, it + 3); ip++) 145 /// | A(ip,j); 146 /// 147 /// The goal of this transformation is to create a trivially vectorizable 148 /// loop. This means a parallel loop at the innermost level that has a 149 /// constant number of iterations corresponding to the target vector width. 150 /// 151 /// This transformation creates a loop at the innermost level. The loop has 152 /// a constant number of iterations, if the number of loop iterations at 153 /// DimToVectorize can be divided by VectorWidth. The default VectorWidth is 154 /// currently constant and not yet target specific. This function does not 155 /// reason about parallelism. 156 static __isl_give isl_map *getPrevectorMap(isl_ctx *ctx, int DimToVectorize, 157 int ScheduleDimensions, 158 int VectorWidth = 4); 159 160 /// @brief Apply additional optimizations on the bands in the schedule tree. 161 /// 162 /// We are looking for an innermost band node and apply the following 163 /// transformations: 164 /// 165 /// - Tile the band 166 /// - if the band is tileable 167 /// - if the band has more than one loop dimension 168 /// 169 /// - Prevectorize the point loop of the tile 170 /// - if vectorization is enabled 171 /// 172 /// @param Node The schedule node to (possibly) optimize. 173 /// @param User A pointer to forward some use information (currently unused). 174 static isl_schedule_node *optimizeBand(isl_schedule_node *Node, void *User); 175 176 static __isl_give isl_union_map * 177 getScheduleMap(__isl_keep isl_schedule *Schedule); 178 179 using llvm::Pass::doFinalization; 180 181 virtual bool doFinalization() override { 182 isl_schedule_free(LastSchedule); 183 LastSchedule = nullptr; 184 return true; 185 } 186 }; 187 } 188 189 char IslScheduleOptimizer::ID = 0; 190 191 __isl_give isl_map * 192 IslScheduleOptimizer::getPrevectorMap(isl_ctx *ctx, int DimToVectorize, 193 int ScheduleDimensions, int VectorWidth) { 194 isl_space *Space; 195 isl_local_space *LocalSpace, *LocalSpaceRange; 196 isl_set *Modulo; 197 isl_map *TilingMap; 198 isl_constraint *c; 199 isl_aff *Aff; 200 int PointDimension; /* ip */ 201 int TileDimension; /* it */ 202 isl_val *VectorWidthMP; 203 204 assert(0 <= DimToVectorize && DimToVectorize < ScheduleDimensions); 205 206 Space = isl_space_alloc(ctx, 0, ScheduleDimensions, ScheduleDimensions + 1); 207 TilingMap = isl_map_universe(isl_space_copy(Space)); 208 LocalSpace = isl_local_space_from_space(Space); 209 PointDimension = ScheduleDimensions; 210 TileDimension = DimToVectorize; 211 212 // Create an identity map for everything except DimToVectorize and map 213 // DimToVectorize to the point loop at the innermost dimension. 214 for (int i = 0; i < ScheduleDimensions; i++) { 215 c = isl_equality_alloc(isl_local_space_copy(LocalSpace)); 216 c = isl_constraint_set_coefficient_si(c, isl_dim_in, i, -1); 217 218 if (i == DimToVectorize) 219 c = isl_constraint_set_coefficient_si(c, isl_dim_out, PointDimension, 1); 220 else 221 c = isl_constraint_set_coefficient_si(c, isl_dim_out, i, 1); 222 223 TilingMap = isl_map_add_constraint(TilingMap, c); 224 } 225 226 // it % 'VectorWidth' = 0 227 LocalSpaceRange = isl_local_space_range(isl_local_space_copy(LocalSpace)); 228 Aff = isl_aff_zero_on_domain(LocalSpaceRange); 229 Aff = isl_aff_set_constant_si(Aff, VectorWidth); 230 Aff = isl_aff_set_coefficient_si(Aff, isl_dim_in, TileDimension, 1); 231 VectorWidthMP = isl_val_int_from_si(ctx, VectorWidth); 232 Aff = isl_aff_mod_val(Aff, VectorWidthMP); 233 Modulo = isl_pw_aff_zero_set(isl_pw_aff_from_aff(Aff)); 234 TilingMap = isl_map_intersect_range(TilingMap, Modulo); 235 236 // it <= ip 237 c = isl_inequality_alloc(isl_local_space_copy(LocalSpace)); 238 isl_constraint_set_coefficient_si(c, isl_dim_out, TileDimension, -1); 239 isl_constraint_set_coefficient_si(c, isl_dim_out, PointDimension, 1); 240 TilingMap = isl_map_add_constraint(TilingMap, c); 241 242 // ip <= it + ('VectorWidth' - 1) 243 c = isl_inequality_alloc(LocalSpace); 244 isl_constraint_set_coefficient_si(c, isl_dim_out, TileDimension, 1); 245 isl_constraint_set_coefficient_si(c, isl_dim_out, PointDimension, -1); 246 isl_constraint_set_constant_si(c, VectorWidth - 1); 247 TilingMap = isl_map_add_constraint(TilingMap, c); 248 249 return TilingMap; 250 } 251 252 isl_schedule_node *IslScheduleOptimizer::optimizeBand(isl_schedule_node *Node, 253 void *User) { 254 if (isl_schedule_node_get_type(Node) != isl_schedule_node_band) 255 return Node; 256 257 if (isl_schedule_node_n_children(Node) != 1) 258 return Node; 259 260 if (!isl_schedule_node_band_get_permutable(Node)) 261 return Node; 262 263 auto Space = isl_schedule_node_band_get_space(Node); 264 auto Dims = isl_space_dim(Space, isl_dim_set); 265 266 if (Dims <= 1) { 267 isl_space_free(Space); 268 return Node; 269 } 270 271 auto Child = isl_schedule_node_get_child(Node, 0); 272 auto Type = isl_schedule_node_get_type(Child); 273 isl_schedule_node_free(Child); 274 275 if (Type != isl_schedule_node_leaf) { 276 isl_space_free(Space); 277 return Node; 278 } 279 280 auto Sizes = isl_multi_val_zero(Space); 281 auto Ctx = isl_schedule_node_get_ctx(Node); 282 283 for (unsigned i = 0; i < Dims; i++) { 284 auto tileSize = TileSizes.size() > i ? TileSizes[i] : DefaultTileSize; 285 Sizes = isl_multi_val_set_val(Sizes, i, isl_val_int_from_si(Ctx, tileSize)); 286 } 287 288 isl_schedule_node *Res; 289 290 if (DisableTiling) { 291 isl_multi_val_free(Sizes); 292 Res = Node; 293 } else { 294 Res = isl_schedule_node_band_tile(Node, Sizes); 295 } 296 297 if (PollyVectorizerChoice == VECTORIZER_NONE) 298 return Res; 299 300 Child = isl_schedule_node_get_child(Res, 0); 301 auto ChildSchedule = isl_schedule_node_band_get_partial_schedule(Child); 302 303 for (int i = Dims - 1; i >= 0; i--) { 304 if (isl_schedule_node_band_member_get_coincident(Child, i)) { 305 auto TileMap = IslScheduleOptimizer::getPrevectorMap(Ctx, i, Dims); 306 auto TileUMap = isl_union_map_from_map(TileMap); 307 auto ChildSchedule2 = isl_union_map_apply_range( 308 isl_union_map_from_multi_union_pw_aff(ChildSchedule), TileUMap); 309 ChildSchedule = isl_multi_union_pw_aff_from_union_map(ChildSchedule2); 310 break; 311 } 312 } 313 314 isl_schedule_node_free(Res); 315 Res = isl_schedule_node_delete(Child); 316 Res = isl_schedule_node_insert_partial_schedule(Res, ChildSchedule); 317 return Res; 318 } 319 320 __isl_give isl_union_map * 321 IslScheduleOptimizer::getScheduleMap(__isl_keep isl_schedule *Schedule) { 322 isl_schedule_node *Root = isl_schedule_get_root(Schedule); 323 Root = isl_schedule_node_map_descendant( 324 Root, IslScheduleOptimizer::optimizeBand, NULL); 325 auto ScheduleMap = isl_schedule_node_get_subtree_schedule_union_map(Root); 326 ScheduleMap = isl_union_map_detect_equalities(ScheduleMap); 327 isl_schedule_node_free(Root); 328 return ScheduleMap; 329 } 330 331 bool IslScheduleOptimizer::isProfitableSchedule( 332 Scop &S, __isl_keep isl_union_map *NewSchedule) { 333 // To understand if the schedule has been optimized we check if the schedule 334 // has changed at all. 335 // TODO: We can improve this by tracking if any necessarily beneficial 336 // transformations have been performed. This can e.g. be tiling, loop 337 // interchange, or ...) We can track this either at the place where the 338 // transformation has been performed or, in case of automatic ILP based 339 // optimizations, by comparing (yet to be defined) performance metrics 340 // before/after the scheduling optimizer 341 // (e.g., #stride-one accesses) 342 isl_union_map *OldSchedule = S.getSchedule(); 343 bool changed = !isl_union_map_is_equal(OldSchedule, NewSchedule); 344 isl_union_map_free(OldSchedule); 345 return changed; 346 } 347 348 bool IslScheduleOptimizer::runOnScop(Scop &S) { 349 350 // Skip empty SCoPs but still allow code generation as it will delete the 351 // loops present but not needed. 352 if (S.getSize() == 0) { 353 S.markAsOptimized(); 354 return false; 355 } 356 357 const Dependences &D = getAnalysis<DependenceInfo>().getDependences(); 358 359 if (!D.hasValidDependences()) 360 return false; 361 362 isl_schedule_free(LastSchedule); 363 LastSchedule = nullptr; 364 365 // Build input data. 366 int ValidityKinds = 367 Dependences::TYPE_RAW | Dependences::TYPE_WAR | Dependences::TYPE_WAW; 368 int ProximityKinds; 369 370 if (OptimizeDeps == "all") 371 ProximityKinds = 372 Dependences::TYPE_RAW | Dependences::TYPE_WAR | Dependences::TYPE_WAW; 373 else if (OptimizeDeps == "raw") 374 ProximityKinds = Dependences::TYPE_RAW; 375 else { 376 errs() << "Do not know how to optimize for '" << OptimizeDeps << "'" 377 << " Falling back to optimizing all dependences.\n"; 378 ProximityKinds = 379 Dependences::TYPE_RAW | Dependences::TYPE_WAR | Dependences::TYPE_WAW; 380 } 381 382 isl_union_set *Domain = S.getDomains(); 383 384 if (!Domain) 385 return false; 386 387 isl_union_map *Validity = D.getDependences(ValidityKinds); 388 isl_union_map *Proximity = D.getDependences(ProximityKinds); 389 390 // Simplify the dependences by removing the constraints introduced by the 391 // domains. This can speed up the scheduling time significantly, as large 392 // constant coefficients will be removed from the dependences. The 393 // introduction of some additional dependences reduces the possible 394 // transformations, but in most cases, such transformation do not seem to be 395 // interesting anyway. In some cases this option may stop the scheduler to 396 // find any schedule. 397 if (SimplifyDeps == "yes") { 398 Validity = isl_union_map_gist_domain(Validity, isl_union_set_copy(Domain)); 399 Validity = isl_union_map_gist_range(Validity, isl_union_set_copy(Domain)); 400 Proximity = 401 isl_union_map_gist_domain(Proximity, isl_union_set_copy(Domain)); 402 Proximity = isl_union_map_gist_range(Proximity, isl_union_set_copy(Domain)); 403 } else if (SimplifyDeps != "no") { 404 errs() << "warning: Option -polly-opt-simplify-deps should either be 'yes' " 405 "or 'no'. Falling back to default: 'yes'\n"; 406 } 407 408 DEBUG(dbgs() << "\n\nCompute schedule from: "); 409 DEBUG(dbgs() << "Domain := " << stringFromIslObj(Domain) << ";\n"); 410 DEBUG(dbgs() << "Proximity := " << stringFromIslObj(Proximity) << ";\n"); 411 DEBUG(dbgs() << "Validity := " << stringFromIslObj(Validity) << ";\n"); 412 413 int IslFusionStrategy; 414 415 if (FusionStrategy == "max") { 416 IslFusionStrategy = ISL_SCHEDULE_FUSE_MAX; 417 } else if (FusionStrategy == "min") { 418 IslFusionStrategy = ISL_SCHEDULE_FUSE_MIN; 419 } else { 420 errs() << "warning: Unknown fusion strategy. Falling back to maximal " 421 "fusion.\n"; 422 IslFusionStrategy = ISL_SCHEDULE_FUSE_MAX; 423 } 424 425 int IslMaximizeBands; 426 427 if (MaximizeBandDepth == "yes") { 428 IslMaximizeBands = 1; 429 } else if (MaximizeBandDepth == "no") { 430 IslMaximizeBands = 0; 431 } else { 432 errs() << "warning: Option -polly-opt-maximize-bands should either be 'yes'" 433 " or 'no'. Falling back to default: 'yes'\n"; 434 IslMaximizeBands = 1; 435 } 436 437 isl_options_set_schedule_fuse(S.getIslCtx(), IslFusionStrategy); 438 isl_options_set_schedule_maximize_band_depth(S.getIslCtx(), IslMaximizeBands); 439 isl_options_set_schedule_max_constant_term(S.getIslCtx(), MaxConstantTerm); 440 isl_options_set_schedule_max_coefficient(S.getIslCtx(), MaxCoefficient); 441 isl_options_set_tile_scale_tile_loops(S.getIslCtx(), 0); 442 443 isl_options_set_on_error(S.getIslCtx(), ISL_ON_ERROR_CONTINUE); 444 445 isl_schedule_constraints *ScheduleConstraints; 446 ScheduleConstraints = isl_schedule_constraints_on_domain(Domain); 447 ScheduleConstraints = 448 isl_schedule_constraints_set_proximity(ScheduleConstraints, Proximity); 449 ScheduleConstraints = isl_schedule_constraints_set_validity( 450 ScheduleConstraints, isl_union_map_copy(Validity)); 451 ScheduleConstraints = 452 isl_schedule_constraints_set_coincidence(ScheduleConstraints, Validity); 453 isl_schedule *Schedule; 454 Schedule = isl_schedule_constraints_compute_schedule(ScheduleConstraints); 455 isl_options_set_on_error(S.getIslCtx(), ISL_ON_ERROR_ABORT); 456 457 // In cases the scheduler is not able to optimize the code, we just do not 458 // touch the schedule. 459 if (!Schedule) 460 return false; 461 462 DEBUG(dbgs() << "Schedule := " << stringFromIslObj(Schedule) << ";\n"); 463 464 isl_union_map *NewSchedule = getScheduleMap(Schedule); 465 466 // Check if the optimizations performed were profitable, otherwise exit early. 467 if (!isProfitableSchedule(S, NewSchedule)) { 468 isl_schedule_free(Schedule); 469 isl_union_map_free(NewSchedule); 470 return false; 471 } 472 473 S.markAsOptimized(); 474 475 for (ScopStmt *Stmt : S) { 476 isl_map *StmtSchedule; 477 isl_set *Domain = Stmt->getDomain(); 478 isl_union_map *StmtBand; 479 StmtBand = isl_union_map_intersect_domain(isl_union_map_copy(NewSchedule), 480 isl_union_set_from_set(Domain)); 481 if (isl_union_map_is_empty(StmtBand)) { 482 StmtSchedule = isl_map_from_domain(isl_set_empty(Stmt->getDomainSpace())); 483 isl_union_map_free(StmtBand); 484 } else { 485 assert(isl_union_map_n_map(StmtBand) == 1); 486 StmtSchedule = isl_map_from_union_map(StmtBand); 487 } 488 489 Stmt->setScattering(StmtSchedule); 490 } 491 492 isl_schedule_free(Schedule); 493 isl_union_map_free(NewSchedule); 494 return false; 495 } 496 497 void IslScheduleOptimizer::printScop(raw_ostream &OS, Scop &) const { 498 isl_printer *p; 499 char *ScheduleStr; 500 501 OS << "Calculated schedule:\n"; 502 503 if (!LastSchedule) { 504 OS << "n/a\n"; 505 return; 506 } 507 508 p = isl_printer_to_str(isl_schedule_get_ctx(LastSchedule)); 509 p = isl_printer_print_schedule(p, LastSchedule); 510 ScheduleStr = isl_printer_get_str(p); 511 isl_printer_free(p); 512 513 OS << ScheduleStr << "\n"; 514 } 515 516 void IslScheduleOptimizer::getAnalysisUsage(AnalysisUsage &AU) const { 517 ScopPass::getAnalysisUsage(AU); 518 AU.addRequired<DependenceInfo>(); 519 } 520 521 Pass *polly::createIslScheduleOptimizerPass() { 522 return new IslScheduleOptimizer(); 523 } 524 525 INITIALIZE_PASS_BEGIN(IslScheduleOptimizer, "polly-opt-isl", 526 "Polly - Optimize schedule of SCoP", false, false); 527 INITIALIZE_PASS_DEPENDENCY(DependenceInfo); 528 INITIALIZE_PASS_DEPENDENCY(ScopInfo); 529 INITIALIZE_PASS_END(IslScheduleOptimizer, "polly-opt-isl", 530 "Polly - Optimize schedule of SCoP", false, false) 531