1 //===- Schedule.cpp - Calculate an optimized schedule ---------------------===// 2 // 3 // The LLVM Compiler Infrastructure 4 // 5 // This file is distributed under the University of Illinois Open Source 6 // License. See LICENSE.TXT for details. 7 // 8 //===----------------------------------------------------------------------===// 9 // 10 // This pass the isl to calculate a schedule that is optimized for parallelism 11 // and tileablility. The algorithm used in isl is an optimized version of the 12 // algorithm described in following paper: 13 // 14 // U. Bondhugula, A. Hartono, J. Ramanujam, and P. Sadayappan. 15 // A Practical Automatic Polyhedral Parallelizer and Locality Optimizer. 16 // In Proceedings of the 2008 ACM SIGPLAN Conference On Programming Language 17 // Design and Implementation, PLDI ’08, pages 101–113. ACM, 2008. 18 //===----------------------------------------------------------------------===// 19 20 #include "polly/ScheduleOptimizer.h" 21 #include "polly/CodeGen/CodeGeneration.h" 22 #include "polly/DependenceInfo.h" 23 #include "polly/LinkAllPasses.h" 24 #include "polly/Options.h" 25 #include "polly/ScopInfo.h" 26 #include "polly/Support/GICHelper.h" 27 #include "llvm/Support/Debug.h" 28 #include "isl/aff.h" 29 #include "isl/band.h" 30 #include "isl/constraint.h" 31 #include "isl/map.h" 32 #include "isl/options.h" 33 #include "isl/printer.h" 34 #include "isl/schedule.h" 35 #include "isl/schedule_node.h" 36 #include "isl/space.h" 37 #include "isl/union_map.h" 38 #include "isl/union_set.h" 39 40 using namespace llvm; 41 using namespace polly; 42 43 #define DEBUG_TYPE "polly-opt-isl" 44 45 namespace polly { 46 bool DisablePollyTiling; 47 } 48 static cl::opt<bool, true> 49 DisableTiling("polly-no-tiling", 50 cl::desc("Disable tiling in the scheduler"), 51 cl::location(polly::DisablePollyTiling), cl::init(false), 52 cl::ZeroOrMore, cl::cat(PollyCategory)); 53 54 static cl::opt<std::string> 55 OptimizeDeps("polly-opt-optimize-only", 56 cl::desc("Only a certain kind of dependences (all/raw)"), 57 cl::Hidden, cl::init("all"), cl::ZeroOrMore, 58 cl::cat(PollyCategory)); 59 60 static cl::opt<std::string> 61 SimplifyDeps("polly-opt-simplify-deps", 62 cl::desc("Dependences should be simplified (yes/no)"), 63 cl::Hidden, cl::init("yes"), cl::ZeroOrMore, 64 cl::cat(PollyCategory)); 65 66 static cl::opt<int> MaxConstantTerm( 67 "polly-opt-max-constant-term", 68 cl::desc("The maximal constant term allowed (-1 is unlimited)"), cl::Hidden, 69 cl::init(20), cl::ZeroOrMore, cl::cat(PollyCategory)); 70 71 static cl::opt<int> MaxCoefficient( 72 "polly-opt-max-coefficient", 73 cl::desc("The maximal coefficient allowed (-1 is unlimited)"), cl::Hidden, 74 cl::init(20), cl::ZeroOrMore, cl::cat(PollyCategory)); 75 76 static cl::opt<std::string> FusionStrategy( 77 "polly-opt-fusion", cl::desc("The fusion strategy to choose (min/max)"), 78 cl::Hidden, cl::init("min"), cl::ZeroOrMore, cl::cat(PollyCategory)); 79 80 static cl::opt<std::string> 81 MaximizeBandDepth("polly-opt-maximize-bands", 82 cl::desc("Maximize the band depth (yes/no)"), cl::Hidden, 83 cl::init("yes"), cl::ZeroOrMore, cl::cat(PollyCategory)); 84 85 static cl::opt<int> DefaultTileSize( 86 "polly-default-tile-size", 87 cl::desc("The default tile size (if not enough were provided by" 88 " --polly-tile-sizes)"), 89 cl::Hidden, cl::init(32), cl::ZeroOrMore, cl::cat(PollyCategory)); 90 91 static cl::list<int> TileSizes("polly-tile-sizes", 92 cl::desc("A tile size" 93 " for each loop dimension, filled with" 94 " --polly-default-tile-size"), 95 cl::Hidden, cl::ZeroOrMore, cl::CommaSeparated, 96 cl::cat(PollyCategory)); 97 namespace { 98 99 class IslScheduleOptimizer : public ScopPass { 100 public: 101 static char ID; 102 explicit IslScheduleOptimizer() : ScopPass(ID) { LastSchedule = nullptr; } 103 104 ~IslScheduleOptimizer() { isl_schedule_free(LastSchedule); } 105 106 bool runOnScop(Scop &S) override; 107 void printScop(raw_ostream &OS, Scop &S) const override; 108 void getAnalysisUsage(AnalysisUsage &AU) const override; 109 110 private: 111 isl_schedule *LastSchedule; 112 113 /// @brief Decide if the @p NewSchedule is profitable for @p S. 114 /// 115 /// @param S The SCoP we optimize. 116 /// @param NewSchedule The new schedule we computed. 117 /// 118 /// @return True, if we believe @p NewSchedule is an improvement for @p S. 119 bool isProfitableSchedule(Scop &S, __isl_keep isl_union_map *NewSchedule); 120 121 /// @brief Create a map that pre-vectorizes one scheduling dimension. 122 /// 123 /// getPrevectorMap creates a map that maps each input dimension to the same 124 /// output dimension, except for the dimension DimToVectorize. 125 /// DimToVectorize is strip mined by 'VectorWidth' and the newly created 126 /// point loop of DimToVectorize is moved to the innermost level. 127 /// 128 /// Example (DimToVectorize=0, ScheduleDimensions=2, VectorWidth=4): 129 /// 130 /// | Before transformation 131 /// | 132 /// | A[i,j] -> [i,j] 133 /// | 134 /// | for (i = 0; i < 128; i++) 135 /// | for (j = 0; j < 128; j++) 136 /// | A(i,j); 137 /// 138 /// Prevector map: 139 /// [i,j] -> [it,j,ip] : it % 4 = 0 and it <= ip <= it + 3 and i = ip 140 /// 141 /// | After transformation: 142 /// | 143 /// | A[i,j] -> [it,j,ip] : it % 4 = 0 and it <= ip <= it + 3 and i = ip 144 /// | 145 /// | for (it = 0; it < 128; it+=4) 146 /// | for (j = 0; j < 128; j++) 147 /// | for (ip = max(0,it); ip < min(128, it + 3); ip++) 148 /// | A(ip,j); 149 /// 150 /// The goal of this transformation is to create a trivially vectorizable 151 /// loop. This means a parallel loop at the innermost level that has a 152 /// constant number of iterations corresponding to the target vector width. 153 /// 154 /// This transformation creates a loop at the innermost level. The loop has 155 /// a constant number of iterations, if the number of loop iterations at 156 /// DimToVectorize can be divided by VectorWidth. The default VectorWidth is 157 /// currently constant and not yet target specific. This function does not 158 /// reason about parallelism. 159 static __isl_give isl_map *getPrevectorMap(isl_ctx *ctx, int DimToVectorize, 160 int ScheduleDimensions, 161 int VectorWidth = 4); 162 163 /// @brief Apply additional optimizations on the bands in the schedule tree. 164 /// 165 /// We are looking for an innermost band node and apply the following 166 /// transformations: 167 /// 168 /// - Tile the band 169 /// - if the band is tileable 170 /// - if the band has more than one loop dimension 171 /// 172 /// - Prevectorize the point loop of the tile 173 /// - if vectorization is enabled 174 /// 175 /// @param Node The schedule node to (possibly) optimize. 176 /// @param User A pointer to forward some use information (currently unused). 177 static isl_schedule_node *optimizeBand(isl_schedule_node *Node, void *User); 178 179 /// @brief Apply post-scheduling transformations. 180 /// 181 /// This function applies a set of additional local transformations on the 182 /// schedule tree as it computed by the isl scheduler. Local transformations 183 /// applied include: 184 /// 185 /// - Tiling 186 /// - Prevectorization 187 /// 188 /// @param Schedule The schedule object post-transformations will be applied 189 /// on. 190 /// @returns The transformed schedule. 191 static __isl_give isl_schedule * 192 addPostTransforms(__isl_take isl_schedule *Schedule); 193 194 using llvm::Pass::doFinalization; 195 196 virtual bool doFinalization() override { 197 isl_schedule_free(LastSchedule); 198 LastSchedule = nullptr; 199 return true; 200 } 201 }; 202 } 203 204 char IslScheduleOptimizer::ID = 0; 205 206 __isl_give isl_map * 207 IslScheduleOptimizer::getPrevectorMap(isl_ctx *ctx, int DimToVectorize, 208 int ScheduleDimensions, int VectorWidth) { 209 isl_space *Space; 210 isl_local_space *LocalSpace, *LocalSpaceRange; 211 isl_set *Modulo; 212 isl_map *TilingMap; 213 isl_constraint *c; 214 isl_aff *Aff; 215 int PointDimension; /* ip */ 216 int TileDimension; /* it */ 217 isl_val *VectorWidthMP; 218 219 assert(0 <= DimToVectorize && DimToVectorize < ScheduleDimensions); 220 221 Space = isl_space_alloc(ctx, 0, ScheduleDimensions, ScheduleDimensions + 1); 222 TilingMap = isl_map_universe(isl_space_copy(Space)); 223 LocalSpace = isl_local_space_from_space(Space); 224 PointDimension = ScheduleDimensions; 225 TileDimension = DimToVectorize; 226 227 // Create an identity map for everything except DimToVectorize and map 228 // DimToVectorize to the point loop at the innermost dimension. 229 for (int i = 0; i < ScheduleDimensions; i++) 230 if (i == DimToVectorize) 231 TilingMap = 232 isl_map_equate(TilingMap, isl_dim_in, i, isl_dim_out, PointDimension); 233 else 234 TilingMap = isl_map_equate(TilingMap, isl_dim_in, i, isl_dim_out, i); 235 236 // it % 'VectorWidth' = 0 237 LocalSpaceRange = isl_local_space_range(isl_local_space_copy(LocalSpace)); 238 Aff = isl_aff_zero_on_domain(LocalSpaceRange); 239 Aff = isl_aff_set_constant_si(Aff, VectorWidth); 240 Aff = isl_aff_set_coefficient_si(Aff, isl_dim_in, TileDimension, 1); 241 VectorWidthMP = isl_val_int_from_si(ctx, VectorWidth); 242 Aff = isl_aff_mod_val(Aff, VectorWidthMP); 243 Modulo = isl_pw_aff_zero_set(isl_pw_aff_from_aff(Aff)); 244 TilingMap = isl_map_intersect_range(TilingMap, Modulo); 245 246 // it <= ip 247 TilingMap = isl_map_order_le(TilingMap, isl_dim_out, TileDimension, 248 isl_dim_out, PointDimension); 249 250 // ip <= it + ('VectorWidth' - 1) 251 c = isl_inequality_alloc(LocalSpace); 252 isl_constraint_set_coefficient_si(c, isl_dim_out, TileDimension, 1); 253 isl_constraint_set_coefficient_si(c, isl_dim_out, PointDimension, -1); 254 isl_constraint_set_constant_si(c, VectorWidth - 1); 255 TilingMap = isl_map_add_constraint(TilingMap, c); 256 257 return TilingMap; 258 } 259 260 isl_schedule_node *IslScheduleOptimizer::optimizeBand(isl_schedule_node *Node, 261 void *User) { 262 if (isl_schedule_node_get_type(Node) != isl_schedule_node_band) 263 return Node; 264 265 if (isl_schedule_node_n_children(Node) != 1) 266 return Node; 267 268 if (!isl_schedule_node_band_get_permutable(Node)) 269 return Node; 270 271 auto Space = isl_schedule_node_band_get_space(Node); 272 auto Dims = isl_space_dim(Space, isl_dim_set); 273 274 if (Dims <= 1) { 275 isl_space_free(Space); 276 return Node; 277 } 278 279 auto Child = isl_schedule_node_get_child(Node, 0); 280 auto Type = isl_schedule_node_get_type(Child); 281 isl_schedule_node_free(Child); 282 283 if (Type != isl_schedule_node_leaf) { 284 isl_space_free(Space); 285 return Node; 286 } 287 288 auto Sizes = isl_multi_val_zero(Space); 289 auto Ctx = isl_schedule_node_get_ctx(Node); 290 291 for (unsigned i = 0; i < Dims; i++) { 292 auto tileSize = TileSizes.size() > i ? TileSizes[i] : DefaultTileSize; 293 Sizes = isl_multi_val_set_val(Sizes, i, isl_val_int_from_si(Ctx, tileSize)); 294 } 295 296 isl_schedule_node *Res; 297 298 if (DisableTiling) { 299 isl_multi_val_free(Sizes); 300 Res = Node; 301 } else { 302 Res = isl_schedule_node_band_tile(Node, Sizes); 303 } 304 305 if (PollyVectorizerChoice == VECTORIZER_NONE) 306 return Res; 307 308 Child = isl_schedule_node_get_child(Res, 0); 309 auto ChildSchedule = isl_schedule_node_band_get_partial_schedule(Child); 310 311 for (int i = Dims - 1; i >= 0; i--) { 312 if (isl_schedule_node_band_member_get_coincident(Child, i)) { 313 auto TileMap = IslScheduleOptimizer::getPrevectorMap(Ctx, i, Dims); 314 auto TileUMap = isl_union_map_from_map(TileMap); 315 auto ChildSchedule2 = isl_union_map_apply_range( 316 isl_union_map_from_multi_union_pw_aff(ChildSchedule), TileUMap); 317 ChildSchedule = isl_multi_union_pw_aff_from_union_map(ChildSchedule2); 318 break; 319 } 320 } 321 322 isl_schedule_node_free(Res); 323 Res = isl_schedule_node_delete(Child); 324 Res = isl_schedule_node_insert_partial_schedule(Res, ChildSchedule); 325 return Res; 326 } 327 328 __isl_give isl_schedule * 329 IslScheduleOptimizer::addPostTransforms(__isl_take isl_schedule *Schedule) { 330 isl_schedule_node *Root = isl_schedule_get_root(Schedule); 331 isl_schedule_free(Schedule); 332 Root = isl_schedule_node_map_descendant_bottom_up( 333 Root, IslScheduleOptimizer::optimizeBand, NULL); 334 auto S = isl_schedule_node_get_schedule(Root); 335 isl_schedule_node_free(Root); 336 return S; 337 } 338 339 bool IslScheduleOptimizer::isProfitableSchedule( 340 Scop &S, __isl_keep isl_union_map *NewSchedule) { 341 // To understand if the schedule has been optimized we check if the schedule 342 // has changed at all. 343 // TODO: We can improve this by tracking if any necessarily beneficial 344 // transformations have been performed. This can e.g. be tiling, loop 345 // interchange, or ...) We can track this either at the place where the 346 // transformation has been performed or, in case of automatic ILP based 347 // optimizations, by comparing (yet to be defined) performance metrics 348 // before/after the scheduling optimizer 349 // (e.g., #stride-one accesses) 350 isl_union_map *OldSchedule = S.getSchedule(); 351 bool changed = !isl_union_map_is_equal(OldSchedule, NewSchedule); 352 isl_union_map_free(OldSchedule); 353 return changed; 354 } 355 356 bool IslScheduleOptimizer::runOnScop(Scop &S) { 357 358 // Skip empty SCoPs but still allow code generation as it will delete the 359 // loops present but not needed. 360 if (S.getSize() == 0) { 361 S.markAsOptimized(); 362 return false; 363 } 364 365 const Dependences &D = getAnalysis<DependenceInfo>().getDependences(); 366 367 if (!D.hasValidDependences()) 368 return false; 369 370 isl_schedule_free(LastSchedule); 371 LastSchedule = nullptr; 372 373 // Build input data. 374 int ValidityKinds = 375 Dependences::TYPE_RAW | Dependences::TYPE_WAR | Dependences::TYPE_WAW; 376 int ProximityKinds; 377 378 if (OptimizeDeps == "all") 379 ProximityKinds = 380 Dependences::TYPE_RAW | Dependences::TYPE_WAR | Dependences::TYPE_WAW; 381 else if (OptimizeDeps == "raw") 382 ProximityKinds = Dependences::TYPE_RAW; 383 else { 384 errs() << "Do not know how to optimize for '" << OptimizeDeps << "'" 385 << " Falling back to optimizing all dependences.\n"; 386 ProximityKinds = 387 Dependences::TYPE_RAW | Dependences::TYPE_WAR | Dependences::TYPE_WAW; 388 } 389 390 isl_union_set *Domain = S.getDomains(); 391 392 if (!Domain) 393 return false; 394 395 isl_union_map *Validity = D.getDependences(ValidityKinds); 396 isl_union_map *Proximity = D.getDependences(ProximityKinds); 397 398 // Simplify the dependences by removing the constraints introduced by the 399 // domains. This can speed up the scheduling time significantly, as large 400 // constant coefficients will be removed from the dependences. The 401 // introduction of some additional dependences reduces the possible 402 // transformations, but in most cases, such transformation do not seem to be 403 // interesting anyway. In some cases this option may stop the scheduler to 404 // find any schedule. 405 if (SimplifyDeps == "yes") { 406 Validity = isl_union_map_gist_domain(Validity, isl_union_set_copy(Domain)); 407 Validity = isl_union_map_gist_range(Validity, isl_union_set_copy(Domain)); 408 Proximity = 409 isl_union_map_gist_domain(Proximity, isl_union_set_copy(Domain)); 410 Proximity = isl_union_map_gist_range(Proximity, isl_union_set_copy(Domain)); 411 } else if (SimplifyDeps != "no") { 412 errs() << "warning: Option -polly-opt-simplify-deps should either be 'yes' " 413 "or 'no'. Falling back to default: 'yes'\n"; 414 } 415 416 DEBUG(dbgs() << "\n\nCompute schedule from: "); 417 DEBUG(dbgs() << "Domain := " << stringFromIslObj(Domain) << ";\n"); 418 DEBUG(dbgs() << "Proximity := " << stringFromIslObj(Proximity) << ";\n"); 419 DEBUG(dbgs() << "Validity := " << stringFromIslObj(Validity) << ";\n"); 420 421 unsigned IslSerializeSCCs; 422 423 if (FusionStrategy == "max") { 424 IslSerializeSCCs = 0; 425 } else if (FusionStrategy == "min") { 426 IslSerializeSCCs = 1; 427 } else { 428 errs() << "warning: Unknown fusion strategy. Falling back to maximal " 429 "fusion.\n"; 430 IslSerializeSCCs = 0; 431 } 432 433 int IslMaximizeBands; 434 435 if (MaximizeBandDepth == "yes") { 436 IslMaximizeBands = 1; 437 } else if (MaximizeBandDepth == "no") { 438 IslMaximizeBands = 0; 439 } else { 440 errs() << "warning: Option -polly-opt-maximize-bands should either be 'yes'" 441 " or 'no'. Falling back to default: 'yes'\n"; 442 IslMaximizeBands = 1; 443 } 444 445 isl_options_set_schedule_serialize_sccs(S.getIslCtx(), IslSerializeSCCs); 446 isl_options_set_schedule_maximize_band_depth(S.getIslCtx(), IslMaximizeBands); 447 isl_options_set_schedule_max_constant_term(S.getIslCtx(), MaxConstantTerm); 448 isl_options_set_schedule_max_coefficient(S.getIslCtx(), MaxCoefficient); 449 isl_options_set_tile_scale_tile_loops(S.getIslCtx(), 0); 450 451 isl_options_set_on_error(S.getIslCtx(), ISL_ON_ERROR_CONTINUE); 452 453 isl_schedule_constraints *ScheduleConstraints; 454 ScheduleConstraints = isl_schedule_constraints_on_domain(Domain); 455 ScheduleConstraints = 456 isl_schedule_constraints_set_proximity(ScheduleConstraints, Proximity); 457 ScheduleConstraints = isl_schedule_constraints_set_validity( 458 ScheduleConstraints, isl_union_map_copy(Validity)); 459 ScheduleConstraints = 460 isl_schedule_constraints_set_coincidence(ScheduleConstraints, Validity); 461 isl_schedule *Schedule; 462 Schedule = isl_schedule_constraints_compute_schedule(ScheduleConstraints); 463 isl_options_set_on_error(S.getIslCtx(), ISL_ON_ERROR_ABORT); 464 465 // In cases the scheduler is not able to optimize the code, we just do not 466 // touch the schedule. 467 if (!Schedule) 468 return false; 469 470 DEBUG({ 471 auto *P = isl_printer_to_str(S.getIslCtx()); 472 P = isl_printer_set_yaml_style(P, ISL_YAML_STYLE_BLOCK); 473 P = isl_printer_print_schedule(P, Schedule); 474 dbgs() << "NewScheduleTree: \n" << isl_printer_get_str(P) << "\n"; 475 isl_printer_free(P); 476 }); 477 478 isl_schedule *NewSchedule = addPostTransforms(Schedule); 479 isl_union_map *NewScheduleMap = isl_schedule_get_map(NewSchedule); 480 481 if (!isProfitableSchedule(S, NewScheduleMap)) { 482 isl_union_map_free(NewScheduleMap); 483 isl_schedule_free(NewSchedule); 484 return false; 485 } 486 487 S.setScheduleTree(NewSchedule); 488 S.markAsOptimized(); 489 490 isl_union_map_free(NewScheduleMap); 491 return false; 492 } 493 494 void IslScheduleOptimizer::printScop(raw_ostream &OS, Scop &) const { 495 isl_printer *p; 496 char *ScheduleStr; 497 498 OS << "Calculated schedule:\n"; 499 500 if (!LastSchedule) { 501 OS << "n/a\n"; 502 return; 503 } 504 505 p = isl_printer_to_str(isl_schedule_get_ctx(LastSchedule)); 506 p = isl_printer_print_schedule(p, LastSchedule); 507 ScheduleStr = isl_printer_get_str(p); 508 isl_printer_free(p); 509 510 OS << ScheduleStr << "\n"; 511 } 512 513 void IslScheduleOptimizer::getAnalysisUsage(AnalysisUsage &AU) const { 514 ScopPass::getAnalysisUsage(AU); 515 AU.addRequired<DependenceInfo>(); 516 } 517 518 Pass *polly::createIslScheduleOptimizerPass() { 519 return new IslScheduleOptimizer(); 520 } 521 522 INITIALIZE_PASS_BEGIN(IslScheduleOptimizer, "polly-opt-isl", 523 "Polly - Optimize schedule of SCoP", false, false); 524 INITIALIZE_PASS_DEPENDENCY(DependenceInfo); 525 INITIALIZE_PASS_DEPENDENCY(ScopInfo); 526 INITIALIZE_PASS_END(IslScheduleOptimizer, "polly-opt-isl", 527 "Polly - Optimize schedule of SCoP", false, false) 528