1 //===- Schedule.cpp - Calculate an optimized schedule ---------------------===// 2 // 3 // The LLVM Compiler Infrastructure 4 // 5 // This file is distributed under the University of Illinois Open Source 6 // License. See LICENSE.TXT for details. 7 // 8 //===----------------------------------------------------------------------===// 9 // 10 // This pass generates an entirey new schedule tree from the data dependences 11 // and iteration domains. The new schedule tree is computed in two steps: 12 // 13 // 1) The isl scheduling optimizer is run 14 // 15 // The isl scheduling optimizer creates a new schedule tree that maximizes 16 // parallelism and tileability and minimizes data-dependence distances. The 17 // algorithm used is a modified version of the ``Pluto'' algorithm: 18 // 19 // U. Bondhugula, A. Hartono, J. Ramanujam, and P. Sadayappan. 20 // A Practical Automatic Polyhedral Parallelizer and Locality Optimizer. 21 // In Proceedings of the 2008 ACM SIGPLAN Conference On Programming Language 22 // Design and Implementation, PLDI ’08, pages 101–113. ACM, 2008. 23 // 24 // 2) A set of post-scheduling transformations is applied on the schedule tree. 25 // 26 // These optimizations include: 27 // 28 // - Tiling of the innermost tilable bands 29 // - Prevectorization - The coice of a possible outer loop that is strip-mined 30 // to the innermost level to enable inner-loop 31 // vectorization. 32 // - Some optimizations for spatial locality are also planned. 33 // 34 // For a detailed description of the schedule tree itself please see section 6 35 // of: 36 // 37 // Polyhedral AST generation is more than scanning polyhedra 38 // Tobias Grosser, Sven Verdoolaege, Albert Cohen 39 // ACM Transations on Programming Languages and Systems (TOPLAS), 40 // 37(4), July 2015 41 // http://www.grosser.es/#pub-polyhedral-AST-generation 42 // 43 // This publication also contains a detailed discussion of the different options 44 // for polyhedral loop unrolling, full/partial tile separation and other uses 45 // of the schedule tree. 46 // 47 //===----------------------------------------------------------------------===// 48 49 #include "polly/ScheduleOptimizer.h" 50 #include "polly/CodeGen/CodeGeneration.h" 51 #include "polly/DependenceInfo.h" 52 #include "polly/LinkAllPasses.h" 53 #include "polly/Options.h" 54 #include "polly/ScopInfo.h" 55 #include "polly/Support/GICHelper.h" 56 #include "llvm/Support/Debug.h" 57 #include "isl/aff.h" 58 #include "isl/band.h" 59 #include "isl/constraint.h" 60 #include "isl/map.h" 61 #include "isl/options.h" 62 #include "isl/printer.h" 63 #include "isl/schedule.h" 64 #include "isl/schedule_node.h" 65 #include "isl/space.h" 66 #include "isl/union_map.h" 67 #include "isl/union_set.h" 68 #include <ciso646> 69 70 using namespace llvm; 71 using namespace polly; 72 73 #define DEBUG_TYPE "polly-opt-isl" 74 75 static cl::opt<std::string> 76 OptimizeDeps("polly-opt-optimize-only", 77 cl::desc("Only a certain kind of dependences (all/raw)"), 78 cl::Hidden, cl::init("all"), cl::ZeroOrMore, 79 cl::cat(PollyCategory)); 80 81 static cl::opt<std::string> 82 SimplifyDeps("polly-opt-simplify-deps", 83 cl::desc("Dependences should be simplified (yes/no)"), 84 cl::Hidden, cl::init("yes"), cl::ZeroOrMore, 85 cl::cat(PollyCategory)); 86 87 static cl::opt<int> MaxConstantTerm( 88 "polly-opt-max-constant-term", 89 cl::desc("The maximal constant term allowed (-1 is unlimited)"), cl::Hidden, 90 cl::init(20), cl::ZeroOrMore, cl::cat(PollyCategory)); 91 92 static cl::opt<int> MaxCoefficient( 93 "polly-opt-max-coefficient", 94 cl::desc("The maximal coefficient allowed (-1 is unlimited)"), cl::Hidden, 95 cl::init(20), cl::ZeroOrMore, cl::cat(PollyCategory)); 96 97 static cl::opt<std::string> FusionStrategy( 98 "polly-opt-fusion", cl::desc("The fusion strategy to choose (min/max)"), 99 cl::Hidden, cl::init("min"), cl::ZeroOrMore, cl::cat(PollyCategory)); 100 101 static cl::opt<std::string> 102 MaximizeBandDepth("polly-opt-maximize-bands", 103 cl::desc("Maximize the band depth (yes/no)"), cl::Hidden, 104 cl::init("yes"), cl::ZeroOrMore, cl::cat(PollyCategory)); 105 106 static cl::opt<std::string> OuterCoincidence( 107 "polly-opt-outer-coincidence", 108 cl::desc("Try to construct schedules where the outer member of each band " 109 "satisfies the coincidence constraints (yes/no)"), 110 cl::Hidden, cl::init("no"), cl::ZeroOrMore, cl::cat(PollyCategory)); 111 112 static cl::opt<int> PrevectorWidth( 113 "polly-prevect-width", 114 cl::desc( 115 "The number of loop iterations to strip-mine for pre-vectorization"), 116 cl::Hidden, cl::init(4), cl::ZeroOrMore, cl::cat(PollyCategory)); 117 118 static cl::opt<bool> FirstLevelTiling("polly-tiling", 119 cl::desc("Enable loop tiling"), 120 cl::init(true), cl::ZeroOrMore, 121 cl::cat(PollyCategory)); 122 123 static cl::opt<int> FirstLevelDefaultTileSize( 124 "polly-default-tile-size", 125 cl::desc("The default tile size (if not enough were provided by" 126 " --polly-tile-sizes)"), 127 cl::Hidden, cl::init(32), cl::ZeroOrMore, cl::cat(PollyCategory)); 128 129 static cl::list<int> FirstLevelTileSizes( 130 "polly-tile-sizes", cl::desc("A tile size for each loop dimension, filled " 131 "with --polly-default-tile-size"), 132 cl::Hidden, cl::ZeroOrMore, cl::CommaSeparated, cl::cat(PollyCategory)); 133 134 static cl::opt<bool> 135 SecondLevelTiling("polly-2nd-level-tiling", 136 cl::desc("Enable a 2nd level loop of loop tiling"), 137 cl::init(false), cl::ZeroOrMore, cl::cat(PollyCategory)); 138 139 static cl::opt<int> SecondLevelDefaultTileSize( 140 "polly-2nd-level-default-tile-size", 141 cl::desc("The default 2nd-level tile size (if not enough were provided by" 142 " --polly-2nd-level-tile-sizes)"), 143 cl::Hidden, cl::init(16), cl::ZeroOrMore, cl::cat(PollyCategory)); 144 145 static cl::list<int> 146 SecondLevelTileSizes("polly-2nd-level-tile-sizes", 147 cl::desc("A tile size for each loop dimension, filled " 148 "with --polly-default-tile-size"), 149 cl::Hidden, cl::ZeroOrMore, cl::CommaSeparated, 150 cl::cat(PollyCategory)); 151 152 static cl::opt<bool> RegisterTiling("polly-register-tiling", 153 cl::desc("Enable register tiling"), 154 cl::init(false), cl::ZeroOrMore, 155 cl::cat(PollyCategory)); 156 157 static cl::opt<int> RegisterDefaultTileSize( 158 "polly-register-tiling-default-tile-size", 159 cl::desc("The default register tile size (if not enough were provided by" 160 " --polly-register-tile-sizes)"), 161 cl::Hidden, cl::init(2), cl::ZeroOrMore, cl::cat(PollyCategory)); 162 163 static cl::list<int> 164 RegisterTileSizes("polly-register-tile-sizes", 165 cl::desc("A tile size for each loop dimension, filled " 166 "with --polly-register-tile-size"), 167 cl::Hidden, cl::ZeroOrMore, cl::CommaSeparated, 168 cl::cat(PollyCategory)); 169 170 static cl::opt<bool> 171 PMBasedOpts("polly-pattern-matching-based-opts", 172 cl::desc("Perform optimizations based on pattern matching"), 173 cl::init(false), cl::ZeroOrMore, cl::cat(PollyCategory)); 174 175 /// @brief Create an isl_union_set, which describes the isolate option based 176 /// on IsoalteDomain. 177 /// 178 /// @param IsolateDomain An isl_set whose last dimension is the only one that 179 /// should belong to the current band node. 180 static __isl_give isl_union_set * 181 getIsolateOptions(__isl_take isl_set *IsolateDomain) { 182 auto Dims = isl_set_dim(IsolateDomain, isl_dim_set); 183 auto *IsolateRelation = isl_map_from_domain(IsolateDomain); 184 IsolateRelation = isl_map_move_dims(IsolateRelation, isl_dim_out, 0, 185 isl_dim_in, Dims - 1, 1); 186 auto *IsolateOption = isl_map_wrap(IsolateRelation); 187 auto *Id = isl_id_alloc(isl_set_get_ctx(IsolateOption), "isolate", NULL); 188 return isl_union_set_from_set(isl_set_set_tuple_id(IsolateOption, Id)); 189 } 190 191 /// @brief Create an isl_union_set, which describes the atomic option for the 192 /// dimension of the current node. 193 /// 194 /// It may help to reduce the size of generated code. 195 /// 196 /// @param Ctx An isl_ctx, which is used to create the isl_union_set. 197 static __isl_give isl_union_set *getAtomicOptions(__isl_take isl_ctx *Ctx) { 198 auto *Space = isl_space_set_alloc(Ctx, 0, 1); 199 auto *AtomicOption = isl_set_universe(Space); 200 auto *Id = isl_id_alloc(Ctx, "atomic", NULL); 201 return isl_union_set_from_set(isl_set_set_tuple_id(AtomicOption, Id)); 202 } 203 204 /// @brief Make the last dimension of Set to take values 205 /// from 0 to VectorWidth - 1. 206 /// 207 /// @param Set A set, which should be modified. 208 /// @param VectorWidth A parameter, which determines the constraint. 209 static __isl_give isl_set *addExtentConstraints(__isl_take isl_set *Set, 210 int VectorWidth) { 211 auto Dims = isl_set_dim(Set, isl_dim_set); 212 auto Space = isl_set_get_space(Set); 213 auto *LocalSpace = isl_local_space_from_space(Space); 214 auto *ExtConstr = 215 isl_constraint_alloc_inequality(isl_local_space_copy(LocalSpace)); 216 ExtConstr = isl_constraint_set_constant_si(ExtConstr, 0); 217 ExtConstr = 218 isl_constraint_set_coefficient_si(ExtConstr, isl_dim_set, Dims - 1, 1); 219 Set = isl_set_add_constraint(Set, ExtConstr); 220 ExtConstr = isl_constraint_alloc_inequality(LocalSpace); 221 ExtConstr = isl_constraint_set_constant_si(ExtConstr, VectorWidth - 1); 222 ExtConstr = 223 isl_constraint_set_coefficient_si(ExtConstr, isl_dim_set, Dims - 1, -1); 224 return isl_set_add_constraint(Set, ExtConstr); 225 } 226 227 /// @brief Build the desired set of partial tile prefixes. 228 /// 229 /// We build a set of partial tile prefixes, which are prefixes of the vector 230 /// loop that have exactly VectorWidth iterations. 231 /// 232 /// 1. Get all prefixes of the vector loop. 233 /// 2. Extend it to a set, which has exactly VectorWidth iterations for 234 /// any prefix from the set that was built on the previous step. 235 /// 3. Subtract loop domain from it, project out the vector loop dimension and 236 /// get a set of prefixes, which don't have exactly VectorWidth iterations. 237 /// 4. Subtract it from all prefixes of the vector loop and get the desired 238 /// set. 239 /// 240 /// @param ScheduleRange A range of a map, which describes a prefix schedule 241 /// relation. 242 static __isl_give isl_set * 243 getPartialTilePrefixes(__isl_take isl_set *ScheduleRange, int VectorWidth) { 244 auto Dims = isl_set_dim(ScheduleRange, isl_dim_set); 245 auto *LoopPrefixes = isl_set_project_out(isl_set_copy(ScheduleRange), 246 isl_dim_set, Dims - 1, 1); 247 auto *ExtentPrefixes = 248 isl_set_add_dims(isl_set_copy(LoopPrefixes), isl_dim_set, 1); 249 ExtentPrefixes = addExtentConstraints(ExtentPrefixes, VectorWidth); 250 auto *BadPrefixes = isl_set_subtract(ExtentPrefixes, ScheduleRange); 251 BadPrefixes = isl_set_project_out(BadPrefixes, isl_dim_set, Dims - 1, 1); 252 return isl_set_subtract(LoopPrefixes, BadPrefixes); 253 } 254 255 __isl_give isl_schedule_node *ScheduleTreeOptimizer::isolateFullPartialTiles( 256 __isl_take isl_schedule_node *Node, int VectorWidth) { 257 assert(isl_schedule_node_get_type(Node) == isl_schedule_node_band); 258 Node = isl_schedule_node_child(Node, 0); 259 Node = isl_schedule_node_child(Node, 0); 260 auto *SchedRelUMap = isl_schedule_node_get_prefix_schedule_relation(Node); 261 auto *ScheduleRelation = isl_map_from_union_map(SchedRelUMap); 262 auto *ScheduleRange = isl_map_range(ScheduleRelation); 263 auto *IsolateDomain = getPartialTilePrefixes(ScheduleRange, VectorWidth); 264 auto *AtomicOption = getAtomicOptions(isl_set_get_ctx(IsolateDomain)); 265 auto *IsolateOption = getIsolateOptions(IsolateDomain); 266 Node = isl_schedule_node_parent(Node); 267 Node = isl_schedule_node_parent(Node); 268 auto *Options = isl_union_set_union(IsolateOption, AtomicOption); 269 Node = isl_schedule_node_band_set_ast_build_options(Node, Options); 270 return Node; 271 } 272 273 __isl_give isl_schedule_node * 274 ScheduleTreeOptimizer::prevectSchedBand(__isl_take isl_schedule_node *Node, 275 unsigned DimToVectorize, 276 int VectorWidth) { 277 assert(isl_schedule_node_get_type(Node) == isl_schedule_node_band); 278 279 auto Space = isl_schedule_node_band_get_space(Node); 280 auto ScheduleDimensions = isl_space_dim(Space, isl_dim_set); 281 isl_space_free(Space); 282 assert(DimToVectorize < ScheduleDimensions); 283 284 if (DimToVectorize > 0) { 285 Node = isl_schedule_node_band_split(Node, DimToVectorize); 286 Node = isl_schedule_node_child(Node, 0); 287 } 288 if (DimToVectorize < ScheduleDimensions - 1) 289 Node = isl_schedule_node_band_split(Node, 1); 290 Space = isl_schedule_node_band_get_space(Node); 291 auto Sizes = isl_multi_val_zero(Space); 292 auto Ctx = isl_schedule_node_get_ctx(Node); 293 Sizes = 294 isl_multi_val_set_val(Sizes, 0, isl_val_int_from_si(Ctx, VectorWidth)); 295 Node = isl_schedule_node_band_tile(Node, Sizes); 296 Node = isolateFullPartialTiles(Node, VectorWidth); 297 Node = isl_schedule_node_child(Node, 0); 298 // Make sure the "trivially vectorizable loop" is not unrolled. Otherwise, 299 // we will have troubles to match it in the backend. 300 Node = isl_schedule_node_band_set_ast_build_options( 301 Node, isl_union_set_read_from_str(Ctx, "{ unroll[x]: 1 = 0 }")); 302 Node = isl_schedule_node_band_sink(Node); 303 Node = isl_schedule_node_child(Node, 0); 304 if (isl_schedule_node_get_type(Node) == isl_schedule_node_leaf) 305 Node = isl_schedule_node_parent(Node); 306 isl_id *LoopMarker = isl_id_alloc(Ctx, "SIMD", nullptr); 307 Node = isl_schedule_node_insert_mark(Node, LoopMarker); 308 return Node; 309 } 310 311 __isl_give isl_schedule_node * 312 ScheduleTreeOptimizer::tileNode(__isl_take isl_schedule_node *Node, 313 const char *Identifier, ArrayRef<int> TileSizes, 314 int DefaultTileSize) { 315 auto Ctx = isl_schedule_node_get_ctx(Node); 316 auto Space = isl_schedule_node_band_get_space(Node); 317 auto Dims = isl_space_dim(Space, isl_dim_set); 318 auto Sizes = isl_multi_val_zero(Space); 319 std::string IdentifierString(Identifier); 320 for (unsigned i = 0; i < Dims; i++) { 321 auto tileSize = i < TileSizes.size() ? TileSizes[i] : DefaultTileSize; 322 Sizes = isl_multi_val_set_val(Sizes, i, isl_val_int_from_si(Ctx, tileSize)); 323 } 324 auto TileLoopMarkerStr = IdentifierString + " - Tiles"; 325 isl_id *TileLoopMarker = 326 isl_id_alloc(Ctx, TileLoopMarkerStr.c_str(), nullptr); 327 Node = isl_schedule_node_insert_mark(Node, TileLoopMarker); 328 Node = isl_schedule_node_child(Node, 0); 329 Node = isl_schedule_node_band_tile(Node, Sizes); 330 Node = isl_schedule_node_child(Node, 0); 331 auto PointLoopMarkerStr = IdentifierString + " - Points"; 332 isl_id *PointLoopMarker = 333 isl_id_alloc(Ctx, PointLoopMarkerStr.c_str(), nullptr); 334 Node = isl_schedule_node_insert_mark(Node, PointLoopMarker); 335 Node = isl_schedule_node_child(Node, 0); 336 return Node; 337 } 338 339 bool ScheduleTreeOptimizer::isTileableBandNode( 340 __isl_keep isl_schedule_node *Node) { 341 if (isl_schedule_node_get_type(Node) != isl_schedule_node_band) 342 return false; 343 344 if (isl_schedule_node_n_children(Node) != 1) 345 return false; 346 347 if (!isl_schedule_node_band_get_permutable(Node)) 348 return false; 349 350 auto Space = isl_schedule_node_band_get_space(Node); 351 auto Dims = isl_space_dim(Space, isl_dim_set); 352 isl_space_free(Space); 353 354 if (Dims <= 1) 355 return false; 356 357 auto Child = isl_schedule_node_get_child(Node, 0); 358 auto Type = isl_schedule_node_get_type(Child); 359 isl_schedule_node_free(Child); 360 361 if (Type != isl_schedule_node_leaf) 362 return false; 363 364 return true; 365 } 366 367 __isl_give isl_schedule_node * 368 ScheduleTreeOptimizer::standardBandOpts(__isl_take isl_schedule_node *Node, 369 void *User) { 370 if (FirstLevelTiling) 371 Node = tileNode(Node, "1st level tiling", FirstLevelTileSizes, 372 FirstLevelDefaultTileSize); 373 374 if (SecondLevelTiling) 375 Node = tileNode(Node, "2nd level tiling", SecondLevelTileSizes, 376 SecondLevelDefaultTileSize); 377 378 if (RegisterTiling) { 379 auto *Ctx = isl_schedule_node_get_ctx(Node); 380 Node = tileNode(Node, "Register tiling", RegisterTileSizes, 381 RegisterDefaultTileSize); 382 Node = isl_schedule_node_band_set_ast_build_options( 383 Node, isl_union_set_read_from_str(Ctx, "{unroll[x]}")); 384 } 385 386 if (PollyVectorizerChoice == VECTORIZER_NONE) 387 return Node; 388 389 auto Space = isl_schedule_node_band_get_space(Node); 390 auto Dims = isl_space_dim(Space, isl_dim_set); 391 isl_space_free(Space); 392 393 for (int i = Dims - 1; i >= 0; i--) 394 if (isl_schedule_node_band_member_get_coincident(Node, i)) { 395 Node = prevectSchedBand(Node, i, PrevectorWidth); 396 break; 397 } 398 399 return Node; 400 } 401 402 /// @brief Check whether output dimensions of the map rely on the specified 403 /// input dimension. 404 /// 405 /// @param IslMap The isl map to be considered. 406 /// @param DimNum The number of an input dimension to be checked. 407 static bool isInputDimUsed(__isl_take isl_map *IslMap, unsigned DimNum) { 408 auto *CheckedAccessRelation = 409 isl_map_project_out(isl_map_copy(IslMap), isl_dim_in, DimNum, 1); 410 CheckedAccessRelation = 411 isl_map_insert_dims(CheckedAccessRelation, isl_dim_in, DimNum, 1); 412 auto *InputDimsId = isl_map_get_tuple_id(IslMap, isl_dim_in); 413 CheckedAccessRelation = 414 isl_map_set_tuple_id(CheckedAccessRelation, isl_dim_in, InputDimsId); 415 InputDimsId = isl_map_get_tuple_id(IslMap, isl_dim_out); 416 CheckedAccessRelation = 417 isl_map_set_tuple_id(CheckedAccessRelation, isl_dim_out, InputDimsId); 418 auto res = !isl_map_is_equal(CheckedAccessRelation, IslMap); 419 isl_map_free(CheckedAccessRelation); 420 isl_map_free(IslMap); 421 return res; 422 } 423 424 /// @brief Check if the SCoP statement could probably be optimized with 425 /// analytical modeling. 426 /// 427 /// containsMatrMult tries to determine whether the following conditions 428 /// are true: 429 /// 1. all memory accesses of the statement will have stride 0 or 1, 430 /// if we interchange loops (switch the variable used in the inner 431 /// loop to the outer loop). 432 /// 2. all memory accesses of the statement except from the last one, are 433 /// read memory access and the last one is write memory access. 434 /// 3. all subscripts of the last memory access of the statement don't contain 435 /// the variable used in the inner loop. 436 /// 437 /// @param PartialSchedule The PartialSchedule that contains a SCoP statement 438 /// to check. 439 static bool containsMatrMult(__isl_keep isl_map *PartialSchedule) { 440 auto InputDimsId = isl_map_get_tuple_id(PartialSchedule, isl_dim_in); 441 auto *ScpStmt = static_cast<ScopStmt *>(isl_id_get_user(InputDimsId)); 442 isl_id_free(InputDimsId); 443 if (ScpStmt->size() <= 1) 444 return false; 445 auto MemA = ScpStmt->begin(); 446 for (unsigned i = 0; i < ScpStmt->size() - 2 && MemA != ScpStmt->end(); 447 i++, MemA++) 448 if (!(*MemA)->isRead() || 449 ((*MemA)->isArrayKind() && 450 !((*MemA)->isStrideOne(isl_map_copy(PartialSchedule)) || 451 (*MemA)->isStrideZero(isl_map_copy(PartialSchedule))))) 452 return false; 453 MemA++; 454 if (!(*MemA)->isWrite() || !(*MemA)->isArrayKind() || 455 !((*MemA)->isStrideOne(isl_map_copy(PartialSchedule)) || 456 (*MemA)->isStrideZero(isl_map_copy(PartialSchedule)))) 457 return false; 458 auto DimNum = isl_map_dim(PartialSchedule, isl_dim_in); 459 return !isInputDimUsed((*MemA)->getAccessRelation(), DimNum - 1); 460 } 461 462 /// @brief Circular shift of output dimensions of the integer map. 463 /// 464 /// @param IslMap The isl map to be modified. 465 static __isl_give isl_map *circularShiftOutputDims(__isl_take isl_map *IslMap) { 466 auto DimNum = isl_map_dim(IslMap, isl_dim_out); 467 if (DimNum == 0) 468 return IslMap; 469 auto InputDimsId = isl_map_get_tuple_id(IslMap, isl_dim_in); 470 IslMap = isl_map_move_dims(IslMap, isl_dim_in, 0, isl_dim_out, DimNum - 1, 1); 471 IslMap = isl_map_move_dims(IslMap, isl_dim_out, 0, isl_dim_in, 0, 1); 472 return isl_map_set_tuple_id(IslMap, isl_dim_in, InputDimsId); 473 } 474 475 bool ScheduleTreeOptimizer::isMatrMultPattern( 476 __isl_keep isl_schedule_node *Node) { 477 auto *PartialSchedule = 478 isl_schedule_node_band_get_partial_schedule_union_map(Node); 479 if (isl_union_map_n_map(PartialSchedule) != 1) 480 return false; 481 auto *NewPartialSchedule = isl_map_from_union_map(PartialSchedule); 482 auto DimNum = isl_map_dim(NewPartialSchedule, isl_dim_in); 483 if (DimNum != 3) { 484 isl_map_free(NewPartialSchedule); 485 return false; 486 } 487 assert(isl_map_dim(NewPartialSchedule, isl_dim_out) == 3 && 488 "Each schedule dimension should be represented by a union piecewise" 489 "quasi-affine expression."); 490 NewPartialSchedule = circularShiftOutputDims(NewPartialSchedule); 491 if (containsMatrMult(NewPartialSchedule)) { 492 isl_map_free(NewPartialSchedule); 493 return true; 494 } 495 isl_map_free(NewPartialSchedule); 496 return false; 497 } 498 499 __isl_give isl_schedule_node * 500 ScheduleTreeOptimizer::optimizeBand(__isl_take isl_schedule_node *Node, 501 void *User) { 502 if (!isTileableBandNode(Node)) 503 return Node; 504 505 if (PMBasedOpts && isMatrMultPattern(Node)) 506 DEBUG(dbgs() << "The matrix multiplication pattern was detected\n"); 507 508 return standardBandOpts(Node, User); 509 } 510 511 __isl_give isl_schedule * 512 ScheduleTreeOptimizer::optimizeSchedule(__isl_take isl_schedule *Schedule) { 513 isl_schedule_node *Root = isl_schedule_get_root(Schedule); 514 Root = optimizeScheduleNode(Root); 515 isl_schedule_free(Schedule); 516 auto S = isl_schedule_node_get_schedule(Root); 517 isl_schedule_node_free(Root); 518 return S; 519 } 520 521 __isl_give isl_schedule_node *ScheduleTreeOptimizer::optimizeScheduleNode( 522 __isl_take isl_schedule_node *Node) { 523 Node = isl_schedule_node_map_descendant_bottom_up(Node, optimizeBand, NULL); 524 return Node; 525 } 526 527 bool ScheduleTreeOptimizer::isProfitableSchedule( 528 Scop &S, __isl_keep isl_union_map *NewSchedule) { 529 // To understand if the schedule has been optimized we check if the schedule 530 // has changed at all. 531 // TODO: We can improve this by tracking if any necessarily beneficial 532 // transformations have been performed. This can e.g. be tiling, loop 533 // interchange, or ...) We can track this either at the place where the 534 // transformation has been performed or, in case of automatic ILP based 535 // optimizations, by comparing (yet to be defined) performance metrics 536 // before/after the scheduling optimizer 537 // (e.g., #stride-one accesses) 538 isl_union_map *OldSchedule = S.getSchedule(); 539 bool changed = !isl_union_map_is_equal(OldSchedule, NewSchedule); 540 isl_union_map_free(OldSchedule); 541 return changed; 542 } 543 544 namespace { 545 class IslScheduleOptimizer : public ScopPass { 546 public: 547 static char ID; 548 explicit IslScheduleOptimizer() : ScopPass(ID) { LastSchedule = nullptr; } 549 550 ~IslScheduleOptimizer() { isl_schedule_free(LastSchedule); } 551 552 /// @brief Optimize the schedule of the SCoP @p S. 553 bool runOnScop(Scop &S) override; 554 555 /// @brief Print the new schedule for the SCoP @p S. 556 void printScop(raw_ostream &OS, Scop &S) const override; 557 558 /// @brief Register all analyses and transformation required. 559 void getAnalysisUsage(AnalysisUsage &AU) const override; 560 561 /// @brief Release the internal memory. 562 void releaseMemory() override { 563 isl_schedule_free(LastSchedule); 564 LastSchedule = nullptr; 565 } 566 567 private: 568 isl_schedule *LastSchedule; 569 }; 570 } 571 572 char IslScheduleOptimizer::ID = 0; 573 574 bool IslScheduleOptimizer::runOnScop(Scop &S) { 575 576 // Skip empty SCoPs but still allow code generation as it will delete the 577 // loops present but not needed. 578 if (S.getSize() == 0) { 579 S.markAsOptimized(); 580 return false; 581 } 582 583 const Dependences &D = 584 getAnalysis<DependenceInfo>().getDependences(Dependences::AL_Statement); 585 586 if (!D.hasValidDependences()) 587 return false; 588 589 isl_schedule_free(LastSchedule); 590 LastSchedule = nullptr; 591 592 // Build input data. 593 int ValidityKinds = 594 Dependences::TYPE_RAW | Dependences::TYPE_WAR | Dependences::TYPE_WAW; 595 int ProximityKinds; 596 597 if (OptimizeDeps == "all") 598 ProximityKinds = 599 Dependences::TYPE_RAW | Dependences::TYPE_WAR | Dependences::TYPE_WAW; 600 else if (OptimizeDeps == "raw") 601 ProximityKinds = Dependences::TYPE_RAW; 602 else { 603 errs() << "Do not know how to optimize for '" << OptimizeDeps << "'" 604 << " Falling back to optimizing all dependences.\n"; 605 ProximityKinds = 606 Dependences::TYPE_RAW | Dependences::TYPE_WAR | Dependences::TYPE_WAW; 607 } 608 609 isl_union_set *Domain = S.getDomains(); 610 611 if (!Domain) 612 return false; 613 614 isl_union_map *Validity = D.getDependences(ValidityKinds); 615 isl_union_map *Proximity = D.getDependences(ProximityKinds); 616 617 // Simplify the dependences by removing the constraints introduced by the 618 // domains. This can speed up the scheduling time significantly, as large 619 // constant coefficients will be removed from the dependences. The 620 // introduction of some additional dependences reduces the possible 621 // transformations, but in most cases, such transformation do not seem to be 622 // interesting anyway. In some cases this option may stop the scheduler to 623 // find any schedule. 624 if (SimplifyDeps == "yes") { 625 Validity = isl_union_map_gist_domain(Validity, isl_union_set_copy(Domain)); 626 Validity = isl_union_map_gist_range(Validity, isl_union_set_copy(Domain)); 627 Proximity = 628 isl_union_map_gist_domain(Proximity, isl_union_set_copy(Domain)); 629 Proximity = isl_union_map_gist_range(Proximity, isl_union_set_copy(Domain)); 630 } else if (SimplifyDeps != "no") { 631 errs() << "warning: Option -polly-opt-simplify-deps should either be 'yes' " 632 "or 'no'. Falling back to default: 'yes'\n"; 633 } 634 635 DEBUG(dbgs() << "\n\nCompute schedule from: "); 636 DEBUG(dbgs() << "Domain := " << stringFromIslObj(Domain) << ";\n"); 637 DEBUG(dbgs() << "Proximity := " << stringFromIslObj(Proximity) << ";\n"); 638 DEBUG(dbgs() << "Validity := " << stringFromIslObj(Validity) << ";\n"); 639 640 unsigned IslSerializeSCCs; 641 642 if (FusionStrategy == "max") { 643 IslSerializeSCCs = 0; 644 } else if (FusionStrategy == "min") { 645 IslSerializeSCCs = 1; 646 } else { 647 errs() << "warning: Unknown fusion strategy. Falling back to maximal " 648 "fusion.\n"; 649 IslSerializeSCCs = 0; 650 } 651 652 int IslMaximizeBands; 653 654 if (MaximizeBandDepth == "yes") { 655 IslMaximizeBands = 1; 656 } else if (MaximizeBandDepth == "no") { 657 IslMaximizeBands = 0; 658 } else { 659 errs() << "warning: Option -polly-opt-maximize-bands should either be 'yes'" 660 " or 'no'. Falling back to default: 'yes'\n"; 661 IslMaximizeBands = 1; 662 } 663 664 int IslOuterCoincidence; 665 666 if (OuterCoincidence == "yes") { 667 IslOuterCoincidence = 1; 668 } else if (OuterCoincidence == "no") { 669 IslOuterCoincidence = 0; 670 } else { 671 errs() << "warning: Option -polly-opt-outer-coincidence should either be " 672 "'yes' or 'no'. Falling back to default: 'no'\n"; 673 IslOuterCoincidence = 0; 674 } 675 676 isl_options_set_schedule_outer_coincidence(S.getIslCtx(), 677 IslOuterCoincidence); 678 isl_options_set_schedule_serialize_sccs(S.getIslCtx(), IslSerializeSCCs); 679 isl_options_set_schedule_maximize_band_depth(S.getIslCtx(), IslMaximizeBands); 680 isl_options_set_schedule_max_constant_term(S.getIslCtx(), MaxConstantTerm); 681 isl_options_set_schedule_max_coefficient(S.getIslCtx(), MaxCoefficient); 682 isl_options_set_tile_scale_tile_loops(S.getIslCtx(), 0); 683 684 isl_options_set_on_error(S.getIslCtx(), ISL_ON_ERROR_CONTINUE); 685 686 isl_schedule_constraints *ScheduleConstraints; 687 ScheduleConstraints = isl_schedule_constraints_on_domain(Domain); 688 ScheduleConstraints = 689 isl_schedule_constraints_set_proximity(ScheduleConstraints, Proximity); 690 ScheduleConstraints = isl_schedule_constraints_set_validity( 691 ScheduleConstraints, isl_union_map_copy(Validity)); 692 ScheduleConstraints = 693 isl_schedule_constraints_set_coincidence(ScheduleConstraints, Validity); 694 isl_schedule *Schedule; 695 Schedule = isl_schedule_constraints_compute_schedule(ScheduleConstraints); 696 isl_options_set_on_error(S.getIslCtx(), ISL_ON_ERROR_ABORT); 697 698 // In cases the scheduler is not able to optimize the code, we just do not 699 // touch the schedule. 700 if (!Schedule) 701 return false; 702 703 DEBUG({ 704 auto *P = isl_printer_to_str(S.getIslCtx()); 705 P = isl_printer_set_yaml_style(P, ISL_YAML_STYLE_BLOCK); 706 P = isl_printer_print_schedule(P, Schedule); 707 dbgs() << "NewScheduleTree: \n" << isl_printer_get_str(P) << "\n"; 708 isl_printer_free(P); 709 }); 710 711 isl_schedule *NewSchedule = ScheduleTreeOptimizer::optimizeSchedule(Schedule); 712 isl_union_map *NewScheduleMap = isl_schedule_get_map(NewSchedule); 713 714 if (!ScheduleTreeOptimizer::isProfitableSchedule(S, NewScheduleMap)) { 715 isl_union_map_free(NewScheduleMap); 716 isl_schedule_free(NewSchedule); 717 return false; 718 } 719 720 S.setScheduleTree(NewSchedule); 721 S.markAsOptimized(); 722 723 isl_union_map_free(NewScheduleMap); 724 return false; 725 } 726 727 void IslScheduleOptimizer::printScop(raw_ostream &OS, Scop &) const { 728 isl_printer *p; 729 char *ScheduleStr; 730 731 OS << "Calculated schedule:\n"; 732 733 if (!LastSchedule) { 734 OS << "n/a\n"; 735 return; 736 } 737 738 p = isl_printer_to_str(isl_schedule_get_ctx(LastSchedule)); 739 p = isl_printer_print_schedule(p, LastSchedule); 740 ScheduleStr = isl_printer_get_str(p); 741 isl_printer_free(p); 742 743 OS << ScheduleStr << "\n"; 744 } 745 746 void IslScheduleOptimizer::getAnalysisUsage(AnalysisUsage &AU) const { 747 ScopPass::getAnalysisUsage(AU); 748 AU.addRequired<DependenceInfo>(); 749 } 750 751 Pass *polly::createIslScheduleOptimizerPass() { 752 return new IslScheduleOptimizer(); 753 } 754 755 INITIALIZE_PASS_BEGIN(IslScheduleOptimizer, "polly-opt-isl", 756 "Polly - Optimize schedule of SCoP", false, false); 757 INITIALIZE_PASS_DEPENDENCY(DependenceInfo); 758 INITIALIZE_PASS_DEPENDENCY(ScopInfoRegionPass); 759 INITIALIZE_PASS_END(IslScheduleOptimizer, "polly-opt-isl", 760 "Polly - Optimize schedule of SCoP", false, false) 761