1 //===- Schedule.cpp - Calculate an optimized schedule ---------------------===// 2 // 3 // The LLVM Compiler Infrastructure 4 // 5 // This file is distributed under the University of Illinois Open Source 6 // License. See LICENSE.TXT for details. 7 // 8 //===----------------------------------------------------------------------===// 9 // 10 // This pass generates an entirely new schedule tree from the data dependences 11 // and iteration domains. The new schedule tree is computed in two steps: 12 // 13 // 1) The isl scheduling optimizer is run 14 // 15 // The isl scheduling optimizer creates a new schedule tree that maximizes 16 // parallelism and tileability and minimizes data-dependence distances. The 17 // algorithm used is a modified version of the ``Pluto'' algorithm: 18 // 19 // U. Bondhugula, A. Hartono, J. Ramanujam, and P. Sadayappan. 20 // A Practical Automatic Polyhedral Parallelizer and Locality Optimizer. 21 // In Proceedings of the 2008 ACM SIGPLAN Conference On Programming Language 22 // Design and Implementation, PLDI ’08, pages 101–113. ACM, 2008. 23 // 24 // 2) A set of post-scheduling transformations is applied on the schedule tree. 25 // 26 // These optimizations include: 27 // 28 // - Tiling of the innermost tilable bands 29 // - Prevectorization - The coice of a possible outer loop that is strip-mined 30 // to the innermost level to enable inner-loop 31 // vectorization. 32 // - Some optimizations for spatial locality are also planned. 33 // 34 // For a detailed description of the schedule tree itself please see section 6 35 // of: 36 // 37 // Polyhedral AST generation is more than scanning polyhedra 38 // Tobias Grosser, Sven Verdoolaege, Albert Cohen 39 // ACM Transations on Programming Languages and Systems (TOPLAS), 40 // 37(4), July 2015 41 // http://www.grosser.es/#pub-polyhedral-AST-generation 42 // 43 // This publication also contains a detailed discussion of the different options 44 // for polyhedral loop unrolling, full/partial tile separation and other uses 45 // of the schedule tree. 46 // 47 //===----------------------------------------------------------------------===// 48 49 #include "polly/ScheduleOptimizer.h" 50 #include "polly/CodeGen/CodeGeneration.h" 51 #include "polly/DependenceInfo.h" 52 #include "polly/LinkAllPasses.h" 53 #include "polly/Options.h" 54 #include "polly/ScopInfo.h" 55 #include "polly/Support/GICHelper.h" 56 #include "llvm/Analysis/TargetTransformInfo.h" 57 #include "llvm/Support/Debug.h" 58 #include "isl/aff.h" 59 #include "isl/band.h" 60 #include "isl/constraint.h" 61 #include "isl/map.h" 62 #include "isl/options.h" 63 #include "isl/printer.h" 64 #include "isl/schedule.h" 65 #include "isl/schedule_node.h" 66 #include "isl/space.h" 67 #include "isl/union_map.h" 68 #include "isl/union_set.h" 69 70 using namespace llvm; 71 using namespace polly; 72 73 #define DEBUG_TYPE "polly-opt-isl" 74 75 static cl::opt<std::string> 76 OptimizeDeps("polly-opt-optimize-only", 77 cl::desc("Only a certain kind of dependences (all/raw)"), 78 cl::Hidden, cl::init("all"), cl::ZeroOrMore, 79 cl::cat(PollyCategory)); 80 81 static cl::opt<std::string> 82 SimplifyDeps("polly-opt-simplify-deps", 83 cl::desc("Dependences should be simplified (yes/no)"), 84 cl::Hidden, cl::init("yes"), cl::ZeroOrMore, 85 cl::cat(PollyCategory)); 86 87 static cl::opt<int> MaxConstantTerm( 88 "polly-opt-max-constant-term", 89 cl::desc("The maximal constant term allowed (-1 is unlimited)"), cl::Hidden, 90 cl::init(20), cl::ZeroOrMore, cl::cat(PollyCategory)); 91 92 static cl::opt<int> MaxCoefficient( 93 "polly-opt-max-coefficient", 94 cl::desc("The maximal coefficient allowed (-1 is unlimited)"), cl::Hidden, 95 cl::init(20), cl::ZeroOrMore, cl::cat(PollyCategory)); 96 97 static cl::opt<std::string> FusionStrategy( 98 "polly-opt-fusion", cl::desc("The fusion strategy to choose (min/max)"), 99 cl::Hidden, cl::init("min"), cl::ZeroOrMore, cl::cat(PollyCategory)); 100 101 static cl::opt<std::string> 102 MaximizeBandDepth("polly-opt-maximize-bands", 103 cl::desc("Maximize the band depth (yes/no)"), cl::Hidden, 104 cl::init("yes"), cl::ZeroOrMore, cl::cat(PollyCategory)); 105 106 static cl::opt<std::string> OuterCoincidence( 107 "polly-opt-outer-coincidence", 108 cl::desc("Try to construct schedules where the outer member of each band " 109 "satisfies the coincidence constraints (yes/no)"), 110 cl::Hidden, cl::init("no"), cl::ZeroOrMore, cl::cat(PollyCategory)); 111 112 static cl::opt<int> PrevectorWidth( 113 "polly-prevect-width", 114 cl::desc( 115 "The number of loop iterations to strip-mine for pre-vectorization"), 116 cl::Hidden, cl::init(4), cl::ZeroOrMore, cl::cat(PollyCategory)); 117 118 static cl::opt<bool> FirstLevelTiling("polly-tiling", 119 cl::desc("Enable loop tiling"), 120 cl::init(true), cl::ZeroOrMore, 121 cl::cat(PollyCategory)); 122 123 static cl::opt<int> LatencyVectorFma( 124 "polly-target-latency-vector-fma", 125 cl::desc("The minimal number of cycles between issuing two " 126 "dependent consecutive vector fused multiply-add " 127 "instructions."), 128 cl::Hidden, cl::init(8), cl::ZeroOrMore, cl::cat(PollyCategory)); 129 130 static cl::opt<int> ThrougputVectorFma( 131 "polly-target-througput-vector-fma", 132 cl::desc("A throughput of the processor floating-point arithmetic units " 133 "expressed in the number of vector fused multiply-add " 134 "instructions per clock cycle."), 135 cl::Hidden, cl::init(1), cl::ZeroOrMore, cl::cat(PollyCategory)); 136 137 static cl::list<int> 138 CacheLevelAssociativity("polly-target-cache-level-associativity", 139 cl::desc("The associativity of each cache level."), 140 cl::Hidden, cl::ZeroOrMore, cl::CommaSeparated, 141 cl::cat(PollyCategory)); 142 143 static cl::list<int> CacheLevelSizes( 144 "polly-target-cache-level-sizes", 145 cl::desc("The size of each cache level specified in bytes."), cl::Hidden, 146 cl::ZeroOrMore, cl::CommaSeparated, cl::cat(PollyCategory)); 147 148 static cl::opt<int> FirstLevelDefaultTileSize( 149 "polly-default-tile-size", 150 cl::desc("The default tile size (if not enough were provided by" 151 " --polly-tile-sizes)"), 152 cl::Hidden, cl::init(32), cl::ZeroOrMore, cl::cat(PollyCategory)); 153 154 static cl::list<int> FirstLevelTileSizes( 155 "polly-tile-sizes", cl::desc("A tile size for each loop dimension, filled " 156 "with --polly-default-tile-size"), 157 cl::Hidden, cl::ZeroOrMore, cl::CommaSeparated, cl::cat(PollyCategory)); 158 159 static cl::opt<bool> 160 SecondLevelTiling("polly-2nd-level-tiling", 161 cl::desc("Enable a 2nd level loop of loop tiling"), 162 cl::init(false), cl::ZeroOrMore, cl::cat(PollyCategory)); 163 164 static cl::opt<int> SecondLevelDefaultTileSize( 165 "polly-2nd-level-default-tile-size", 166 cl::desc("The default 2nd-level tile size (if not enough were provided by" 167 " --polly-2nd-level-tile-sizes)"), 168 cl::Hidden, cl::init(16), cl::ZeroOrMore, cl::cat(PollyCategory)); 169 170 static cl::list<int> 171 SecondLevelTileSizes("polly-2nd-level-tile-sizes", 172 cl::desc("A tile size for each loop dimension, filled " 173 "with --polly-default-tile-size"), 174 cl::Hidden, cl::ZeroOrMore, cl::CommaSeparated, 175 cl::cat(PollyCategory)); 176 177 static cl::opt<bool> RegisterTiling("polly-register-tiling", 178 cl::desc("Enable register tiling"), 179 cl::init(false), cl::ZeroOrMore, 180 cl::cat(PollyCategory)); 181 182 static cl::opt<int> RegisterDefaultTileSize( 183 "polly-register-tiling-default-tile-size", 184 cl::desc("The default register tile size (if not enough were provided by" 185 " --polly-register-tile-sizes)"), 186 cl::Hidden, cl::init(2), cl::ZeroOrMore, cl::cat(PollyCategory)); 187 188 static cl::list<int> 189 RegisterTileSizes("polly-register-tile-sizes", 190 cl::desc("A tile size for each loop dimension, filled " 191 "with --polly-register-tile-size"), 192 cl::Hidden, cl::ZeroOrMore, cl::CommaSeparated, 193 cl::cat(PollyCategory)); 194 195 static cl::opt<bool> 196 PMBasedOpts("polly-pattern-matching-based-opts", 197 cl::desc("Perform optimizations based on pattern matching"), 198 cl::init(false), cl::ZeroOrMore, cl::cat(PollyCategory)); 199 200 /// @brief Create an isl_union_set, which describes the isolate option based 201 /// on IsoalteDomain. 202 /// 203 /// @param IsolateDomain An isl_set whose last dimension is the only one that 204 /// should belong to the current band node. 205 static __isl_give isl_union_set * 206 getIsolateOptions(__isl_take isl_set *IsolateDomain) { 207 auto Dims = isl_set_dim(IsolateDomain, isl_dim_set); 208 auto *IsolateRelation = isl_map_from_domain(IsolateDomain); 209 IsolateRelation = isl_map_move_dims(IsolateRelation, isl_dim_out, 0, 210 isl_dim_in, Dims - 1, 1); 211 auto *IsolateOption = isl_map_wrap(IsolateRelation); 212 auto *Id = isl_id_alloc(isl_set_get_ctx(IsolateOption), "isolate", nullptr); 213 return isl_union_set_from_set(isl_set_set_tuple_id(IsolateOption, Id)); 214 } 215 216 /// @brief Create an isl_union_set, which describes the atomic option for the 217 /// dimension of the current node. 218 /// 219 /// It may help to reduce the size of generated code. 220 /// 221 /// @param Ctx An isl_ctx, which is used to create the isl_union_set. 222 static __isl_give isl_union_set *getAtomicOptions(__isl_take isl_ctx *Ctx) { 223 auto *Space = isl_space_set_alloc(Ctx, 0, 1); 224 auto *AtomicOption = isl_set_universe(Space); 225 auto *Id = isl_id_alloc(Ctx, "atomic", nullptr); 226 return isl_union_set_from_set(isl_set_set_tuple_id(AtomicOption, Id)); 227 } 228 229 /// @brief Make the last dimension of Set to take values 230 /// from 0 to VectorWidth - 1. 231 /// 232 /// @param Set A set, which should be modified. 233 /// @param VectorWidth A parameter, which determines the constraint. 234 static __isl_give isl_set *addExtentConstraints(__isl_take isl_set *Set, 235 int VectorWidth) { 236 auto Dims = isl_set_dim(Set, isl_dim_set); 237 auto Space = isl_set_get_space(Set); 238 auto *LocalSpace = isl_local_space_from_space(Space); 239 auto *ExtConstr = 240 isl_constraint_alloc_inequality(isl_local_space_copy(LocalSpace)); 241 ExtConstr = isl_constraint_set_constant_si(ExtConstr, 0); 242 ExtConstr = 243 isl_constraint_set_coefficient_si(ExtConstr, isl_dim_set, Dims - 1, 1); 244 Set = isl_set_add_constraint(Set, ExtConstr); 245 ExtConstr = isl_constraint_alloc_inequality(LocalSpace); 246 ExtConstr = isl_constraint_set_constant_si(ExtConstr, VectorWidth - 1); 247 ExtConstr = 248 isl_constraint_set_coefficient_si(ExtConstr, isl_dim_set, Dims - 1, -1); 249 return isl_set_add_constraint(Set, ExtConstr); 250 } 251 252 /// @brief Build the desired set of partial tile prefixes. 253 /// 254 /// We build a set of partial tile prefixes, which are prefixes of the vector 255 /// loop that have exactly VectorWidth iterations. 256 /// 257 /// 1. Get all prefixes of the vector loop. 258 /// 2. Extend it to a set, which has exactly VectorWidth iterations for 259 /// any prefix from the set that was built on the previous step. 260 /// 3. Subtract loop domain from it, project out the vector loop dimension and 261 /// get a set of prefixes, which don't have exactly VectorWidth iterations. 262 /// 4. Subtract it from all prefixes of the vector loop and get the desired 263 /// set. 264 /// 265 /// @param ScheduleRange A range of a map, which describes a prefix schedule 266 /// relation. 267 static __isl_give isl_set * 268 getPartialTilePrefixes(__isl_take isl_set *ScheduleRange, int VectorWidth) { 269 auto Dims = isl_set_dim(ScheduleRange, isl_dim_set); 270 auto *LoopPrefixes = isl_set_project_out(isl_set_copy(ScheduleRange), 271 isl_dim_set, Dims - 1, 1); 272 auto *ExtentPrefixes = 273 isl_set_add_dims(isl_set_copy(LoopPrefixes), isl_dim_set, 1); 274 ExtentPrefixes = addExtentConstraints(ExtentPrefixes, VectorWidth); 275 auto *BadPrefixes = isl_set_subtract(ExtentPrefixes, ScheduleRange); 276 BadPrefixes = isl_set_project_out(BadPrefixes, isl_dim_set, Dims - 1, 1); 277 return isl_set_subtract(LoopPrefixes, BadPrefixes); 278 } 279 280 __isl_give isl_schedule_node *ScheduleTreeOptimizer::isolateFullPartialTiles( 281 __isl_take isl_schedule_node *Node, int VectorWidth) { 282 assert(isl_schedule_node_get_type(Node) == isl_schedule_node_band); 283 Node = isl_schedule_node_child(Node, 0); 284 Node = isl_schedule_node_child(Node, 0); 285 auto *SchedRelUMap = isl_schedule_node_get_prefix_schedule_relation(Node); 286 auto *ScheduleRelation = isl_map_from_union_map(SchedRelUMap); 287 auto *ScheduleRange = isl_map_range(ScheduleRelation); 288 auto *IsolateDomain = getPartialTilePrefixes(ScheduleRange, VectorWidth); 289 auto *AtomicOption = getAtomicOptions(isl_set_get_ctx(IsolateDomain)); 290 auto *IsolateOption = getIsolateOptions(IsolateDomain); 291 Node = isl_schedule_node_parent(Node); 292 Node = isl_schedule_node_parent(Node); 293 auto *Options = isl_union_set_union(IsolateOption, AtomicOption); 294 Node = isl_schedule_node_band_set_ast_build_options(Node, Options); 295 return Node; 296 } 297 298 __isl_give isl_schedule_node * 299 ScheduleTreeOptimizer::prevectSchedBand(__isl_take isl_schedule_node *Node, 300 unsigned DimToVectorize, 301 int VectorWidth) { 302 assert(isl_schedule_node_get_type(Node) == isl_schedule_node_band); 303 304 auto Space = isl_schedule_node_band_get_space(Node); 305 auto ScheduleDimensions = isl_space_dim(Space, isl_dim_set); 306 isl_space_free(Space); 307 assert(DimToVectorize < ScheduleDimensions); 308 309 if (DimToVectorize > 0) { 310 Node = isl_schedule_node_band_split(Node, DimToVectorize); 311 Node = isl_schedule_node_child(Node, 0); 312 } 313 if (DimToVectorize < ScheduleDimensions - 1) 314 Node = isl_schedule_node_band_split(Node, 1); 315 Space = isl_schedule_node_band_get_space(Node); 316 auto Sizes = isl_multi_val_zero(Space); 317 auto Ctx = isl_schedule_node_get_ctx(Node); 318 Sizes = 319 isl_multi_val_set_val(Sizes, 0, isl_val_int_from_si(Ctx, VectorWidth)); 320 Node = isl_schedule_node_band_tile(Node, Sizes); 321 Node = isolateFullPartialTiles(Node, VectorWidth); 322 Node = isl_schedule_node_child(Node, 0); 323 // Make sure the "trivially vectorizable loop" is not unrolled. Otherwise, 324 // we will have troubles to match it in the backend. 325 Node = isl_schedule_node_band_set_ast_build_options( 326 Node, isl_union_set_read_from_str(Ctx, "{ unroll[x]: 1 = 0 }")); 327 Node = isl_schedule_node_band_sink(Node); 328 Node = isl_schedule_node_child(Node, 0); 329 if (isl_schedule_node_get_type(Node) == isl_schedule_node_leaf) 330 Node = isl_schedule_node_parent(Node); 331 isl_id *LoopMarker = isl_id_alloc(Ctx, "SIMD", nullptr); 332 Node = isl_schedule_node_insert_mark(Node, LoopMarker); 333 return Node; 334 } 335 336 __isl_give isl_schedule_node * 337 ScheduleTreeOptimizer::tileNode(__isl_take isl_schedule_node *Node, 338 const char *Identifier, ArrayRef<int> TileSizes, 339 int DefaultTileSize) { 340 auto Ctx = isl_schedule_node_get_ctx(Node); 341 auto Space = isl_schedule_node_band_get_space(Node); 342 auto Dims = isl_space_dim(Space, isl_dim_set); 343 auto Sizes = isl_multi_val_zero(Space); 344 std::string IdentifierString(Identifier); 345 for (unsigned i = 0; i < Dims; i++) { 346 auto tileSize = i < TileSizes.size() ? TileSizes[i] : DefaultTileSize; 347 Sizes = isl_multi_val_set_val(Sizes, i, isl_val_int_from_si(Ctx, tileSize)); 348 } 349 auto TileLoopMarkerStr = IdentifierString + " - Tiles"; 350 isl_id *TileLoopMarker = 351 isl_id_alloc(Ctx, TileLoopMarkerStr.c_str(), nullptr); 352 Node = isl_schedule_node_insert_mark(Node, TileLoopMarker); 353 Node = isl_schedule_node_child(Node, 0); 354 Node = isl_schedule_node_band_tile(Node, Sizes); 355 Node = isl_schedule_node_child(Node, 0); 356 auto PointLoopMarkerStr = IdentifierString + " - Points"; 357 isl_id *PointLoopMarker = 358 isl_id_alloc(Ctx, PointLoopMarkerStr.c_str(), nullptr); 359 Node = isl_schedule_node_insert_mark(Node, PointLoopMarker); 360 Node = isl_schedule_node_child(Node, 0); 361 return Node; 362 } 363 364 __isl_give isl_schedule_node * 365 ScheduleTreeOptimizer::applyRegisterTiling(__isl_take isl_schedule_node *Node, 366 llvm::ArrayRef<int> TileSizes, 367 int DefaultTileSize) { 368 auto *Ctx = isl_schedule_node_get_ctx(Node); 369 Node = tileNode(Node, "Register tiling", TileSizes, DefaultTileSize); 370 Node = isl_schedule_node_band_set_ast_build_options( 371 Node, isl_union_set_read_from_str(Ctx, "{unroll[x]}")); 372 return Node; 373 } 374 375 bool ScheduleTreeOptimizer::isTileableBandNode( 376 __isl_keep isl_schedule_node *Node) { 377 if (isl_schedule_node_get_type(Node) != isl_schedule_node_band) 378 return false; 379 380 if (isl_schedule_node_n_children(Node) != 1) 381 return false; 382 383 if (!isl_schedule_node_band_get_permutable(Node)) 384 return false; 385 386 auto Space = isl_schedule_node_band_get_space(Node); 387 auto Dims = isl_space_dim(Space, isl_dim_set); 388 isl_space_free(Space); 389 390 if (Dims <= 1) 391 return false; 392 393 auto Child = isl_schedule_node_get_child(Node, 0); 394 auto Type = isl_schedule_node_get_type(Child); 395 isl_schedule_node_free(Child); 396 397 if (Type != isl_schedule_node_leaf) 398 return false; 399 400 return true; 401 } 402 403 __isl_give isl_schedule_node * 404 ScheduleTreeOptimizer::standardBandOpts(__isl_take isl_schedule_node *Node, 405 void *User) { 406 if (FirstLevelTiling) 407 Node = tileNode(Node, "1st level tiling", FirstLevelTileSizes, 408 FirstLevelDefaultTileSize); 409 410 if (SecondLevelTiling) 411 Node = tileNode(Node, "2nd level tiling", SecondLevelTileSizes, 412 SecondLevelDefaultTileSize); 413 414 if (RegisterTiling) 415 Node = 416 applyRegisterTiling(Node, RegisterTileSizes, RegisterDefaultTileSize); 417 418 if (PollyVectorizerChoice == VECTORIZER_NONE) 419 return Node; 420 421 auto Space = isl_schedule_node_band_get_space(Node); 422 auto Dims = isl_space_dim(Space, isl_dim_set); 423 isl_space_free(Space); 424 425 for (int i = Dims - 1; i >= 0; i--) 426 if (isl_schedule_node_band_member_get_coincident(Node, i)) { 427 Node = prevectSchedBand(Node, i, PrevectorWidth); 428 break; 429 } 430 431 return Node; 432 } 433 434 /// @brief Check whether output dimensions of the map rely on the specified 435 /// input dimension. 436 /// 437 /// @param IslMap The isl map to be considered. 438 /// @param DimNum The number of an input dimension to be checked. 439 static bool isInputDimUsed(__isl_take isl_map *IslMap, unsigned DimNum) { 440 auto *CheckedAccessRelation = 441 isl_map_project_out(isl_map_copy(IslMap), isl_dim_in, DimNum, 1); 442 CheckedAccessRelation = 443 isl_map_insert_dims(CheckedAccessRelation, isl_dim_in, DimNum, 1); 444 auto *InputDimsId = isl_map_get_tuple_id(IslMap, isl_dim_in); 445 CheckedAccessRelation = 446 isl_map_set_tuple_id(CheckedAccessRelation, isl_dim_in, InputDimsId); 447 InputDimsId = isl_map_get_tuple_id(IslMap, isl_dim_out); 448 CheckedAccessRelation = 449 isl_map_set_tuple_id(CheckedAccessRelation, isl_dim_out, InputDimsId); 450 auto res = !isl_map_is_equal(CheckedAccessRelation, IslMap); 451 isl_map_free(CheckedAccessRelation); 452 isl_map_free(IslMap); 453 return res; 454 } 455 456 /// @brief Check if the SCoP statement could probably be optimized with 457 /// analytical modeling. 458 /// 459 /// containsMatrMult tries to determine whether the following conditions 460 /// are true: 461 /// 1. all memory accesses of the statement will have stride 0 or 1, 462 /// if we interchange loops (switch the variable used in the inner 463 /// loop to the outer loop). 464 /// 2. all memory accesses of the statement except from the last one, are 465 /// read memory access and the last one is write memory access. 466 /// 3. all subscripts of the last memory access of the statement don't contain 467 /// the variable used in the inner loop. 468 /// 469 /// @param PartialSchedule The PartialSchedule that contains a SCoP statement 470 /// to check. 471 static bool containsMatrMult(__isl_keep isl_map *PartialSchedule) { 472 auto InputDimsId = isl_map_get_tuple_id(PartialSchedule, isl_dim_in); 473 auto *ScpStmt = static_cast<ScopStmt *>(isl_id_get_user(InputDimsId)); 474 isl_id_free(InputDimsId); 475 if (ScpStmt->size() <= 1) 476 return false; 477 auto MemA = ScpStmt->begin(); 478 for (unsigned i = 0; i < ScpStmt->size() - 2 && MemA != ScpStmt->end(); 479 i++, MemA++) 480 if (!(*MemA)->isRead() || 481 ((*MemA)->isArrayKind() && 482 !((*MemA)->isStrideOne(isl_map_copy(PartialSchedule)) || 483 (*MemA)->isStrideZero(isl_map_copy(PartialSchedule))))) 484 return false; 485 MemA++; 486 if (!(*MemA)->isWrite() || !(*MemA)->isArrayKind() || 487 !((*MemA)->isStrideOne(isl_map_copy(PartialSchedule)) || 488 (*MemA)->isStrideZero(isl_map_copy(PartialSchedule)))) 489 return false; 490 auto DimNum = isl_map_dim(PartialSchedule, isl_dim_in); 491 return !isInputDimUsed((*MemA)->getAccessRelation(), DimNum - 1); 492 } 493 494 /// @brief Circular shift of output dimensions of the integer map. 495 /// 496 /// @param IslMap The isl map to be modified. 497 static __isl_give isl_map *circularShiftOutputDims(__isl_take isl_map *IslMap) { 498 auto DimNum = isl_map_dim(IslMap, isl_dim_out); 499 if (DimNum == 0) 500 return IslMap; 501 auto InputDimsId = isl_map_get_tuple_id(IslMap, isl_dim_in); 502 IslMap = isl_map_move_dims(IslMap, isl_dim_in, 0, isl_dim_out, DimNum - 1, 1); 503 IslMap = isl_map_move_dims(IslMap, isl_dim_out, 0, isl_dim_in, 0, 1); 504 return isl_map_set_tuple_id(IslMap, isl_dim_in, InputDimsId); 505 } 506 507 /// @brief Permute two dimensions of the band node. 508 /// 509 /// Permute FirstDim and SecondDim dimensions of the Node. 510 /// 511 /// @param Node The band node to be modified. 512 /// @param FirstDim The first dimension to be permuted. 513 /// @param SecondDim The second dimension to be permuted. 514 static __isl_give isl_schedule_node * 515 permuteBandNodeDimensions(__isl_take isl_schedule_node *Node, unsigned FirstDim, 516 unsigned SecondDim) { 517 assert(isl_schedule_node_get_type(Node) == isl_schedule_node_band && 518 isl_schedule_node_band_n_member(Node) > std::max(FirstDim, SecondDim)); 519 auto PartialSchedule = isl_schedule_node_band_get_partial_schedule(Node); 520 auto PartialScheduleFirstDim = 521 isl_multi_union_pw_aff_get_union_pw_aff(PartialSchedule, FirstDim); 522 auto PartialScheduleSecondDim = 523 isl_multi_union_pw_aff_get_union_pw_aff(PartialSchedule, SecondDim); 524 PartialSchedule = isl_multi_union_pw_aff_set_union_pw_aff( 525 PartialSchedule, SecondDim, PartialScheduleFirstDim); 526 PartialSchedule = isl_multi_union_pw_aff_set_union_pw_aff( 527 PartialSchedule, FirstDim, PartialScheduleSecondDim); 528 Node = isl_schedule_node_delete(Node); 529 Node = isl_schedule_node_insert_partial_schedule(Node, PartialSchedule); 530 return Node; 531 } 532 533 __isl_give isl_schedule_node *ScheduleTreeOptimizer::createMicroKernel( 534 __isl_take isl_schedule_node *Node, MicroKernelParamsTy MicroKernelParams) { 535 return applyRegisterTiling(Node, {MicroKernelParams.Mr, MicroKernelParams.Nr}, 536 1); 537 } 538 539 __isl_give isl_schedule_node *ScheduleTreeOptimizer::createMacroKernel( 540 __isl_take isl_schedule_node *Node, MacroKernelParamsTy MacroKernelParams) { 541 assert(isl_schedule_node_get_type(Node) == isl_schedule_node_band); 542 if (MacroKernelParams.Mc == 1 && MacroKernelParams.Nc == 1 && 543 MacroKernelParams.Kc == 1) 544 return Node; 545 Node = tileNode( 546 Node, "1st level tiling", 547 {MacroKernelParams.Mc, MacroKernelParams.Nc, MacroKernelParams.Kc}, 1); 548 Node = isl_schedule_node_parent(isl_schedule_node_parent(Node)); 549 Node = permuteBandNodeDimensions(Node, 1, 2); 550 return isl_schedule_node_child(isl_schedule_node_child(Node, 0), 0); 551 } 552 553 /// Get parameters of the BLIS micro kernel. 554 /// 555 /// We choose the Mr and Nr parameters of the micro kernel to be large enough 556 /// such that no stalls caused by the combination of latencies and dependencies 557 /// are introduced during the updates of the resulting matrix of the matrix 558 /// multiplication. However, they should also be as small as possible to 559 /// release more registers for entries of multiplied matrices. 560 /// 561 /// @param TTI Target Transform Info. 562 /// @return The structure of type MicroKernelParamsTy. 563 /// @see MicroKernelParamsTy 564 static struct MicroKernelParamsTy 565 getMicroKernelParams(const llvm::TargetTransformInfo *TTI) { 566 assert(TTI && "The target transform info should be provided."); 567 568 // Nvec - Number of double-precision floating-point numbers that can be hold 569 // by a vector register. Use 2 by default. 570 auto Nvec = TTI->getRegisterBitWidth(true) / 64; 571 if (Nvec == 0) 572 Nvec = 2; 573 int Nr = 574 ceil(sqrt(Nvec * LatencyVectorFma * ThrougputVectorFma) / Nvec) * Nvec; 575 int Mr = ceil(Nvec * LatencyVectorFma * ThrougputVectorFma / Nr); 576 return {Mr, Nr}; 577 } 578 579 /// Get parameters of the BLIS macro kernel. 580 /// 581 /// During the computation of matrix multiplication, blocks of partitioned 582 /// matrices are mapped to different layers of the memory hierarchy. 583 /// To optimize data reuse, blocks should be ideally kept in cache between 584 /// iterations. Since parameters of the macro kernel determine sizes of these 585 /// blocks, there are upper and lower bounds on these parameters. 586 /// 587 /// @param MicroKernelParams Parameters of the micro-kernel 588 /// to be taken into account. 589 /// @return The structure of type MacroKernelParamsTy. 590 /// @see MacroKernelParamsTy 591 /// @see MicroKernelParamsTy 592 static struct MacroKernelParamsTy 593 getMacroKernelParams(const MicroKernelParamsTy &MicroKernelParams) { 594 // According to www.cs.utexas.edu/users/flame/pubs/TOMS-BLIS-Analytical.pdf, 595 // it requires information about the first two levels of a cache to determine 596 // all the parameters of a macro-kernel. It also checks that an associativity 597 // degree of a cache level is greater than two. Otherwise, another algorithm 598 // for determination of the parameters should be used. 599 if (!(MicroKernelParams.Mr > 0 && MicroKernelParams.Nr > 0 && 600 CacheLevelSizes.size() >= 2 && CacheLevelAssociativity.size() >= 2 && 601 CacheLevelSizes[0] > 0 && CacheLevelSizes[1] > 0 && 602 CacheLevelAssociativity[0] > 2 && CacheLevelAssociativity[1] > 2)) 603 return {1, 1, 1}; 604 int Cbr = floor( 605 (CacheLevelAssociativity[0] - 1) / 606 (1 + static_cast<double>(MicroKernelParams.Mr) / MicroKernelParams.Nr)); 607 int Kc = (Cbr * CacheLevelSizes[0]) / 608 (MicroKernelParams.Nr * CacheLevelAssociativity[0] * 8); 609 double Cac = static_cast<double>(MicroKernelParams.Mr * Kc * 8 * 610 CacheLevelAssociativity[1]) / 611 CacheLevelSizes[1]; 612 double Cbc = static_cast<double>(MicroKernelParams.Nr * Kc * 8 * 613 CacheLevelAssociativity[1]) / 614 CacheLevelSizes[1]; 615 int Mc = floor(MicroKernelParams.Mr / Cac); 616 int Nc = 617 floor((MicroKernelParams.Nr * (CacheLevelAssociativity[1] - 2)) / Cbc); 618 return {Mc, Nc, Kc}; 619 } 620 621 __isl_give isl_schedule_node *ScheduleTreeOptimizer::optimizeMatMulPattern( 622 __isl_take isl_schedule_node *Node, const llvm::TargetTransformInfo *TTI) { 623 assert(TTI && "The target transform info should be provided."); 624 auto MicroKernelParams = getMicroKernelParams(TTI); 625 auto MacroKernelParams = getMacroKernelParams(MicroKernelParams); 626 Node = createMacroKernel(Node, MacroKernelParams); 627 Node = createMicroKernel(Node, MicroKernelParams); 628 return Node; 629 } 630 631 bool ScheduleTreeOptimizer::isMatrMultPattern( 632 __isl_keep isl_schedule_node *Node) { 633 auto *PartialSchedule = 634 isl_schedule_node_band_get_partial_schedule_union_map(Node); 635 if (isl_schedule_node_band_n_member(Node) != 3 || 636 isl_union_map_n_map(PartialSchedule) != 1) { 637 isl_union_map_free(PartialSchedule); 638 return false; 639 } 640 auto *NewPartialSchedule = isl_map_from_union_map(PartialSchedule); 641 NewPartialSchedule = circularShiftOutputDims(NewPartialSchedule); 642 if (containsMatrMult(NewPartialSchedule)) { 643 isl_map_free(NewPartialSchedule); 644 return true; 645 } 646 isl_map_free(NewPartialSchedule); 647 return false; 648 } 649 650 __isl_give isl_schedule_node * 651 ScheduleTreeOptimizer::optimizeBand(__isl_take isl_schedule_node *Node, 652 void *User) { 653 if (!isTileableBandNode(Node)) 654 return Node; 655 656 if (PMBasedOpts && User && isMatrMultPattern(Node)) { 657 DEBUG(dbgs() << "The matrix multiplication pattern was detected\n"); 658 const llvm::TargetTransformInfo *TTI; 659 TTI = static_cast<const llvm::TargetTransformInfo *>(User); 660 Node = optimizeMatMulPattern(Node, TTI); 661 } 662 663 return standardBandOpts(Node, User); 664 } 665 666 __isl_give isl_schedule * 667 ScheduleTreeOptimizer::optimizeSchedule(__isl_take isl_schedule *Schedule, 668 const llvm::TargetTransformInfo *TTI) { 669 isl_schedule_node *Root = isl_schedule_get_root(Schedule); 670 Root = optimizeScheduleNode(Root, TTI); 671 isl_schedule_free(Schedule); 672 auto S = isl_schedule_node_get_schedule(Root); 673 isl_schedule_node_free(Root); 674 return S; 675 } 676 677 __isl_give isl_schedule_node *ScheduleTreeOptimizer::optimizeScheduleNode( 678 __isl_take isl_schedule_node *Node, const llvm::TargetTransformInfo *TTI) { 679 Node = isl_schedule_node_map_descendant_bottom_up( 680 Node, optimizeBand, const_cast<void *>(static_cast<const void *>(TTI))); 681 return Node; 682 } 683 684 bool ScheduleTreeOptimizer::isProfitableSchedule( 685 Scop &S, __isl_keep isl_union_map *NewSchedule) { 686 // To understand if the schedule has been optimized we check if the schedule 687 // has changed at all. 688 // TODO: We can improve this by tracking if any necessarily beneficial 689 // transformations have been performed. This can e.g. be tiling, loop 690 // interchange, or ...) We can track this either at the place where the 691 // transformation has been performed or, in case of automatic ILP based 692 // optimizations, by comparing (yet to be defined) performance metrics 693 // before/after the scheduling optimizer 694 // (e.g., #stride-one accesses) 695 isl_union_map *OldSchedule = S.getSchedule(); 696 bool changed = !isl_union_map_is_equal(OldSchedule, NewSchedule); 697 isl_union_map_free(OldSchedule); 698 return changed; 699 } 700 701 namespace { 702 class IslScheduleOptimizer : public ScopPass { 703 public: 704 static char ID; 705 explicit IslScheduleOptimizer() : ScopPass(ID) { LastSchedule = nullptr; } 706 707 ~IslScheduleOptimizer() { isl_schedule_free(LastSchedule); } 708 709 /// @brief Optimize the schedule of the SCoP @p S. 710 bool runOnScop(Scop &S) override; 711 712 /// @brief Print the new schedule for the SCoP @p S. 713 void printScop(raw_ostream &OS, Scop &S) const override; 714 715 /// @brief Register all analyses and transformation required. 716 void getAnalysisUsage(AnalysisUsage &AU) const override; 717 718 /// @brief Release the internal memory. 719 void releaseMemory() override { 720 isl_schedule_free(LastSchedule); 721 LastSchedule = nullptr; 722 } 723 724 private: 725 isl_schedule *LastSchedule; 726 }; 727 } // namespace 728 729 char IslScheduleOptimizer::ID = 0; 730 731 bool IslScheduleOptimizer::runOnScop(Scop &S) { 732 733 // Skip empty SCoPs but still allow code generation as it will delete the 734 // loops present but not needed. 735 if (S.getSize() == 0) { 736 S.markAsOptimized(); 737 return false; 738 } 739 740 const Dependences &D = 741 getAnalysis<DependenceInfo>().getDependences(Dependences::AL_Statement); 742 743 if (!D.hasValidDependences()) 744 return false; 745 746 isl_schedule_free(LastSchedule); 747 LastSchedule = nullptr; 748 749 // Build input data. 750 int ValidityKinds = 751 Dependences::TYPE_RAW | Dependences::TYPE_WAR | Dependences::TYPE_WAW; 752 int ProximityKinds; 753 754 if (OptimizeDeps == "all") 755 ProximityKinds = 756 Dependences::TYPE_RAW | Dependences::TYPE_WAR | Dependences::TYPE_WAW; 757 else if (OptimizeDeps == "raw") 758 ProximityKinds = Dependences::TYPE_RAW; 759 else { 760 errs() << "Do not know how to optimize for '" << OptimizeDeps << "'" 761 << " Falling back to optimizing all dependences.\n"; 762 ProximityKinds = 763 Dependences::TYPE_RAW | Dependences::TYPE_WAR | Dependences::TYPE_WAW; 764 } 765 766 isl_union_set *Domain = S.getDomains(); 767 768 if (!Domain) 769 return false; 770 771 isl_union_map *Validity = D.getDependences(ValidityKinds); 772 isl_union_map *Proximity = D.getDependences(ProximityKinds); 773 774 // Simplify the dependences by removing the constraints introduced by the 775 // domains. This can speed up the scheduling time significantly, as large 776 // constant coefficients will be removed from the dependences. The 777 // introduction of some additional dependences reduces the possible 778 // transformations, but in most cases, such transformation do not seem to be 779 // interesting anyway. In some cases this option may stop the scheduler to 780 // find any schedule. 781 if (SimplifyDeps == "yes") { 782 Validity = isl_union_map_gist_domain(Validity, isl_union_set_copy(Domain)); 783 Validity = isl_union_map_gist_range(Validity, isl_union_set_copy(Domain)); 784 Proximity = 785 isl_union_map_gist_domain(Proximity, isl_union_set_copy(Domain)); 786 Proximity = isl_union_map_gist_range(Proximity, isl_union_set_copy(Domain)); 787 } else if (SimplifyDeps != "no") { 788 errs() << "warning: Option -polly-opt-simplify-deps should either be 'yes' " 789 "or 'no'. Falling back to default: 'yes'\n"; 790 } 791 792 DEBUG(dbgs() << "\n\nCompute schedule from: "); 793 DEBUG(dbgs() << "Domain := " << stringFromIslObj(Domain) << ";\n"); 794 DEBUG(dbgs() << "Proximity := " << stringFromIslObj(Proximity) << ";\n"); 795 DEBUG(dbgs() << "Validity := " << stringFromIslObj(Validity) << ";\n"); 796 797 unsigned IslSerializeSCCs; 798 799 if (FusionStrategy == "max") { 800 IslSerializeSCCs = 0; 801 } else if (FusionStrategy == "min") { 802 IslSerializeSCCs = 1; 803 } else { 804 errs() << "warning: Unknown fusion strategy. Falling back to maximal " 805 "fusion.\n"; 806 IslSerializeSCCs = 0; 807 } 808 809 int IslMaximizeBands; 810 811 if (MaximizeBandDepth == "yes") { 812 IslMaximizeBands = 1; 813 } else if (MaximizeBandDepth == "no") { 814 IslMaximizeBands = 0; 815 } else { 816 errs() << "warning: Option -polly-opt-maximize-bands should either be 'yes'" 817 " or 'no'. Falling back to default: 'yes'\n"; 818 IslMaximizeBands = 1; 819 } 820 821 int IslOuterCoincidence; 822 823 if (OuterCoincidence == "yes") { 824 IslOuterCoincidence = 1; 825 } else if (OuterCoincidence == "no") { 826 IslOuterCoincidence = 0; 827 } else { 828 errs() << "warning: Option -polly-opt-outer-coincidence should either be " 829 "'yes' or 'no'. Falling back to default: 'no'\n"; 830 IslOuterCoincidence = 0; 831 } 832 833 isl_ctx *Ctx = S.getIslCtx(); 834 835 isl_options_set_schedule_outer_coincidence(Ctx, IslOuterCoincidence); 836 isl_options_set_schedule_serialize_sccs(Ctx, IslSerializeSCCs); 837 isl_options_set_schedule_maximize_band_depth(Ctx, IslMaximizeBands); 838 isl_options_set_schedule_max_constant_term(Ctx, MaxConstantTerm); 839 isl_options_set_schedule_max_coefficient(Ctx, MaxCoefficient); 840 isl_options_set_tile_scale_tile_loops(Ctx, 0); 841 842 auto OnErrorStatus = isl_options_get_on_error(Ctx); 843 isl_options_set_on_error(Ctx, ISL_ON_ERROR_CONTINUE); 844 845 isl_schedule_constraints *ScheduleConstraints; 846 ScheduleConstraints = isl_schedule_constraints_on_domain(Domain); 847 ScheduleConstraints = 848 isl_schedule_constraints_set_proximity(ScheduleConstraints, Proximity); 849 ScheduleConstraints = isl_schedule_constraints_set_validity( 850 ScheduleConstraints, isl_union_map_copy(Validity)); 851 ScheduleConstraints = 852 isl_schedule_constraints_set_coincidence(ScheduleConstraints, Validity); 853 isl_schedule *Schedule; 854 Schedule = isl_schedule_constraints_compute_schedule(ScheduleConstraints); 855 isl_options_set_on_error(Ctx, OnErrorStatus); 856 857 // In cases the scheduler is not able to optimize the code, we just do not 858 // touch the schedule. 859 if (!Schedule) 860 return false; 861 862 DEBUG({ 863 auto *P = isl_printer_to_str(Ctx); 864 P = isl_printer_set_yaml_style(P, ISL_YAML_STYLE_BLOCK); 865 P = isl_printer_print_schedule(P, Schedule); 866 dbgs() << "NewScheduleTree: \n" << isl_printer_get_str(P) << "\n"; 867 isl_printer_free(P); 868 }); 869 870 Function &F = S.getFunction(); 871 auto *TTI = &getAnalysis<TargetTransformInfoWrapperPass>().getTTI(F); 872 isl_schedule *NewSchedule = 873 ScheduleTreeOptimizer::optimizeSchedule(Schedule, TTI); 874 isl_union_map *NewScheduleMap = isl_schedule_get_map(NewSchedule); 875 876 if (!ScheduleTreeOptimizer::isProfitableSchedule(S, NewScheduleMap)) { 877 isl_union_map_free(NewScheduleMap); 878 isl_schedule_free(NewSchedule); 879 return false; 880 } 881 882 S.setScheduleTree(NewSchedule); 883 S.markAsOptimized(); 884 885 isl_union_map_free(NewScheduleMap); 886 return false; 887 } 888 889 void IslScheduleOptimizer::printScop(raw_ostream &OS, Scop &) const { 890 isl_printer *p; 891 char *ScheduleStr; 892 893 OS << "Calculated schedule:\n"; 894 895 if (!LastSchedule) { 896 OS << "n/a\n"; 897 return; 898 } 899 900 p = isl_printer_to_str(isl_schedule_get_ctx(LastSchedule)); 901 p = isl_printer_print_schedule(p, LastSchedule); 902 ScheduleStr = isl_printer_get_str(p); 903 isl_printer_free(p); 904 905 OS << ScheduleStr << "\n"; 906 } 907 908 void IslScheduleOptimizer::getAnalysisUsage(AnalysisUsage &AU) const { 909 ScopPass::getAnalysisUsage(AU); 910 AU.addRequired<DependenceInfo>(); 911 AU.addRequired<TargetTransformInfoWrapperPass>(); 912 } 913 914 Pass *polly::createIslScheduleOptimizerPass() { 915 return new IslScheduleOptimizer(); 916 } 917 918 INITIALIZE_PASS_BEGIN(IslScheduleOptimizer, "polly-opt-isl", 919 "Polly - Optimize schedule of SCoP", false, false); 920 INITIALIZE_PASS_DEPENDENCY(DependenceInfo); 921 INITIALIZE_PASS_DEPENDENCY(ScopInfoRegionPass); 922 INITIALIZE_PASS_DEPENDENCY(TargetTransformInfoWrapperPass); 923 INITIALIZE_PASS_END(IslScheduleOptimizer, "polly-opt-isl", 924 "Polly - Optimize schedule of SCoP", false, false) 925