1 //===- Schedule.cpp - Calculate an optimized schedule ---------------------===//
2 //
3 //                     The LLVM Compiler Infrastructure
4 //
5 // This file is distributed under the University of Illinois Open Source
6 // License. See LICENSE.TXT for details.
7 //
8 //===----------------------------------------------------------------------===//
9 //
10 // This pass generates an entirely new schedule tree from the data dependences
11 // and iteration domains. The new schedule tree is computed in two steps:
12 //
13 // 1) The isl scheduling optimizer is run
14 //
15 // The isl scheduling optimizer creates a new schedule tree that maximizes
16 // parallelism and tileability and minimizes data-dependence distances. The
17 // algorithm used is a modified version of the ``Pluto'' algorithm:
18 //
19 //   U. Bondhugula, A. Hartono, J. Ramanujam, and P. Sadayappan.
20 //   A Practical Automatic Polyhedral Parallelizer and Locality Optimizer.
21 //   In Proceedings of the 2008 ACM SIGPLAN Conference On Programming Language
22 //   Design and Implementation, PLDI ’08, pages 101–113. ACM, 2008.
23 //
24 // 2) A set of post-scheduling transformations is applied on the schedule tree.
25 //
26 // These optimizations include:
27 //
28 //  - Tiling of the innermost tilable bands
29 //  - Prevectorization - The coice of a possible outer loop that is strip-mined
30 //                       to the innermost level to enable inner-loop
31 //                       vectorization.
32 //  - Some optimizations for spatial locality are also planned.
33 //
34 // For a detailed description of the schedule tree itself please see section 6
35 // of:
36 //
37 // Polyhedral AST generation is more than scanning polyhedra
38 // Tobias Grosser, Sven Verdoolaege, Albert Cohen
39 // ACM Transations on Programming Languages and Systems (TOPLAS),
40 // 37(4), July 2015
41 // http://www.grosser.es/#pub-polyhedral-AST-generation
42 //
43 // This publication also contains a detailed discussion of the different options
44 // for polyhedral loop unrolling, full/partial tile separation and other uses
45 // of the schedule tree.
46 //
47 //===----------------------------------------------------------------------===//
48 
49 #include "polly/ScheduleOptimizer.h"
50 #include "polly/CodeGen/CodeGeneration.h"
51 #include "polly/DependenceInfo.h"
52 #include "polly/LinkAllPasses.h"
53 #include "polly/Options.h"
54 #include "polly/ScopInfo.h"
55 #include "polly/Support/GICHelper.h"
56 #include "llvm/Analysis/TargetTransformInfo.h"
57 #include "llvm/Support/Debug.h"
58 #include "isl/aff.h"
59 #include "isl/band.h"
60 #include "isl/constraint.h"
61 #include "isl/map.h"
62 #include "isl/options.h"
63 #include "isl/printer.h"
64 #include "isl/schedule.h"
65 #include "isl/schedule_node.h"
66 #include "isl/space.h"
67 #include "isl/union_map.h"
68 #include "isl/union_set.h"
69 
70 using namespace llvm;
71 using namespace polly;
72 
73 #define DEBUG_TYPE "polly-opt-isl"
74 
75 static cl::opt<std::string>
76     OptimizeDeps("polly-opt-optimize-only",
77                  cl::desc("Only a certain kind of dependences (all/raw)"),
78                  cl::Hidden, cl::init("all"), cl::ZeroOrMore,
79                  cl::cat(PollyCategory));
80 
81 static cl::opt<std::string>
82     SimplifyDeps("polly-opt-simplify-deps",
83                  cl::desc("Dependences should be simplified (yes/no)"),
84                  cl::Hidden, cl::init("yes"), cl::ZeroOrMore,
85                  cl::cat(PollyCategory));
86 
87 static cl::opt<int> MaxConstantTerm(
88     "polly-opt-max-constant-term",
89     cl::desc("The maximal constant term allowed (-1 is unlimited)"), cl::Hidden,
90     cl::init(20), cl::ZeroOrMore, cl::cat(PollyCategory));
91 
92 static cl::opt<int> MaxCoefficient(
93     "polly-opt-max-coefficient",
94     cl::desc("The maximal coefficient allowed (-1 is unlimited)"), cl::Hidden,
95     cl::init(20), cl::ZeroOrMore, cl::cat(PollyCategory));
96 
97 static cl::opt<std::string> FusionStrategy(
98     "polly-opt-fusion", cl::desc("The fusion strategy to choose (min/max)"),
99     cl::Hidden, cl::init("min"), cl::ZeroOrMore, cl::cat(PollyCategory));
100 
101 static cl::opt<std::string>
102     MaximizeBandDepth("polly-opt-maximize-bands",
103                       cl::desc("Maximize the band depth (yes/no)"), cl::Hidden,
104                       cl::init("yes"), cl::ZeroOrMore, cl::cat(PollyCategory));
105 
106 static cl::opt<std::string> OuterCoincidence(
107     "polly-opt-outer-coincidence",
108     cl::desc("Try to construct schedules where the outer member of each band "
109              "satisfies the coincidence constraints (yes/no)"),
110     cl::Hidden, cl::init("no"), cl::ZeroOrMore, cl::cat(PollyCategory));
111 
112 static cl::opt<int> PrevectorWidth(
113     "polly-prevect-width",
114     cl::desc(
115         "The number of loop iterations to strip-mine for pre-vectorization"),
116     cl::Hidden, cl::init(4), cl::ZeroOrMore, cl::cat(PollyCategory));
117 
118 static cl::opt<bool> FirstLevelTiling("polly-tiling",
119                                       cl::desc("Enable loop tiling"),
120                                       cl::init(true), cl::ZeroOrMore,
121                                       cl::cat(PollyCategory));
122 
123 static cl::opt<int> LatencyVectorFma(
124     "polly-target-latency-vector-fma",
125     cl::desc("The minimal number of cycles between issuing two "
126              "dependent consecutive vector fused multiply-add "
127              "instructions."),
128     cl::Hidden, cl::init(8), cl::ZeroOrMore, cl::cat(PollyCategory));
129 
130 static cl::opt<int> ThrougputVectorFma(
131     "polly-target-througput-vector-fma",
132     cl::desc("A throughput of the processor floating-point arithmetic units "
133              "expressed in the number of vector fused multiply-add "
134              "instructions per clock cycle."),
135     cl::Hidden, cl::init(1), cl::ZeroOrMore, cl::cat(PollyCategory));
136 
137 static cl::list<int>
138     CacheLevelAssociativity("polly-target-cache-level-associativity",
139                             cl::desc("The associativity of each cache level."),
140                             cl::Hidden, cl::ZeroOrMore, cl::CommaSeparated,
141                             cl::cat(PollyCategory));
142 
143 static cl::list<int> CacheLevelSizes(
144     "polly-target-cache-level-sizes",
145     cl::desc("The size of each cache level specified in bytes."), cl::Hidden,
146     cl::ZeroOrMore, cl::CommaSeparated, cl::cat(PollyCategory));
147 
148 static cl::opt<int> FirstLevelDefaultTileSize(
149     "polly-default-tile-size",
150     cl::desc("The default tile size (if not enough were provided by"
151              " --polly-tile-sizes)"),
152     cl::Hidden, cl::init(32), cl::ZeroOrMore, cl::cat(PollyCategory));
153 
154 static cl::list<int> FirstLevelTileSizes(
155     "polly-tile-sizes", cl::desc("A tile size for each loop dimension, filled "
156                                  "with --polly-default-tile-size"),
157     cl::Hidden, cl::ZeroOrMore, cl::CommaSeparated, cl::cat(PollyCategory));
158 
159 static cl::opt<bool>
160     SecondLevelTiling("polly-2nd-level-tiling",
161                       cl::desc("Enable a 2nd level loop of loop tiling"),
162                       cl::init(false), cl::ZeroOrMore, cl::cat(PollyCategory));
163 
164 static cl::opt<int> SecondLevelDefaultTileSize(
165     "polly-2nd-level-default-tile-size",
166     cl::desc("The default 2nd-level tile size (if not enough were provided by"
167              " --polly-2nd-level-tile-sizes)"),
168     cl::Hidden, cl::init(16), cl::ZeroOrMore, cl::cat(PollyCategory));
169 
170 static cl::list<int>
171     SecondLevelTileSizes("polly-2nd-level-tile-sizes",
172                          cl::desc("A tile size for each loop dimension, filled "
173                                   "with --polly-default-tile-size"),
174                          cl::Hidden, cl::ZeroOrMore, cl::CommaSeparated,
175                          cl::cat(PollyCategory));
176 
177 static cl::opt<bool> RegisterTiling("polly-register-tiling",
178                                     cl::desc("Enable register tiling"),
179                                     cl::init(false), cl::ZeroOrMore,
180                                     cl::cat(PollyCategory));
181 
182 static cl::opt<int> RegisterDefaultTileSize(
183     "polly-register-tiling-default-tile-size",
184     cl::desc("The default register tile size (if not enough were provided by"
185              " --polly-register-tile-sizes)"),
186     cl::Hidden, cl::init(2), cl::ZeroOrMore, cl::cat(PollyCategory));
187 
188 static cl::list<int>
189     RegisterTileSizes("polly-register-tile-sizes",
190                       cl::desc("A tile size for each loop dimension, filled "
191                                "with --polly-register-tile-size"),
192                       cl::Hidden, cl::ZeroOrMore, cl::CommaSeparated,
193                       cl::cat(PollyCategory));
194 
195 static cl::opt<bool>
196     PMBasedOpts("polly-pattern-matching-based-opts",
197                 cl::desc("Perform optimizations based on pattern matching"),
198                 cl::init(false), cl::ZeroOrMore, cl::cat(PollyCategory));
199 
200 /// @brief Create an isl_union_set, which describes the isolate option based
201 ///        on IsoalteDomain.
202 ///
203 /// @param IsolateDomain An isl_set whose last dimension is the only one that
204 ///                      should belong to the current band node.
205 static __isl_give isl_union_set *
206 getIsolateOptions(__isl_take isl_set *IsolateDomain) {
207   auto Dims = isl_set_dim(IsolateDomain, isl_dim_set);
208   auto *IsolateRelation = isl_map_from_domain(IsolateDomain);
209   IsolateRelation = isl_map_move_dims(IsolateRelation, isl_dim_out, 0,
210                                       isl_dim_in, Dims - 1, 1);
211   auto *IsolateOption = isl_map_wrap(IsolateRelation);
212   auto *Id = isl_id_alloc(isl_set_get_ctx(IsolateOption), "isolate", nullptr);
213   return isl_union_set_from_set(isl_set_set_tuple_id(IsolateOption, Id));
214 }
215 
216 /// @brief Create an isl_union_set, which describes the atomic option for the
217 ///        dimension of the current node.
218 ///
219 /// It may help to reduce the size of generated code.
220 ///
221 /// @param Ctx An isl_ctx, which is used to create the isl_union_set.
222 static __isl_give isl_union_set *getAtomicOptions(__isl_take isl_ctx *Ctx) {
223   auto *Space = isl_space_set_alloc(Ctx, 0, 1);
224   auto *AtomicOption = isl_set_universe(Space);
225   auto *Id = isl_id_alloc(Ctx, "atomic", nullptr);
226   return isl_union_set_from_set(isl_set_set_tuple_id(AtomicOption, Id));
227 }
228 
229 /// @brief Make the last dimension of Set to take values
230 ///        from 0 to VectorWidth - 1.
231 ///
232 /// @param Set         A set, which should be modified.
233 /// @param VectorWidth A parameter, which determines the constraint.
234 static __isl_give isl_set *addExtentConstraints(__isl_take isl_set *Set,
235                                                 int VectorWidth) {
236   auto Dims = isl_set_dim(Set, isl_dim_set);
237   auto Space = isl_set_get_space(Set);
238   auto *LocalSpace = isl_local_space_from_space(Space);
239   auto *ExtConstr =
240       isl_constraint_alloc_inequality(isl_local_space_copy(LocalSpace));
241   ExtConstr = isl_constraint_set_constant_si(ExtConstr, 0);
242   ExtConstr =
243       isl_constraint_set_coefficient_si(ExtConstr, isl_dim_set, Dims - 1, 1);
244   Set = isl_set_add_constraint(Set, ExtConstr);
245   ExtConstr = isl_constraint_alloc_inequality(LocalSpace);
246   ExtConstr = isl_constraint_set_constant_si(ExtConstr, VectorWidth - 1);
247   ExtConstr =
248       isl_constraint_set_coefficient_si(ExtConstr, isl_dim_set, Dims - 1, -1);
249   return isl_set_add_constraint(Set, ExtConstr);
250 }
251 
252 /// @brief Build the desired set of partial tile prefixes.
253 ///
254 /// We build a set of partial tile prefixes, which are prefixes of the vector
255 /// loop that have exactly VectorWidth iterations.
256 ///
257 /// 1. Get all prefixes of the vector loop.
258 /// 2. Extend it to a set, which has exactly VectorWidth iterations for
259 ///    any prefix from the set that was built on the previous step.
260 /// 3. Subtract loop domain from it, project out the vector loop dimension and
261 ///    get a set of prefixes, which don't have exactly VectorWidth iterations.
262 /// 4. Subtract it from all prefixes of the vector loop and get the desired
263 ///    set.
264 ///
265 /// @param ScheduleRange A range of a map, which describes a prefix schedule
266 ///                      relation.
267 static __isl_give isl_set *
268 getPartialTilePrefixes(__isl_take isl_set *ScheduleRange, int VectorWidth) {
269   auto Dims = isl_set_dim(ScheduleRange, isl_dim_set);
270   auto *LoopPrefixes = isl_set_project_out(isl_set_copy(ScheduleRange),
271                                            isl_dim_set, Dims - 1, 1);
272   auto *ExtentPrefixes =
273       isl_set_add_dims(isl_set_copy(LoopPrefixes), isl_dim_set, 1);
274   ExtentPrefixes = addExtentConstraints(ExtentPrefixes, VectorWidth);
275   auto *BadPrefixes = isl_set_subtract(ExtentPrefixes, ScheduleRange);
276   BadPrefixes = isl_set_project_out(BadPrefixes, isl_dim_set, Dims - 1, 1);
277   return isl_set_subtract(LoopPrefixes, BadPrefixes);
278 }
279 
280 __isl_give isl_schedule_node *ScheduleTreeOptimizer::isolateFullPartialTiles(
281     __isl_take isl_schedule_node *Node, int VectorWidth) {
282   assert(isl_schedule_node_get_type(Node) == isl_schedule_node_band);
283   Node = isl_schedule_node_child(Node, 0);
284   Node = isl_schedule_node_child(Node, 0);
285   auto *SchedRelUMap = isl_schedule_node_get_prefix_schedule_relation(Node);
286   auto *ScheduleRelation = isl_map_from_union_map(SchedRelUMap);
287   auto *ScheduleRange = isl_map_range(ScheduleRelation);
288   auto *IsolateDomain = getPartialTilePrefixes(ScheduleRange, VectorWidth);
289   auto *AtomicOption = getAtomicOptions(isl_set_get_ctx(IsolateDomain));
290   auto *IsolateOption = getIsolateOptions(IsolateDomain);
291   Node = isl_schedule_node_parent(Node);
292   Node = isl_schedule_node_parent(Node);
293   auto *Options = isl_union_set_union(IsolateOption, AtomicOption);
294   Node = isl_schedule_node_band_set_ast_build_options(Node, Options);
295   return Node;
296 }
297 
298 __isl_give isl_schedule_node *
299 ScheduleTreeOptimizer::prevectSchedBand(__isl_take isl_schedule_node *Node,
300                                         unsigned DimToVectorize,
301                                         int VectorWidth) {
302   assert(isl_schedule_node_get_type(Node) == isl_schedule_node_band);
303 
304   auto Space = isl_schedule_node_band_get_space(Node);
305   auto ScheduleDimensions = isl_space_dim(Space, isl_dim_set);
306   isl_space_free(Space);
307   assert(DimToVectorize < ScheduleDimensions);
308 
309   if (DimToVectorize > 0) {
310     Node = isl_schedule_node_band_split(Node, DimToVectorize);
311     Node = isl_schedule_node_child(Node, 0);
312   }
313   if (DimToVectorize < ScheduleDimensions - 1)
314     Node = isl_schedule_node_band_split(Node, 1);
315   Space = isl_schedule_node_band_get_space(Node);
316   auto Sizes = isl_multi_val_zero(Space);
317   auto Ctx = isl_schedule_node_get_ctx(Node);
318   Sizes =
319       isl_multi_val_set_val(Sizes, 0, isl_val_int_from_si(Ctx, VectorWidth));
320   Node = isl_schedule_node_band_tile(Node, Sizes);
321   Node = isolateFullPartialTiles(Node, VectorWidth);
322   Node = isl_schedule_node_child(Node, 0);
323   // Make sure the "trivially vectorizable loop" is not unrolled. Otherwise,
324   // we will have troubles to match it in the backend.
325   Node = isl_schedule_node_band_set_ast_build_options(
326       Node, isl_union_set_read_from_str(Ctx, "{ unroll[x]: 1 = 0 }"));
327   Node = isl_schedule_node_band_sink(Node);
328   Node = isl_schedule_node_child(Node, 0);
329   if (isl_schedule_node_get_type(Node) == isl_schedule_node_leaf)
330     Node = isl_schedule_node_parent(Node);
331   isl_id *LoopMarker = isl_id_alloc(Ctx, "SIMD", nullptr);
332   Node = isl_schedule_node_insert_mark(Node, LoopMarker);
333   return Node;
334 }
335 
336 __isl_give isl_schedule_node *
337 ScheduleTreeOptimizer::tileNode(__isl_take isl_schedule_node *Node,
338                                 const char *Identifier, ArrayRef<int> TileSizes,
339                                 int DefaultTileSize) {
340   auto Ctx = isl_schedule_node_get_ctx(Node);
341   auto Space = isl_schedule_node_band_get_space(Node);
342   auto Dims = isl_space_dim(Space, isl_dim_set);
343   auto Sizes = isl_multi_val_zero(Space);
344   std::string IdentifierString(Identifier);
345   for (unsigned i = 0; i < Dims; i++) {
346     auto tileSize = i < TileSizes.size() ? TileSizes[i] : DefaultTileSize;
347     Sizes = isl_multi_val_set_val(Sizes, i, isl_val_int_from_si(Ctx, tileSize));
348   }
349   auto TileLoopMarkerStr = IdentifierString + " - Tiles";
350   isl_id *TileLoopMarker =
351       isl_id_alloc(Ctx, TileLoopMarkerStr.c_str(), nullptr);
352   Node = isl_schedule_node_insert_mark(Node, TileLoopMarker);
353   Node = isl_schedule_node_child(Node, 0);
354   Node = isl_schedule_node_band_tile(Node, Sizes);
355   Node = isl_schedule_node_child(Node, 0);
356   auto PointLoopMarkerStr = IdentifierString + " - Points";
357   isl_id *PointLoopMarker =
358       isl_id_alloc(Ctx, PointLoopMarkerStr.c_str(), nullptr);
359   Node = isl_schedule_node_insert_mark(Node, PointLoopMarker);
360   Node = isl_schedule_node_child(Node, 0);
361   return Node;
362 }
363 
364 __isl_give isl_schedule_node *
365 ScheduleTreeOptimizer::applyRegisterTiling(__isl_take isl_schedule_node *Node,
366                                            llvm::ArrayRef<int> TileSizes,
367                                            int DefaultTileSize) {
368   auto *Ctx = isl_schedule_node_get_ctx(Node);
369   Node = tileNode(Node, "Register tiling", TileSizes, DefaultTileSize);
370   Node = isl_schedule_node_band_set_ast_build_options(
371       Node, isl_union_set_read_from_str(Ctx, "{unroll[x]}"));
372   return Node;
373 }
374 
375 bool ScheduleTreeOptimizer::isTileableBandNode(
376     __isl_keep isl_schedule_node *Node) {
377   if (isl_schedule_node_get_type(Node) != isl_schedule_node_band)
378     return false;
379 
380   if (isl_schedule_node_n_children(Node) != 1)
381     return false;
382 
383   if (!isl_schedule_node_band_get_permutable(Node))
384     return false;
385 
386   auto Space = isl_schedule_node_band_get_space(Node);
387   auto Dims = isl_space_dim(Space, isl_dim_set);
388   isl_space_free(Space);
389 
390   if (Dims <= 1)
391     return false;
392 
393   auto Child = isl_schedule_node_get_child(Node, 0);
394   auto Type = isl_schedule_node_get_type(Child);
395   isl_schedule_node_free(Child);
396 
397   if (Type != isl_schedule_node_leaf)
398     return false;
399 
400   return true;
401 }
402 
403 __isl_give isl_schedule_node *
404 ScheduleTreeOptimizer::standardBandOpts(__isl_take isl_schedule_node *Node,
405                                         void *User) {
406   if (FirstLevelTiling)
407     Node = tileNode(Node, "1st level tiling", FirstLevelTileSizes,
408                     FirstLevelDefaultTileSize);
409 
410   if (SecondLevelTiling)
411     Node = tileNode(Node, "2nd level tiling", SecondLevelTileSizes,
412                     SecondLevelDefaultTileSize);
413 
414   if (RegisterTiling)
415     Node =
416         applyRegisterTiling(Node, RegisterTileSizes, RegisterDefaultTileSize);
417 
418   if (PollyVectorizerChoice == VECTORIZER_NONE)
419     return Node;
420 
421   auto Space = isl_schedule_node_band_get_space(Node);
422   auto Dims = isl_space_dim(Space, isl_dim_set);
423   isl_space_free(Space);
424 
425   for (int i = Dims - 1; i >= 0; i--)
426     if (isl_schedule_node_band_member_get_coincident(Node, i)) {
427       Node = prevectSchedBand(Node, i, PrevectorWidth);
428       break;
429     }
430 
431   return Node;
432 }
433 
434 /// @brief Check whether output dimensions of the map rely on the specified
435 ///        input dimension.
436 ///
437 /// @param IslMap The isl map to be considered.
438 /// @param DimNum The number of an input dimension to be checked.
439 static bool isInputDimUsed(__isl_take isl_map *IslMap, unsigned DimNum) {
440   auto *CheckedAccessRelation =
441       isl_map_project_out(isl_map_copy(IslMap), isl_dim_in, DimNum, 1);
442   CheckedAccessRelation =
443       isl_map_insert_dims(CheckedAccessRelation, isl_dim_in, DimNum, 1);
444   auto *InputDimsId = isl_map_get_tuple_id(IslMap, isl_dim_in);
445   CheckedAccessRelation =
446       isl_map_set_tuple_id(CheckedAccessRelation, isl_dim_in, InputDimsId);
447   InputDimsId = isl_map_get_tuple_id(IslMap, isl_dim_out);
448   CheckedAccessRelation =
449       isl_map_set_tuple_id(CheckedAccessRelation, isl_dim_out, InputDimsId);
450   auto res = !isl_map_is_equal(CheckedAccessRelation, IslMap);
451   isl_map_free(CheckedAccessRelation);
452   isl_map_free(IslMap);
453   return res;
454 }
455 
456 /// @brief Check if the SCoP statement could probably be optimized with
457 ///        analytical modeling.
458 ///
459 /// containsMatrMult tries to determine whether the following conditions
460 /// are true:
461 /// 1. all memory accesses of the statement will have stride 0 or 1,
462 ///    if we interchange loops (switch the variable used in the inner
463 ///    loop to the outer loop).
464 /// 2. all memory accesses of the statement except from the last one, are
465 ///    read memory access and the last one is write memory access.
466 /// 3. all subscripts of the last memory access of the statement don't contain
467 ///    the variable used in the inner loop.
468 ///
469 /// @param PartialSchedule The PartialSchedule that contains a SCoP statement
470 ///        to check.
471 static bool containsMatrMult(__isl_keep isl_map *PartialSchedule) {
472   auto InputDimsId = isl_map_get_tuple_id(PartialSchedule, isl_dim_in);
473   auto *ScpStmt = static_cast<ScopStmt *>(isl_id_get_user(InputDimsId));
474   isl_id_free(InputDimsId);
475   if (ScpStmt->size() <= 1)
476     return false;
477   auto MemA = ScpStmt->begin();
478   for (unsigned i = 0; i < ScpStmt->size() - 2 && MemA != ScpStmt->end();
479        i++, MemA++)
480     if (!(*MemA)->isRead() ||
481         ((*MemA)->isArrayKind() &&
482          !((*MemA)->isStrideOne(isl_map_copy(PartialSchedule)) ||
483            (*MemA)->isStrideZero(isl_map_copy(PartialSchedule)))))
484       return false;
485   MemA++;
486   if (!(*MemA)->isWrite() || !(*MemA)->isArrayKind() ||
487       !((*MemA)->isStrideOne(isl_map_copy(PartialSchedule)) ||
488         (*MemA)->isStrideZero(isl_map_copy(PartialSchedule))))
489     return false;
490   auto DimNum = isl_map_dim(PartialSchedule, isl_dim_in);
491   return !isInputDimUsed((*MemA)->getAccessRelation(), DimNum - 1);
492 }
493 
494 /// @brief Circular shift of output dimensions of the integer map.
495 ///
496 /// @param IslMap The isl map to be modified.
497 static __isl_give isl_map *circularShiftOutputDims(__isl_take isl_map *IslMap) {
498   auto DimNum = isl_map_dim(IslMap, isl_dim_out);
499   if (DimNum == 0)
500     return IslMap;
501   auto InputDimsId = isl_map_get_tuple_id(IslMap, isl_dim_in);
502   IslMap = isl_map_move_dims(IslMap, isl_dim_in, 0, isl_dim_out, DimNum - 1, 1);
503   IslMap = isl_map_move_dims(IslMap, isl_dim_out, 0, isl_dim_in, 0, 1);
504   return isl_map_set_tuple_id(IslMap, isl_dim_in, InputDimsId);
505 }
506 
507 /// @brief Permute two dimensions of the band node.
508 ///
509 /// Permute FirstDim and SecondDim dimensions of the Node.
510 ///
511 /// @param Node The band node to be modified.
512 /// @param FirstDim The first dimension to be permuted.
513 /// @param SecondDim The second dimension to be permuted.
514 static __isl_give isl_schedule_node *
515 permuteBandNodeDimensions(__isl_take isl_schedule_node *Node, unsigned FirstDim,
516                           unsigned SecondDim) {
517   assert(isl_schedule_node_get_type(Node) == isl_schedule_node_band &&
518          isl_schedule_node_band_n_member(Node) > std::max(FirstDim, SecondDim));
519   auto PartialSchedule = isl_schedule_node_band_get_partial_schedule(Node);
520   auto PartialScheduleFirstDim =
521       isl_multi_union_pw_aff_get_union_pw_aff(PartialSchedule, FirstDim);
522   auto PartialScheduleSecondDim =
523       isl_multi_union_pw_aff_get_union_pw_aff(PartialSchedule, SecondDim);
524   PartialSchedule = isl_multi_union_pw_aff_set_union_pw_aff(
525       PartialSchedule, SecondDim, PartialScheduleFirstDim);
526   PartialSchedule = isl_multi_union_pw_aff_set_union_pw_aff(
527       PartialSchedule, FirstDim, PartialScheduleSecondDim);
528   Node = isl_schedule_node_delete(Node);
529   Node = isl_schedule_node_insert_partial_schedule(Node, PartialSchedule);
530   return Node;
531 }
532 
533 __isl_give isl_schedule_node *ScheduleTreeOptimizer::createMicroKernel(
534     __isl_take isl_schedule_node *Node, MicroKernelParamsTy MicroKernelParams) {
535   return applyRegisterTiling(Node, {MicroKernelParams.Mr, MicroKernelParams.Nr},
536                              1);
537 }
538 
539 __isl_give isl_schedule_node *ScheduleTreeOptimizer::createMacroKernel(
540     __isl_take isl_schedule_node *Node, MacroKernelParamsTy MacroKernelParams) {
541   assert(isl_schedule_node_get_type(Node) == isl_schedule_node_band);
542   if (MacroKernelParams.Mc == 1 && MacroKernelParams.Nc == 1 &&
543       MacroKernelParams.Kc == 1)
544     return Node;
545   Node = tileNode(
546       Node, "1st level tiling",
547       {MacroKernelParams.Mc, MacroKernelParams.Nc, MacroKernelParams.Kc}, 1);
548   Node = isl_schedule_node_parent(isl_schedule_node_parent(Node));
549   Node = permuteBandNodeDimensions(Node, 1, 2);
550   return isl_schedule_node_child(isl_schedule_node_child(Node, 0), 0);
551 }
552 
553 /// Get parameters of the BLIS micro kernel.
554 ///
555 /// We choose the Mr and Nr parameters of the micro kernel to be large enough
556 /// such that no stalls caused by the combination of latencies and dependencies
557 /// are introduced during the updates of the resulting matrix of the matrix
558 /// multiplication. However, they should also be as small as possible to
559 /// release more registers for entries of multiplied matrices.
560 ///
561 /// @param TTI Target Transform Info.
562 /// @return The structure of type MicroKernelParamsTy.
563 /// @see MicroKernelParamsTy
564 static struct MicroKernelParamsTy
565 getMicroKernelParams(const llvm::TargetTransformInfo *TTI) {
566   assert(TTI && "The target transform info should be provided.");
567 
568   // Nvec - Number of double-precision floating-point numbers that can be hold
569   // by a vector register. Use 2 by default.
570   auto Nvec = TTI->getRegisterBitWidth(true) / 64;
571   if (Nvec == 0)
572     Nvec = 2;
573   int Nr =
574       ceil(sqrt(Nvec * LatencyVectorFma * ThrougputVectorFma) / Nvec) * Nvec;
575   int Mr = ceil(Nvec * LatencyVectorFma * ThrougputVectorFma / Nr);
576   return {Mr, Nr};
577 }
578 
579 /// Get parameters of the BLIS macro kernel.
580 ///
581 /// During the computation of matrix multiplication, blocks of partitioned
582 /// matrices are mapped to different layers of the memory hierarchy.
583 /// To optimize data reuse, blocks should be ideally kept in cache between
584 /// iterations. Since parameters of the macro kernel determine sizes of these
585 /// blocks, there are upper and lower bounds on these parameters.
586 ///
587 /// @param MicroKernelParams Parameters of the micro-kernel
588 ///                          to be taken into account.
589 /// @return The structure of type MacroKernelParamsTy.
590 /// @see MacroKernelParamsTy
591 /// @see MicroKernelParamsTy
592 static struct MacroKernelParamsTy
593 getMacroKernelParams(const MicroKernelParamsTy &MicroKernelParams) {
594   // According to www.cs.utexas.edu/users/flame/pubs/TOMS-BLIS-Analytical.pdf,
595   // it requires information about the first two levels of a cache to determine
596   // all the parameters of a macro-kernel. It also checks that an associativity
597   // degree of a cache level is greater than two. Otherwise, another algorithm
598   // for determination of the parameters should be used.
599   if (!(MicroKernelParams.Mr > 0 && MicroKernelParams.Nr > 0 &&
600         CacheLevelSizes.size() >= 2 && CacheLevelAssociativity.size() >= 2 &&
601         CacheLevelSizes[0] > 0 && CacheLevelSizes[1] > 0 &&
602         CacheLevelAssociativity[0] > 2 && CacheLevelAssociativity[1] > 2))
603     return {1, 1, 1};
604   int Cbr = floor(
605       (CacheLevelAssociativity[0] - 1) /
606       (1 + static_cast<double>(MicroKernelParams.Mr) / MicroKernelParams.Nr));
607   int Kc = (Cbr * CacheLevelSizes[0]) /
608            (MicroKernelParams.Nr * CacheLevelAssociativity[0] * 8);
609   double Cac = static_cast<double>(MicroKernelParams.Mr * Kc * 8 *
610                                    CacheLevelAssociativity[1]) /
611                CacheLevelSizes[1];
612   double Cbc = static_cast<double>(MicroKernelParams.Nr * Kc * 8 *
613                                    CacheLevelAssociativity[1]) /
614                CacheLevelSizes[1];
615   int Mc = floor(MicroKernelParams.Mr / Cac);
616   int Nc =
617       floor((MicroKernelParams.Nr * (CacheLevelAssociativity[1] - 2)) / Cbc);
618   return {Mc, Nc, Kc};
619 }
620 
621 __isl_give isl_schedule_node *ScheduleTreeOptimizer::optimizeMatMulPattern(
622     __isl_take isl_schedule_node *Node, const llvm::TargetTransformInfo *TTI) {
623   assert(TTI && "The target transform info should be provided.");
624   auto MicroKernelParams = getMicroKernelParams(TTI);
625   auto MacroKernelParams = getMacroKernelParams(MicroKernelParams);
626   Node = createMacroKernel(Node, MacroKernelParams);
627   Node = createMicroKernel(Node, MicroKernelParams);
628   return Node;
629 }
630 
631 bool ScheduleTreeOptimizer::isMatrMultPattern(
632     __isl_keep isl_schedule_node *Node) {
633   auto *PartialSchedule =
634       isl_schedule_node_band_get_partial_schedule_union_map(Node);
635   if (isl_schedule_node_band_n_member(Node) != 3 ||
636       isl_union_map_n_map(PartialSchedule) != 1) {
637     isl_union_map_free(PartialSchedule);
638     return false;
639   }
640   auto *NewPartialSchedule = isl_map_from_union_map(PartialSchedule);
641   NewPartialSchedule = circularShiftOutputDims(NewPartialSchedule);
642   if (containsMatrMult(NewPartialSchedule)) {
643     isl_map_free(NewPartialSchedule);
644     return true;
645   }
646   isl_map_free(NewPartialSchedule);
647   return false;
648 }
649 
650 __isl_give isl_schedule_node *
651 ScheduleTreeOptimizer::optimizeBand(__isl_take isl_schedule_node *Node,
652                                     void *User) {
653   if (!isTileableBandNode(Node))
654     return Node;
655 
656   if (PMBasedOpts && User && isMatrMultPattern(Node)) {
657     DEBUG(dbgs() << "The matrix multiplication pattern was detected\n");
658     const llvm::TargetTransformInfo *TTI;
659     TTI = static_cast<const llvm::TargetTransformInfo *>(User);
660     Node = optimizeMatMulPattern(Node, TTI);
661   }
662 
663   return standardBandOpts(Node, User);
664 }
665 
666 __isl_give isl_schedule *
667 ScheduleTreeOptimizer::optimizeSchedule(__isl_take isl_schedule *Schedule,
668                                         const llvm::TargetTransformInfo *TTI) {
669   isl_schedule_node *Root = isl_schedule_get_root(Schedule);
670   Root = optimizeScheduleNode(Root, TTI);
671   isl_schedule_free(Schedule);
672   auto S = isl_schedule_node_get_schedule(Root);
673   isl_schedule_node_free(Root);
674   return S;
675 }
676 
677 __isl_give isl_schedule_node *ScheduleTreeOptimizer::optimizeScheduleNode(
678     __isl_take isl_schedule_node *Node, const llvm::TargetTransformInfo *TTI) {
679   Node = isl_schedule_node_map_descendant_bottom_up(
680       Node, optimizeBand, const_cast<void *>(static_cast<const void *>(TTI)));
681   return Node;
682 }
683 
684 bool ScheduleTreeOptimizer::isProfitableSchedule(
685     Scop &S, __isl_keep isl_union_map *NewSchedule) {
686   // To understand if the schedule has been optimized we check if the schedule
687   // has changed at all.
688   // TODO: We can improve this by tracking if any necessarily beneficial
689   // transformations have been performed. This can e.g. be tiling, loop
690   // interchange, or ...) We can track this either at the place where the
691   // transformation has been performed or, in case of automatic ILP based
692   // optimizations, by comparing (yet to be defined) performance metrics
693   // before/after the scheduling optimizer
694   // (e.g., #stride-one accesses)
695   isl_union_map *OldSchedule = S.getSchedule();
696   bool changed = !isl_union_map_is_equal(OldSchedule, NewSchedule);
697   isl_union_map_free(OldSchedule);
698   return changed;
699 }
700 
701 namespace {
702 class IslScheduleOptimizer : public ScopPass {
703 public:
704   static char ID;
705   explicit IslScheduleOptimizer() : ScopPass(ID) { LastSchedule = nullptr; }
706 
707   ~IslScheduleOptimizer() { isl_schedule_free(LastSchedule); }
708 
709   /// @brief Optimize the schedule of the SCoP @p S.
710   bool runOnScop(Scop &S) override;
711 
712   /// @brief Print the new schedule for the SCoP @p S.
713   void printScop(raw_ostream &OS, Scop &S) const override;
714 
715   /// @brief Register all analyses and transformation required.
716   void getAnalysisUsage(AnalysisUsage &AU) const override;
717 
718   /// @brief Release the internal memory.
719   void releaseMemory() override {
720     isl_schedule_free(LastSchedule);
721     LastSchedule = nullptr;
722   }
723 
724 private:
725   isl_schedule *LastSchedule;
726 };
727 } // namespace
728 
729 char IslScheduleOptimizer::ID = 0;
730 
731 bool IslScheduleOptimizer::runOnScop(Scop &S) {
732 
733   // Skip empty SCoPs but still allow code generation as it will delete the
734   // loops present but not needed.
735   if (S.getSize() == 0) {
736     S.markAsOptimized();
737     return false;
738   }
739 
740   const Dependences &D =
741       getAnalysis<DependenceInfo>().getDependences(Dependences::AL_Statement);
742 
743   if (!D.hasValidDependences())
744     return false;
745 
746   isl_schedule_free(LastSchedule);
747   LastSchedule = nullptr;
748 
749   // Build input data.
750   int ValidityKinds =
751       Dependences::TYPE_RAW | Dependences::TYPE_WAR | Dependences::TYPE_WAW;
752   int ProximityKinds;
753 
754   if (OptimizeDeps == "all")
755     ProximityKinds =
756         Dependences::TYPE_RAW | Dependences::TYPE_WAR | Dependences::TYPE_WAW;
757   else if (OptimizeDeps == "raw")
758     ProximityKinds = Dependences::TYPE_RAW;
759   else {
760     errs() << "Do not know how to optimize for '" << OptimizeDeps << "'"
761            << " Falling back to optimizing all dependences.\n";
762     ProximityKinds =
763         Dependences::TYPE_RAW | Dependences::TYPE_WAR | Dependences::TYPE_WAW;
764   }
765 
766   isl_union_set *Domain = S.getDomains();
767 
768   if (!Domain)
769     return false;
770 
771   isl_union_map *Validity = D.getDependences(ValidityKinds);
772   isl_union_map *Proximity = D.getDependences(ProximityKinds);
773 
774   // Simplify the dependences by removing the constraints introduced by the
775   // domains. This can speed up the scheduling time significantly, as large
776   // constant coefficients will be removed from the dependences. The
777   // introduction of some additional dependences reduces the possible
778   // transformations, but in most cases, such transformation do not seem to be
779   // interesting anyway. In some cases this option may stop the scheduler to
780   // find any schedule.
781   if (SimplifyDeps == "yes") {
782     Validity = isl_union_map_gist_domain(Validity, isl_union_set_copy(Domain));
783     Validity = isl_union_map_gist_range(Validity, isl_union_set_copy(Domain));
784     Proximity =
785         isl_union_map_gist_domain(Proximity, isl_union_set_copy(Domain));
786     Proximity = isl_union_map_gist_range(Proximity, isl_union_set_copy(Domain));
787   } else if (SimplifyDeps != "no") {
788     errs() << "warning: Option -polly-opt-simplify-deps should either be 'yes' "
789               "or 'no'. Falling back to default: 'yes'\n";
790   }
791 
792   DEBUG(dbgs() << "\n\nCompute schedule from: ");
793   DEBUG(dbgs() << "Domain := " << stringFromIslObj(Domain) << ";\n");
794   DEBUG(dbgs() << "Proximity := " << stringFromIslObj(Proximity) << ";\n");
795   DEBUG(dbgs() << "Validity := " << stringFromIslObj(Validity) << ";\n");
796 
797   unsigned IslSerializeSCCs;
798 
799   if (FusionStrategy == "max") {
800     IslSerializeSCCs = 0;
801   } else if (FusionStrategy == "min") {
802     IslSerializeSCCs = 1;
803   } else {
804     errs() << "warning: Unknown fusion strategy. Falling back to maximal "
805               "fusion.\n";
806     IslSerializeSCCs = 0;
807   }
808 
809   int IslMaximizeBands;
810 
811   if (MaximizeBandDepth == "yes") {
812     IslMaximizeBands = 1;
813   } else if (MaximizeBandDepth == "no") {
814     IslMaximizeBands = 0;
815   } else {
816     errs() << "warning: Option -polly-opt-maximize-bands should either be 'yes'"
817               " or 'no'. Falling back to default: 'yes'\n";
818     IslMaximizeBands = 1;
819   }
820 
821   int IslOuterCoincidence;
822 
823   if (OuterCoincidence == "yes") {
824     IslOuterCoincidence = 1;
825   } else if (OuterCoincidence == "no") {
826     IslOuterCoincidence = 0;
827   } else {
828     errs() << "warning: Option -polly-opt-outer-coincidence should either be "
829               "'yes' or 'no'. Falling back to default: 'no'\n";
830     IslOuterCoincidence = 0;
831   }
832 
833   isl_ctx *Ctx = S.getIslCtx();
834 
835   isl_options_set_schedule_outer_coincidence(Ctx, IslOuterCoincidence);
836   isl_options_set_schedule_serialize_sccs(Ctx, IslSerializeSCCs);
837   isl_options_set_schedule_maximize_band_depth(Ctx, IslMaximizeBands);
838   isl_options_set_schedule_max_constant_term(Ctx, MaxConstantTerm);
839   isl_options_set_schedule_max_coefficient(Ctx, MaxCoefficient);
840   isl_options_set_tile_scale_tile_loops(Ctx, 0);
841 
842   auto OnErrorStatus = isl_options_get_on_error(Ctx);
843   isl_options_set_on_error(Ctx, ISL_ON_ERROR_CONTINUE);
844 
845   isl_schedule_constraints *ScheduleConstraints;
846   ScheduleConstraints = isl_schedule_constraints_on_domain(Domain);
847   ScheduleConstraints =
848       isl_schedule_constraints_set_proximity(ScheduleConstraints, Proximity);
849   ScheduleConstraints = isl_schedule_constraints_set_validity(
850       ScheduleConstraints, isl_union_map_copy(Validity));
851   ScheduleConstraints =
852       isl_schedule_constraints_set_coincidence(ScheduleConstraints, Validity);
853   isl_schedule *Schedule;
854   Schedule = isl_schedule_constraints_compute_schedule(ScheduleConstraints);
855   isl_options_set_on_error(Ctx, OnErrorStatus);
856 
857   // In cases the scheduler is not able to optimize the code, we just do not
858   // touch the schedule.
859   if (!Schedule)
860     return false;
861 
862   DEBUG({
863     auto *P = isl_printer_to_str(Ctx);
864     P = isl_printer_set_yaml_style(P, ISL_YAML_STYLE_BLOCK);
865     P = isl_printer_print_schedule(P, Schedule);
866     dbgs() << "NewScheduleTree: \n" << isl_printer_get_str(P) << "\n";
867     isl_printer_free(P);
868   });
869 
870   Function &F = S.getFunction();
871   auto *TTI = &getAnalysis<TargetTransformInfoWrapperPass>().getTTI(F);
872   isl_schedule *NewSchedule =
873       ScheduleTreeOptimizer::optimizeSchedule(Schedule, TTI);
874   isl_union_map *NewScheduleMap = isl_schedule_get_map(NewSchedule);
875 
876   if (!ScheduleTreeOptimizer::isProfitableSchedule(S, NewScheduleMap)) {
877     isl_union_map_free(NewScheduleMap);
878     isl_schedule_free(NewSchedule);
879     return false;
880   }
881 
882   S.setScheduleTree(NewSchedule);
883   S.markAsOptimized();
884 
885   isl_union_map_free(NewScheduleMap);
886   return false;
887 }
888 
889 void IslScheduleOptimizer::printScop(raw_ostream &OS, Scop &) const {
890   isl_printer *p;
891   char *ScheduleStr;
892 
893   OS << "Calculated schedule:\n";
894 
895   if (!LastSchedule) {
896     OS << "n/a\n";
897     return;
898   }
899 
900   p = isl_printer_to_str(isl_schedule_get_ctx(LastSchedule));
901   p = isl_printer_print_schedule(p, LastSchedule);
902   ScheduleStr = isl_printer_get_str(p);
903   isl_printer_free(p);
904 
905   OS << ScheduleStr << "\n";
906 }
907 
908 void IslScheduleOptimizer::getAnalysisUsage(AnalysisUsage &AU) const {
909   ScopPass::getAnalysisUsage(AU);
910   AU.addRequired<DependenceInfo>();
911   AU.addRequired<TargetTransformInfoWrapperPass>();
912 }
913 
914 Pass *polly::createIslScheduleOptimizerPass() {
915   return new IslScheduleOptimizer();
916 }
917 
918 INITIALIZE_PASS_BEGIN(IslScheduleOptimizer, "polly-opt-isl",
919                       "Polly - Optimize schedule of SCoP", false, false);
920 INITIALIZE_PASS_DEPENDENCY(DependenceInfo);
921 INITIALIZE_PASS_DEPENDENCY(ScopInfoRegionPass);
922 INITIALIZE_PASS_DEPENDENCY(TargetTransformInfoWrapperPass);
923 INITIALIZE_PASS_END(IslScheduleOptimizer, "polly-opt-isl",
924                     "Polly - Optimize schedule of SCoP", false, false)
925