1 //===- Schedule.cpp - Calculate an optimized schedule ---------------------===//
2 //
3 //                     The LLVM Compiler Infrastructure
4 //
5 // This file is distributed under the University of Illinois Open Source
6 // License. See LICENSE.TXT for details.
7 //
8 //===----------------------------------------------------------------------===//
9 //
10 // This pass generates an entirely new schedule tree from the data dependences
11 // and iteration domains. The new schedule tree is computed in two steps:
12 //
13 // 1) The isl scheduling optimizer is run
14 //
15 // The isl scheduling optimizer creates a new schedule tree that maximizes
16 // parallelism and tileability and minimizes data-dependence distances. The
17 // algorithm used is a modified version of the ``Pluto'' algorithm:
18 //
19 //   U. Bondhugula, A. Hartono, J. Ramanujam, and P. Sadayappan.
20 //   A Practical Automatic Polyhedral Parallelizer and Locality Optimizer.
21 //   In Proceedings of the 2008 ACM SIGPLAN Conference On Programming Language
22 //   Design and Implementation, PLDI ’08, pages 101–113. ACM, 2008.
23 //
24 // 2) A set of post-scheduling transformations is applied on the schedule tree.
25 //
26 // These optimizations include:
27 //
28 //  - Tiling of the innermost tilable bands
29 //  - Prevectorization - The coice of a possible outer loop that is strip-mined
30 //                       to the innermost level to enable inner-loop
31 //                       vectorization.
32 //  - Some optimizations for spatial locality are also planned.
33 //
34 // For a detailed description of the schedule tree itself please see section 6
35 // of:
36 //
37 // Polyhedral AST generation is more than scanning polyhedra
38 // Tobias Grosser, Sven Verdoolaege, Albert Cohen
39 // ACM Transations on Programming Languages and Systems (TOPLAS),
40 // 37(4), July 2015
41 // http://www.grosser.es/#pub-polyhedral-AST-generation
42 //
43 // This publication also contains a detailed discussion of the different options
44 // for polyhedral loop unrolling, full/partial tile separation and other uses
45 // of the schedule tree.
46 //
47 //===----------------------------------------------------------------------===//
48 
49 #include "polly/ScheduleOptimizer.h"
50 #include "polly/CodeGen/CodeGeneration.h"
51 #include "polly/DependenceInfo.h"
52 #include "polly/LinkAllPasses.h"
53 #include "polly/Options.h"
54 #include "polly/ScopInfo.h"
55 #include "polly/Support/GICHelper.h"
56 #include "llvm/Analysis/TargetTransformInfo.h"
57 #include "llvm/Support/Debug.h"
58 #include "isl/aff.h"
59 #include "isl/band.h"
60 #include "isl/constraint.h"
61 #include "isl/map.h"
62 #include "isl/options.h"
63 #include "isl/printer.h"
64 #include "isl/schedule.h"
65 #include "isl/schedule_node.h"
66 #include "isl/space.h"
67 #include "isl/union_map.h"
68 #include "isl/union_set.h"
69 
70 using namespace llvm;
71 using namespace polly;
72 
73 #define DEBUG_TYPE "polly-opt-isl"
74 
75 static cl::opt<std::string>
76     OptimizeDeps("polly-opt-optimize-only",
77                  cl::desc("Only a certain kind of dependences (all/raw)"),
78                  cl::Hidden, cl::init("all"), cl::ZeroOrMore,
79                  cl::cat(PollyCategory));
80 
81 static cl::opt<std::string>
82     SimplifyDeps("polly-opt-simplify-deps",
83                  cl::desc("Dependences should be simplified (yes/no)"),
84                  cl::Hidden, cl::init("yes"), cl::ZeroOrMore,
85                  cl::cat(PollyCategory));
86 
87 static cl::opt<int> MaxConstantTerm(
88     "polly-opt-max-constant-term",
89     cl::desc("The maximal constant term allowed (-1 is unlimited)"), cl::Hidden,
90     cl::init(20), cl::ZeroOrMore, cl::cat(PollyCategory));
91 
92 static cl::opt<int> MaxCoefficient(
93     "polly-opt-max-coefficient",
94     cl::desc("The maximal coefficient allowed (-1 is unlimited)"), cl::Hidden,
95     cl::init(20), cl::ZeroOrMore, cl::cat(PollyCategory));
96 
97 static cl::opt<std::string> FusionStrategy(
98     "polly-opt-fusion", cl::desc("The fusion strategy to choose (min/max)"),
99     cl::Hidden, cl::init("min"), cl::ZeroOrMore, cl::cat(PollyCategory));
100 
101 static cl::opt<std::string>
102     MaximizeBandDepth("polly-opt-maximize-bands",
103                       cl::desc("Maximize the band depth (yes/no)"), cl::Hidden,
104                       cl::init("yes"), cl::ZeroOrMore, cl::cat(PollyCategory));
105 
106 static cl::opt<std::string> OuterCoincidence(
107     "polly-opt-outer-coincidence",
108     cl::desc("Try to construct schedules where the outer member of each band "
109              "satisfies the coincidence constraints (yes/no)"),
110     cl::Hidden, cl::init("no"), cl::ZeroOrMore, cl::cat(PollyCategory));
111 
112 static cl::opt<int> PrevectorWidth(
113     "polly-prevect-width",
114     cl::desc(
115         "The number of loop iterations to strip-mine for pre-vectorization"),
116     cl::Hidden, cl::init(4), cl::ZeroOrMore, cl::cat(PollyCategory));
117 
118 static cl::opt<bool> FirstLevelTiling("polly-tiling",
119                                       cl::desc("Enable loop tiling"),
120                                       cl::init(true), cl::ZeroOrMore,
121                                       cl::cat(PollyCategory));
122 
123 static cl::opt<int> LatencyVectorFma(
124     "polly-target-latency-vector-fma",
125     cl::desc("The minimal number of cycles between issuing two "
126              "dependent consecutive vector fused multiply-add "
127              "instructions."),
128     cl::Hidden, cl::init(8), cl::ZeroOrMore, cl::cat(PollyCategory));
129 
130 static cl::opt<int> ThrougputVectorFma(
131     "polly-target-througput-vector-fma",
132     cl::desc("A throughput of the processor floating-point arithmetic units "
133              "expressed in the number of vector fused multiply-add "
134              "instructions per clock cycle."),
135     cl::Hidden, cl::init(1), cl::ZeroOrMore, cl::cat(PollyCategory));
136 
137 static cl::list<int>
138     CacheLevelAssociativity("polly-target-cache-level-associativity",
139                             cl::desc("The associativity of each cache level."),
140                             cl::Hidden, cl::ZeroOrMore, cl::CommaSeparated,
141                             cl::cat(PollyCategory));
142 
143 static cl::list<int> CacheLevelSizes(
144     "polly-target-cache-level-sizes",
145     cl::desc("The size of each cache level specified in bytes."), cl::Hidden,
146     cl::ZeroOrMore, cl::CommaSeparated, cl::cat(PollyCategory));
147 
148 static cl::opt<int> FirstLevelDefaultTileSize(
149     "polly-default-tile-size",
150     cl::desc("The default tile size (if not enough were provided by"
151              " --polly-tile-sizes)"),
152     cl::Hidden, cl::init(32), cl::ZeroOrMore, cl::cat(PollyCategory));
153 
154 static cl::list<int> FirstLevelTileSizes(
155     "polly-tile-sizes", cl::desc("A tile size for each loop dimension, filled "
156                                  "with --polly-default-tile-size"),
157     cl::Hidden, cl::ZeroOrMore, cl::CommaSeparated, cl::cat(PollyCategory));
158 
159 static cl::opt<bool>
160     SecondLevelTiling("polly-2nd-level-tiling",
161                       cl::desc("Enable a 2nd level loop of loop tiling"),
162                       cl::init(false), cl::ZeroOrMore, cl::cat(PollyCategory));
163 
164 static cl::opt<int> SecondLevelDefaultTileSize(
165     "polly-2nd-level-default-tile-size",
166     cl::desc("The default 2nd-level tile size (if not enough were provided by"
167              " --polly-2nd-level-tile-sizes)"),
168     cl::Hidden, cl::init(16), cl::ZeroOrMore, cl::cat(PollyCategory));
169 
170 static cl::list<int>
171     SecondLevelTileSizes("polly-2nd-level-tile-sizes",
172                          cl::desc("A tile size for each loop dimension, filled "
173                                   "with --polly-default-tile-size"),
174                          cl::Hidden, cl::ZeroOrMore, cl::CommaSeparated,
175                          cl::cat(PollyCategory));
176 
177 static cl::opt<bool> RegisterTiling("polly-register-tiling",
178                                     cl::desc("Enable register tiling"),
179                                     cl::init(false), cl::ZeroOrMore,
180                                     cl::cat(PollyCategory));
181 
182 static cl::opt<int> RegisterDefaultTileSize(
183     "polly-register-tiling-default-tile-size",
184     cl::desc("The default register tile size (if not enough were provided by"
185              " --polly-register-tile-sizes)"),
186     cl::Hidden, cl::init(2), cl::ZeroOrMore, cl::cat(PollyCategory));
187 
188 static cl::list<int>
189     RegisterTileSizes("polly-register-tile-sizes",
190                       cl::desc("A tile size for each loop dimension, filled "
191                                "with --polly-register-tile-size"),
192                       cl::Hidden, cl::ZeroOrMore, cl::CommaSeparated,
193                       cl::cat(PollyCategory));
194 
195 static cl::opt<bool>
196     PMBasedOpts("polly-pattern-matching-based-opts",
197                 cl::desc("Perform optimizations based on pattern matching"),
198                 cl::init(false), cl::ZeroOrMore, cl::cat(PollyCategory));
199 
200 static cl::opt<bool> OptimizedScops(
201     "polly-optimized-scops",
202     cl::desc("Polly - Dump polyhedral description of Scops optimized with "
203              "the isl scheduling optimizer and the set of post-scheduling "
204              "transformations is applied on the schedule tree"),
205     cl::init(false), cl::ZeroOrMore, cl::cat(PollyCategory));
206 
207 /// Create an isl_union_set, which describes the isolate option based on
208 /// IsoalteDomain.
209 ///
210 /// @param IsolateDomain An isl_set whose last dimension is the only one that
211 ///                      should belong to the current band node.
212 static __isl_give isl_union_set *
213 getIsolateOptions(__isl_take isl_set *IsolateDomain) {
214   auto Dims = isl_set_dim(IsolateDomain, isl_dim_set);
215   auto *IsolateRelation = isl_map_from_domain(IsolateDomain);
216   IsolateRelation = isl_map_move_dims(IsolateRelation, isl_dim_out, 0,
217                                       isl_dim_in, Dims - 1, 1);
218   auto *IsolateOption = isl_map_wrap(IsolateRelation);
219   auto *Id = isl_id_alloc(isl_set_get_ctx(IsolateOption), "isolate", nullptr);
220   return isl_union_set_from_set(isl_set_set_tuple_id(IsolateOption, Id));
221 }
222 
223 /// Create an isl_union_set, which describes the atomic option for the dimension
224 /// of the current node.
225 ///
226 /// It may help to reduce the size of generated code.
227 ///
228 /// @param Ctx An isl_ctx, which is used to create the isl_union_set.
229 static __isl_give isl_union_set *getAtomicOptions(__isl_take isl_ctx *Ctx) {
230   auto *Space = isl_space_set_alloc(Ctx, 0, 1);
231   auto *AtomicOption = isl_set_universe(Space);
232   auto *Id = isl_id_alloc(Ctx, "atomic", nullptr);
233   return isl_union_set_from_set(isl_set_set_tuple_id(AtomicOption, Id));
234 }
235 
236 /// Make the last dimension of Set to take values from 0 to VectorWidth - 1.
237 ///
238 /// @param Set         A set, which should be modified.
239 /// @param VectorWidth A parameter, which determines the constraint.
240 static __isl_give isl_set *addExtentConstraints(__isl_take isl_set *Set,
241                                                 int VectorWidth) {
242   auto Dims = isl_set_dim(Set, isl_dim_set);
243   auto Space = isl_set_get_space(Set);
244   auto *LocalSpace = isl_local_space_from_space(Space);
245   auto *ExtConstr =
246       isl_constraint_alloc_inequality(isl_local_space_copy(LocalSpace));
247   ExtConstr = isl_constraint_set_constant_si(ExtConstr, 0);
248   ExtConstr =
249       isl_constraint_set_coefficient_si(ExtConstr, isl_dim_set, Dims - 1, 1);
250   Set = isl_set_add_constraint(Set, ExtConstr);
251   ExtConstr = isl_constraint_alloc_inequality(LocalSpace);
252   ExtConstr = isl_constraint_set_constant_si(ExtConstr, VectorWidth - 1);
253   ExtConstr =
254       isl_constraint_set_coefficient_si(ExtConstr, isl_dim_set, Dims - 1, -1);
255   return isl_set_add_constraint(Set, ExtConstr);
256 }
257 
258 /// Build the desired set of partial tile prefixes.
259 ///
260 /// We build a set of partial tile prefixes, which are prefixes of the vector
261 /// loop that have exactly VectorWidth iterations.
262 ///
263 /// 1. Get all prefixes of the vector loop.
264 /// 2. Extend it to a set, which has exactly VectorWidth iterations for
265 ///    any prefix from the set that was built on the previous step.
266 /// 3. Subtract loop domain from it, project out the vector loop dimension and
267 ///    get a set of prefixes, which don't have exactly VectorWidth iterations.
268 /// 4. Subtract it from all prefixes of the vector loop and get the desired
269 ///    set.
270 ///
271 /// @param ScheduleRange A range of a map, which describes a prefix schedule
272 ///                      relation.
273 static __isl_give isl_set *
274 getPartialTilePrefixes(__isl_take isl_set *ScheduleRange, int VectorWidth) {
275   auto Dims = isl_set_dim(ScheduleRange, isl_dim_set);
276   auto *LoopPrefixes = isl_set_project_out(isl_set_copy(ScheduleRange),
277                                            isl_dim_set, Dims - 1, 1);
278   auto *ExtentPrefixes =
279       isl_set_add_dims(isl_set_copy(LoopPrefixes), isl_dim_set, 1);
280   ExtentPrefixes = addExtentConstraints(ExtentPrefixes, VectorWidth);
281   auto *BadPrefixes = isl_set_subtract(ExtentPrefixes, ScheduleRange);
282   BadPrefixes = isl_set_project_out(BadPrefixes, isl_dim_set, Dims - 1, 1);
283   return isl_set_subtract(LoopPrefixes, BadPrefixes);
284 }
285 
286 __isl_give isl_schedule_node *ScheduleTreeOptimizer::isolateFullPartialTiles(
287     __isl_take isl_schedule_node *Node, int VectorWidth) {
288   assert(isl_schedule_node_get_type(Node) == isl_schedule_node_band);
289   Node = isl_schedule_node_child(Node, 0);
290   Node = isl_schedule_node_child(Node, 0);
291   auto *SchedRelUMap = isl_schedule_node_get_prefix_schedule_relation(Node);
292   auto *ScheduleRelation = isl_map_from_union_map(SchedRelUMap);
293   auto *ScheduleRange = isl_map_range(ScheduleRelation);
294   auto *IsolateDomain = getPartialTilePrefixes(ScheduleRange, VectorWidth);
295   auto *AtomicOption = getAtomicOptions(isl_set_get_ctx(IsolateDomain));
296   auto *IsolateOption = getIsolateOptions(IsolateDomain);
297   Node = isl_schedule_node_parent(Node);
298   Node = isl_schedule_node_parent(Node);
299   auto *Options = isl_union_set_union(IsolateOption, AtomicOption);
300   Node = isl_schedule_node_band_set_ast_build_options(Node, Options);
301   return Node;
302 }
303 
304 __isl_give isl_schedule_node *
305 ScheduleTreeOptimizer::prevectSchedBand(__isl_take isl_schedule_node *Node,
306                                         unsigned DimToVectorize,
307                                         int VectorWidth) {
308   assert(isl_schedule_node_get_type(Node) == isl_schedule_node_band);
309 
310   auto Space = isl_schedule_node_band_get_space(Node);
311   auto ScheduleDimensions = isl_space_dim(Space, isl_dim_set);
312   isl_space_free(Space);
313   assert(DimToVectorize < ScheduleDimensions);
314 
315   if (DimToVectorize > 0) {
316     Node = isl_schedule_node_band_split(Node, DimToVectorize);
317     Node = isl_schedule_node_child(Node, 0);
318   }
319   if (DimToVectorize < ScheduleDimensions - 1)
320     Node = isl_schedule_node_band_split(Node, 1);
321   Space = isl_schedule_node_band_get_space(Node);
322   auto Sizes = isl_multi_val_zero(Space);
323   auto Ctx = isl_schedule_node_get_ctx(Node);
324   Sizes =
325       isl_multi_val_set_val(Sizes, 0, isl_val_int_from_si(Ctx, VectorWidth));
326   Node = isl_schedule_node_band_tile(Node, Sizes);
327   Node = isolateFullPartialTiles(Node, VectorWidth);
328   Node = isl_schedule_node_child(Node, 0);
329   // Make sure the "trivially vectorizable loop" is not unrolled. Otherwise,
330   // we will have troubles to match it in the backend.
331   Node = isl_schedule_node_band_set_ast_build_options(
332       Node, isl_union_set_read_from_str(Ctx, "{ unroll[x]: 1 = 0 }"));
333   Node = isl_schedule_node_band_sink(Node);
334   Node = isl_schedule_node_child(Node, 0);
335   if (isl_schedule_node_get_type(Node) == isl_schedule_node_leaf)
336     Node = isl_schedule_node_parent(Node);
337   isl_id *LoopMarker = isl_id_alloc(Ctx, "SIMD", nullptr);
338   Node = isl_schedule_node_insert_mark(Node, LoopMarker);
339   return Node;
340 }
341 
342 __isl_give isl_schedule_node *
343 ScheduleTreeOptimizer::tileNode(__isl_take isl_schedule_node *Node,
344                                 const char *Identifier, ArrayRef<int> TileSizes,
345                                 int DefaultTileSize) {
346   auto Ctx = isl_schedule_node_get_ctx(Node);
347   auto Space = isl_schedule_node_band_get_space(Node);
348   auto Dims = isl_space_dim(Space, isl_dim_set);
349   auto Sizes = isl_multi_val_zero(Space);
350   std::string IdentifierString(Identifier);
351   for (unsigned i = 0; i < Dims; i++) {
352     auto tileSize = i < TileSizes.size() ? TileSizes[i] : DefaultTileSize;
353     Sizes = isl_multi_val_set_val(Sizes, i, isl_val_int_from_si(Ctx, tileSize));
354   }
355   auto TileLoopMarkerStr = IdentifierString + " - Tiles";
356   isl_id *TileLoopMarker =
357       isl_id_alloc(Ctx, TileLoopMarkerStr.c_str(), nullptr);
358   Node = isl_schedule_node_insert_mark(Node, TileLoopMarker);
359   Node = isl_schedule_node_child(Node, 0);
360   Node = isl_schedule_node_band_tile(Node, Sizes);
361   Node = isl_schedule_node_child(Node, 0);
362   auto PointLoopMarkerStr = IdentifierString + " - Points";
363   isl_id *PointLoopMarker =
364       isl_id_alloc(Ctx, PointLoopMarkerStr.c_str(), nullptr);
365   Node = isl_schedule_node_insert_mark(Node, PointLoopMarker);
366   Node = isl_schedule_node_child(Node, 0);
367   return Node;
368 }
369 
370 __isl_give isl_schedule_node *
371 ScheduleTreeOptimizer::applyRegisterTiling(__isl_take isl_schedule_node *Node,
372                                            llvm::ArrayRef<int> TileSizes,
373                                            int DefaultTileSize) {
374   auto *Ctx = isl_schedule_node_get_ctx(Node);
375   Node = tileNode(Node, "Register tiling", TileSizes, DefaultTileSize);
376   Node = isl_schedule_node_band_set_ast_build_options(
377       Node, isl_union_set_read_from_str(Ctx, "{unroll[x]}"));
378   return Node;
379 }
380 
381 bool ScheduleTreeOptimizer::isTileableBandNode(
382     __isl_keep isl_schedule_node *Node) {
383   if (isl_schedule_node_get_type(Node) != isl_schedule_node_band)
384     return false;
385 
386   if (isl_schedule_node_n_children(Node) != 1)
387     return false;
388 
389   if (!isl_schedule_node_band_get_permutable(Node))
390     return false;
391 
392   auto Space = isl_schedule_node_band_get_space(Node);
393   auto Dims = isl_space_dim(Space, isl_dim_set);
394   isl_space_free(Space);
395 
396   if (Dims <= 1)
397     return false;
398 
399   auto Child = isl_schedule_node_get_child(Node, 0);
400   auto Type = isl_schedule_node_get_type(Child);
401   isl_schedule_node_free(Child);
402 
403   if (Type != isl_schedule_node_leaf)
404     return false;
405 
406   return true;
407 }
408 
409 __isl_give isl_schedule_node *
410 ScheduleTreeOptimizer::standardBandOpts(__isl_take isl_schedule_node *Node,
411                                         void *User) {
412   if (FirstLevelTiling)
413     Node = tileNode(Node, "1st level tiling", FirstLevelTileSizes,
414                     FirstLevelDefaultTileSize);
415 
416   if (SecondLevelTiling)
417     Node = tileNode(Node, "2nd level tiling", SecondLevelTileSizes,
418                     SecondLevelDefaultTileSize);
419 
420   if (RegisterTiling)
421     Node =
422         applyRegisterTiling(Node, RegisterTileSizes, RegisterDefaultTileSize);
423 
424   if (PollyVectorizerChoice == VECTORIZER_NONE)
425     return Node;
426 
427   auto Space = isl_schedule_node_band_get_space(Node);
428   auto Dims = isl_space_dim(Space, isl_dim_set);
429   isl_space_free(Space);
430 
431   for (int i = Dims - 1; i >= 0; i--)
432     if (isl_schedule_node_band_member_get_coincident(Node, i)) {
433       Node = prevectSchedBand(Node, i, PrevectorWidth);
434       break;
435     }
436 
437   return Node;
438 }
439 
440 /// Check whether output dimensions of the map rely on the specified input
441 /// dimension.
442 ///
443 /// @param IslMap The isl map to be considered.
444 /// @param DimNum The number of an input dimension to be checked.
445 static bool isInputDimUsed(__isl_take isl_map *IslMap, unsigned DimNum) {
446   auto *CheckedAccessRelation =
447       isl_map_project_out(isl_map_copy(IslMap), isl_dim_in, DimNum, 1);
448   CheckedAccessRelation =
449       isl_map_insert_dims(CheckedAccessRelation, isl_dim_in, DimNum, 1);
450   auto *InputDimsId = isl_map_get_tuple_id(IslMap, isl_dim_in);
451   CheckedAccessRelation =
452       isl_map_set_tuple_id(CheckedAccessRelation, isl_dim_in, InputDimsId);
453   InputDimsId = isl_map_get_tuple_id(IslMap, isl_dim_out);
454   CheckedAccessRelation =
455       isl_map_set_tuple_id(CheckedAccessRelation, isl_dim_out, InputDimsId);
456   auto res = !isl_map_is_equal(CheckedAccessRelation, IslMap);
457   isl_map_free(CheckedAccessRelation);
458   isl_map_free(IslMap);
459   return res;
460 }
461 
462 /// Check if the SCoP statement could probably be optimized with analytical
463 /// modeling.
464 ///
465 /// containsMatrMult tries to determine whether the following conditions
466 /// are true:
467 /// 1. all memory accesses of the statement will have stride 0 or 1,
468 ///    if we interchange loops (switch the variable used in the inner
469 ///    loop to the outer loop).
470 /// 2. all memory accesses of the statement except from the last one, are
471 ///    read memory access and the last one is write memory access.
472 /// 3. all subscripts of the last memory access of the statement don't contain
473 ///    the variable used in the inner loop.
474 ///
475 /// @param PartialSchedule The PartialSchedule that contains a SCoP statement
476 ///        to check.
477 static bool containsMatrMult(__isl_keep isl_map *PartialSchedule) {
478   auto InputDimsId = isl_map_get_tuple_id(PartialSchedule, isl_dim_in);
479   auto *ScpStmt = static_cast<ScopStmt *>(isl_id_get_user(InputDimsId));
480   isl_id_free(InputDimsId);
481   if (ScpStmt->size() <= 1)
482     return false;
483   auto MemA = ScpStmt->begin();
484   for (unsigned i = 0; i < ScpStmt->size() - 2 && MemA != ScpStmt->end();
485        i++, MemA++)
486     if (!(*MemA)->isRead() ||
487         ((*MemA)->isArrayKind() &&
488          !((*MemA)->isStrideOne(isl_map_copy(PartialSchedule)) ||
489            (*MemA)->isStrideZero(isl_map_copy(PartialSchedule)))))
490       return false;
491   MemA++;
492   if (!(*MemA)->isWrite() || !(*MemA)->isArrayKind() ||
493       !((*MemA)->isStrideOne(isl_map_copy(PartialSchedule)) ||
494         (*MemA)->isStrideZero(isl_map_copy(PartialSchedule))))
495     return false;
496   auto DimNum = isl_map_dim(PartialSchedule, isl_dim_in);
497   return !isInputDimUsed((*MemA)->getAccessRelation(), DimNum - 1);
498 }
499 
500 /// Circular shift of output dimensions of the integer map.
501 ///
502 /// @param IslMap The isl map to be modified.
503 static __isl_give isl_map *circularShiftOutputDims(__isl_take isl_map *IslMap) {
504   auto DimNum = isl_map_dim(IslMap, isl_dim_out);
505   if (DimNum == 0)
506     return IslMap;
507   auto InputDimsId = isl_map_get_tuple_id(IslMap, isl_dim_in);
508   IslMap = isl_map_move_dims(IslMap, isl_dim_in, 0, isl_dim_out, DimNum - 1, 1);
509   IslMap = isl_map_move_dims(IslMap, isl_dim_out, 0, isl_dim_in, 0, 1);
510   return isl_map_set_tuple_id(IslMap, isl_dim_in, InputDimsId);
511 }
512 
513 /// Permute two dimensions of the band node.
514 ///
515 /// Permute FirstDim and SecondDim dimensions of the Node.
516 ///
517 /// @param Node The band node to be modified.
518 /// @param FirstDim The first dimension to be permuted.
519 /// @param SecondDim The second dimension to be permuted.
520 static __isl_give isl_schedule_node *
521 permuteBandNodeDimensions(__isl_take isl_schedule_node *Node, unsigned FirstDim,
522                           unsigned SecondDim) {
523   assert(isl_schedule_node_get_type(Node) == isl_schedule_node_band &&
524          isl_schedule_node_band_n_member(Node) > std::max(FirstDim, SecondDim));
525   auto PartialSchedule = isl_schedule_node_band_get_partial_schedule(Node);
526   auto PartialScheduleFirstDim =
527       isl_multi_union_pw_aff_get_union_pw_aff(PartialSchedule, FirstDim);
528   auto PartialScheduleSecondDim =
529       isl_multi_union_pw_aff_get_union_pw_aff(PartialSchedule, SecondDim);
530   PartialSchedule = isl_multi_union_pw_aff_set_union_pw_aff(
531       PartialSchedule, SecondDim, PartialScheduleFirstDim);
532   PartialSchedule = isl_multi_union_pw_aff_set_union_pw_aff(
533       PartialSchedule, FirstDim, PartialScheduleSecondDim);
534   Node = isl_schedule_node_delete(Node);
535   Node = isl_schedule_node_insert_partial_schedule(Node, PartialSchedule);
536   return Node;
537 }
538 
539 __isl_give isl_schedule_node *ScheduleTreeOptimizer::createMicroKernel(
540     __isl_take isl_schedule_node *Node, MicroKernelParamsTy MicroKernelParams) {
541   return applyRegisterTiling(Node, {MicroKernelParams.Mr, MicroKernelParams.Nr},
542                              1);
543 }
544 
545 __isl_give isl_schedule_node *ScheduleTreeOptimizer::createMacroKernel(
546     __isl_take isl_schedule_node *Node, MacroKernelParamsTy MacroKernelParams) {
547   assert(isl_schedule_node_get_type(Node) == isl_schedule_node_band);
548   if (MacroKernelParams.Mc == 1 && MacroKernelParams.Nc == 1 &&
549       MacroKernelParams.Kc == 1)
550     return Node;
551   Node = tileNode(
552       Node, "1st level tiling",
553       {MacroKernelParams.Mc, MacroKernelParams.Nc, MacroKernelParams.Kc}, 1);
554   Node = isl_schedule_node_parent(isl_schedule_node_parent(Node));
555   Node = permuteBandNodeDimensions(Node, 1, 2);
556   return isl_schedule_node_child(isl_schedule_node_child(Node, 0), 0);
557 }
558 
559 /// Get parameters of the BLIS micro kernel.
560 ///
561 /// We choose the Mr and Nr parameters of the micro kernel to be large enough
562 /// such that no stalls caused by the combination of latencies and dependencies
563 /// are introduced during the updates of the resulting matrix of the matrix
564 /// multiplication. However, they should also be as small as possible to
565 /// release more registers for entries of multiplied matrices.
566 ///
567 /// @param TTI Target Transform Info.
568 /// @return The structure of type MicroKernelParamsTy.
569 /// @see MicroKernelParamsTy
570 static struct MicroKernelParamsTy
571 getMicroKernelParams(const llvm::TargetTransformInfo *TTI) {
572   assert(TTI && "The target transform info should be provided.");
573 
574   // Nvec - Number of double-precision floating-point numbers that can be hold
575   // by a vector register. Use 2 by default.
576   auto Nvec = TTI->getRegisterBitWidth(true) / 64;
577   if (Nvec == 0)
578     Nvec = 2;
579   int Nr =
580       ceil(sqrt(Nvec * LatencyVectorFma * ThrougputVectorFma) / Nvec) * Nvec;
581   int Mr = ceil(Nvec * LatencyVectorFma * ThrougputVectorFma / Nr);
582   return {Mr, Nr};
583 }
584 
585 /// Get parameters of the BLIS macro kernel.
586 ///
587 /// During the computation of matrix multiplication, blocks of partitioned
588 /// matrices are mapped to different layers of the memory hierarchy.
589 /// To optimize data reuse, blocks should be ideally kept in cache between
590 /// iterations. Since parameters of the macro kernel determine sizes of these
591 /// blocks, there are upper and lower bounds on these parameters.
592 ///
593 /// @param MicroKernelParams Parameters of the micro-kernel
594 ///                          to be taken into account.
595 /// @return The structure of type MacroKernelParamsTy.
596 /// @see MacroKernelParamsTy
597 /// @see MicroKernelParamsTy
598 static struct MacroKernelParamsTy
599 getMacroKernelParams(const MicroKernelParamsTy &MicroKernelParams) {
600   // According to www.cs.utexas.edu/users/flame/pubs/TOMS-BLIS-Analytical.pdf,
601   // it requires information about the first two levels of a cache to determine
602   // all the parameters of a macro-kernel. It also checks that an associativity
603   // degree of a cache level is greater than two. Otherwise, another algorithm
604   // for determination of the parameters should be used.
605   if (!(MicroKernelParams.Mr > 0 && MicroKernelParams.Nr > 0 &&
606         CacheLevelSizes.size() >= 2 && CacheLevelAssociativity.size() >= 2 &&
607         CacheLevelSizes[0] > 0 && CacheLevelSizes[1] > 0 &&
608         CacheLevelAssociativity[0] > 2 && CacheLevelAssociativity[1] > 2))
609     return {1, 1, 1};
610   int Cbr = floor(
611       (CacheLevelAssociativity[0] - 1) /
612       (1 + static_cast<double>(MicroKernelParams.Mr) / MicroKernelParams.Nr));
613   int Kc = (Cbr * CacheLevelSizes[0]) /
614            (MicroKernelParams.Nr * CacheLevelAssociativity[0] * 8);
615   double Cac = static_cast<double>(MicroKernelParams.Mr * Kc * 8 *
616                                    CacheLevelAssociativity[1]) /
617                CacheLevelSizes[1];
618   double Cbc = static_cast<double>(MicroKernelParams.Nr * Kc * 8 *
619                                    CacheLevelAssociativity[1]) /
620                CacheLevelSizes[1];
621   int Mc = floor(MicroKernelParams.Mr / Cac);
622   int Nc =
623       floor((MicroKernelParams.Nr * (CacheLevelAssociativity[1] - 2)) / Cbc);
624   return {Mc, Nc, Kc};
625 }
626 
627 /// Identify a memory access through the shape of its memory access relation.
628 ///
629 /// Identify the unique memory access in @p Stmt, that has an access relation
630 /// equal to @p ExpectedAccessRelation.
631 ///
632 /// @param Stmt The SCoP statement that contains the memory accesses under
633 ///             consideration.
634 /// @param ExpectedAccessRelation The access relation that identifies
635 ///                               the memory access.
636 /// @return  The memory access of @p Stmt whose memory access relation is equal
637 ///          to @p ExpectedAccessRelation. nullptr in case there is no or more
638 ///          than one such access.
639 MemoryAccess *
640 identifyAccessByAccessRelation(ScopStmt *Stmt,
641                                __isl_take isl_map *ExpectedAccessRelation) {
642   if (isl_map_has_tuple_id(ExpectedAccessRelation, isl_dim_out))
643     ExpectedAccessRelation =
644         isl_map_reset_tuple_id(ExpectedAccessRelation, isl_dim_out);
645   MemoryAccess *IdentifiedAccess = nullptr;
646   for (auto *Access : *Stmt) {
647     auto *AccessRelation = Access->getAccessRelation();
648     AccessRelation = isl_map_reset_tuple_id(AccessRelation, isl_dim_out);
649     if (isl_map_is_equal(ExpectedAccessRelation, AccessRelation)) {
650       if (IdentifiedAccess) {
651         isl_map_free(AccessRelation);
652         isl_map_free(ExpectedAccessRelation);
653         return nullptr;
654       }
655       IdentifiedAccess = Access;
656     }
657     isl_map_free(AccessRelation);
658   }
659   isl_map_free(ExpectedAccessRelation);
660   return IdentifiedAccess;
661 }
662 
663 /// Add constrains to @Dim dimension of @p ExtMap.
664 ///
665 /// If @ExtMap has the following form [O0, O1, O2]->[I1, I2, I3],
666 /// the following constraint will be added
667 /// Bound * OM <= IM <= Bound * (OM + 1) - 1,
668 /// where M is @p Dim and Bound is @p Bound.
669 ///
670 /// @param ExtMap The isl map to be modified.
671 /// @param Dim The output dimension to be modfied.
672 /// @param Bound The value that is used to specify the constraint.
673 /// @return The modified isl map
674 __isl_give isl_map *
675 addExtensionMapMatMulDimConstraint(__isl_take isl_map *ExtMap, unsigned Dim,
676                                    unsigned Bound) {
677   assert(Bound != 0);
678   auto *ExtMapSpace = isl_map_get_space(ExtMap);
679   auto *ConstrSpace = isl_local_space_from_space(ExtMapSpace);
680   auto *Constr =
681       isl_constraint_alloc_inequality(isl_local_space_copy(ConstrSpace));
682   Constr = isl_constraint_set_coefficient_si(Constr, isl_dim_out, Dim, 1);
683   Constr =
684       isl_constraint_set_coefficient_si(Constr, isl_dim_in, Dim, Bound * (-1));
685   ExtMap = isl_map_add_constraint(ExtMap, Constr);
686   Constr = isl_constraint_alloc_inequality(ConstrSpace);
687   Constr = isl_constraint_set_coefficient_si(Constr, isl_dim_out, Dim, -1);
688   Constr = isl_constraint_set_coefficient_si(Constr, isl_dim_in, Dim, Bound);
689   Constr = isl_constraint_set_constant_si(Constr, Bound - 1);
690   return isl_map_add_constraint(ExtMap, Constr);
691 }
692 
693 /// Create an access relation that is specific for matrix multiplication
694 /// pattern.
695 ///
696 /// Create an access relation of the following form:
697 /// { [O0, O1, O2]->[I1, I2, I3] :
698 ///   FirstOutputDimBound * O0 <= I1 <= FirstOutputDimBound * (O0 + 1) - 1
699 ///   and SecondOutputDimBound * O1 <= I2 <= SecondOutputDimBound * (O1 + 1) - 1
700 ///   and ThirdOutputDimBound * O2 <= I3 <= ThirdOutputDimBound * (O2 + 1) - 1}
701 ///   where FirstOutputDimBound is @p FirstOutputDimBound,
702 ///   SecondOutputDimBound is @p SecondOutputDimBound,
703 ///   ThirdOutputDimBound is @p ThirdOutputDimBound
704 ///
705 /// @param Ctx The isl context.
706 /// @param FirstOutputDimBound,
707 ///        SecondOutputDimBound,
708 ///        ThirdOutputDimBound The parameters of the access relation.
709 /// @return The specified access relation.
710 __isl_give isl_map *getMatMulExt(isl_ctx *Ctx, unsigned FirstOutputDimBound,
711                                  unsigned SecondOutputDimBound,
712                                  unsigned ThirdOutputDimBound) {
713   auto *NewRelSpace = isl_space_alloc(Ctx, 0, 3, 3);
714   auto *extensionMap = isl_map_universe(NewRelSpace);
715   if (!FirstOutputDimBound)
716     extensionMap = isl_map_fix_si(extensionMap, isl_dim_out, 0, 0);
717   else
718     extensionMap = addExtensionMapMatMulDimConstraint(extensionMap, 0,
719                                                       FirstOutputDimBound);
720   if (!SecondOutputDimBound)
721     extensionMap = isl_map_fix_si(extensionMap, isl_dim_out, 1, 0);
722   else
723     extensionMap = addExtensionMapMatMulDimConstraint(extensionMap, 1,
724                                                       SecondOutputDimBound);
725   if (!ThirdOutputDimBound)
726     extensionMap = isl_map_fix_si(extensionMap, isl_dim_out, 2, 0);
727   else
728     extensionMap = addExtensionMapMatMulDimConstraint(extensionMap, 2,
729                                                       ThirdOutputDimBound);
730   return extensionMap;
731 }
732 
733 /// Create an access relation that is specific to the matrix
734 ///        multiplication pattern.
735 ///
736 /// Create an access relation of the following form:
737 /// Stmt[O0, O1, O2]->[OI, OJ],
738 /// where I is @p I, J is @J
739 ///
740 /// @param Stmt The SCoP statement for which to generate the access relation.
741 /// @param I The index of the input dimension that is mapped to the first output
742 ///          dimension.
743 /// @param J The index of the input dimension that is mapped to the second
744 ///          output dimension.
745 /// @return The specified access relation.
746 __isl_give isl_map *
747 getMatMulPatternOriginalAccessRelation(ScopStmt *Stmt, unsigned I, unsigned J) {
748   auto *AccessRelSpace = isl_space_alloc(Stmt->getIslCtx(), 0, 3, 2);
749   auto *AccessRel = isl_map_universe(AccessRelSpace);
750   AccessRel = isl_map_equate(AccessRel, isl_dim_in, I, isl_dim_out, 0);
751   AccessRel = isl_map_equate(AccessRel, isl_dim_in, J, isl_dim_out, 1);
752   AccessRel = isl_map_set_tuple_id(AccessRel, isl_dim_in, Stmt->getDomainId());
753   return AccessRel;
754 }
755 
756 /// Identify the memory access that corresponds to the access to the second
757 /// operand of the matrix multiplication.
758 ///
759 /// Identify the memory access that corresponds to the access
760 /// to the matrix B of the matrix multiplication C = A x B.
761 ///
762 /// @param Stmt The SCoP statement that contains the memory accesses
763 ///             under consideration.
764 /// @return The memory access of @p Stmt that corresponds to the access
765 ///         to the second operand of the matrix multiplication.
766 MemoryAccess *identifyAccessA(ScopStmt *Stmt) {
767   auto *OriginalRel = getMatMulPatternOriginalAccessRelation(Stmt, 0, 2);
768   return identifyAccessByAccessRelation(Stmt, OriginalRel);
769 }
770 
771 /// Identify the memory access that corresponds to the access to the first
772 /// operand of the matrix multiplication.
773 ///
774 /// Identify the memory access that corresponds to the access
775 /// to the matrix A of the matrix multiplication C = A x B.
776 ///
777 /// @param Stmt The SCoP statement that contains the memory accesses
778 ///             under consideration.
779 /// @return The memory access of @p Stmt that corresponds to the access
780 ///         to the first operand of the matrix multiplication.
781 MemoryAccess *identifyAccessB(ScopStmt *Stmt) {
782   auto *OriginalRel = getMatMulPatternOriginalAccessRelation(Stmt, 2, 1);
783   return identifyAccessByAccessRelation(Stmt, OriginalRel);
784 }
785 
786 /// Create an access relation that is specific to
787 ///        the matrix multiplication pattern.
788 ///
789 /// Create an access relation of the following form:
790 /// [O0, O1, O2, O3, O4, O5, O6, O7, O8] -> [O5 + K * OI, OJ],
791 /// where K is @p Coeff, I is @p FirstDim, J is @p SecondDim.
792 ///
793 /// It can be used, for example, to create relations that helps to consequently
794 /// access elements of operands of a matrix multiplication after creation of
795 /// the BLIS micro and macro kernels.
796 ///
797 /// @see ScheduleTreeOptimizer::createMicroKernel
798 /// @see ScheduleTreeOptimizer::createMacroKernel
799 ///
800 /// Subsequently, the described access relation is applied to the range of
801 /// @p MapOldIndVar, that is used to map original induction variables to
802 /// the ones, which are produced by schedule transformations. It helps to
803 /// define relations using a new space and, at the same time, keep them
804 /// in the original one.
805 ///
806 /// @param MapOldIndVar The relation, which maps original induction variables
807 ///                     to the ones, which are produced by schedule
808 ///                     transformations.
809 /// @param Coeff The coefficient that is used to define the specified access
810 ///              relation.
811 /// @param FirstDim, SecondDim The input dimensions that are used to define
812 ///        the specified access relation.
813 /// @return The specified access relation.
814 __isl_give isl_map *getMatMulAccRel(__isl_take isl_map *MapOldIndVar,
815                                     unsigned Coeff, unsigned FirstDim,
816                                     unsigned SecondDim) {
817   auto *Ctx = isl_map_get_ctx(MapOldIndVar);
818   auto *AccessRelSpace = isl_space_alloc(Ctx, 0, 9, 2);
819   auto *AccessRel = isl_map_universe(isl_space_copy(AccessRelSpace));
820   auto *ConstrSpace = isl_local_space_from_space(AccessRelSpace);
821   auto *Constr = isl_constraint_alloc_equality(ConstrSpace);
822   Constr = isl_constraint_set_coefficient_si(Constr, isl_dim_out, 0, -1);
823   Constr = isl_constraint_set_coefficient_si(Constr, isl_dim_in, 5, 1);
824   Constr =
825       isl_constraint_set_coefficient_si(Constr, isl_dim_in, FirstDim, Coeff);
826   AccessRel = isl_map_add_constraint(AccessRel, Constr);
827   AccessRel = isl_map_equate(AccessRel, isl_dim_in, SecondDim, isl_dim_out, 1);
828   return isl_map_apply_range(MapOldIndVar, AccessRel);
829 }
830 
831 __isl_give isl_schedule_node *
832 createExtensionNode(__isl_take isl_schedule_node *Node,
833                     __isl_take isl_map *ExtensionMap) {
834   auto *Extension = isl_union_map_from_map(ExtensionMap);
835   auto *NewNode = isl_schedule_node_from_extension(Extension);
836   return isl_schedule_node_graft_before(Node, NewNode);
837 }
838 
839 /// Apply the packing transformation.
840 ///
841 /// The packing transformation can be described as a data-layout
842 /// transformation that requires to introduce a new array, copy data
843 /// to the array, and change memory access locations of the compute kernel
844 /// to reference the array.
845 ///
846 /// @param Node The schedule node to be optimized.
847 /// @param MapOldIndVar The relation, which maps original induction variables
848 ///                     to the ones, which are produced by schedule
849 ///                     transformations.
850 /// @param MicroParams, MacroParams Parameters of the BLIS kernel
851 ///                                 to be taken into account.
852 /// @return The optimized schedule node.
853 static __isl_give isl_schedule_node *optimizeDataLayoutMatrMulPattern(
854     __isl_take isl_schedule_node *Node, __isl_take isl_map *MapOldIndVar,
855     MicroKernelParamsTy MicroParams, MacroKernelParamsTy MacroParams) {
856   auto InputDimsId = isl_map_get_tuple_id(MapOldIndVar, isl_dim_in);
857   auto *Stmt = static_cast<ScopStmt *>(isl_id_get_user(InputDimsId));
858   isl_id_free(InputDimsId);
859   MemoryAccess *MemAccessA = identifyAccessA(Stmt);
860   MemoryAccess *MemAccessB = identifyAccessB(Stmt);
861   if (!MemAccessA || !MemAccessB) {
862     isl_map_free(MapOldIndVar);
863     return Node;
864   }
865   Node = isl_schedule_node_parent(isl_schedule_node_parent(Node));
866   Node = isl_schedule_node_parent(isl_schedule_node_parent(Node));
867   Node = isl_schedule_node_parent(Node);
868   Node = isl_schedule_node_child(isl_schedule_node_band_split(Node, 2), 0);
869   auto *AccRel =
870       getMatMulAccRel(isl_map_copy(MapOldIndVar), MacroParams.Kc, 3, 6);
871   unsigned FirstDimSize = MacroParams.Mc * MacroParams.Kc / MicroParams.Mr;
872   unsigned SecondDimSize = MicroParams.Mr;
873   auto *SAI = Stmt->getParent()->createScopArrayInfo(
874       MemAccessA->getElementType(), "Packed_A", {FirstDimSize, SecondDimSize});
875   AccRel = isl_map_set_tuple_id(AccRel, isl_dim_out, SAI->getBasePtrId());
876   auto *OldAcc = MemAccessA->getAccessRelation();
877   MemAccessA->setNewAccessRelation(AccRel);
878   auto *ExtMap =
879       getMatMulExt(Stmt->getIslCtx(), MacroParams.Mc, 0, MacroParams.Kc);
880   ExtMap = isl_map_project_out(ExtMap, isl_dim_in, 1, 1);
881   auto *Domain = Stmt->getDomain();
882   auto *NewStmt = Stmt->getParent()->addScopStmt(
883       OldAcc, MemAccessA->getAccessRelation(), isl_set_copy(Domain));
884   ExtMap = isl_map_set_tuple_id(ExtMap, isl_dim_out, NewStmt->getDomainId());
885   Node = createExtensionNode(Node, ExtMap);
886   Node = isl_schedule_node_child(Node, 0);
887   AccRel = getMatMulAccRel(MapOldIndVar, MacroParams.Kc, 4, 7);
888   FirstDimSize = MacroParams.Nc * MacroParams.Kc / MicroParams.Nr;
889   SecondDimSize = MicroParams.Nr;
890   SAI = Stmt->getParent()->createScopArrayInfo(
891       MemAccessB->getElementType(), "Packed_B", {FirstDimSize, SecondDimSize});
892   AccRel = isl_map_set_tuple_id(AccRel, isl_dim_out, SAI->getBasePtrId());
893   OldAcc = MemAccessB->getAccessRelation();
894   MemAccessB->setNewAccessRelation(AccRel);
895   ExtMap = getMatMulExt(Stmt->getIslCtx(), 0, MacroParams.Nc, MacroParams.Kc);
896   isl_map_move_dims(ExtMap, isl_dim_out, 0, isl_dim_in, 1, 1);
897   isl_map_move_dims(ExtMap, isl_dim_in, 2, isl_dim_out, 0, 1);
898   NewStmt = Stmt->getParent()->addScopStmt(
899       OldAcc, MemAccessB->getAccessRelation(), Domain);
900   ExtMap = isl_map_set_tuple_id(ExtMap, isl_dim_out, NewStmt->getDomainId());
901   Node = createExtensionNode(Node, ExtMap);
902   Node = isl_schedule_node_child(isl_schedule_node_child(Node, 0), 0);
903   return isl_schedule_node_child(isl_schedule_node_child(Node, 0), 0);
904 }
905 
906 /// Get a relation mapping induction variables produced by schedule
907 /// transformations to the original ones.
908 ///
909 /// @param Node The schedule node produced as the result of creation
910 ///        of the BLIS kernels.
911 /// @param MicroKernelParams, MacroKernelParams Parameters of the BLIS kernel
912 ///                                             to be taken into account.
913 /// @return  The relation mapping original induction variables to the ones
914 ///          produced by schedule transformation.
915 /// @see ScheduleTreeOptimizer::createMicroKernel
916 /// @see ScheduleTreeOptimizer::createMacroKernel
917 /// @see getMacroKernelParams
918 __isl_give isl_map *
919 getInductionVariablesSubstitution(__isl_take isl_schedule_node *Node,
920                                   MicroKernelParamsTy MicroKernelParams,
921                                   MacroKernelParamsTy MacroKernelParams) {
922   auto *Child = isl_schedule_node_get_child(Node, 0);
923   auto *UnMapOldIndVar = isl_schedule_node_get_prefix_schedule_union_map(Child);
924   isl_schedule_node_free(Child);
925   auto *MapOldIndVar = isl_map_from_union_map(UnMapOldIndVar);
926   if (isl_map_dim(MapOldIndVar, isl_dim_out) > 9)
927     MapOldIndVar =
928         isl_map_project_out(MapOldIndVar, isl_dim_out, 0,
929                             isl_map_dim(MapOldIndVar, isl_dim_out) - 9);
930   return MapOldIndVar;
931 }
932 
933 __isl_give isl_schedule_node *ScheduleTreeOptimizer::optimizeMatMulPattern(
934     __isl_take isl_schedule_node *Node, const llvm::TargetTransformInfo *TTI) {
935   assert(TTI && "The target transform info should be provided.");
936   auto MicroKernelParams = getMicroKernelParams(TTI);
937   auto MacroKernelParams = getMacroKernelParams(MicroKernelParams);
938   Node = createMacroKernel(Node, MacroKernelParams);
939   Node = createMicroKernel(Node, MicroKernelParams);
940   if (MacroKernelParams.Mc == 1 || MacroKernelParams.Nc == 1 ||
941       MacroKernelParams.Kc == 1)
942     return Node;
943   auto *MapOldIndVar = getInductionVariablesSubstitution(
944       Node, MicroKernelParams, MacroKernelParams);
945   if (!MapOldIndVar)
946     return Node;
947   return optimizeDataLayoutMatrMulPattern(Node, MapOldIndVar, MicroKernelParams,
948                                           MacroKernelParams);
949 }
950 
951 bool ScheduleTreeOptimizer::isMatrMultPattern(
952     __isl_keep isl_schedule_node *Node) {
953   auto *PartialSchedule =
954       isl_schedule_node_band_get_partial_schedule_union_map(Node);
955   if (isl_schedule_node_band_n_member(Node) != 3 ||
956       isl_union_map_n_map(PartialSchedule) != 1) {
957     isl_union_map_free(PartialSchedule);
958     return false;
959   }
960   auto *NewPartialSchedule = isl_map_from_union_map(PartialSchedule);
961   NewPartialSchedule = circularShiftOutputDims(NewPartialSchedule);
962   if (containsMatrMult(NewPartialSchedule)) {
963     isl_map_free(NewPartialSchedule);
964     return true;
965   }
966   isl_map_free(NewPartialSchedule);
967   return false;
968 }
969 
970 __isl_give isl_schedule_node *
971 ScheduleTreeOptimizer::optimizeBand(__isl_take isl_schedule_node *Node,
972                                     void *User) {
973   if (!isTileableBandNode(Node))
974     return Node;
975 
976   if (PMBasedOpts && User && isMatrMultPattern(Node)) {
977     DEBUG(dbgs() << "The matrix multiplication pattern was detected\n");
978     const llvm::TargetTransformInfo *TTI;
979     TTI = static_cast<const llvm::TargetTransformInfo *>(User);
980     Node = optimizeMatMulPattern(Node, TTI);
981   }
982 
983   return standardBandOpts(Node, User);
984 }
985 
986 __isl_give isl_schedule *
987 ScheduleTreeOptimizer::optimizeSchedule(__isl_take isl_schedule *Schedule,
988                                         const llvm::TargetTransformInfo *TTI) {
989   isl_schedule_node *Root = isl_schedule_get_root(Schedule);
990   Root = optimizeScheduleNode(Root, TTI);
991   isl_schedule_free(Schedule);
992   auto S = isl_schedule_node_get_schedule(Root);
993   isl_schedule_node_free(Root);
994   return S;
995 }
996 
997 __isl_give isl_schedule_node *ScheduleTreeOptimizer::optimizeScheduleNode(
998     __isl_take isl_schedule_node *Node, const llvm::TargetTransformInfo *TTI) {
999   Node = isl_schedule_node_map_descendant_bottom_up(
1000       Node, optimizeBand, const_cast<void *>(static_cast<const void *>(TTI)));
1001   return Node;
1002 }
1003 
1004 bool ScheduleTreeOptimizer::isProfitableSchedule(
1005     Scop &S, __isl_keep isl_schedule *NewSchedule) {
1006   // To understand if the schedule has been optimized we check if the schedule
1007   // has changed at all.
1008   // TODO: We can improve this by tracking if any necessarily beneficial
1009   // transformations have been performed. This can e.g. be tiling, loop
1010   // interchange, or ...) We can track this either at the place where the
1011   // transformation has been performed or, in case of automatic ILP based
1012   // optimizations, by comparing (yet to be defined) performance metrics
1013   // before/after the scheduling optimizer
1014   // (e.g., #stride-one accesses)
1015   if (S.containsExtensionNode(NewSchedule))
1016     return true;
1017   auto *NewScheduleMap = isl_schedule_get_map(NewSchedule);
1018   isl_union_map *OldSchedule = S.getSchedule();
1019   assert(OldSchedule && "Only IslScheduleOptimizer can insert extension nodes "
1020                         "that make Scop::getSchedule() return nullptr.");
1021   bool changed = !isl_union_map_is_equal(OldSchedule, NewScheduleMap);
1022   isl_union_map_free(OldSchedule);
1023   isl_union_map_free(NewScheduleMap);
1024   return changed;
1025 }
1026 
1027 namespace {
1028 class IslScheduleOptimizer : public ScopPass {
1029 public:
1030   static char ID;
1031   explicit IslScheduleOptimizer() : ScopPass(ID) { LastSchedule = nullptr; }
1032 
1033   ~IslScheduleOptimizer() { isl_schedule_free(LastSchedule); }
1034 
1035   /// Optimize the schedule of the SCoP @p S.
1036   bool runOnScop(Scop &S) override;
1037 
1038   /// Print the new schedule for the SCoP @p S.
1039   void printScop(raw_ostream &OS, Scop &S) const override;
1040 
1041   /// Register all analyses and transformation required.
1042   void getAnalysisUsage(AnalysisUsage &AU) const override;
1043 
1044   /// Release the internal memory.
1045   void releaseMemory() override {
1046     isl_schedule_free(LastSchedule);
1047     LastSchedule = nullptr;
1048   }
1049 
1050 private:
1051   isl_schedule *LastSchedule;
1052 };
1053 } // namespace
1054 
1055 char IslScheduleOptimizer::ID = 0;
1056 
1057 bool IslScheduleOptimizer::runOnScop(Scop &S) {
1058 
1059   // Skip empty SCoPs but still allow code generation as it will delete the
1060   // loops present but not needed.
1061   if (S.getSize() == 0) {
1062     S.markAsOptimized();
1063     return false;
1064   }
1065 
1066   const Dependences &D =
1067       getAnalysis<DependenceInfo>().getDependences(Dependences::AL_Statement);
1068 
1069   if (!D.hasValidDependences())
1070     return false;
1071 
1072   isl_schedule_free(LastSchedule);
1073   LastSchedule = nullptr;
1074 
1075   // Build input data.
1076   int ValidityKinds =
1077       Dependences::TYPE_RAW | Dependences::TYPE_WAR | Dependences::TYPE_WAW;
1078   int ProximityKinds;
1079 
1080   if (OptimizeDeps == "all")
1081     ProximityKinds =
1082         Dependences::TYPE_RAW | Dependences::TYPE_WAR | Dependences::TYPE_WAW;
1083   else if (OptimizeDeps == "raw")
1084     ProximityKinds = Dependences::TYPE_RAW;
1085   else {
1086     errs() << "Do not know how to optimize for '" << OptimizeDeps << "'"
1087            << " Falling back to optimizing all dependences.\n";
1088     ProximityKinds =
1089         Dependences::TYPE_RAW | Dependences::TYPE_WAR | Dependences::TYPE_WAW;
1090   }
1091 
1092   isl_union_set *Domain = S.getDomains();
1093 
1094   if (!Domain)
1095     return false;
1096 
1097   isl_union_map *Validity = D.getDependences(ValidityKinds);
1098   isl_union_map *Proximity = D.getDependences(ProximityKinds);
1099 
1100   // Simplify the dependences by removing the constraints introduced by the
1101   // domains. This can speed up the scheduling time significantly, as large
1102   // constant coefficients will be removed from the dependences. The
1103   // introduction of some additional dependences reduces the possible
1104   // transformations, but in most cases, such transformation do not seem to be
1105   // interesting anyway. In some cases this option may stop the scheduler to
1106   // find any schedule.
1107   if (SimplifyDeps == "yes") {
1108     Validity = isl_union_map_gist_domain(Validity, isl_union_set_copy(Domain));
1109     Validity = isl_union_map_gist_range(Validity, isl_union_set_copy(Domain));
1110     Proximity =
1111         isl_union_map_gist_domain(Proximity, isl_union_set_copy(Domain));
1112     Proximity = isl_union_map_gist_range(Proximity, isl_union_set_copy(Domain));
1113   } else if (SimplifyDeps != "no") {
1114     errs() << "warning: Option -polly-opt-simplify-deps should either be 'yes' "
1115               "or 'no'. Falling back to default: 'yes'\n";
1116   }
1117 
1118   DEBUG(dbgs() << "\n\nCompute schedule from: ");
1119   DEBUG(dbgs() << "Domain := " << stringFromIslObj(Domain) << ";\n");
1120   DEBUG(dbgs() << "Proximity := " << stringFromIslObj(Proximity) << ";\n");
1121   DEBUG(dbgs() << "Validity := " << stringFromIslObj(Validity) << ";\n");
1122 
1123   unsigned IslSerializeSCCs;
1124 
1125   if (FusionStrategy == "max") {
1126     IslSerializeSCCs = 0;
1127   } else if (FusionStrategy == "min") {
1128     IslSerializeSCCs = 1;
1129   } else {
1130     errs() << "warning: Unknown fusion strategy. Falling back to maximal "
1131               "fusion.\n";
1132     IslSerializeSCCs = 0;
1133   }
1134 
1135   int IslMaximizeBands;
1136 
1137   if (MaximizeBandDepth == "yes") {
1138     IslMaximizeBands = 1;
1139   } else if (MaximizeBandDepth == "no") {
1140     IslMaximizeBands = 0;
1141   } else {
1142     errs() << "warning: Option -polly-opt-maximize-bands should either be 'yes'"
1143               " or 'no'. Falling back to default: 'yes'\n";
1144     IslMaximizeBands = 1;
1145   }
1146 
1147   int IslOuterCoincidence;
1148 
1149   if (OuterCoincidence == "yes") {
1150     IslOuterCoincidence = 1;
1151   } else if (OuterCoincidence == "no") {
1152     IslOuterCoincidence = 0;
1153   } else {
1154     errs() << "warning: Option -polly-opt-outer-coincidence should either be "
1155               "'yes' or 'no'. Falling back to default: 'no'\n";
1156     IslOuterCoincidence = 0;
1157   }
1158 
1159   isl_ctx *Ctx = S.getIslCtx();
1160 
1161   isl_options_set_schedule_outer_coincidence(Ctx, IslOuterCoincidence);
1162   isl_options_set_schedule_serialize_sccs(Ctx, IslSerializeSCCs);
1163   isl_options_set_schedule_maximize_band_depth(Ctx, IslMaximizeBands);
1164   isl_options_set_schedule_max_constant_term(Ctx, MaxConstantTerm);
1165   isl_options_set_schedule_max_coefficient(Ctx, MaxCoefficient);
1166   isl_options_set_tile_scale_tile_loops(Ctx, 0);
1167 
1168   auto OnErrorStatus = isl_options_get_on_error(Ctx);
1169   isl_options_set_on_error(Ctx, ISL_ON_ERROR_CONTINUE);
1170 
1171   isl_schedule_constraints *ScheduleConstraints;
1172   ScheduleConstraints = isl_schedule_constraints_on_domain(Domain);
1173   ScheduleConstraints =
1174       isl_schedule_constraints_set_proximity(ScheduleConstraints, Proximity);
1175   ScheduleConstraints = isl_schedule_constraints_set_validity(
1176       ScheduleConstraints, isl_union_map_copy(Validity));
1177   ScheduleConstraints =
1178       isl_schedule_constraints_set_coincidence(ScheduleConstraints, Validity);
1179   isl_schedule *Schedule;
1180   Schedule = isl_schedule_constraints_compute_schedule(ScheduleConstraints);
1181   isl_options_set_on_error(Ctx, OnErrorStatus);
1182 
1183   // In cases the scheduler is not able to optimize the code, we just do not
1184   // touch the schedule.
1185   if (!Schedule)
1186     return false;
1187 
1188   DEBUG({
1189     auto *P = isl_printer_to_str(Ctx);
1190     P = isl_printer_set_yaml_style(P, ISL_YAML_STYLE_BLOCK);
1191     P = isl_printer_print_schedule(P, Schedule);
1192     dbgs() << "NewScheduleTree: \n" << isl_printer_get_str(P) << "\n";
1193     isl_printer_free(P);
1194   });
1195 
1196   Function &F = S.getFunction();
1197   auto *TTI = &getAnalysis<TargetTransformInfoWrapperPass>().getTTI(F);
1198   isl_schedule *NewSchedule =
1199       ScheduleTreeOptimizer::optimizeSchedule(Schedule, TTI);
1200 
1201   if (!ScheduleTreeOptimizer::isProfitableSchedule(S, NewSchedule)) {
1202     isl_schedule_free(NewSchedule);
1203     return false;
1204   }
1205 
1206   S.setScheduleTree(NewSchedule);
1207   S.markAsOptimized();
1208 
1209   if (OptimizedScops)
1210     S.dump();
1211 
1212   return false;
1213 }
1214 
1215 void IslScheduleOptimizer::printScop(raw_ostream &OS, Scop &) const {
1216   isl_printer *p;
1217   char *ScheduleStr;
1218 
1219   OS << "Calculated schedule:\n";
1220 
1221   if (!LastSchedule) {
1222     OS << "n/a\n";
1223     return;
1224   }
1225 
1226   p = isl_printer_to_str(isl_schedule_get_ctx(LastSchedule));
1227   p = isl_printer_print_schedule(p, LastSchedule);
1228   ScheduleStr = isl_printer_get_str(p);
1229   isl_printer_free(p);
1230 
1231   OS << ScheduleStr << "\n";
1232 }
1233 
1234 void IslScheduleOptimizer::getAnalysisUsage(AnalysisUsage &AU) const {
1235   ScopPass::getAnalysisUsage(AU);
1236   AU.addRequired<DependenceInfo>();
1237   AU.addRequired<TargetTransformInfoWrapperPass>();
1238 }
1239 
1240 Pass *polly::createIslScheduleOptimizerPass() {
1241   return new IslScheduleOptimizer();
1242 }
1243 
1244 INITIALIZE_PASS_BEGIN(IslScheduleOptimizer, "polly-opt-isl",
1245                       "Polly - Optimize schedule of SCoP", false, false);
1246 INITIALIZE_PASS_DEPENDENCY(DependenceInfo);
1247 INITIALIZE_PASS_DEPENDENCY(ScopInfoRegionPass);
1248 INITIALIZE_PASS_DEPENDENCY(TargetTransformInfoWrapperPass);
1249 INITIALIZE_PASS_END(IslScheduleOptimizer, "polly-opt-isl",
1250                     "Polly - Optimize schedule of SCoP", false, false)
1251