1 //===- Schedule.cpp - Calculate an optimized schedule ---------------------===//
2 //
3 //                     The LLVM Compiler Infrastructure
4 //
5 // This file is distributed under the University of Illinois Open Source
6 // License. See LICENSE.TXT for details.
7 //
8 //===----------------------------------------------------------------------===//
9 //
10 // This pass generates an entirely new schedule tree from the data dependences
11 // and iteration domains. The new schedule tree is computed in two steps:
12 //
13 // 1) The isl scheduling optimizer is run
14 //
15 // The isl scheduling optimizer creates a new schedule tree that maximizes
16 // parallelism and tileability and minimizes data-dependence distances. The
17 // algorithm used is a modified version of the ``Pluto'' algorithm:
18 //
19 //   U. Bondhugula, A. Hartono, J. Ramanujam, and P. Sadayappan.
20 //   A Practical Automatic Polyhedral Parallelizer and Locality Optimizer.
21 //   In Proceedings of the 2008 ACM SIGPLAN Conference On Programming Language
22 //   Design and Implementation, PLDI ’08, pages 101–113. ACM, 2008.
23 //
24 // 2) A set of post-scheduling transformations is applied on the schedule tree.
25 //
26 // These optimizations include:
27 //
28 //  - Tiling of the innermost tilable bands
29 //  - Prevectorization - The coice of a possible outer loop that is strip-mined
30 //                       to the innermost level to enable inner-loop
31 //                       vectorization.
32 //  - Some optimizations for spatial locality are also planned.
33 //
34 // For a detailed description of the schedule tree itself please see section 6
35 // of:
36 //
37 // Polyhedral AST generation is more than scanning polyhedra
38 // Tobias Grosser, Sven Verdoolaege, Albert Cohen
39 // ACM Transations on Programming Languages and Systems (TOPLAS),
40 // 37(4), July 2015
41 // http://www.grosser.es/#pub-polyhedral-AST-generation
42 //
43 // This publication also contains a detailed discussion of the different options
44 // for polyhedral loop unrolling, full/partial tile separation and other uses
45 // of the schedule tree.
46 //
47 //===----------------------------------------------------------------------===//
48 
49 #include "polly/ScheduleOptimizer.h"
50 #include "polly/CodeGen/CodeGeneration.h"
51 #include "polly/DependenceInfo.h"
52 #include "polly/LinkAllPasses.h"
53 #include "polly/Options.h"
54 #include "polly/ScopInfo.h"
55 #include "polly/Support/GICHelper.h"
56 #include "llvm/Analysis/TargetTransformInfo.h"
57 #include "llvm/Support/Debug.h"
58 #include "isl/aff.h"
59 #include "isl/band.h"
60 #include "isl/constraint.h"
61 #include "isl/map.h"
62 #include "isl/options.h"
63 #include "isl/printer.h"
64 #include "isl/schedule.h"
65 #include "isl/schedule_node.h"
66 #include "isl/space.h"
67 #include "isl/union_map.h"
68 #include "isl/union_set.h"
69 
70 using namespace llvm;
71 using namespace polly;
72 
73 #define DEBUG_TYPE "polly-opt-isl"
74 
75 static cl::opt<std::string>
76     OptimizeDeps("polly-opt-optimize-only",
77                  cl::desc("Only a certain kind of dependences (all/raw)"),
78                  cl::Hidden, cl::init("all"), cl::ZeroOrMore,
79                  cl::cat(PollyCategory));
80 
81 static cl::opt<std::string>
82     SimplifyDeps("polly-opt-simplify-deps",
83                  cl::desc("Dependences should be simplified (yes/no)"),
84                  cl::Hidden, cl::init("yes"), cl::ZeroOrMore,
85                  cl::cat(PollyCategory));
86 
87 static cl::opt<int> MaxConstantTerm(
88     "polly-opt-max-constant-term",
89     cl::desc("The maximal constant term allowed (-1 is unlimited)"), cl::Hidden,
90     cl::init(20), cl::ZeroOrMore, cl::cat(PollyCategory));
91 
92 static cl::opt<int> MaxCoefficient(
93     "polly-opt-max-coefficient",
94     cl::desc("The maximal coefficient allowed (-1 is unlimited)"), cl::Hidden,
95     cl::init(20), cl::ZeroOrMore, cl::cat(PollyCategory));
96 
97 static cl::opt<std::string> FusionStrategy(
98     "polly-opt-fusion", cl::desc("The fusion strategy to choose (min/max)"),
99     cl::Hidden, cl::init("min"), cl::ZeroOrMore, cl::cat(PollyCategory));
100 
101 static cl::opt<std::string>
102     MaximizeBandDepth("polly-opt-maximize-bands",
103                       cl::desc("Maximize the band depth (yes/no)"), cl::Hidden,
104                       cl::init("yes"), cl::ZeroOrMore, cl::cat(PollyCategory));
105 
106 static cl::opt<std::string> OuterCoincidence(
107     "polly-opt-outer-coincidence",
108     cl::desc("Try to construct schedules where the outer member of each band "
109              "satisfies the coincidence constraints (yes/no)"),
110     cl::Hidden, cl::init("no"), cl::ZeroOrMore, cl::cat(PollyCategory));
111 
112 static cl::opt<int> PrevectorWidth(
113     "polly-prevect-width",
114     cl::desc(
115         "The number of loop iterations to strip-mine for pre-vectorization"),
116     cl::Hidden, cl::init(4), cl::ZeroOrMore, cl::cat(PollyCategory));
117 
118 static cl::opt<bool> FirstLevelTiling("polly-tiling",
119                                       cl::desc("Enable loop tiling"),
120                                       cl::init(true), cl::ZeroOrMore,
121                                       cl::cat(PollyCategory));
122 
123 static cl::opt<int> LatencyVectorFma(
124     "polly-target-latency-vector-fma",
125     cl::desc("The minimal number of cycles between issuing two "
126              "dependent consecutive vector fused multiply-add "
127              "instructions."),
128     cl::Hidden, cl::init(8), cl::ZeroOrMore, cl::cat(PollyCategory));
129 
130 static cl::opt<int> ThroughputVectorFma(
131     "polly-target-throughput-vector-fma",
132     cl::desc("A throughput of the processor floating-point arithmetic units "
133              "expressed in the number of vector fused multiply-add "
134              "instructions per clock cycle."),
135     cl::Hidden, cl::init(1), cl::ZeroOrMore, cl::cat(PollyCategory));
136 
137 // This option, along with --polly-target-2nd-cache-level-associativity,
138 // --polly-target-1st-cache-level-size, and --polly-target-2st-cache-level-size
139 // represent the parameters of the target cache, which do not have typical
140 // values that can be used by default. However, to apply the pattern matching
141 // optimizations, we use the values of the parameters of Intel Core i7-3820
142 // SandyBridge in case the parameters are not specified. Such an approach helps
143 // also to attain the high-performance on IBM POWER System S822 and IBM Power
144 // 730 Express server.
145 static cl::opt<int> FirstCacheLevelAssociativity(
146     "polly-target-1st-cache-level-associativity",
147     cl::desc("The associativity of the first cache level."), cl::Hidden,
148     cl::init(8), cl::ZeroOrMore, cl::cat(PollyCategory));
149 
150 static cl::opt<int> SecondCacheLevelAssociativity(
151     "polly-target-2nd-cache-level-associativity",
152     cl::desc("The associativity of the second cache level."), cl::Hidden,
153     cl::init(8), cl::ZeroOrMore, cl::cat(PollyCategory));
154 
155 static cl::opt<int> FirstCacheLevelSize(
156     "polly-target-1st-cache-level-size",
157     cl::desc("The size of the first cache level specified in bytes."),
158     cl::Hidden, cl::init(32768), cl::ZeroOrMore, cl::cat(PollyCategory));
159 
160 static cl::opt<int> SecondCacheLevelSize(
161     "polly-target-2nd-cache-level-size",
162     cl::desc("The size of the second level specified in bytes."), cl::Hidden,
163     cl::init(262144), cl::ZeroOrMore, cl::cat(PollyCategory));
164 
165 static cl::opt<int> VectorRegisterBitwidth(
166     "polly-target-vector-register-bitwidth",
167     cl::desc("The size in bits of a vector register (if not set, this "
168              "information is taken from LLVM's target information."),
169     cl::Hidden, cl::init(-1), cl::ZeroOrMore, cl::cat(PollyCategory));
170 
171 static cl::opt<int> FirstLevelDefaultTileSize(
172     "polly-default-tile-size",
173     cl::desc("The default tile size (if not enough were provided by"
174              " --polly-tile-sizes)"),
175     cl::Hidden, cl::init(32), cl::ZeroOrMore, cl::cat(PollyCategory));
176 
177 static cl::list<int>
178     FirstLevelTileSizes("polly-tile-sizes",
179                         cl::desc("A tile size for each loop dimension, filled "
180                                  "with --polly-default-tile-size"),
181                         cl::Hidden, cl::ZeroOrMore, cl::CommaSeparated,
182                         cl::cat(PollyCategory));
183 
184 static cl::opt<bool>
185     SecondLevelTiling("polly-2nd-level-tiling",
186                       cl::desc("Enable a 2nd level loop of loop tiling"),
187                       cl::init(false), cl::ZeroOrMore, cl::cat(PollyCategory));
188 
189 static cl::opt<int> SecondLevelDefaultTileSize(
190     "polly-2nd-level-default-tile-size",
191     cl::desc("The default 2nd-level tile size (if not enough were provided by"
192              " --polly-2nd-level-tile-sizes)"),
193     cl::Hidden, cl::init(16), cl::ZeroOrMore, cl::cat(PollyCategory));
194 
195 static cl::list<int>
196     SecondLevelTileSizes("polly-2nd-level-tile-sizes",
197                          cl::desc("A tile size for each loop dimension, filled "
198                                   "with --polly-default-tile-size"),
199                          cl::Hidden, cl::ZeroOrMore, cl::CommaSeparated,
200                          cl::cat(PollyCategory));
201 
202 static cl::opt<bool> RegisterTiling("polly-register-tiling",
203                                     cl::desc("Enable register tiling"),
204                                     cl::init(false), cl::ZeroOrMore,
205                                     cl::cat(PollyCategory));
206 
207 static cl::opt<int> RegisterDefaultTileSize(
208     "polly-register-tiling-default-tile-size",
209     cl::desc("The default register tile size (if not enough were provided by"
210              " --polly-register-tile-sizes)"),
211     cl::Hidden, cl::init(2), cl::ZeroOrMore, cl::cat(PollyCategory));
212 
213 static cl::opt<int> PollyPatternMatchingNcQuotient(
214     "polly-pattern-matching-nc-quotient",
215     cl::desc("Quotient that is obtained by dividing Nc, the parameter of the"
216              "macro-kernel, by Nr, the parameter of the micro-kernel"),
217     cl::Hidden, cl::init(256), cl::ZeroOrMore, cl::cat(PollyCategory));
218 
219 static cl::list<int>
220     RegisterTileSizes("polly-register-tile-sizes",
221                       cl::desc("A tile size for each loop dimension, filled "
222                                "with --polly-register-tile-size"),
223                       cl::Hidden, cl::ZeroOrMore, cl::CommaSeparated,
224                       cl::cat(PollyCategory));
225 
226 static cl::opt<bool>
227     PMBasedOpts("polly-pattern-matching-based-opts",
228                 cl::desc("Perform optimizations based on pattern matching"),
229                 cl::init(false), cl::ZeroOrMore, cl::cat(PollyCategory));
230 
231 static cl::opt<bool> OptimizedScops(
232     "polly-optimized-scops",
233     cl::desc("Polly - Dump polyhedral description of Scops optimized with "
234              "the isl scheduling optimizer and the set of post-scheduling "
235              "transformations is applied on the schedule tree"),
236     cl::init(false), cl::ZeroOrMore, cl::cat(PollyCategory));
237 
238 /// Create an isl_union_set, which describes the isolate option based on
239 /// IsoalteDomain.
240 ///
241 /// @param IsolateDomain An isl_set whose last dimension is the only one that
242 ///                      should belong to the current band node.
243 static __isl_give isl_union_set *
244 getIsolateOptions(__isl_take isl_set *IsolateDomain) {
245   auto Dims = isl_set_dim(IsolateDomain, isl_dim_set);
246   auto *IsolateRelation = isl_map_from_domain(IsolateDomain);
247   IsolateRelation = isl_map_move_dims(IsolateRelation, isl_dim_out, 0,
248                                       isl_dim_in, Dims - 1, 1);
249   auto *IsolateOption = isl_map_wrap(IsolateRelation);
250   auto *Id = isl_id_alloc(isl_set_get_ctx(IsolateOption), "isolate", nullptr);
251   return isl_union_set_from_set(isl_set_set_tuple_id(IsolateOption, Id));
252 }
253 
254 /// Create an isl_union_set, which describes the atomic option for the dimension
255 /// of the current node.
256 ///
257 /// It may help to reduce the size of generated code.
258 ///
259 /// @param Ctx An isl_ctx, which is used to create the isl_union_set.
260 static __isl_give isl_union_set *getAtomicOptions(__isl_take isl_ctx *Ctx) {
261   auto *Space = isl_space_set_alloc(Ctx, 0, 1);
262   auto *AtomicOption = isl_set_universe(Space);
263   auto *Id = isl_id_alloc(Ctx, "atomic", nullptr);
264   return isl_union_set_from_set(isl_set_set_tuple_id(AtomicOption, Id));
265 }
266 
267 /// Make the last dimension of Set to take values from 0 to VectorWidth - 1.
268 ///
269 /// @param Set         A set, which should be modified.
270 /// @param VectorWidth A parameter, which determines the constraint.
271 static __isl_give isl_set *addExtentConstraints(__isl_take isl_set *Set,
272                                                 int VectorWidth) {
273   auto Dims = isl_set_dim(Set, isl_dim_set);
274   auto Space = isl_set_get_space(Set);
275   auto *LocalSpace = isl_local_space_from_space(Space);
276   auto *ExtConstr =
277       isl_constraint_alloc_inequality(isl_local_space_copy(LocalSpace));
278   ExtConstr = isl_constraint_set_constant_si(ExtConstr, 0);
279   ExtConstr =
280       isl_constraint_set_coefficient_si(ExtConstr, isl_dim_set, Dims - 1, 1);
281   Set = isl_set_add_constraint(Set, ExtConstr);
282   ExtConstr = isl_constraint_alloc_inequality(LocalSpace);
283   ExtConstr = isl_constraint_set_constant_si(ExtConstr, VectorWidth - 1);
284   ExtConstr =
285       isl_constraint_set_coefficient_si(ExtConstr, isl_dim_set, Dims - 1, -1);
286   return isl_set_add_constraint(Set, ExtConstr);
287 }
288 
289 /// Build the desired set of partial tile prefixes.
290 ///
291 /// We build a set of partial tile prefixes, which are prefixes of the vector
292 /// loop that have exactly VectorWidth iterations.
293 ///
294 /// 1. Get all prefixes of the vector loop.
295 /// 2. Extend it to a set, which has exactly VectorWidth iterations for
296 ///    any prefix from the set that was built on the previous step.
297 /// 3. Subtract loop domain from it, project out the vector loop dimension and
298 ///    get a set of prefixes, which don't have exactly VectorWidth iterations.
299 /// 4. Subtract it from all prefixes of the vector loop and get the desired
300 ///    set.
301 ///
302 /// @param ScheduleRange A range of a map, which describes a prefix schedule
303 ///                      relation.
304 static __isl_give isl_set *
305 getPartialTilePrefixes(__isl_take isl_set *ScheduleRange, int VectorWidth) {
306   auto Dims = isl_set_dim(ScheduleRange, isl_dim_set);
307   auto *LoopPrefixes = isl_set_project_out(isl_set_copy(ScheduleRange),
308                                            isl_dim_set, Dims - 1, 1);
309   auto *ExtentPrefixes =
310       isl_set_add_dims(isl_set_copy(LoopPrefixes), isl_dim_set, 1);
311   ExtentPrefixes = addExtentConstraints(ExtentPrefixes, VectorWidth);
312   auto *BadPrefixes = isl_set_subtract(ExtentPrefixes, ScheduleRange);
313   BadPrefixes = isl_set_project_out(BadPrefixes, isl_dim_set, Dims - 1, 1);
314   return isl_set_subtract(LoopPrefixes, BadPrefixes);
315 }
316 
317 __isl_give isl_schedule_node *ScheduleTreeOptimizer::isolateFullPartialTiles(
318     __isl_take isl_schedule_node *Node, int VectorWidth) {
319   assert(isl_schedule_node_get_type(Node) == isl_schedule_node_band);
320   Node = isl_schedule_node_child(Node, 0);
321   Node = isl_schedule_node_child(Node, 0);
322   auto *SchedRelUMap = isl_schedule_node_get_prefix_schedule_relation(Node);
323   auto *ScheduleRelation = isl_map_from_union_map(SchedRelUMap);
324   auto *ScheduleRange = isl_map_range(ScheduleRelation);
325   auto *IsolateDomain = getPartialTilePrefixes(ScheduleRange, VectorWidth);
326   auto *AtomicOption = getAtomicOptions(isl_set_get_ctx(IsolateDomain));
327   auto *IsolateOption = getIsolateOptions(IsolateDomain);
328   Node = isl_schedule_node_parent(Node);
329   Node = isl_schedule_node_parent(Node);
330   auto *Options = isl_union_set_union(IsolateOption, AtomicOption);
331   Node = isl_schedule_node_band_set_ast_build_options(Node, Options);
332   return Node;
333 }
334 
335 __isl_give isl_schedule_node *
336 ScheduleTreeOptimizer::prevectSchedBand(__isl_take isl_schedule_node *Node,
337                                         unsigned DimToVectorize,
338                                         int VectorWidth) {
339   assert(isl_schedule_node_get_type(Node) == isl_schedule_node_band);
340 
341   auto Space = isl_schedule_node_band_get_space(Node);
342   auto ScheduleDimensions = isl_space_dim(Space, isl_dim_set);
343   isl_space_free(Space);
344   assert(DimToVectorize < ScheduleDimensions);
345 
346   if (DimToVectorize > 0) {
347     Node = isl_schedule_node_band_split(Node, DimToVectorize);
348     Node = isl_schedule_node_child(Node, 0);
349   }
350   if (DimToVectorize < ScheduleDimensions - 1)
351     Node = isl_schedule_node_band_split(Node, 1);
352   Space = isl_schedule_node_band_get_space(Node);
353   auto Sizes = isl_multi_val_zero(Space);
354   auto Ctx = isl_schedule_node_get_ctx(Node);
355   Sizes =
356       isl_multi_val_set_val(Sizes, 0, isl_val_int_from_si(Ctx, VectorWidth));
357   Node = isl_schedule_node_band_tile(Node, Sizes);
358   Node = isolateFullPartialTiles(Node, VectorWidth);
359   Node = isl_schedule_node_child(Node, 0);
360   // Make sure the "trivially vectorizable loop" is not unrolled. Otherwise,
361   // we will have troubles to match it in the backend.
362   Node = isl_schedule_node_band_set_ast_build_options(
363       Node, isl_union_set_read_from_str(Ctx, "{ unroll[x]: 1 = 0 }"));
364   Node = isl_schedule_node_band_sink(Node);
365   Node = isl_schedule_node_child(Node, 0);
366   if (isl_schedule_node_get_type(Node) == isl_schedule_node_leaf)
367     Node = isl_schedule_node_parent(Node);
368   isl_id *LoopMarker = isl_id_alloc(Ctx, "SIMD", nullptr);
369   Node = isl_schedule_node_insert_mark(Node, LoopMarker);
370   return Node;
371 }
372 
373 __isl_give isl_schedule_node *
374 ScheduleTreeOptimizer::tileNode(__isl_take isl_schedule_node *Node,
375                                 const char *Identifier, ArrayRef<int> TileSizes,
376                                 int DefaultTileSize) {
377   auto Ctx = isl_schedule_node_get_ctx(Node);
378   auto Space = isl_schedule_node_band_get_space(Node);
379   auto Dims = isl_space_dim(Space, isl_dim_set);
380   auto Sizes = isl_multi_val_zero(Space);
381   std::string IdentifierString(Identifier);
382   for (unsigned i = 0; i < Dims; i++) {
383     auto tileSize = i < TileSizes.size() ? TileSizes[i] : DefaultTileSize;
384     Sizes = isl_multi_val_set_val(Sizes, i, isl_val_int_from_si(Ctx, tileSize));
385   }
386   auto TileLoopMarkerStr = IdentifierString + " - Tiles";
387   isl_id *TileLoopMarker =
388       isl_id_alloc(Ctx, TileLoopMarkerStr.c_str(), nullptr);
389   Node = isl_schedule_node_insert_mark(Node, TileLoopMarker);
390   Node = isl_schedule_node_child(Node, 0);
391   Node = isl_schedule_node_band_tile(Node, Sizes);
392   Node = isl_schedule_node_child(Node, 0);
393   auto PointLoopMarkerStr = IdentifierString + " - Points";
394   isl_id *PointLoopMarker =
395       isl_id_alloc(Ctx, PointLoopMarkerStr.c_str(), nullptr);
396   Node = isl_schedule_node_insert_mark(Node, PointLoopMarker);
397   Node = isl_schedule_node_child(Node, 0);
398   return Node;
399 }
400 
401 __isl_give isl_schedule_node *
402 ScheduleTreeOptimizer::applyRegisterTiling(__isl_take isl_schedule_node *Node,
403                                            llvm::ArrayRef<int> TileSizes,
404                                            int DefaultTileSize) {
405   auto *Ctx = isl_schedule_node_get_ctx(Node);
406   Node = tileNode(Node, "Register tiling", TileSizes, DefaultTileSize);
407   Node = isl_schedule_node_band_set_ast_build_options(
408       Node, isl_union_set_read_from_str(Ctx, "{unroll[x]}"));
409   return Node;
410 }
411 
412 bool ScheduleTreeOptimizer::isTileableBandNode(
413     __isl_keep isl_schedule_node *Node) {
414   if (isl_schedule_node_get_type(Node) != isl_schedule_node_band)
415     return false;
416 
417   if (isl_schedule_node_n_children(Node) != 1)
418     return false;
419 
420   if (!isl_schedule_node_band_get_permutable(Node))
421     return false;
422 
423   auto Space = isl_schedule_node_band_get_space(Node);
424   auto Dims = isl_space_dim(Space, isl_dim_set);
425   isl_space_free(Space);
426 
427   if (Dims <= 1)
428     return false;
429 
430   auto Child = isl_schedule_node_get_child(Node, 0);
431   auto Type = isl_schedule_node_get_type(Child);
432   isl_schedule_node_free(Child);
433 
434   if (Type != isl_schedule_node_leaf)
435     return false;
436 
437   return true;
438 }
439 
440 __isl_give isl_schedule_node *
441 ScheduleTreeOptimizer::standardBandOpts(__isl_take isl_schedule_node *Node,
442                                         void *User) {
443   if (FirstLevelTiling)
444     Node = tileNode(Node, "1st level tiling", FirstLevelTileSizes,
445                     FirstLevelDefaultTileSize);
446 
447   if (SecondLevelTiling)
448     Node = tileNode(Node, "2nd level tiling", SecondLevelTileSizes,
449                     SecondLevelDefaultTileSize);
450 
451   if (RegisterTiling)
452     Node =
453         applyRegisterTiling(Node, RegisterTileSizes, RegisterDefaultTileSize);
454 
455   if (PollyVectorizerChoice == VECTORIZER_NONE)
456     return Node;
457 
458   auto Space = isl_schedule_node_band_get_space(Node);
459   auto Dims = isl_space_dim(Space, isl_dim_set);
460   isl_space_free(Space);
461 
462   for (int i = Dims - 1; i >= 0; i--)
463     if (isl_schedule_node_band_member_get_coincident(Node, i)) {
464       Node = prevectSchedBand(Node, i, PrevectorWidth);
465       break;
466     }
467 
468   return Node;
469 }
470 
471 /// Check whether output dimensions of the map rely on the specified input
472 /// dimension.
473 ///
474 /// @param IslMap The isl map to be considered.
475 /// @param DimNum The number of an input dimension to be checked.
476 static bool isInputDimUsed(__isl_take isl_map *IslMap, unsigned DimNum) {
477   auto *CheckedAccessRelation =
478       isl_map_project_out(isl_map_copy(IslMap), isl_dim_in, DimNum, 1);
479   CheckedAccessRelation =
480       isl_map_insert_dims(CheckedAccessRelation, isl_dim_in, DimNum, 1);
481   auto *InputDimsId = isl_map_get_tuple_id(IslMap, isl_dim_in);
482   CheckedAccessRelation =
483       isl_map_set_tuple_id(CheckedAccessRelation, isl_dim_in, InputDimsId);
484   InputDimsId = isl_map_get_tuple_id(IslMap, isl_dim_out);
485   CheckedAccessRelation =
486       isl_map_set_tuple_id(CheckedAccessRelation, isl_dim_out, InputDimsId);
487   auto res = !isl_map_is_equal(CheckedAccessRelation, IslMap);
488   isl_map_free(CheckedAccessRelation);
489   isl_map_free(IslMap);
490   return res;
491 }
492 
493 /// Check if the SCoP statement could probably be optimized with analytical
494 /// modeling.
495 ///
496 /// containsMatrMult tries to determine whether the following conditions
497 /// are true:
498 /// 1. all memory accesses of the statement will have stride 0 or 1,
499 ///    if we interchange loops (switch the variable used in the inner
500 ///    loop to the outer loop).
501 /// 2. all memory accesses of the statement except from the last one, are
502 ///    read memory access and the last one is write memory access.
503 /// 3. all subscripts of the last memory access of the statement don't contain
504 ///    the variable used in the inner loop.
505 ///
506 /// @param PartialSchedule The PartialSchedule that contains a SCoP statement
507 ///        to check.
508 static bool containsMatrMult(__isl_keep isl_map *PartialSchedule) {
509   auto InputDimsId = isl_map_get_tuple_id(PartialSchedule, isl_dim_in);
510   auto *ScpStmt = static_cast<ScopStmt *>(isl_id_get_user(InputDimsId));
511   isl_id_free(InputDimsId);
512   if (ScpStmt->size() <= 1)
513     return false;
514   auto MemA = ScpStmt->begin();
515   for (unsigned i = 0; i < ScpStmt->size() - 2 && MemA != ScpStmt->end();
516        i++, MemA++)
517     if (!(*MemA)->isRead() ||
518         ((*MemA)->isArrayKind() &&
519          !((*MemA)->isStrideOne(isl_map_copy(PartialSchedule)) ||
520            (*MemA)->isStrideZero(isl_map_copy(PartialSchedule)))))
521       return false;
522   MemA++;
523   if (!(*MemA)->isWrite() || !(*MemA)->isArrayKind() ||
524       !((*MemA)->isStrideOne(isl_map_copy(PartialSchedule)) ||
525         (*MemA)->isStrideZero(isl_map_copy(PartialSchedule))))
526     return false;
527   auto DimNum = isl_map_dim(PartialSchedule, isl_dim_in);
528   return !isInputDimUsed((*MemA)->getAccessRelation(), DimNum - 1);
529 }
530 
531 /// Circular shift of output dimensions of the integer map.
532 ///
533 /// @param IslMap The isl map to be modified.
534 static __isl_give isl_map *circularShiftOutputDims(__isl_take isl_map *IslMap) {
535   auto DimNum = isl_map_dim(IslMap, isl_dim_out);
536   if (DimNum == 0)
537     return IslMap;
538   auto InputDimsId = isl_map_get_tuple_id(IslMap, isl_dim_in);
539   IslMap = isl_map_move_dims(IslMap, isl_dim_in, 0, isl_dim_out, DimNum - 1, 1);
540   IslMap = isl_map_move_dims(IslMap, isl_dim_out, 0, isl_dim_in, 0, 1);
541   return isl_map_set_tuple_id(IslMap, isl_dim_in, InputDimsId);
542 }
543 
544 /// Permute two dimensions of the band node.
545 ///
546 /// Permute FirstDim and SecondDim dimensions of the Node.
547 ///
548 /// @param Node The band node to be modified.
549 /// @param FirstDim The first dimension to be permuted.
550 /// @param SecondDim The second dimension to be permuted.
551 static __isl_give isl_schedule_node *
552 permuteBandNodeDimensions(__isl_take isl_schedule_node *Node, unsigned FirstDim,
553                           unsigned SecondDim) {
554   assert(isl_schedule_node_get_type(Node) == isl_schedule_node_band &&
555          isl_schedule_node_band_n_member(Node) > std::max(FirstDim, SecondDim));
556   auto PartialSchedule = isl_schedule_node_band_get_partial_schedule(Node);
557   auto PartialScheduleFirstDim =
558       isl_multi_union_pw_aff_get_union_pw_aff(PartialSchedule, FirstDim);
559   auto PartialScheduleSecondDim =
560       isl_multi_union_pw_aff_get_union_pw_aff(PartialSchedule, SecondDim);
561   PartialSchedule = isl_multi_union_pw_aff_set_union_pw_aff(
562       PartialSchedule, SecondDim, PartialScheduleFirstDim);
563   PartialSchedule = isl_multi_union_pw_aff_set_union_pw_aff(
564       PartialSchedule, FirstDim, PartialScheduleSecondDim);
565   Node = isl_schedule_node_delete(Node);
566   Node = isl_schedule_node_insert_partial_schedule(Node, PartialSchedule);
567   return Node;
568 }
569 
570 __isl_give isl_schedule_node *ScheduleTreeOptimizer::createMicroKernel(
571     __isl_take isl_schedule_node *Node, MicroKernelParamsTy MicroKernelParams) {
572   applyRegisterTiling(Node, {MicroKernelParams.Mr, MicroKernelParams.Nr}, 1);
573   Node = isl_schedule_node_parent(isl_schedule_node_parent(Node));
574   Node = permuteBandNodeDimensions(Node, 0, 1);
575   return isl_schedule_node_child(isl_schedule_node_child(Node, 0), 0);
576 }
577 
578 __isl_give isl_schedule_node *ScheduleTreeOptimizer::createMacroKernel(
579     __isl_take isl_schedule_node *Node, MacroKernelParamsTy MacroKernelParams) {
580   assert(isl_schedule_node_get_type(Node) == isl_schedule_node_band);
581   if (MacroKernelParams.Mc == 1 && MacroKernelParams.Nc == 1 &&
582       MacroKernelParams.Kc == 1)
583     return Node;
584   Node = tileNode(
585       Node, "1st level tiling",
586       {MacroKernelParams.Mc, MacroKernelParams.Nc, MacroKernelParams.Kc}, 1);
587   Node = isl_schedule_node_parent(isl_schedule_node_parent(Node));
588   Node = permuteBandNodeDimensions(Node, 1, 2);
589   Node = permuteBandNodeDimensions(Node, 0, 2);
590   return isl_schedule_node_child(isl_schedule_node_child(Node, 0), 0);
591 }
592 
593 /// Get parameters of the BLIS micro kernel.
594 ///
595 /// We choose the Mr and Nr parameters of the micro kernel to be large enough
596 /// such that no stalls caused by the combination of latencies and dependencies
597 /// are introduced during the updates of the resulting matrix of the matrix
598 /// multiplication. However, they should also be as small as possible to
599 /// release more registers for entries of multiplied matrices.
600 ///
601 /// @param TTI Target Transform Info.
602 /// @return The structure of type MicroKernelParamsTy.
603 /// @see MicroKernelParamsTy
604 static struct MicroKernelParamsTy
605 getMicroKernelParams(const llvm::TargetTransformInfo *TTI) {
606   assert(TTI && "The target transform info should be provided.");
607 
608   // Nvec - Number of double-precision floating-point numbers that can be hold
609   // by a vector register. Use 2 by default.
610   long RegisterBitwidth = VectorRegisterBitwidth;
611 
612   if (RegisterBitwidth == -1)
613     RegisterBitwidth = TTI->getRegisterBitWidth(true);
614   auto Nvec = RegisterBitwidth / 64;
615   if (Nvec == 0)
616     Nvec = 2;
617   int Nr =
618       ceil(sqrt(Nvec * LatencyVectorFma * ThroughputVectorFma) / Nvec) * Nvec;
619   int Mr = ceil(Nvec * LatencyVectorFma * ThroughputVectorFma / Nr);
620   return {Mr, Nr};
621 }
622 
623 /// Get parameters of the BLIS macro kernel.
624 ///
625 /// During the computation of matrix multiplication, blocks of partitioned
626 /// matrices are mapped to different layers of the memory hierarchy.
627 /// To optimize data reuse, blocks should be ideally kept in cache between
628 /// iterations. Since parameters of the macro kernel determine sizes of these
629 /// blocks, there are upper and lower bounds on these parameters.
630 ///
631 /// @param MicroKernelParams Parameters of the micro-kernel
632 ///                          to be taken into account.
633 /// @return The structure of type MacroKernelParamsTy.
634 /// @see MacroKernelParamsTy
635 /// @see MicroKernelParamsTy
636 static struct MacroKernelParamsTy
637 getMacroKernelParams(const MicroKernelParamsTy &MicroKernelParams) {
638   // According to www.cs.utexas.edu/users/flame/pubs/TOMS-BLIS-Analytical.pdf,
639   // it requires information about the first two levels of a cache to determine
640   // all the parameters of a macro-kernel. It also checks that an associativity
641   // degree of a cache level is greater than two. Otherwise, another algorithm
642   // for determination of the parameters should be used.
643   if (!(MicroKernelParams.Mr > 0 && MicroKernelParams.Nr > 0 &&
644         FirstCacheLevelSize > 0 && SecondCacheLevelSize > 0 &&
645         FirstCacheLevelAssociativity > 2 && SecondCacheLevelAssociativity > 2))
646     return {1, 1, 1};
647   // The quotient should be greater than zero.
648   if (PollyPatternMatchingNcQuotient <= 0)
649     return {1, 1, 1};
650   int Car = floor(
651       (FirstCacheLevelAssociativity - 1) /
652       (1 + static_cast<double>(MicroKernelParams.Nr) / MicroKernelParams.Mr));
653   int Kc = (Car * FirstCacheLevelSize) /
654            (MicroKernelParams.Mr * FirstCacheLevelAssociativity * 8);
655   double Cac = static_cast<double>(Kc * 8 * SecondCacheLevelAssociativity) /
656                SecondCacheLevelSize;
657   int Mc = floor((SecondCacheLevelAssociativity - 2) / Cac);
658   int Nc = PollyPatternMatchingNcQuotient * MicroKernelParams.Nr;
659   return {Mc, Nc, Kc};
660 }
661 
662 /// Identify a memory access through the shape of its memory access relation.
663 ///
664 /// Identify the unique memory access in @p Stmt, that has an access relation
665 /// equal to @p ExpectedAccessRelation.
666 ///
667 /// @param Stmt The SCoP statement that contains the memory accesses under
668 ///             consideration.
669 /// @param ExpectedAccessRelation The access relation that identifies
670 ///                               the memory access.
671 /// @return  The memory access of @p Stmt whose memory access relation is equal
672 ///          to @p ExpectedAccessRelation. nullptr in case there is no or more
673 ///          than one such access.
674 MemoryAccess *
675 identifyAccessByAccessRelation(ScopStmt *Stmt,
676                                __isl_take isl_map *ExpectedAccessRelation) {
677   if (isl_map_has_tuple_id(ExpectedAccessRelation, isl_dim_out))
678     ExpectedAccessRelation =
679         isl_map_reset_tuple_id(ExpectedAccessRelation, isl_dim_out);
680   MemoryAccess *IdentifiedAccess = nullptr;
681   for (auto *Access : *Stmt) {
682     auto *AccessRelation = Access->getAccessRelation();
683     AccessRelation = isl_map_reset_tuple_id(AccessRelation, isl_dim_out);
684     if (isl_map_is_equal(ExpectedAccessRelation, AccessRelation)) {
685       if (IdentifiedAccess) {
686         isl_map_free(AccessRelation);
687         isl_map_free(ExpectedAccessRelation);
688         return nullptr;
689       }
690       IdentifiedAccess = Access;
691     }
692     isl_map_free(AccessRelation);
693   }
694   isl_map_free(ExpectedAccessRelation);
695   return IdentifiedAccess;
696 }
697 
698 /// Add constrains to @Dim dimension of @p ExtMap.
699 ///
700 /// If @ExtMap has the following form [O0, O1, O2]->[I1, I2, I3],
701 /// the following constraint will be added
702 /// Bound * OM <= IM <= Bound * (OM + 1) - 1,
703 /// where M is @p Dim and Bound is @p Bound.
704 ///
705 /// @param ExtMap The isl map to be modified.
706 /// @param Dim The output dimension to be modfied.
707 /// @param Bound The value that is used to specify the constraint.
708 /// @return The modified isl map
709 __isl_give isl_map *
710 addExtensionMapMatMulDimConstraint(__isl_take isl_map *ExtMap, unsigned Dim,
711                                    unsigned Bound) {
712   assert(Bound != 0);
713   auto *ExtMapSpace = isl_map_get_space(ExtMap);
714   auto *ConstrSpace = isl_local_space_from_space(ExtMapSpace);
715   auto *Constr =
716       isl_constraint_alloc_inequality(isl_local_space_copy(ConstrSpace));
717   Constr = isl_constraint_set_coefficient_si(Constr, isl_dim_out, Dim, 1);
718   Constr =
719       isl_constraint_set_coefficient_si(Constr, isl_dim_in, Dim, Bound * (-1));
720   ExtMap = isl_map_add_constraint(ExtMap, Constr);
721   Constr = isl_constraint_alloc_inequality(ConstrSpace);
722   Constr = isl_constraint_set_coefficient_si(Constr, isl_dim_out, Dim, -1);
723   Constr = isl_constraint_set_coefficient_si(Constr, isl_dim_in, Dim, Bound);
724   Constr = isl_constraint_set_constant_si(Constr, Bound - 1);
725   return isl_map_add_constraint(ExtMap, Constr);
726 }
727 
728 /// Create an access relation that is specific for matrix multiplication
729 /// pattern.
730 ///
731 /// Create an access relation of the following form:
732 /// { [O0, O1, O2]->[I1, I2, I3] :
733 ///   FirstOutputDimBound * O0 <= I1 <= FirstOutputDimBound * (O0 + 1) - 1
734 ///   and SecondOutputDimBound * O1 <= I2 <= SecondOutputDimBound * (O1 + 1) - 1
735 ///   and ThirdOutputDimBound * O2 <= I3 <= ThirdOutputDimBound * (O2 + 1) - 1}
736 ///   where FirstOutputDimBound is @p FirstOutputDimBound,
737 ///   SecondOutputDimBound is @p SecondOutputDimBound,
738 ///   ThirdOutputDimBound is @p ThirdOutputDimBound
739 ///
740 /// @param Ctx The isl context.
741 /// @param FirstOutputDimBound,
742 ///        SecondOutputDimBound,
743 ///        ThirdOutputDimBound The parameters of the access relation.
744 /// @return The specified access relation.
745 __isl_give isl_map *getMatMulExt(isl_ctx *Ctx, unsigned FirstOutputDimBound,
746                                  unsigned SecondOutputDimBound,
747                                  unsigned ThirdOutputDimBound) {
748   auto *NewRelSpace = isl_space_alloc(Ctx, 0, 3, 3);
749   auto *extensionMap = isl_map_universe(NewRelSpace);
750   if (!FirstOutputDimBound)
751     extensionMap = isl_map_fix_si(extensionMap, isl_dim_out, 0, 0);
752   else
753     extensionMap = addExtensionMapMatMulDimConstraint(extensionMap, 0,
754                                                       FirstOutputDimBound);
755   if (!SecondOutputDimBound)
756     extensionMap = isl_map_fix_si(extensionMap, isl_dim_out, 1, 0);
757   else
758     extensionMap = addExtensionMapMatMulDimConstraint(extensionMap, 1,
759                                                       SecondOutputDimBound);
760   if (!ThirdOutputDimBound)
761     extensionMap = isl_map_fix_si(extensionMap, isl_dim_out, 2, 0);
762   else
763     extensionMap = addExtensionMapMatMulDimConstraint(extensionMap, 2,
764                                                       ThirdOutputDimBound);
765   return extensionMap;
766 }
767 
768 /// Create an access relation that is specific to the matrix
769 ///        multiplication pattern.
770 ///
771 /// Create an access relation of the following form:
772 /// Stmt[O0, O1, O2]->[OI, OJ],
773 /// where I is @p I, J is @J
774 ///
775 /// @param Stmt The SCoP statement for which to generate the access relation.
776 /// @param I The index of the input dimension that is mapped to the first output
777 ///          dimension.
778 /// @param J The index of the input dimension that is mapped to the second
779 ///          output dimension.
780 /// @return The specified access relation.
781 __isl_give isl_map *
782 getMatMulPatternOriginalAccessRelation(ScopStmt *Stmt, unsigned I, unsigned J) {
783   auto *AccessRelSpace = isl_space_alloc(Stmt->getIslCtx(), 0, 3, 2);
784   auto *AccessRel = isl_map_universe(AccessRelSpace);
785   AccessRel = isl_map_equate(AccessRel, isl_dim_in, I, isl_dim_out, 0);
786   AccessRel = isl_map_equate(AccessRel, isl_dim_in, J, isl_dim_out, 1);
787   AccessRel = isl_map_set_tuple_id(AccessRel, isl_dim_in, Stmt->getDomainId());
788   return AccessRel;
789 }
790 
791 /// Identify the memory access that corresponds to the access to the second
792 /// operand of the matrix multiplication.
793 ///
794 /// Identify the memory access that corresponds to the access
795 /// to the matrix B of the matrix multiplication C = A x B.
796 ///
797 /// @param Stmt The SCoP statement that contains the memory accesses
798 ///             under consideration.
799 /// @return The memory access of @p Stmt that corresponds to the access
800 ///         to the second operand of the matrix multiplication.
801 MemoryAccess *identifyAccessA(ScopStmt *Stmt) {
802   auto *OriginalRel = getMatMulPatternOriginalAccessRelation(Stmt, 0, 2);
803   return identifyAccessByAccessRelation(Stmt, OriginalRel);
804 }
805 
806 /// Identify the memory access that corresponds to the access to the first
807 /// operand of the matrix multiplication.
808 ///
809 /// Identify the memory access that corresponds to the access
810 /// to the matrix A of the matrix multiplication C = A x B.
811 ///
812 /// @param Stmt The SCoP statement that contains the memory accesses
813 ///             under consideration.
814 /// @return The memory access of @p Stmt that corresponds to the access
815 ///         to the first operand of the matrix multiplication.
816 MemoryAccess *identifyAccessB(ScopStmt *Stmt) {
817   auto *OriginalRel = getMatMulPatternOriginalAccessRelation(Stmt, 2, 1);
818   return identifyAccessByAccessRelation(Stmt, OriginalRel);
819 }
820 
821 /// Create an access relation that is specific to
822 ///        the matrix multiplication pattern.
823 ///
824 /// Create an access relation of the following form:
825 /// [O0, O1, O2, O3, O4, O5, O6, O7, O8] -> [OI, O5, OJ]
826 /// where I is @p FirstDim, J is @p SecondDim.
827 ///
828 /// It can be used, for example, to create relations that helps to consequently
829 /// access elements of operands of a matrix multiplication after creation of
830 /// the BLIS micro and macro kernels.
831 ///
832 /// @see ScheduleTreeOptimizer::createMicroKernel
833 /// @see ScheduleTreeOptimizer::createMacroKernel
834 ///
835 /// Subsequently, the described access relation is applied to the range of
836 /// @p MapOldIndVar, that is used to map original induction variables to
837 /// the ones, which are produced by schedule transformations. It helps to
838 /// define relations using a new space and, at the same time, keep them
839 /// in the original one.
840 ///
841 /// @param MapOldIndVar The relation, which maps original induction variables
842 ///                     to the ones, which are produced by schedule
843 ///                     transformations.
844 /// @param FirstDim, SecondDim The input dimensions that are used to define
845 ///        the specified access relation.
846 /// @return The specified access relation.
847 __isl_give isl_map *getMatMulAccRel(__isl_take isl_map *MapOldIndVar,
848                                     unsigned FirstDim, unsigned SecondDim) {
849   auto *Ctx = isl_map_get_ctx(MapOldIndVar);
850   auto *AccessRelSpace = isl_space_alloc(Ctx, 0, 9, 3);
851   auto *AccessRel = isl_map_universe(AccessRelSpace);
852   AccessRel = isl_map_equate(AccessRel, isl_dim_in, FirstDim, isl_dim_out, 0);
853   AccessRel = isl_map_equate(AccessRel, isl_dim_in, 5, isl_dim_out, 1);
854   AccessRel = isl_map_equate(AccessRel, isl_dim_in, SecondDim, isl_dim_out, 2);
855   return isl_map_apply_range(MapOldIndVar, AccessRel);
856 }
857 
858 __isl_give isl_schedule_node *
859 createExtensionNode(__isl_take isl_schedule_node *Node,
860                     __isl_take isl_map *ExtensionMap) {
861   auto *Extension = isl_union_map_from_map(ExtensionMap);
862   auto *NewNode = isl_schedule_node_from_extension(Extension);
863   return isl_schedule_node_graft_before(Node, NewNode);
864 }
865 
866 /// Apply the packing transformation.
867 ///
868 /// The packing transformation can be described as a data-layout
869 /// transformation that requires to introduce a new array, copy data
870 /// to the array, and change memory access locations of the compute kernel
871 /// to reference the array.
872 ///
873 /// @param Node The schedule node to be optimized.
874 /// @param MapOldIndVar The relation, which maps original induction variables
875 ///                     to the ones, which are produced by schedule
876 ///                     transformations.
877 /// @param MicroParams, MacroParams Parameters of the BLIS kernel
878 ///                                 to be taken into account.
879 /// @return The optimized schedule node.
880 static __isl_give isl_schedule_node *optimizeDataLayoutMatrMulPattern(
881     __isl_take isl_schedule_node *Node, __isl_take isl_map *MapOldIndVar,
882     MicroKernelParamsTy MicroParams, MacroKernelParamsTy MacroParams) {
883   // Check whether memory accesses of the SCoP statement correspond to
884   // the matrix multiplication pattern and if this is true, obtain them.
885   auto InputDimsId = isl_map_get_tuple_id(MapOldIndVar, isl_dim_in);
886   auto *Stmt = static_cast<ScopStmt *>(isl_id_get_user(InputDimsId));
887   isl_id_free(InputDimsId);
888   MemoryAccess *MemAccessA = identifyAccessA(Stmt);
889   MemoryAccess *MemAccessB = identifyAccessB(Stmt);
890   if (!MemAccessA || !MemAccessB) {
891     isl_map_free(MapOldIndVar);
892     return Node;
893   }
894 
895   // Create a copy statement that corresponds to the memory access to the
896   // matrix B, the second operand of the matrix multiplication.
897   Node = isl_schedule_node_parent(isl_schedule_node_parent(Node));
898   Node = isl_schedule_node_parent(isl_schedule_node_parent(Node));
899   Node = isl_schedule_node_parent(Node);
900   Node = isl_schedule_node_child(isl_schedule_node_band_split(Node, 2), 0);
901   auto *AccRel = getMatMulAccRel(isl_map_copy(MapOldIndVar), 3, 7);
902   unsigned FirstDimSize = MacroParams.Nc / MicroParams.Nr;
903   unsigned SecondDimSize = MacroParams.Kc;
904   unsigned ThirdDimSize = MicroParams.Nr;
905   auto *SAI = Stmt->getParent()->createScopArrayInfo(
906       MemAccessB->getElementType(), "Packed_B",
907       {FirstDimSize, SecondDimSize, ThirdDimSize});
908   AccRel = isl_map_set_tuple_id(AccRel, isl_dim_out, SAI->getBasePtrId());
909   auto *OldAcc = MemAccessB->getAccessRelation();
910   MemAccessB->setNewAccessRelation(AccRel);
911   auto *ExtMap =
912       getMatMulExt(Stmt->getIslCtx(), 0, MacroParams.Nc, MacroParams.Kc);
913   isl_map_move_dims(ExtMap, isl_dim_out, 0, isl_dim_in, 0, 1);
914   isl_map_move_dims(ExtMap, isl_dim_in, 2, isl_dim_out, 0, 1);
915   ExtMap = isl_map_project_out(ExtMap, isl_dim_in, 2, 1);
916   auto *Domain = Stmt->getDomain();
917 
918   // Restrict the domains of the copy statements to only execute when also its
919   // originating statement is executed.
920   auto *DomainId = isl_set_get_tuple_id(Domain);
921   auto *NewStmt = Stmt->getParent()->addScopStmt(
922       OldAcc, MemAccessB->getAccessRelation(), isl_set_copy(Domain));
923   ExtMap = isl_map_set_tuple_id(ExtMap, isl_dim_out, isl_id_copy(DomainId));
924   ExtMap = isl_map_intersect_range(ExtMap, isl_set_copy(Domain));
925   ExtMap = isl_map_set_tuple_id(ExtMap, isl_dim_out, NewStmt->getDomainId());
926   Node = createExtensionNode(Node, ExtMap);
927 
928   // Create a copy statement that corresponds to the memory access
929   // to the matrix A, the first operand of the matrix multiplication.
930   Node = isl_schedule_node_child(Node, 0);
931   AccRel = getMatMulAccRel(MapOldIndVar, 4, 6);
932   FirstDimSize = MacroParams.Mc / MicroParams.Mr;
933   ThirdDimSize = MicroParams.Mr;
934   SAI = Stmt->getParent()->createScopArrayInfo(
935       MemAccessA->getElementType(), "Packed_A",
936       {FirstDimSize, SecondDimSize, ThirdDimSize});
937   AccRel = isl_map_set_tuple_id(AccRel, isl_dim_out, SAI->getBasePtrId());
938   OldAcc = MemAccessA->getAccessRelation();
939   MemAccessA->setNewAccessRelation(AccRel);
940   ExtMap = getMatMulExt(Stmt->getIslCtx(), MacroParams.Mc, 0, MacroParams.Kc);
941   isl_map_move_dims(ExtMap, isl_dim_out, 0, isl_dim_in, 0, 1);
942   isl_map_move_dims(ExtMap, isl_dim_in, 2, isl_dim_out, 0, 1);
943   NewStmt = Stmt->getParent()->addScopStmt(
944       OldAcc, MemAccessA->getAccessRelation(), isl_set_copy(Domain));
945 
946   // Restrict the domains of the copy statements to only execute when also its
947   // originating statement is executed.
948   ExtMap = isl_map_set_tuple_id(ExtMap, isl_dim_out, DomainId);
949   ExtMap = isl_map_intersect_range(ExtMap, Domain);
950   ExtMap = isl_map_set_tuple_id(ExtMap, isl_dim_out, NewStmt->getDomainId());
951   Node = createExtensionNode(Node, ExtMap);
952   Node = isl_schedule_node_child(isl_schedule_node_child(Node, 0), 0);
953   return isl_schedule_node_child(isl_schedule_node_child(Node, 0), 0);
954 }
955 
956 /// Get a relation mapping induction variables produced by schedule
957 /// transformations to the original ones.
958 ///
959 /// @param Node The schedule node produced as the result of creation
960 ///        of the BLIS kernels.
961 /// @param MicroKernelParams, MacroKernelParams Parameters of the BLIS kernel
962 ///                                             to be taken into account.
963 /// @return  The relation mapping original induction variables to the ones
964 ///          produced by schedule transformation.
965 /// @see ScheduleTreeOptimizer::createMicroKernel
966 /// @see ScheduleTreeOptimizer::createMacroKernel
967 /// @see getMacroKernelParams
968 __isl_give isl_map *
969 getInductionVariablesSubstitution(__isl_take isl_schedule_node *Node,
970                                   MicroKernelParamsTy MicroKernelParams,
971                                   MacroKernelParamsTy MacroKernelParams) {
972   auto *Child = isl_schedule_node_get_child(Node, 0);
973   auto *UnMapOldIndVar = isl_schedule_node_get_prefix_schedule_union_map(Child);
974   isl_schedule_node_free(Child);
975   auto *MapOldIndVar = isl_map_from_union_map(UnMapOldIndVar);
976   if (isl_map_dim(MapOldIndVar, isl_dim_out) > 9)
977     MapOldIndVar =
978         isl_map_project_out(MapOldIndVar, isl_dim_out, 0,
979                             isl_map_dim(MapOldIndVar, isl_dim_out) - 9);
980   return MapOldIndVar;
981 }
982 
983 __isl_give isl_schedule_node *ScheduleTreeOptimizer::optimizeMatMulPattern(
984     __isl_take isl_schedule_node *Node, const llvm::TargetTransformInfo *TTI) {
985   assert(TTI && "The target transform info should be provided.");
986   auto MicroKernelParams = getMicroKernelParams(TTI);
987   auto MacroKernelParams = getMacroKernelParams(MicroKernelParams);
988   Node = createMacroKernel(Node, MacroKernelParams);
989   Node = createMicroKernel(Node, MicroKernelParams);
990   if (MacroKernelParams.Mc == 1 || MacroKernelParams.Nc == 1 ||
991       MacroKernelParams.Kc == 1)
992     return Node;
993   auto *MapOldIndVar = getInductionVariablesSubstitution(
994       Node, MicroKernelParams, MacroKernelParams);
995   if (!MapOldIndVar)
996     return Node;
997   return optimizeDataLayoutMatrMulPattern(Node, MapOldIndVar, MicroKernelParams,
998                                           MacroKernelParams);
999 }
1000 
1001 bool ScheduleTreeOptimizer::isMatrMultPattern(
1002     __isl_keep isl_schedule_node *Node) {
1003   auto *PartialSchedule =
1004       isl_schedule_node_band_get_partial_schedule_union_map(Node);
1005   if (isl_schedule_node_band_n_member(Node) != 3 ||
1006       isl_union_map_n_map(PartialSchedule) != 1) {
1007     isl_union_map_free(PartialSchedule);
1008     return false;
1009   }
1010   auto *NewPartialSchedule = isl_map_from_union_map(PartialSchedule);
1011   NewPartialSchedule = circularShiftOutputDims(NewPartialSchedule);
1012   if (containsMatrMult(NewPartialSchedule)) {
1013     isl_map_free(NewPartialSchedule);
1014     return true;
1015   }
1016   isl_map_free(NewPartialSchedule);
1017   return false;
1018 }
1019 
1020 __isl_give isl_schedule_node *
1021 ScheduleTreeOptimizer::optimizeBand(__isl_take isl_schedule_node *Node,
1022                                     void *User) {
1023   if (!isTileableBandNode(Node))
1024     return Node;
1025 
1026   if (PMBasedOpts && User && isMatrMultPattern(Node)) {
1027     DEBUG(dbgs() << "The matrix multiplication pattern was detected\n");
1028     const llvm::TargetTransformInfo *TTI;
1029     TTI = static_cast<const llvm::TargetTransformInfo *>(User);
1030     Node = optimizeMatMulPattern(Node, TTI);
1031   }
1032 
1033   return standardBandOpts(Node, User);
1034 }
1035 
1036 __isl_give isl_schedule *
1037 ScheduleTreeOptimizer::optimizeSchedule(__isl_take isl_schedule *Schedule,
1038                                         const llvm::TargetTransformInfo *TTI) {
1039   isl_schedule_node *Root = isl_schedule_get_root(Schedule);
1040   Root = optimizeScheduleNode(Root, TTI);
1041   isl_schedule_free(Schedule);
1042   auto S = isl_schedule_node_get_schedule(Root);
1043   isl_schedule_node_free(Root);
1044   return S;
1045 }
1046 
1047 __isl_give isl_schedule_node *ScheduleTreeOptimizer::optimizeScheduleNode(
1048     __isl_take isl_schedule_node *Node, const llvm::TargetTransformInfo *TTI) {
1049   Node = isl_schedule_node_map_descendant_bottom_up(
1050       Node, optimizeBand, const_cast<void *>(static_cast<const void *>(TTI)));
1051   return Node;
1052 }
1053 
1054 bool ScheduleTreeOptimizer::isProfitableSchedule(
1055     Scop &S, __isl_keep isl_schedule *NewSchedule) {
1056   // To understand if the schedule has been optimized we check if the schedule
1057   // has changed at all.
1058   // TODO: We can improve this by tracking if any necessarily beneficial
1059   // transformations have been performed. This can e.g. be tiling, loop
1060   // interchange, or ...) We can track this either at the place where the
1061   // transformation has been performed or, in case of automatic ILP based
1062   // optimizations, by comparing (yet to be defined) performance metrics
1063   // before/after the scheduling optimizer
1064   // (e.g., #stride-one accesses)
1065   if (S.containsExtensionNode(NewSchedule))
1066     return true;
1067   auto *NewScheduleMap = isl_schedule_get_map(NewSchedule);
1068   isl_union_map *OldSchedule = S.getSchedule();
1069   assert(OldSchedule &&
1070          "Only IslScheduleOptimizer can insert extension nodes "
1071          "that make Scop::getSchedule() return nullptr.");
1072   bool changed = !isl_union_map_is_equal(OldSchedule, NewScheduleMap);
1073   isl_union_map_free(OldSchedule);
1074   isl_union_map_free(NewScheduleMap);
1075   return changed;
1076 }
1077 
1078 namespace {
1079 class IslScheduleOptimizer : public ScopPass {
1080 public:
1081   static char ID;
1082   explicit IslScheduleOptimizer() : ScopPass(ID) { LastSchedule = nullptr; }
1083 
1084   ~IslScheduleOptimizer() { isl_schedule_free(LastSchedule); }
1085 
1086   /// Optimize the schedule of the SCoP @p S.
1087   bool runOnScop(Scop &S) override;
1088 
1089   /// Print the new schedule for the SCoP @p S.
1090   void printScop(raw_ostream &OS, Scop &S) const override;
1091 
1092   /// Register all analyses and transformation required.
1093   void getAnalysisUsage(AnalysisUsage &AU) const override;
1094 
1095   /// Release the internal memory.
1096   void releaseMemory() override {
1097     isl_schedule_free(LastSchedule);
1098     LastSchedule = nullptr;
1099   }
1100 
1101 private:
1102   isl_schedule *LastSchedule;
1103 };
1104 } // namespace
1105 
1106 char IslScheduleOptimizer::ID = 0;
1107 
1108 bool IslScheduleOptimizer::runOnScop(Scop &S) {
1109 
1110   // Skip empty SCoPs but still allow code generation as it will delete the
1111   // loops present but not needed.
1112   if (S.getSize() == 0) {
1113     S.markAsOptimized();
1114     return false;
1115   }
1116 
1117   const Dependences &D =
1118       getAnalysis<DependenceInfo>().getDependences(Dependences::AL_Statement);
1119 
1120   if (!D.hasValidDependences())
1121     return false;
1122 
1123   isl_schedule_free(LastSchedule);
1124   LastSchedule = nullptr;
1125 
1126   // Build input data.
1127   int ValidityKinds =
1128       Dependences::TYPE_RAW | Dependences::TYPE_WAR | Dependences::TYPE_WAW;
1129   int ProximityKinds;
1130 
1131   if (OptimizeDeps == "all")
1132     ProximityKinds =
1133         Dependences::TYPE_RAW | Dependences::TYPE_WAR | Dependences::TYPE_WAW;
1134   else if (OptimizeDeps == "raw")
1135     ProximityKinds = Dependences::TYPE_RAW;
1136   else {
1137     errs() << "Do not know how to optimize for '" << OptimizeDeps << "'"
1138            << " Falling back to optimizing all dependences.\n";
1139     ProximityKinds =
1140         Dependences::TYPE_RAW | Dependences::TYPE_WAR | Dependences::TYPE_WAW;
1141   }
1142 
1143   isl_union_set *Domain = S.getDomains();
1144 
1145   if (!Domain)
1146     return false;
1147 
1148   isl_union_map *Validity = D.getDependences(ValidityKinds);
1149   isl_union_map *Proximity = D.getDependences(ProximityKinds);
1150 
1151   // Simplify the dependences by removing the constraints introduced by the
1152   // domains. This can speed up the scheduling time significantly, as large
1153   // constant coefficients will be removed from the dependences. The
1154   // introduction of some additional dependences reduces the possible
1155   // transformations, but in most cases, such transformation do not seem to be
1156   // interesting anyway. In some cases this option may stop the scheduler to
1157   // find any schedule.
1158   if (SimplifyDeps == "yes") {
1159     Validity = isl_union_map_gist_domain(Validity, isl_union_set_copy(Domain));
1160     Validity = isl_union_map_gist_range(Validity, isl_union_set_copy(Domain));
1161     Proximity =
1162         isl_union_map_gist_domain(Proximity, isl_union_set_copy(Domain));
1163     Proximity = isl_union_map_gist_range(Proximity, isl_union_set_copy(Domain));
1164   } else if (SimplifyDeps != "no") {
1165     errs() << "warning: Option -polly-opt-simplify-deps should either be 'yes' "
1166               "or 'no'. Falling back to default: 'yes'\n";
1167   }
1168 
1169   DEBUG(dbgs() << "\n\nCompute schedule from: ");
1170   DEBUG(dbgs() << "Domain := " << stringFromIslObj(Domain) << ";\n");
1171   DEBUG(dbgs() << "Proximity := " << stringFromIslObj(Proximity) << ";\n");
1172   DEBUG(dbgs() << "Validity := " << stringFromIslObj(Validity) << ";\n");
1173 
1174   unsigned IslSerializeSCCs;
1175 
1176   if (FusionStrategy == "max") {
1177     IslSerializeSCCs = 0;
1178   } else if (FusionStrategy == "min") {
1179     IslSerializeSCCs = 1;
1180   } else {
1181     errs() << "warning: Unknown fusion strategy. Falling back to maximal "
1182               "fusion.\n";
1183     IslSerializeSCCs = 0;
1184   }
1185 
1186   int IslMaximizeBands;
1187 
1188   if (MaximizeBandDepth == "yes") {
1189     IslMaximizeBands = 1;
1190   } else if (MaximizeBandDepth == "no") {
1191     IslMaximizeBands = 0;
1192   } else {
1193     errs() << "warning: Option -polly-opt-maximize-bands should either be 'yes'"
1194               " or 'no'. Falling back to default: 'yes'\n";
1195     IslMaximizeBands = 1;
1196   }
1197 
1198   int IslOuterCoincidence;
1199 
1200   if (OuterCoincidence == "yes") {
1201     IslOuterCoincidence = 1;
1202   } else if (OuterCoincidence == "no") {
1203     IslOuterCoincidence = 0;
1204   } else {
1205     errs() << "warning: Option -polly-opt-outer-coincidence should either be "
1206               "'yes' or 'no'. Falling back to default: 'no'\n";
1207     IslOuterCoincidence = 0;
1208   }
1209 
1210   isl_ctx *Ctx = S.getIslCtx();
1211 
1212   isl_options_set_schedule_outer_coincidence(Ctx, IslOuterCoincidence);
1213   isl_options_set_schedule_serialize_sccs(Ctx, IslSerializeSCCs);
1214   isl_options_set_schedule_maximize_band_depth(Ctx, IslMaximizeBands);
1215   isl_options_set_schedule_max_constant_term(Ctx, MaxConstantTerm);
1216   isl_options_set_schedule_max_coefficient(Ctx, MaxCoefficient);
1217   isl_options_set_tile_scale_tile_loops(Ctx, 0);
1218 
1219   auto OnErrorStatus = isl_options_get_on_error(Ctx);
1220   isl_options_set_on_error(Ctx, ISL_ON_ERROR_CONTINUE);
1221 
1222   isl_schedule_constraints *ScheduleConstraints;
1223   ScheduleConstraints = isl_schedule_constraints_on_domain(Domain);
1224   ScheduleConstraints =
1225       isl_schedule_constraints_set_proximity(ScheduleConstraints, Proximity);
1226   ScheduleConstraints = isl_schedule_constraints_set_validity(
1227       ScheduleConstraints, isl_union_map_copy(Validity));
1228   ScheduleConstraints =
1229       isl_schedule_constraints_set_coincidence(ScheduleConstraints, Validity);
1230   isl_schedule *Schedule;
1231   Schedule = isl_schedule_constraints_compute_schedule(ScheduleConstraints);
1232   isl_options_set_on_error(Ctx, OnErrorStatus);
1233 
1234   // In cases the scheduler is not able to optimize the code, we just do not
1235   // touch the schedule.
1236   if (!Schedule)
1237     return false;
1238 
1239   DEBUG({
1240     auto *P = isl_printer_to_str(Ctx);
1241     P = isl_printer_set_yaml_style(P, ISL_YAML_STYLE_BLOCK);
1242     P = isl_printer_print_schedule(P, Schedule);
1243     auto *str = isl_printer_get_str(P);
1244     dbgs() << "NewScheduleTree: \n" << str << "\n";
1245     free(str);
1246     isl_printer_free(P);
1247   });
1248 
1249   Function &F = S.getFunction();
1250   auto *TTI = &getAnalysis<TargetTransformInfoWrapperPass>().getTTI(F);
1251   isl_schedule *NewSchedule =
1252       ScheduleTreeOptimizer::optimizeSchedule(Schedule, TTI);
1253 
1254   if (!ScheduleTreeOptimizer::isProfitableSchedule(S, NewSchedule)) {
1255     isl_schedule_free(NewSchedule);
1256     return false;
1257   }
1258 
1259   S.setScheduleTree(NewSchedule);
1260   S.markAsOptimized();
1261 
1262   if (OptimizedScops)
1263     S.dump();
1264 
1265   return false;
1266 }
1267 
1268 void IslScheduleOptimizer::printScop(raw_ostream &OS, Scop &) const {
1269   isl_printer *p;
1270   char *ScheduleStr;
1271 
1272   OS << "Calculated schedule:\n";
1273 
1274   if (!LastSchedule) {
1275     OS << "n/a\n";
1276     return;
1277   }
1278 
1279   p = isl_printer_to_str(isl_schedule_get_ctx(LastSchedule));
1280   p = isl_printer_print_schedule(p, LastSchedule);
1281   ScheduleStr = isl_printer_get_str(p);
1282   isl_printer_free(p);
1283 
1284   OS << ScheduleStr << "\n";
1285 }
1286 
1287 void IslScheduleOptimizer::getAnalysisUsage(AnalysisUsage &AU) const {
1288   ScopPass::getAnalysisUsage(AU);
1289   AU.addRequired<DependenceInfo>();
1290   AU.addRequired<TargetTransformInfoWrapperPass>();
1291 }
1292 
1293 Pass *polly::createIslScheduleOptimizerPass() {
1294   return new IslScheduleOptimizer();
1295 }
1296 
1297 INITIALIZE_PASS_BEGIN(IslScheduleOptimizer, "polly-opt-isl",
1298                       "Polly - Optimize schedule of SCoP", false, false);
1299 INITIALIZE_PASS_DEPENDENCY(DependenceInfo);
1300 INITIALIZE_PASS_DEPENDENCY(ScopInfoRegionPass);
1301 INITIALIZE_PASS_DEPENDENCY(TargetTransformInfoWrapperPass);
1302 INITIALIZE_PASS_END(IslScheduleOptimizer, "polly-opt-isl",
1303                     "Polly - Optimize schedule of SCoP", false, false)
1304