1 //===- Schedule.cpp - Calculate an optimized schedule ---------------------===//
2 //
3 //                     The LLVM Compiler Infrastructure
4 //
5 // This file is distributed under the University of Illinois Open Source
6 // License. See LICENSE.TXT for details.
7 //
8 //===----------------------------------------------------------------------===//
9 //
10 // This pass generates an entirely new schedule tree from the data dependences
11 // and iteration domains. The new schedule tree is computed in two steps:
12 //
13 // 1) The isl scheduling optimizer is run
14 //
15 // The isl scheduling optimizer creates a new schedule tree that maximizes
16 // parallelism and tileability and minimizes data-dependence distances. The
17 // algorithm used is a modified version of the ``Pluto'' algorithm:
18 //
19 //   U. Bondhugula, A. Hartono, J. Ramanujam, and P. Sadayappan.
20 //   A Practical Automatic Polyhedral Parallelizer and Locality Optimizer.
21 //   In Proceedings of the 2008 ACM SIGPLAN Conference On Programming Language
22 //   Design and Implementation, PLDI ’08, pages 101–113. ACM, 2008.
23 //
24 // 2) A set of post-scheduling transformations is applied on the schedule tree.
25 //
26 // These optimizations include:
27 //
28 //  - Tiling of the innermost tilable bands
29 //  - Prevectorization - The coice of a possible outer loop that is strip-mined
30 //                       to the innermost level to enable inner-loop
31 //                       vectorization.
32 //  - Some optimizations for spatial locality are also planned.
33 //
34 // For a detailed description of the schedule tree itself please see section 6
35 // of:
36 //
37 // Polyhedral AST generation is more than scanning polyhedra
38 // Tobias Grosser, Sven Verdoolaege, Albert Cohen
39 // ACM Transations on Programming Languages and Systems (TOPLAS),
40 // 37(4), July 2015
41 // http://www.grosser.es/#pub-polyhedral-AST-generation
42 //
43 // This publication also contains a detailed discussion of the different options
44 // for polyhedral loop unrolling, full/partial tile separation and other uses
45 // of the schedule tree.
46 //
47 //===----------------------------------------------------------------------===//
48 
49 #include "polly/ScheduleOptimizer.h"
50 #include "polly/CodeGen/CodeGeneration.h"
51 #include "polly/DependenceInfo.h"
52 #include "polly/LinkAllPasses.h"
53 #include "polly/Options.h"
54 #include "polly/ScopInfo.h"
55 #include "polly/Support/GICHelper.h"
56 #include "llvm/Analysis/TargetTransformInfo.h"
57 #include "llvm/Support/Debug.h"
58 #include "isl/aff.h"
59 #include "isl/band.h"
60 #include "isl/constraint.h"
61 #include "isl/map.h"
62 #include "isl/options.h"
63 #include "isl/printer.h"
64 #include "isl/schedule.h"
65 #include "isl/schedule_node.h"
66 #include "isl/space.h"
67 #include "isl/union_map.h"
68 #include "isl/union_set.h"
69 
70 using namespace llvm;
71 using namespace polly;
72 
73 #define DEBUG_TYPE "polly-opt-isl"
74 
75 static cl::opt<std::string>
76     OptimizeDeps("polly-opt-optimize-only",
77                  cl::desc("Only a certain kind of dependences (all/raw)"),
78                  cl::Hidden, cl::init("all"), cl::ZeroOrMore,
79                  cl::cat(PollyCategory));
80 
81 static cl::opt<std::string>
82     SimplifyDeps("polly-opt-simplify-deps",
83                  cl::desc("Dependences should be simplified (yes/no)"),
84                  cl::Hidden, cl::init("yes"), cl::ZeroOrMore,
85                  cl::cat(PollyCategory));
86 
87 static cl::opt<int> MaxConstantTerm(
88     "polly-opt-max-constant-term",
89     cl::desc("The maximal constant term allowed (-1 is unlimited)"), cl::Hidden,
90     cl::init(20), cl::ZeroOrMore, cl::cat(PollyCategory));
91 
92 static cl::opt<int> MaxCoefficient(
93     "polly-opt-max-coefficient",
94     cl::desc("The maximal coefficient allowed (-1 is unlimited)"), cl::Hidden,
95     cl::init(20), cl::ZeroOrMore, cl::cat(PollyCategory));
96 
97 static cl::opt<std::string> FusionStrategy(
98     "polly-opt-fusion", cl::desc("The fusion strategy to choose (min/max)"),
99     cl::Hidden, cl::init("min"), cl::ZeroOrMore, cl::cat(PollyCategory));
100 
101 static cl::opt<std::string>
102     MaximizeBandDepth("polly-opt-maximize-bands",
103                       cl::desc("Maximize the band depth (yes/no)"), cl::Hidden,
104                       cl::init("yes"), cl::ZeroOrMore, cl::cat(PollyCategory));
105 
106 static cl::opt<std::string> OuterCoincidence(
107     "polly-opt-outer-coincidence",
108     cl::desc("Try to construct schedules where the outer member of each band "
109              "satisfies the coincidence constraints (yes/no)"),
110     cl::Hidden, cl::init("no"), cl::ZeroOrMore, cl::cat(PollyCategory));
111 
112 static cl::opt<int> PrevectorWidth(
113     "polly-prevect-width",
114     cl::desc(
115         "The number of loop iterations to strip-mine for pre-vectorization"),
116     cl::Hidden, cl::init(4), cl::ZeroOrMore, cl::cat(PollyCategory));
117 
118 static cl::opt<bool> FirstLevelTiling("polly-tiling",
119                                       cl::desc("Enable loop tiling"),
120                                       cl::init(true), cl::ZeroOrMore,
121                                       cl::cat(PollyCategory));
122 
123 static cl::opt<int> LatencyVectorFma(
124     "polly-target-latency-vector-fma",
125     cl::desc("The minimal number of cycles between issuing two "
126              "dependent consecutive vector fused multiply-add "
127              "instructions."),
128     cl::Hidden, cl::init(8), cl::ZeroOrMore, cl::cat(PollyCategory));
129 
130 static cl::opt<int> ThrougputVectorFma(
131     "polly-target-througput-vector-fma",
132     cl::desc("A throughput of the processor floating-point arithmetic units "
133              "expressed in the number of vector fused multiply-add "
134              "instructions per clock cycle."),
135     cl::Hidden, cl::init(1), cl::ZeroOrMore, cl::cat(PollyCategory));
136 
137 static cl::list<int>
138     CacheLevelAssociativity("polly-target-cache-level-associativity",
139                             cl::desc("The associativity of each cache level."),
140                             cl::Hidden, cl::ZeroOrMore, cl::CommaSeparated,
141                             cl::cat(PollyCategory));
142 
143 static cl::list<int> CacheLevelSizes(
144     "polly-target-cache-level-sizes",
145     cl::desc("The size of each cache level specified in bytes."), cl::Hidden,
146     cl::ZeroOrMore, cl::CommaSeparated, cl::cat(PollyCategory));
147 
148 static cl::opt<int> FirstLevelDefaultTileSize(
149     "polly-default-tile-size",
150     cl::desc("The default tile size (if not enough were provided by"
151              " --polly-tile-sizes)"),
152     cl::Hidden, cl::init(32), cl::ZeroOrMore, cl::cat(PollyCategory));
153 
154 static cl::list<int> FirstLevelTileSizes(
155     "polly-tile-sizes", cl::desc("A tile size for each loop dimension, filled "
156                                  "with --polly-default-tile-size"),
157     cl::Hidden, cl::ZeroOrMore, cl::CommaSeparated, cl::cat(PollyCategory));
158 
159 static cl::opt<bool>
160     SecondLevelTiling("polly-2nd-level-tiling",
161                       cl::desc("Enable a 2nd level loop of loop tiling"),
162                       cl::init(false), cl::ZeroOrMore, cl::cat(PollyCategory));
163 
164 static cl::opt<int> SecondLevelDefaultTileSize(
165     "polly-2nd-level-default-tile-size",
166     cl::desc("The default 2nd-level tile size (if not enough were provided by"
167              " --polly-2nd-level-tile-sizes)"),
168     cl::Hidden, cl::init(16), cl::ZeroOrMore, cl::cat(PollyCategory));
169 
170 static cl::list<int>
171     SecondLevelTileSizes("polly-2nd-level-tile-sizes",
172                          cl::desc("A tile size for each loop dimension, filled "
173                                   "with --polly-default-tile-size"),
174                          cl::Hidden, cl::ZeroOrMore, cl::CommaSeparated,
175                          cl::cat(PollyCategory));
176 
177 static cl::opt<bool> RegisterTiling("polly-register-tiling",
178                                     cl::desc("Enable register tiling"),
179                                     cl::init(false), cl::ZeroOrMore,
180                                     cl::cat(PollyCategory));
181 
182 static cl::opt<int> RegisterDefaultTileSize(
183     "polly-register-tiling-default-tile-size",
184     cl::desc("The default register tile size (if not enough were provided by"
185              " --polly-register-tile-sizes)"),
186     cl::Hidden, cl::init(2), cl::ZeroOrMore, cl::cat(PollyCategory));
187 
188 static cl::opt<int> PollyPatternMatchingNcQuotient(
189     "polly-pattern-matching-nc-quotient",
190     cl::desc("Quotient that is obtained by dividing Nc, the parameter of the"
191              "macro-kernel, by Nr, the parameter of the micro-kernel"),
192     cl::Hidden, cl::init(256), cl::ZeroOrMore, cl::cat(PollyCategory));
193 
194 static cl::list<int>
195     RegisterTileSizes("polly-register-tile-sizes",
196                       cl::desc("A tile size for each loop dimension, filled "
197                                "with --polly-register-tile-size"),
198                       cl::Hidden, cl::ZeroOrMore, cl::CommaSeparated,
199                       cl::cat(PollyCategory));
200 
201 static cl::opt<bool>
202     PMBasedOpts("polly-pattern-matching-based-opts",
203                 cl::desc("Perform optimizations based on pattern matching"),
204                 cl::init(false), cl::ZeroOrMore, cl::cat(PollyCategory));
205 
206 static cl::opt<bool> OptimizedScops(
207     "polly-optimized-scops",
208     cl::desc("Polly - Dump polyhedral description of Scops optimized with "
209              "the isl scheduling optimizer and the set of post-scheduling "
210              "transformations is applied on the schedule tree"),
211     cl::init(false), cl::ZeroOrMore, cl::cat(PollyCategory));
212 
213 /// Create an isl_union_set, which describes the isolate option based on
214 /// IsoalteDomain.
215 ///
216 /// @param IsolateDomain An isl_set whose last dimension is the only one that
217 ///                      should belong to the current band node.
218 static __isl_give isl_union_set *
219 getIsolateOptions(__isl_take isl_set *IsolateDomain) {
220   auto Dims = isl_set_dim(IsolateDomain, isl_dim_set);
221   auto *IsolateRelation = isl_map_from_domain(IsolateDomain);
222   IsolateRelation = isl_map_move_dims(IsolateRelation, isl_dim_out, 0,
223                                       isl_dim_in, Dims - 1, 1);
224   auto *IsolateOption = isl_map_wrap(IsolateRelation);
225   auto *Id = isl_id_alloc(isl_set_get_ctx(IsolateOption), "isolate", nullptr);
226   return isl_union_set_from_set(isl_set_set_tuple_id(IsolateOption, Id));
227 }
228 
229 /// Create an isl_union_set, which describes the atomic option for the dimension
230 /// of the current node.
231 ///
232 /// It may help to reduce the size of generated code.
233 ///
234 /// @param Ctx An isl_ctx, which is used to create the isl_union_set.
235 static __isl_give isl_union_set *getAtomicOptions(__isl_take isl_ctx *Ctx) {
236   auto *Space = isl_space_set_alloc(Ctx, 0, 1);
237   auto *AtomicOption = isl_set_universe(Space);
238   auto *Id = isl_id_alloc(Ctx, "atomic", nullptr);
239   return isl_union_set_from_set(isl_set_set_tuple_id(AtomicOption, Id));
240 }
241 
242 /// Make the last dimension of Set to take values from 0 to VectorWidth - 1.
243 ///
244 /// @param Set         A set, which should be modified.
245 /// @param VectorWidth A parameter, which determines the constraint.
246 static __isl_give isl_set *addExtentConstraints(__isl_take isl_set *Set,
247                                                 int VectorWidth) {
248   auto Dims = isl_set_dim(Set, isl_dim_set);
249   auto Space = isl_set_get_space(Set);
250   auto *LocalSpace = isl_local_space_from_space(Space);
251   auto *ExtConstr =
252       isl_constraint_alloc_inequality(isl_local_space_copy(LocalSpace));
253   ExtConstr = isl_constraint_set_constant_si(ExtConstr, 0);
254   ExtConstr =
255       isl_constraint_set_coefficient_si(ExtConstr, isl_dim_set, Dims - 1, 1);
256   Set = isl_set_add_constraint(Set, ExtConstr);
257   ExtConstr = isl_constraint_alloc_inequality(LocalSpace);
258   ExtConstr = isl_constraint_set_constant_si(ExtConstr, VectorWidth - 1);
259   ExtConstr =
260       isl_constraint_set_coefficient_si(ExtConstr, isl_dim_set, Dims - 1, -1);
261   return isl_set_add_constraint(Set, ExtConstr);
262 }
263 
264 /// Build the desired set of partial tile prefixes.
265 ///
266 /// We build a set of partial tile prefixes, which are prefixes of the vector
267 /// loop that have exactly VectorWidth iterations.
268 ///
269 /// 1. Get all prefixes of the vector loop.
270 /// 2. Extend it to a set, which has exactly VectorWidth iterations for
271 ///    any prefix from the set that was built on the previous step.
272 /// 3. Subtract loop domain from it, project out the vector loop dimension and
273 ///    get a set of prefixes, which don't have exactly VectorWidth iterations.
274 /// 4. Subtract it from all prefixes of the vector loop and get the desired
275 ///    set.
276 ///
277 /// @param ScheduleRange A range of a map, which describes a prefix schedule
278 ///                      relation.
279 static __isl_give isl_set *
280 getPartialTilePrefixes(__isl_take isl_set *ScheduleRange, int VectorWidth) {
281   auto Dims = isl_set_dim(ScheduleRange, isl_dim_set);
282   auto *LoopPrefixes = isl_set_project_out(isl_set_copy(ScheduleRange),
283                                            isl_dim_set, Dims - 1, 1);
284   auto *ExtentPrefixes =
285       isl_set_add_dims(isl_set_copy(LoopPrefixes), isl_dim_set, 1);
286   ExtentPrefixes = addExtentConstraints(ExtentPrefixes, VectorWidth);
287   auto *BadPrefixes = isl_set_subtract(ExtentPrefixes, ScheduleRange);
288   BadPrefixes = isl_set_project_out(BadPrefixes, isl_dim_set, Dims - 1, 1);
289   return isl_set_subtract(LoopPrefixes, BadPrefixes);
290 }
291 
292 __isl_give isl_schedule_node *ScheduleTreeOptimizer::isolateFullPartialTiles(
293     __isl_take isl_schedule_node *Node, int VectorWidth) {
294   assert(isl_schedule_node_get_type(Node) == isl_schedule_node_band);
295   Node = isl_schedule_node_child(Node, 0);
296   Node = isl_schedule_node_child(Node, 0);
297   auto *SchedRelUMap = isl_schedule_node_get_prefix_schedule_relation(Node);
298   auto *ScheduleRelation = isl_map_from_union_map(SchedRelUMap);
299   auto *ScheduleRange = isl_map_range(ScheduleRelation);
300   auto *IsolateDomain = getPartialTilePrefixes(ScheduleRange, VectorWidth);
301   auto *AtomicOption = getAtomicOptions(isl_set_get_ctx(IsolateDomain));
302   auto *IsolateOption = getIsolateOptions(IsolateDomain);
303   Node = isl_schedule_node_parent(Node);
304   Node = isl_schedule_node_parent(Node);
305   auto *Options = isl_union_set_union(IsolateOption, AtomicOption);
306   Node = isl_schedule_node_band_set_ast_build_options(Node, Options);
307   return Node;
308 }
309 
310 __isl_give isl_schedule_node *
311 ScheduleTreeOptimizer::prevectSchedBand(__isl_take isl_schedule_node *Node,
312                                         unsigned DimToVectorize,
313                                         int VectorWidth) {
314   assert(isl_schedule_node_get_type(Node) == isl_schedule_node_band);
315 
316   auto Space = isl_schedule_node_band_get_space(Node);
317   auto ScheduleDimensions = isl_space_dim(Space, isl_dim_set);
318   isl_space_free(Space);
319   assert(DimToVectorize < ScheduleDimensions);
320 
321   if (DimToVectorize > 0) {
322     Node = isl_schedule_node_band_split(Node, DimToVectorize);
323     Node = isl_schedule_node_child(Node, 0);
324   }
325   if (DimToVectorize < ScheduleDimensions - 1)
326     Node = isl_schedule_node_band_split(Node, 1);
327   Space = isl_schedule_node_band_get_space(Node);
328   auto Sizes = isl_multi_val_zero(Space);
329   auto Ctx = isl_schedule_node_get_ctx(Node);
330   Sizes =
331       isl_multi_val_set_val(Sizes, 0, isl_val_int_from_si(Ctx, VectorWidth));
332   Node = isl_schedule_node_band_tile(Node, Sizes);
333   Node = isolateFullPartialTiles(Node, VectorWidth);
334   Node = isl_schedule_node_child(Node, 0);
335   // Make sure the "trivially vectorizable loop" is not unrolled. Otherwise,
336   // we will have troubles to match it in the backend.
337   Node = isl_schedule_node_band_set_ast_build_options(
338       Node, isl_union_set_read_from_str(Ctx, "{ unroll[x]: 1 = 0 }"));
339   Node = isl_schedule_node_band_sink(Node);
340   Node = isl_schedule_node_child(Node, 0);
341   if (isl_schedule_node_get_type(Node) == isl_schedule_node_leaf)
342     Node = isl_schedule_node_parent(Node);
343   isl_id *LoopMarker = isl_id_alloc(Ctx, "SIMD", nullptr);
344   Node = isl_schedule_node_insert_mark(Node, LoopMarker);
345   return Node;
346 }
347 
348 __isl_give isl_schedule_node *
349 ScheduleTreeOptimizer::tileNode(__isl_take isl_schedule_node *Node,
350                                 const char *Identifier, ArrayRef<int> TileSizes,
351                                 int DefaultTileSize) {
352   auto Ctx = isl_schedule_node_get_ctx(Node);
353   auto Space = isl_schedule_node_band_get_space(Node);
354   auto Dims = isl_space_dim(Space, isl_dim_set);
355   auto Sizes = isl_multi_val_zero(Space);
356   std::string IdentifierString(Identifier);
357   for (unsigned i = 0; i < Dims; i++) {
358     auto tileSize = i < TileSizes.size() ? TileSizes[i] : DefaultTileSize;
359     Sizes = isl_multi_val_set_val(Sizes, i, isl_val_int_from_si(Ctx, tileSize));
360   }
361   auto TileLoopMarkerStr = IdentifierString + " - Tiles";
362   isl_id *TileLoopMarker =
363       isl_id_alloc(Ctx, TileLoopMarkerStr.c_str(), nullptr);
364   Node = isl_schedule_node_insert_mark(Node, TileLoopMarker);
365   Node = isl_schedule_node_child(Node, 0);
366   Node = isl_schedule_node_band_tile(Node, Sizes);
367   Node = isl_schedule_node_child(Node, 0);
368   auto PointLoopMarkerStr = IdentifierString + " - Points";
369   isl_id *PointLoopMarker =
370       isl_id_alloc(Ctx, PointLoopMarkerStr.c_str(), nullptr);
371   Node = isl_schedule_node_insert_mark(Node, PointLoopMarker);
372   Node = isl_schedule_node_child(Node, 0);
373   return Node;
374 }
375 
376 __isl_give isl_schedule_node *
377 ScheduleTreeOptimizer::applyRegisterTiling(__isl_take isl_schedule_node *Node,
378                                            llvm::ArrayRef<int> TileSizes,
379                                            int DefaultTileSize) {
380   auto *Ctx = isl_schedule_node_get_ctx(Node);
381   Node = tileNode(Node, "Register tiling", TileSizes, DefaultTileSize);
382   Node = isl_schedule_node_band_set_ast_build_options(
383       Node, isl_union_set_read_from_str(Ctx, "{unroll[x]}"));
384   return Node;
385 }
386 
387 bool ScheduleTreeOptimizer::isTileableBandNode(
388     __isl_keep isl_schedule_node *Node) {
389   if (isl_schedule_node_get_type(Node) != isl_schedule_node_band)
390     return false;
391 
392   if (isl_schedule_node_n_children(Node) != 1)
393     return false;
394 
395   if (!isl_schedule_node_band_get_permutable(Node))
396     return false;
397 
398   auto Space = isl_schedule_node_band_get_space(Node);
399   auto Dims = isl_space_dim(Space, isl_dim_set);
400   isl_space_free(Space);
401 
402   if (Dims <= 1)
403     return false;
404 
405   auto Child = isl_schedule_node_get_child(Node, 0);
406   auto Type = isl_schedule_node_get_type(Child);
407   isl_schedule_node_free(Child);
408 
409   if (Type != isl_schedule_node_leaf)
410     return false;
411 
412   return true;
413 }
414 
415 __isl_give isl_schedule_node *
416 ScheduleTreeOptimizer::standardBandOpts(__isl_take isl_schedule_node *Node,
417                                         void *User) {
418   if (FirstLevelTiling)
419     Node = tileNode(Node, "1st level tiling", FirstLevelTileSizes,
420                     FirstLevelDefaultTileSize);
421 
422   if (SecondLevelTiling)
423     Node = tileNode(Node, "2nd level tiling", SecondLevelTileSizes,
424                     SecondLevelDefaultTileSize);
425 
426   if (RegisterTiling)
427     Node =
428         applyRegisterTiling(Node, RegisterTileSizes, RegisterDefaultTileSize);
429 
430   if (PollyVectorizerChoice == VECTORIZER_NONE)
431     return Node;
432 
433   auto Space = isl_schedule_node_band_get_space(Node);
434   auto Dims = isl_space_dim(Space, isl_dim_set);
435   isl_space_free(Space);
436 
437   for (int i = Dims - 1; i >= 0; i--)
438     if (isl_schedule_node_band_member_get_coincident(Node, i)) {
439       Node = prevectSchedBand(Node, i, PrevectorWidth);
440       break;
441     }
442 
443   return Node;
444 }
445 
446 /// Check whether output dimensions of the map rely on the specified input
447 /// dimension.
448 ///
449 /// @param IslMap The isl map to be considered.
450 /// @param DimNum The number of an input dimension to be checked.
451 static bool isInputDimUsed(__isl_take isl_map *IslMap, unsigned DimNum) {
452   auto *CheckedAccessRelation =
453       isl_map_project_out(isl_map_copy(IslMap), isl_dim_in, DimNum, 1);
454   CheckedAccessRelation =
455       isl_map_insert_dims(CheckedAccessRelation, isl_dim_in, DimNum, 1);
456   auto *InputDimsId = isl_map_get_tuple_id(IslMap, isl_dim_in);
457   CheckedAccessRelation =
458       isl_map_set_tuple_id(CheckedAccessRelation, isl_dim_in, InputDimsId);
459   InputDimsId = isl_map_get_tuple_id(IslMap, isl_dim_out);
460   CheckedAccessRelation =
461       isl_map_set_tuple_id(CheckedAccessRelation, isl_dim_out, InputDimsId);
462   auto res = !isl_map_is_equal(CheckedAccessRelation, IslMap);
463   isl_map_free(CheckedAccessRelation);
464   isl_map_free(IslMap);
465   return res;
466 }
467 
468 /// Check if the SCoP statement could probably be optimized with analytical
469 /// modeling.
470 ///
471 /// containsMatrMult tries to determine whether the following conditions
472 /// are true:
473 /// 1. all memory accesses of the statement will have stride 0 or 1,
474 ///    if we interchange loops (switch the variable used in the inner
475 ///    loop to the outer loop).
476 /// 2. all memory accesses of the statement except from the last one, are
477 ///    read memory access and the last one is write memory access.
478 /// 3. all subscripts of the last memory access of the statement don't contain
479 ///    the variable used in the inner loop.
480 ///
481 /// @param PartialSchedule The PartialSchedule that contains a SCoP statement
482 ///        to check.
483 static bool containsMatrMult(__isl_keep isl_map *PartialSchedule) {
484   auto InputDimsId = isl_map_get_tuple_id(PartialSchedule, isl_dim_in);
485   auto *ScpStmt = static_cast<ScopStmt *>(isl_id_get_user(InputDimsId));
486   isl_id_free(InputDimsId);
487   if (ScpStmt->size() <= 1)
488     return false;
489   auto MemA = ScpStmt->begin();
490   for (unsigned i = 0; i < ScpStmt->size() - 2 && MemA != ScpStmt->end();
491        i++, MemA++)
492     if (!(*MemA)->isRead() ||
493         ((*MemA)->isArrayKind() &&
494          !((*MemA)->isStrideOne(isl_map_copy(PartialSchedule)) ||
495            (*MemA)->isStrideZero(isl_map_copy(PartialSchedule)))))
496       return false;
497   MemA++;
498   if (!(*MemA)->isWrite() || !(*MemA)->isArrayKind() ||
499       !((*MemA)->isStrideOne(isl_map_copy(PartialSchedule)) ||
500         (*MemA)->isStrideZero(isl_map_copy(PartialSchedule))))
501     return false;
502   auto DimNum = isl_map_dim(PartialSchedule, isl_dim_in);
503   return !isInputDimUsed((*MemA)->getAccessRelation(), DimNum - 1);
504 }
505 
506 /// Circular shift of output dimensions of the integer map.
507 ///
508 /// @param IslMap The isl map to be modified.
509 static __isl_give isl_map *circularShiftOutputDims(__isl_take isl_map *IslMap) {
510   auto DimNum = isl_map_dim(IslMap, isl_dim_out);
511   if (DimNum == 0)
512     return IslMap;
513   auto InputDimsId = isl_map_get_tuple_id(IslMap, isl_dim_in);
514   IslMap = isl_map_move_dims(IslMap, isl_dim_in, 0, isl_dim_out, DimNum - 1, 1);
515   IslMap = isl_map_move_dims(IslMap, isl_dim_out, 0, isl_dim_in, 0, 1);
516   return isl_map_set_tuple_id(IslMap, isl_dim_in, InputDimsId);
517 }
518 
519 /// Permute two dimensions of the band node.
520 ///
521 /// Permute FirstDim and SecondDim dimensions of the Node.
522 ///
523 /// @param Node The band node to be modified.
524 /// @param FirstDim The first dimension to be permuted.
525 /// @param SecondDim The second dimension to be permuted.
526 static __isl_give isl_schedule_node *
527 permuteBandNodeDimensions(__isl_take isl_schedule_node *Node, unsigned FirstDim,
528                           unsigned SecondDim) {
529   assert(isl_schedule_node_get_type(Node) == isl_schedule_node_band &&
530          isl_schedule_node_band_n_member(Node) > std::max(FirstDim, SecondDim));
531   auto PartialSchedule = isl_schedule_node_band_get_partial_schedule(Node);
532   auto PartialScheduleFirstDim =
533       isl_multi_union_pw_aff_get_union_pw_aff(PartialSchedule, FirstDim);
534   auto PartialScheduleSecondDim =
535       isl_multi_union_pw_aff_get_union_pw_aff(PartialSchedule, SecondDim);
536   PartialSchedule = isl_multi_union_pw_aff_set_union_pw_aff(
537       PartialSchedule, SecondDim, PartialScheduleFirstDim);
538   PartialSchedule = isl_multi_union_pw_aff_set_union_pw_aff(
539       PartialSchedule, FirstDim, PartialScheduleSecondDim);
540   Node = isl_schedule_node_delete(Node);
541   Node = isl_schedule_node_insert_partial_schedule(Node, PartialSchedule);
542   return Node;
543 }
544 
545 __isl_give isl_schedule_node *ScheduleTreeOptimizer::createMicroKernel(
546     __isl_take isl_schedule_node *Node, MicroKernelParamsTy MicroKernelParams) {
547   applyRegisterTiling(Node, {MicroKernelParams.Mr, MicroKernelParams.Nr}, 1);
548   Node = isl_schedule_node_parent(isl_schedule_node_parent(Node));
549   Node = permuteBandNodeDimensions(Node, 0, 1);
550   return isl_schedule_node_child(isl_schedule_node_child(Node, 0), 0);
551 }
552 
553 __isl_give isl_schedule_node *ScheduleTreeOptimizer::createMacroKernel(
554     __isl_take isl_schedule_node *Node, MacroKernelParamsTy MacroKernelParams) {
555   assert(isl_schedule_node_get_type(Node) == isl_schedule_node_band);
556   if (MacroKernelParams.Mc == 1 && MacroKernelParams.Nc == 1 &&
557       MacroKernelParams.Kc == 1)
558     return Node;
559   Node = tileNode(
560       Node, "1st level tiling",
561       {MacroKernelParams.Mc, MacroKernelParams.Nc, MacroKernelParams.Kc}, 1);
562   Node = isl_schedule_node_parent(isl_schedule_node_parent(Node));
563   Node = permuteBandNodeDimensions(Node, 1, 2);
564   Node = permuteBandNodeDimensions(Node, 0, 2);
565   return isl_schedule_node_child(isl_schedule_node_child(Node, 0), 0);
566 }
567 
568 /// Get parameters of the BLIS micro kernel.
569 ///
570 /// We choose the Mr and Nr parameters of the micro kernel to be large enough
571 /// such that no stalls caused by the combination of latencies and dependencies
572 /// are introduced during the updates of the resulting matrix of the matrix
573 /// multiplication. However, they should also be as small as possible to
574 /// release more registers for entries of multiplied matrices.
575 ///
576 /// @param TTI Target Transform Info.
577 /// @return The structure of type MicroKernelParamsTy.
578 /// @see MicroKernelParamsTy
579 static struct MicroKernelParamsTy
580 getMicroKernelParams(const llvm::TargetTransformInfo *TTI) {
581   assert(TTI && "The target transform info should be provided.");
582 
583   // Nvec - Number of double-precision floating-point numbers that can be hold
584   // by a vector register. Use 2 by default.
585   auto Nvec = TTI->getRegisterBitWidth(true) / 64;
586   if (Nvec == 0)
587     Nvec = 2;
588   int Nr =
589       ceil(sqrt(Nvec * LatencyVectorFma * ThrougputVectorFma) / Nvec) * Nvec;
590   int Mr = ceil(Nvec * LatencyVectorFma * ThrougputVectorFma / Nr);
591   return {Mr, Nr};
592 }
593 
594 /// Get parameters of the BLIS macro kernel.
595 ///
596 /// During the computation of matrix multiplication, blocks of partitioned
597 /// matrices are mapped to different layers of the memory hierarchy.
598 /// To optimize data reuse, blocks should be ideally kept in cache between
599 /// iterations. Since parameters of the macro kernel determine sizes of these
600 /// blocks, there are upper and lower bounds on these parameters.
601 ///
602 /// @param MicroKernelParams Parameters of the micro-kernel
603 ///                          to be taken into account.
604 /// @return The structure of type MacroKernelParamsTy.
605 /// @see MacroKernelParamsTy
606 /// @see MicroKernelParamsTy
607 static struct MacroKernelParamsTy
608 getMacroKernelParams(const MicroKernelParamsTy &MicroKernelParams) {
609   // According to www.cs.utexas.edu/users/flame/pubs/TOMS-BLIS-Analytical.pdf,
610   // it requires information about the first two levels of a cache to determine
611   // all the parameters of a macro-kernel. It also checks that an associativity
612   // degree of a cache level is greater than two. Otherwise, another algorithm
613   // for determination of the parameters should be used.
614   if (!(MicroKernelParams.Mr > 0 && MicroKernelParams.Nr > 0 &&
615         CacheLevelSizes.size() >= 2 && CacheLevelAssociativity.size() >= 2 &&
616         CacheLevelSizes[0] > 0 && CacheLevelSizes[1] > 0 &&
617         CacheLevelAssociativity[0] > 2 && CacheLevelAssociativity[1] > 2))
618     return {1, 1, 1};
619   // The quotient should be greater than zero.
620   if (PollyPatternMatchingNcQuotient <= 0)
621     return {1, 1, 1};
622   int Car = floor(
623       (CacheLevelAssociativity[0] - 1) /
624       (1 + static_cast<double>(MicroKernelParams.Nr) / MicroKernelParams.Mr));
625   int Kc = (Car * CacheLevelSizes[0]) /
626            (MicroKernelParams.Mr * CacheLevelAssociativity[0] * 8);
627   double Cac = static_cast<double>(Kc * 8 * CacheLevelAssociativity[1]) /
628                CacheLevelSizes[1];
629   int Mc = floor((CacheLevelAssociativity[1] - 2) / Cac);
630   int Nc = PollyPatternMatchingNcQuotient * MicroKernelParams.Nr;
631   return {Mc, Nc, Kc};
632 }
633 
634 /// Identify a memory access through the shape of its memory access relation.
635 ///
636 /// Identify the unique memory access in @p Stmt, that has an access relation
637 /// equal to @p ExpectedAccessRelation.
638 ///
639 /// @param Stmt The SCoP statement that contains the memory accesses under
640 ///             consideration.
641 /// @param ExpectedAccessRelation The access relation that identifies
642 ///                               the memory access.
643 /// @return  The memory access of @p Stmt whose memory access relation is equal
644 ///          to @p ExpectedAccessRelation. nullptr in case there is no or more
645 ///          than one such access.
646 MemoryAccess *
647 identifyAccessByAccessRelation(ScopStmt *Stmt,
648                                __isl_take isl_map *ExpectedAccessRelation) {
649   if (isl_map_has_tuple_id(ExpectedAccessRelation, isl_dim_out))
650     ExpectedAccessRelation =
651         isl_map_reset_tuple_id(ExpectedAccessRelation, isl_dim_out);
652   MemoryAccess *IdentifiedAccess = nullptr;
653   for (auto *Access : *Stmt) {
654     auto *AccessRelation = Access->getAccessRelation();
655     AccessRelation = isl_map_reset_tuple_id(AccessRelation, isl_dim_out);
656     if (isl_map_is_equal(ExpectedAccessRelation, AccessRelation)) {
657       if (IdentifiedAccess) {
658         isl_map_free(AccessRelation);
659         isl_map_free(ExpectedAccessRelation);
660         return nullptr;
661       }
662       IdentifiedAccess = Access;
663     }
664     isl_map_free(AccessRelation);
665   }
666   isl_map_free(ExpectedAccessRelation);
667   return IdentifiedAccess;
668 }
669 
670 /// Add constrains to @Dim dimension of @p ExtMap.
671 ///
672 /// If @ExtMap has the following form [O0, O1, O2]->[I1, I2, I3],
673 /// the following constraint will be added
674 /// Bound * OM <= IM <= Bound * (OM + 1) - 1,
675 /// where M is @p Dim and Bound is @p Bound.
676 ///
677 /// @param ExtMap The isl map to be modified.
678 /// @param Dim The output dimension to be modfied.
679 /// @param Bound The value that is used to specify the constraint.
680 /// @return The modified isl map
681 __isl_give isl_map *
682 addExtensionMapMatMulDimConstraint(__isl_take isl_map *ExtMap, unsigned Dim,
683                                    unsigned Bound) {
684   assert(Bound != 0);
685   auto *ExtMapSpace = isl_map_get_space(ExtMap);
686   auto *ConstrSpace = isl_local_space_from_space(ExtMapSpace);
687   auto *Constr =
688       isl_constraint_alloc_inequality(isl_local_space_copy(ConstrSpace));
689   Constr = isl_constraint_set_coefficient_si(Constr, isl_dim_out, Dim, 1);
690   Constr =
691       isl_constraint_set_coefficient_si(Constr, isl_dim_in, Dim, Bound * (-1));
692   ExtMap = isl_map_add_constraint(ExtMap, Constr);
693   Constr = isl_constraint_alloc_inequality(ConstrSpace);
694   Constr = isl_constraint_set_coefficient_si(Constr, isl_dim_out, Dim, -1);
695   Constr = isl_constraint_set_coefficient_si(Constr, isl_dim_in, Dim, Bound);
696   Constr = isl_constraint_set_constant_si(Constr, Bound - 1);
697   return isl_map_add_constraint(ExtMap, Constr);
698 }
699 
700 /// Create an access relation that is specific for matrix multiplication
701 /// pattern.
702 ///
703 /// Create an access relation of the following form:
704 /// { [O0, O1, O2]->[I1, I2, I3] :
705 ///   FirstOutputDimBound * O0 <= I1 <= FirstOutputDimBound * (O0 + 1) - 1
706 ///   and SecondOutputDimBound * O1 <= I2 <= SecondOutputDimBound * (O1 + 1) - 1
707 ///   and ThirdOutputDimBound * O2 <= I3 <= ThirdOutputDimBound * (O2 + 1) - 1}
708 ///   where FirstOutputDimBound is @p FirstOutputDimBound,
709 ///   SecondOutputDimBound is @p SecondOutputDimBound,
710 ///   ThirdOutputDimBound is @p ThirdOutputDimBound
711 ///
712 /// @param Ctx The isl context.
713 /// @param FirstOutputDimBound,
714 ///        SecondOutputDimBound,
715 ///        ThirdOutputDimBound The parameters of the access relation.
716 /// @return The specified access relation.
717 __isl_give isl_map *getMatMulExt(isl_ctx *Ctx, unsigned FirstOutputDimBound,
718                                  unsigned SecondOutputDimBound,
719                                  unsigned ThirdOutputDimBound) {
720   auto *NewRelSpace = isl_space_alloc(Ctx, 0, 3, 3);
721   auto *extensionMap = isl_map_universe(NewRelSpace);
722   if (!FirstOutputDimBound)
723     extensionMap = isl_map_fix_si(extensionMap, isl_dim_out, 0, 0);
724   else
725     extensionMap = addExtensionMapMatMulDimConstraint(extensionMap, 0,
726                                                       FirstOutputDimBound);
727   if (!SecondOutputDimBound)
728     extensionMap = isl_map_fix_si(extensionMap, isl_dim_out, 1, 0);
729   else
730     extensionMap = addExtensionMapMatMulDimConstraint(extensionMap, 1,
731                                                       SecondOutputDimBound);
732   if (!ThirdOutputDimBound)
733     extensionMap = isl_map_fix_si(extensionMap, isl_dim_out, 2, 0);
734   else
735     extensionMap = addExtensionMapMatMulDimConstraint(extensionMap, 2,
736                                                       ThirdOutputDimBound);
737   return extensionMap;
738 }
739 
740 /// Create an access relation that is specific to the matrix
741 ///        multiplication pattern.
742 ///
743 /// Create an access relation of the following form:
744 /// Stmt[O0, O1, O2]->[OI, OJ],
745 /// where I is @p I, J is @J
746 ///
747 /// @param Stmt The SCoP statement for which to generate the access relation.
748 /// @param I The index of the input dimension that is mapped to the first output
749 ///          dimension.
750 /// @param J The index of the input dimension that is mapped to the second
751 ///          output dimension.
752 /// @return The specified access relation.
753 __isl_give isl_map *
754 getMatMulPatternOriginalAccessRelation(ScopStmt *Stmt, unsigned I, unsigned J) {
755   auto *AccessRelSpace = isl_space_alloc(Stmt->getIslCtx(), 0, 3, 2);
756   auto *AccessRel = isl_map_universe(AccessRelSpace);
757   AccessRel = isl_map_equate(AccessRel, isl_dim_in, I, isl_dim_out, 0);
758   AccessRel = isl_map_equate(AccessRel, isl_dim_in, J, isl_dim_out, 1);
759   AccessRel = isl_map_set_tuple_id(AccessRel, isl_dim_in, Stmt->getDomainId());
760   return AccessRel;
761 }
762 
763 /// Identify the memory access that corresponds to the access to the second
764 /// operand of the matrix multiplication.
765 ///
766 /// Identify the memory access that corresponds to the access
767 /// to the matrix B of the matrix multiplication C = A x B.
768 ///
769 /// @param Stmt The SCoP statement that contains the memory accesses
770 ///             under consideration.
771 /// @return The memory access of @p Stmt that corresponds to the access
772 ///         to the second operand of the matrix multiplication.
773 MemoryAccess *identifyAccessA(ScopStmt *Stmt) {
774   auto *OriginalRel = getMatMulPatternOriginalAccessRelation(Stmt, 0, 2);
775   return identifyAccessByAccessRelation(Stmt, OriginalRel);
776 }
777 
778 /// Identify the memory access that corresponds to the access to the first
779 /// operand of the matrix multiplication.
780 ///
781 /// Identify the memory access that corresponds to the access
782 /// to the matrix A of the matrix multiplication C = A x B.
783 ///
784 /// @param Stmt The SCoP statement that contains the memory accesses
785 ///             under consideration.
786 /// @return The memory access of @p Stmt that corresponds to the access
787 ///         to the first operand of the matrix multiplication.
788 MemoryAccess *identifyAccessB(ScopStmt *Stmt) {
789   auto *OriginalRel = getMatMulPatternOriginalAccessRelation(Stmt, 2, 1);
790   return identifyAccessByAccessRelation(Stmt, OriginalRel);
791 }
792 
793 /// Create an access relation that is specific to
794 ///        the matrix multiplication pattern.
795 ///
796 /// Create an access relation of the following form:
797 /// [O0, O1, O2, O3, O4, O5, O6, O7, O8] -> [OI, O5, OJ]
798 /// where I is @p FirstDim, J is @p SecondDim.
799 ///
800 /// It can be used, for example, to create relations that helps to consequently
801 /// access elements of operands of a matrix multiplication after creation of
802 /// the BLIS micro and macro kernels.
803 ///
804 /// @see ScheduleTreeOptimizer::createMicroKernel
805 /// @see ScheduleTreeOptimizer::createMacroKernel
806 ///
807 /// Subsequently, the described access relation is applied to the range of
808 /// @p MapOldIndVar, that is used to map original induction variables to
809 /// the ones, which are produced by schedule transformations. It helps to
810 /// define relations using a new space and, at the same time, keep them
811 /// in the original one.
812 ///
813 /// @param MapOldIndVar The relation, which maps original induction variables
814 ///                     to the ones, which are produced by schedule
815 ///                     transformations.
816 /// @param FirstDim, SecondDim The input dimensions that are used to define
817 ///        the specified access relation.
818 /// @return The specified access relation.
819 __isl_give isl_map *getMatMulAccRel(__isl_take isl_map *MapOldIndVar,
820                                     unsigned FirstDim, unsigned SecondDim) {
821   auto *Ctx = isl_map_get_ctx(MapOldIndVar);
822   auto *AccessRelSpace = isl_space_alloc(Ctx, 0, 9, 3);
823   auto *AccessRel = isl_map_universe(AccessRelSpace);
824   AccessRel = isl_map_equate(AccessRel, isl_dim_in, FirstDim, isl_dim_out, 0);
825   AccessRel = isl_map_equate(AccessRel, isl_dim_in, 5, isl_dim_out, 1);
826   AccessRel = isl_map_equate(AccessRel, isl_dim_in, SecondDim, isl_dim_out, 2);
827   return isl_map_apply_range(MapOldIndVar, AccessRel);
828 }
829 
830 __isl_give isl_schedule_node *
831 createExtensionNode(__isl_take isl_schedule_node *Node,
832                     __isl_take isl_map *ExtensionMap) {
833   auto *Extension = isl_union_map_from_map(ExtensionMap);
834   auto *NewNode = isl_schedule_node_from_extension(Extension);
835   return isl_schedule_node_graft_before(Node, NewNode);
836 }
837 
838 /// Apply the packing transformation.
839 ///
840 /// The packing transformation can be described as a data-layout
841 /// transformation that requires to introduce a new array, copy data
842 /// to the array, and change memory access locations of the compute kernel
843 /// to reference the array.
844 ///
845 /// @param Node The schedule node to be optimized.
846 /// @param MapOldIndVar The relation, which maps original induction variables
847 ///                     to the ones, which are produced by schedule
848 ///                     transformations.
849 /// @param MicroParams, MacroParams Parameters of the BLIS kernel
850 ///                                 to be taken into account.
851 /// @return The optimized schedule node.
852 static __isl_give isl_schedule_node *optimizeDataLayoutMatrMulPattern(
853     __isl_take isl_schedule_node *Node, __isl_take isl_map *MapOldIndVar,
854     MicroKernelParamsTy MicroParams, MacroKernelParamsTy MacroParams) {
855   // Check whether memory accesses of the SCoP statement correspond to
856   // the matrix multiplication pattern and if this is true, obtain them.
857   auto InputDimsId = isl_map_get_tuple_id(MapOldIndVar, isl_dim_in);
858   auto *Stmt = static_cast<ScopStmt *>(isl_id_get_user(InputDimsId));
859   isl_id_free(InputDimsId);
860   MemoryAccess *MemAccessA = identifyAccessA(Stmt);
861   MemoryAccess *MemAccessB = identifyAccessB(Stmt);
862   if (!MemAccessA || !MemAccessB) {
863     isl_map_free(MapOldIndVar);
864     return Node;
865   }
866 
867   // Create a copy statement that corresponds to the memory access to the
868   // matrix B, the second operand of the matrix multiplication.
869   Node = isl_schedule_node_parent(isl_schedule_node_parent(Node));
870   Node = isl_schedule_node_parent(isl_schedule_node_parent(Node));
871   Node = isl_schedule_node_parent(Node);
872   Node = isl_schedule_node_child(isl_schedule_node_band_split(Node, 2), 0);
873   auto *AccRel = getMatMulAccRel(isl_map_copy(MapOldIndVar), 3, 7);
874   unsigned FirstDimSize = MacroParams.Nc / MicroParams.Nr;
875   unsigned SecondDimSize = MacroParams.Kc;
876   unsigned ThirdDimSize = MicroParams.Nr;
877   auto *SAI = Stmt->getParent()->createScopArrayInfo(
878       MemAccessB->getElementType(), "Packed_B",
879       {FirstDimSize, SecondDimSize, ThirdDimSize});
880   AccRel = isl_map_set_tuple_id(AccRel, isl_dim_out, SAI->getBasePtrId());
881   auto *OldAcc = MemAccessB->getAccessRelation();
882   MemAccessB->setNewAccessRelation(AccRel);
883   auto *ExtMap =
884       getMatMulExt(Stmt->getIslCtx(), 0, MacroParams.Nc, MacroParams.Kc);
885   isl_map_move_dims(ExtMap, isl_dim_out, 0, isl_dim_in, 0, 1);
886   isl_map_move_dims(ExtMap, isl_dim_in, 2, isl_dim_out, 0, 1);
887   ExtMap = isl_map_project_out(ExtMap, isl_dim_in, 2, 1);
888   auto *Domain = Stmt->getDomain();
889 
890   // Restrict the domains of the copy statements to only execute when also its
891   // originating statement is executed.
892   auto *DomainId = isl_set_get_tuple_id(Domain);
893   auto *NewStmt = Stmt->getParent()->addScopStmt(
894       OldAcc, MemAccessB->getAccessRelation(), isl_set_copy(Domain));
895   ExtMap = isl_map_set_tuple_id(ExtMap, isl_dim_out, isl_id_copy(DomainId));
896   ExtMap = isl_map_intersect_range(ExtMap, isl_set_copy(Domain));
897   ExtMap = isl_map_set_tuple_id(ExtMap, isl_dim_out, NewStmt->getDomainId());
898   Node = createExtensionNode(Node, ExtMap);
899 
900   // Create a copy statement that corresponds to the memory access
901   // to the matrix A, the first operand of the matrix multiplication.
902   Node = isl_schedule_node_child(Node, 0);
903   AccRel = getMatMulAccRel(MapOldIndVar, 4, 6);
904   FirstDimSize = MacroParams.Mc / MicroParams.Mr;
905   ThirdDimSize = MicroParams.Mr;
906   SAI = Stmt->getParent()->createScopArrayInfo(
907       MemAccessA->getElementType(), "Packed_A",
908       {FirstDimSize, SecondDimSize, ThirdDimSize});
909   AccRel = isl_map_set_tuple_id(AccRel, isl_dim_out, SAI->getBasePtrId());
910   OldAcc = MemAccessA->getAccessRelation();
911   MemAccessA->setNewAccessRelation(AccRel);
912   ExtMap = getMatMulExt(Stmt->getIslCtx(), MacroParams.Mc, 0, MacroParams.Kc);
913   isl_map_move_dims(ExtMap, isl_dim_out, 0, isl_dim_in, 0, 1);
914   isl_map_move_dims(ExtMap, isl_dim_in, 2, isl_dim_out, 0, 1);
915   NewStmt = Stmt->getParent()->addScopStmt(
916       OldAcc, MemAccessA->getAccessRelation(), isl_set_copy(Domain));
917 
918   // Restrict the domains of the copy statements to only execute when also its
919   // originating statement is executed.
920   ExtMap = isl_map_set_tuple_id(ExtMap, isl_dim_out, DomainId);
921   ExtMap = isl_map_intersect_range(ExtMap, Domain);
922   ExtMap = isl_map_set_tuple_id(ExtMap, isl_dim_out, NewStmt->getDomainId());
923   Node = createExtensionNode(Node, ExtMap);
924   Node = isl_schedule_node_child(isl_schedule_node_child(Node, 0), 0);
925   return isl_schedule_node_child(isl_schedule_node_child(Node, 0), 0);
926 }
927 
928 /// Get a relation mapping induction variables produced by schedule
929 /// transformations to the original ones.
930 ///
931 /// @param Node The schedule node produced as the result of creation
932 ///        of the BLIS kernels.
933 /// @param MicroKernelParams, MacroKernelParams Parameters of the BLIS kernel
934 ///                                             to be taken into account.
935 /// @return  The relation mapping original induction variables to the ones
936 ///          produced by schedule transformation.
937 /// @see ScheduleTreeOptimizer::createMicroKernel
938 /// @see ScheduleTreeOptimizer::createMacroKernel
939 /// @see getMacroKernelParams
940 __isl_give isl_map *
941 getInductionVariablesSubstitution(__isl_take isl_schedule_node *Node,
942                                   MicroKernelParamsTy MicroKernelParams,
943                                   MacroKernelParamsTy MacroKernelParams) {
944   auto *Child = isl_schedule_node_get_child(Node, 0);
945   auto *UnMapOldIndVar = isl_schedule_node_get_prefix_schedule_union_map(Child);
946   isl_schedule_node_free(Child);
947   auto *MapOldIndVar = isl_map_from_union_map(UnMapOldIndVar);
948   if (isl_map_dim(MapOldIndVar, isl_dim_out) > 9)
949     MapOldIndVar =
950         isl_map_project_out(MapOldIndVar, isl_dim_out, 0,
951                             isl_map_dim(MapOldIndVar, isl_dim_out) - 9);
952   return MapOldIndVar;
953 }
954 
955 __isl_give isl_schedule_node *ScheduleTreeOptimizer::optimizeMatMulPattern(
956     __isl_take isl_schedule_node *Node, const llvm::TargetTransformInfo *TTI) {
957   assert(TTI && "The target transform info should be provided.");
958   auto MicroKernelParams = getMicroKernelParams(TTI);
959   auto MacroKernelParams = getMacroKernelParams(MicroKernelParams);
960   Node = createMacroKernel(Node, MacroKernelParams);
961   Node = createMicroKernel(Node, MicroKernelParams);
962   if (MacroKernelParams.Mc == 1 || MacroKernelParams.Nc == 1 ||
963       MacroKernelParams.Kc == 1)
964     return Node;
965   auto *MapOldIndVar = getInductionVariablesSubstitution(
966       Node, MicroKernelParams, MacroKernelParams);
967   if (!MapOldIndVar)
968     return Node;
969   return optimizeDataLayoutMatrMulPattern(Node, MapOldIndVar, MicroKernelParams,
970                                           MacroKernelParams);
971 }
972 
973 bool ScheduleTreeOptimizer::isMatrMultPattern(
974     __isl_keep isl_schedule_node *Node) {
975   auto *PartialSchedule =
976       isl_schedule_node_band_get_partial_schedule_union_map(Node);
977   if (isl_schedule_node_band_n_member(Node) != 3 ||
978       isl_union_map_n_map(PartialSchedule) != 1) {
979     isl_union_map_free(PartialSchedule);
980     return false;
981   }
982   auto *NewPartialSchedule = isl_map_from_union_map(PartialSchedule);
983   NewPartialSchedule = circularShiftOutputDims(NewPartialSchedule);
984   if (containsMatrMult(NewPartialSchedule)) {
985     isl_map_free(NewPartialSchedule);
986     return true;
987   }
988   isl_map_free(NewPartialSchedule);
989   return false;
990 }
991 
992 __isl_give isl_schedule_node *
993 ScheduleTreeOptimizer::optimizeBand(__isl_take isl_schedule_node *Node,
994                                     void *User) {
995   if (!isTileableBandNode(Node))
996     return Node;
997 
998   if (PMBasedOpts && User && isMatrMultPattern(Node)) {
999     DEBUG(dbgs() << "The matrix multiplication pattern was detected\n");
1000     const llvm::TargetTransformInfo *TTI;
1001     TTI = static_cast<const llvm::TargetTransformInfo *>(User);
1002     Node = optimizeMatMulPattern(Node, TTI);
1003   }
1004 
1005   return standardBandOpts(Node, User);
1006 }
1007 
1008 __isl_give isl_schedule *
1009 ScheduleTreeOptimizer::optimizeSchedule(__isl_take isl_schedule *Schedule,
1010                                         const llvm::TargetTransformInfo *TTI) {
1011   isl_schedule_node *Root = isl_schedule_get_root(Schedule);
1012   Root = optimizeScheduleNode(Root, TTI);
1013   isl_schedule_free(Schedule);
1014   auto S = isl_schedule_node_get_schedule(Root);
1015   isl_schedule_node_free(Root);
1016   return S;
1017 }
1018 
1019 __isl_give isl_schedule_node *ScheduleTreeOptimizer::optimizeScheduleNode(
1020     __isl_take isl_schedule_node *Node, const llvm::TargetTransformInfo *TTI) {
1021   Node = isl_schedule_node_map_descendant_bottom_up(
1022       Node, optimizeBand, const_cast<void *>(static_cast<const void *>(TTI)));
1023   return Node;
1024 }
1025 
1026 bool ScheduleTreeOptimizer::isProfitableSchedule(
1027     Scop &S, __isl_keep isl_schedule *NewSchedule) {
1028   // To understand if the schedule has been optimized we check if the schedule
1029   // has changed at all.
1030   // TODO: We can improve this by tracking if any necessarily beneficial
1031   // transformations have been performed. This can e.g. be tiling, loop
1032   // interchange, or ...) We can track this either at the place where the
1033   // transformation has been performed or, in case of automatic ILP based
1034   // optimizations, by comparing (yet to be defined) performance metrics
1035   // before/after the scheduling optimizer
1036   // (e.g., #stride-one accesses)
1037   if (S.containsExtensionNode(NewSchedule))
1038     return true;
1039   auto *NewScheduleMap = isl_schedule_get_map(NewSchedule);
1040   isl_union_map *OldSchedule = S.getSchedule();
1041   assert(OldSchedule && "Only IslScheduleOptimizer can insert extension nodes "
1042                         "that make Scop::getSchedule() return nullptr.");
1043   bool changed = !isl_union_map_is_equal(OldSchedule, NewScheduleMap);
1044   isl_union_map_free(OldSchedule);
1045   isl_union_map_free(NewScheduleMap);
1046   return changed;
1047 }
1048 
1049 namespace {
1050 class IslScheduleOptimizer : public ScopPass {
1051 public:
1052   static char ID;
1053   explicit IslScheduleOptimizer() : ScopPass(ID) { LastSchedule = nullptr; }
1054 
1055   ~IslScheduleOptimizer() { isl_schedule_free(LastSchedule); }
1056 
1057   /// Optimize the schedule of the SCoP @p S.
1058   bool runOnScop(Scop &S) override;
1059 
1060   /// Print the new schedule for the SCoP @p S.
1061   void printScop(raw_ostream &OS, Scop &S) const override;
1062 
1063   /// Register all analyses and transformation required.
1064   void getAnalysisUsage(AnalysisUsage &AU) const override;
1065 
1066   /// Release the internal memory.
1067   void releaseMemory() override {
1068     isl_schedule_free(LastSchedule);
1069     LastSchedule = nullptr;
1070   }
1071 
1072 private:
1073   isl_schedule *LastSchedule;
1074 };
1075 } // namespace
1076 
1077 char IslScheduleOptimizer::ID = 0;
1078 
1079 bool IslScheduleOptimizer::runOnScop(Scop &S) {
1080 
1081   // Skip empty SCoPs but still allow code generation as it will delete the
1082   // loops present but not needed.
1083   if (S.getSize() == 0) {
1084     S.markAsOptimized();
1085     return false;
1086   }
1087 
1088   const Dependences &D =
1089       getAnalysis<DependenceInfo>().getDependences(Dependences::AL_Statement);
1090 
1091   if (!D.hasValidDependences())
1092     return false;
1093 
1094   isl_schedule_free(LastSchedule);
1095   LastSchedule = nullptr;
1096 
1097   // Build input data.
1098   int ValidityKinds =
1099       Dependences::TYPE_RAW | Dependences::TYPE_WAR | Dependences::TYPE_WAW;
1100   int ProximityKinds;
1101 
1102   if (OptimizeDeps == "all")
1103     ProximityKinds =
1104         Dependences::TYPE_RAW | Dependences::TYPE_WAR | Dependences::TYPE_WAW;
1105   else if (OptimizeDeps == "raw")
1106     ProximityKinds = Dependences::TYPE_RAW;
1107   else {
1108     errs() << "Do not know how to optimize for '" << OptimizeDeps << "'"
1109            << " Falling back to optimizing all dependences.\n";
1110     ProximityKinds =
1111         Dependences::TYPE_RAW | Dependences::TYPE_WAR | Dependences::TYPE_WAW;
1112   }
1113 
1114   isl_union_set *Domain = S.getDomains();
1115 
1116   if (!Domain)
1117     return false;
1118 
1119   isl_union_map *Validity = D.getDependences(ValidityKinds);
1120   isl_union_map *Proximity = D.getDependences(ProximityKinds);
1121 
1122   // Simplify the dependences by removing the constraints introduced by the
1123   // domains. This can speed up the scheduling time significantly, as large
1124   // constant coefficients will be removed from the dependences. The
1125   // introduction of some additional dependences reduces the possible
1126   // transformations, but in most cases, such transformation do not seem to be
1127   // interesting anyway. In some cases this option may stop the scheduler to
1128   // find any schedule.
1129   if (SimplifyDeps == "yes") {
1130     Validity = isl_union_map_gist_domain(Validity, isl_union_set_copy(Domain));
1131     Validity = isl_union_map_gist_range(Validity, isl_union_set_copy(Domain));
1132     Proximity =
1133         isl_union_map_gist_domain(Proximity, isl_union_set_copy(Domain));
1134     Proximity = isl_union_map_gist_range(Proximity, isl_union_set_copy(Domain));
1135   } else if (SimplifyDeps != "no") {
1136     errs() << "warning: Option -polly-opt-simplify-deps should either be 'yes' "
1137               "or 'no'. Falling back to default: 'yes'\n";
1138   }
1139 
1140   DEBUG(dbgs() << "\n\nCompute schedule from: ");
1141   DEBUG(dbgs() << "Domain := " << stringFromIslObj(Domain) << ";\n");
1142   DEBUG(dbgs() << "Proximity := " << stringFromIslObj(Proximity) << ";\n");
1143   DEBUG(dbgs() << "Validity := " << stringFromIslObj(Validity) << ";\n");
1144 
1145   unsigned IslSerializeSCCs;
1146 
1147   if (FusionStrategy == "max") {
1148     IslSerializeSCCs = 0;
1149   } else if (FusionStrategy == "min") {
1150     IslSerializeSCCs = 1;
1151   } else {
1152     errs() << "warning: Unknown fusion strategy. Falling back to maximal "
1153               "fusion.\n";
1154     IslSerializeSCCs = 0;
1155   }
1156 
1157   int IslMaximizeBands;
1158 
1159   if (MaximizeBandDepth == "yes") {
1160     IslMaximizeBands = 1;
1161   } else if (MaximizeBandDepth == "no") {
1162     IslMaximizeBands = 0;
1163   } else {
1164     errs() << "warning: Option -polly-opt-maximize-bands should either be 'yes'"
1165               " or 'no'. Falling back to default: 'yes'\n";
1166     IslMaximizeBands = 1;
1167   }
1168 
1169   int IslOuterCoincidence;
1170 
1171   if (OuterCoincidence == "yes") {
1172     IslOuterCoincidence = 1;
1173   } else if (OuterCoincidence == "no") {
1174     IslOuterCoincidence = 0;
1175   } else {
1176     errs() << "warning: Option -polly-opt-outer-coincidence should either be "
1177               "'yes' or 'no'. Falling back to default: 'no'\n";
1178     IslOuterCoincidence = 0;
1179   }
1180 
1181   isl_ctx *Ctx = S.getIslCtx();
1182 
1183   isl_options_set_schedule_outer_coincidence(Ctx, IslOuterCoincidence);
1184   isl_options_set_schedule_serialize_sccs(Ctx, IslSerializeSCCs);
1185   isl_options_set_schedule_maximize_band_depth(Ctx, IslMaximizeBands);
1186   isl_options_set_schedule_max_constant_term(Ctx, MaxConstantTerm);
1187   isl_options_set_schedule_max_coefficient(Ctx, MaxCoefficient);
1188   isl_options_set_tile_scale_tile_loops(Ctx, 0);
1189 
1190   auto OnErrorStatus = isl_options_get_on_error(Ctx);
1191   isl_options_set_on_error(Ctx, ISL_ON_ERROR_CONTINUE);
1192 
1193   isl_schedule_constraints *ScheduleConstraints;
1194   ScheduleConstraints = isl_schedule_constraints_on_domain(Domain);
1195   ScheduleConstraints =
1196       isl_schedule_constraints_set_proximity(ScheduleConstraints, Proximity);
1197   ScheduleConstraints = isl_schedule_constraints_set_validity(
1198       ScheduleConstraints, isl_union_map_copy(Validity));
1199   ScheduleConstraints =
1200       isl_schedule_constraints_set_coincidence(ScheduleConstraints, Validity);
1201   isl_schedule *Schedule;
1202   Schedule = isl_schedule_constraints_compute_schedule(ScheduleConstraints);
1203   isl_options_set_on_error(Ctx, OnErrorStatus);
1204 
1205   // In cases the scheduler is not able to optimize the code, we just do not
1206   // touch the schedule.
1207   if (!Schedule)
1208     return false;
1209 
1210   DEBUG({
1211     auto *P = isl_printer_to_str(Ctx);
1212     P = isl_printer_set_yaml_style(P, ISL_YAML_STYLE_BLOCK);
1213     P = isl_printer_print_schedule(P, Schedule);
1214     auto *str = isl_printer_get_str(P);
1215     dbgs() << "NewScheduleTree: \n" << str << "\n";
1216     free(str);
1217     isl_printer_free(P);
1218   });
1219 
1220   Function &F = S.getFunction();
1221   auto *TTI = &getAnalysis<TargetTransformInfoWrapperPass>().getTTI(F);
1222   isl_schedule *NewSchedule =
1223       ScheduleTreeOptimizer::optimizeSchedule(Schedule, TTI);
1224 
1225   if (!ScheduleTreeOptimizer::isProfitableSchedule(S, NewSchedule)) {
1226     isl_schedule_free(NewSchedule);
1227     return false;
1228   }
1229 
1230   S.setScheduleTree(NewSchedule);
1231   S.markAsOptimized();
1232 
1233   if (OptimizedScops)
1234     S.dump();
1235 
1236   return false;
1237 }
1238 
1239 void IslScheduleOptimizer::printScop(raw_ostream &OS, Scop &) const {
1240   isl_printer *p;
1241   char *ScheduleStr;
1242 
1243   OS << "Calculated schedule:\n";
1244 
1245   if (!LastSchedule) {
1246     OS << "n/a\n";
1247     return;
1248   }
1249 
1250   p = isl_printer_to_str(isl_schedule_get_ctx(LastSchedule));
1251   p = isl_printer_print_schedule(p, LastSchedule);
1252   ScheduleStr = isl_printer_get_str(p);
1253   isl_printer_free(p);
1254 
1255   OS << ScheduleStr << "\n";
1256 }
1257 
1258 void IslScheduleOptimizer::getAnalysisUsage(AnalysisUsage &AU) const {
1259   ScopPass::getAnalysisUsage(AU);
1260   AU.addRequired<DependenceInfo>();
1261   AU.addRequired<TargetTransformInfoWrapperPass>();
1262 }
1263 
1264 Pass *polly::createIslScheduleOptimizerPass() {
1265   return new IslScheduleOptimizer();
1266 }
1267 
1268 INITIALIZE_PASS_BEGIN(IslScheduleOptimizer, "polly-opt-isl",
1269                       "Polly - Optimize schedule of SCoP", false, false);
1270 INITIALIZE_PASS_DEPENDENCY(DependenceInfo);
1271 INITIALIZE_PASS_DEPENDENCY(ScopInfoRegionPass);
1272 INITIALIZE_PASS_DEPENDENCY(TargetTransformInfoWrapperPass);
1273 INITIALIZE_PASS_END(IslScheduleOptimizer, "polly-opt-isl",
1274                     "Polly - Optimize schedule of SCoP", false, false)
1275