1 //===- Schedule.cpp - Calculate an optimized schedule ---------------------===//
2 //
3 //                     The LLVM Compiler Infrastructure
4 //
5 // This file is distributed under the University of Illinois Open Source
6 // License. See LICENSE.TXT for details.
7 //
8 //===----------------------------------------------------------------------===//
9 //
10 // This pass generates an entirely new schedule tree from the data dependences
11 // and iteration domains. The new schedule tree is computed in two steps:
12 //
13 // 1) The isl scheduling optimizer is run
14 //
15 // The isl scheduling optimizer creates a new schedule tree that maximizes
16 // parallelism and tileability and minimizes data-dependence distances. The
17 // algorithm used is a modified version of the ``Pluto'' algorithm:
18 //
19 //   U. Bondhugula, A. Hartono, J. Ramanujam, and P. Sadayappan.
20 //   A Practical Automatic Polyhedral Parallelizer and Locality Optimizer.
21 //   In Proceedings of the 2008 ACM SIGPLAN Conference On Programming Language
22 //   Design and Implementation, PLDI ’08, pages 101–113. ACM, 2008.
23 //
24 // 2) A set of post-scheduling transformations is applied on the schedule tree.
25 //
26 // These optimizations include:
27 //
28 //  - Tiling of the innermost tilable bands
29 //  - Prevectorization - The coice of a possible outer loop that is strip-mined
30 //                       to the innermost level to enable inner-loop
31 //                       vectorization.
32 //  - Some optimizations for spatial locality are also planned.
33 //
34 // For a detailed description of the schedule tree itself please see section 6
35 // of:
36 //
37 // Polyhedral AST generation is more than scanning polyhedra
38 // Tobias Grosser, Sven Verdoolaege, Albert Cohen
39 // ACM Transations on Programming Languages and Systems (TOPLAS),
40 // 37(4), July 2015
41 // http://www.grosser.es/#pub-polyhedral-AST-generation
42 //
43 // This publication also contains a detailed discussion of the different options
44 // for polyhedral loop unrolling, full/partial tile separation and other uses
45 // of the schedule tree.
46 //
47 //===----------------------------------------------------------------------===//
48 
49 #include "polly/ScheduleOptimizer.h"
50 #include "polly/CodeGen/CodeGeneration.h"
51 #include "polly/DependenceInfo.h"
52 #include "polly/LinkAllPasses.h"
53 #include "polly/Options.h"
54 #include "polly/ScopInfo.h"
55 #include "polly/Support/GICHelper.h"
56 #include "llvm/Analysis/TargetTransformInfo.h"
57 #include "llvm/Support/Debug.h"
58 #include "isl/aff.h"
59 #include "isl/band.h"
60 #include "isl/constraint.h"
61 #include "isl/map.h"
62 #include "isl/options.h"
63 #include "isl/printer.h"
64 #include "isl/schedule.h"
65 #include "isl/schedule_node.h"
66 #include "isl/space.h"
67 #include "isl/union_map.h"
68 #include "isl/union_set.h"
69 
70 using namespace llvm;
71 using namespace polly;
72 
73 #define DEBUG_TYPE "polly-opt-isl"
74 
75 static cl::opt<std::string>
76     OptimizeDeps("polly-opt-optimize-only",
77                  cl::desc("Only a certain kind of dependences (all/raw)"),
78                  cl::Hidden, cl::init("all"), cl::ZeroOrMore,
79                  cl::cat(PollyCategory));
80 
81 static cl::opt<std::string>
82     SimplifyDeps("polly-opt-simplify-deps",
83                  cl::desc("Dependences should be simplified (yes/no)"),
84                  cl::Hidden, cl::init("yes"), cl::ZeroOrMore,
85                  cl::cat(PollyCategory));
86 
87 static cl::opt<int> MaxConstantTerm(
88     "polly-opt-max-constant-term",
89     cl::desc("The maximal constant term allowed (-1 is unlimited)"), cl::Hidden,
90     cl::init(20), cl::ZeroOrMore, cl::cat(PollyCategory));
91 
92 static cl::opt<int> MaxCoefficient(
93     "polly-opt-max-coefficient",
94     cl::desc("The maximal coefficient allowed (-1 is unlimited)"), cl::Hidden,
95     cl::init(20), cl::ZeroOrMore, cl::cat(PollyCategory));
96 
97 static cl::opt<std::string> FusionStrategy(
98     "polly-opt-fusion", cl::desc("The fusion strategy to choose (min/max)"),
99     cl::Hidden, cl::init("min"), cl::ZeroOrMore, cl::cat(PollyCategory));
100 
101 static cl::opt<std::string>
102     MaximizeBandDepth("polly-opt-maximize-bands",
103                       cl::desc("Maximize the band depth (yes/no)"), cl::Hidden,
104                       cl::init("yes"), cl::ZeroOrMore, cl::cat(PollyCategory));
105 
106 static cl::opt<std::string> OuterCoincidence(
107     "polly-opt-outer-coincidence",
108     cl::desc("Try to construct schedules where the outer member of each band "
109              "satisfies the coincidence constraints (yes/no)"),
110     cl::Hidden, cl::init("no"), cl::ZeroOrMore, cl::cat(PollyCategory));
111 
112 static cl::opt<int> PrevectorWidth(
113     "polly-prevect-width",
114     cl::desc(
115         "The number of loop iterations to strip-mine for pre-vectorization"),
116     cl::Hidden, cl::init(4), cl::ZeroOrMore, cl::cat(PollyCategory));
117 
118 static cl::opt<bool> FirstLevelTiling("polly-tiling",
119                                       cl::desc("Enable loop tiling"),
120                                       cl::init(true), cl::ZeroOrMore,
121                                       cl::cat(PollyCategory));
122 
123 static cl::opt<int> LatencyVectorFma(
124     "polly-target-latency-vector-fma",
125     cl::desc("The minimal number of cycles between issuing two "
126              "dependent consecutive vector fused multiply-add "
127              "instructions."),
128     cl::Hidden, cl::init(8), cl::ZeroOrMore, cl::cat(PollyCategory));
129 
130 static cl::opt<int> ThroughputVectorFma(
131     "polly-target-throughput-vector-fma",
132     cl::desc("A throughput of the processor floating-point arithmetic units "
133              "expressed in the number of vector fused multiply-add "
134              "instructions per clock cycle."),
135     cl::Hidden, cl::init(1), cl::ZeroOrMore, cl::cat(PollyCategory));
136 
137 // This option, along with --polly-target-2nd-cache-level-associativity,
138 // --polly-target-1st-cache-level-size, and --polly-target-2st-cache-level-size
139 // represent the parameters of the target cache, which do not have typical
140 // values that can be used by default. However, to apply the pattern matching
141 // optimizations, we use the values of the parameters of Intel Core i7-3820
142 // SandyBridge in case the parameters are not specified. Such an approach helps
143 // also to attain the high-performance on IBM POWER System S822 and IBM Power
144 // 730 Express server.
145 static cl::opt<int> FirstCacheLevelAssociativity(
146     "polly-target-1st-cache-level-associativity",
147     cl::desc("The associativity of the first cache level."), cl::Hidden,
148     cl::init(8), cl::ZeroOrMore, cl::cat(PollyCategory));
149 
150 static cl::opt<int> SecondCacheLevelAssociativity(
151     "polly-target-2nd-cache-level-associativity",
152     cl::desc("The associativity of the second cache level."), cl::Hidden,
153     cl::init(8), cl::ZeroOrMore, cl::cat(PollyCategory));
154 
155 static cl::opt<int> FirstCacheLevelSize(
156     "polly-target-1st-cache-level-size",
157     cl::desc("The size of the first cache level specified in bytes."),
158     cl::Hidden, cl::init(32768), cl::ZeroOrMore, cl::cat(PollyCategory));
159 
160 static cl::opt<int> SecondCacheLevelSize(
161     "polly-target-2nd-cache-level-size",
162     cl::desc("The size of the second level specified in bytes."), cl::Hidden,
163     cl::init(262144), cl::ZeroOrMore, cl::cat(PollyCategory));
164 
165 static cl::opt<int> FirstLevelDefaultTileSize(
166     "polly-default-tile-size",
167     cl::desc("The default tile size (if not enough were provided by"
168              " --polly-tile-sizes)"),
169     cl::Hidden, cl::init(32), cl::ZeroOrMore, cl::cat(PollyCategory));
170 
171 static cl::list<int> FirstLevelTileSizes(
172     "polly-tile-sizes", cl::desc("A tile size for each loop dimension, filled "
173                                  "with --polly-default-tile-size"),
174     cl::Hidden, cl::ZeroOrMore, cl::CommaSeparated, cl::cat(PollyCategory));
175 
176 static cl::opt<bool>
177     SecondLevelTiling("polly-2nd-level-tiling",
178                       cl::desc("Enable a 2nd level loop of loop tiling"),
179                       cl::init(false), cl::ZeroOrMore, cl::cat(PollyCategory));
180 
181 static cl::opt<int> SecondLevelDefaultTileSize(
182     "polly-2nd-level-default-tile-size",
183     cl::desc("The default 2nd-level tile size (if not enough were provided by"
184              " --polly-2nd-level-tile-sizes)"),
185     cl::Hidden, cl::init(16), cl::ZeroOrMore, cl::cat(PollyCategory));
186 
187 static cl::list<int>
188     SecondLevelTileSizes("polly-2nd-level-tile-sizes",
189                          cl::desc("A tile size for each loop dimension, filled "
190                                   "with --polly-default-tile-size"),
191                          cl::Hidden, cl::ZeroOrMore, cl::CommaSeparated,
192                          cl::cat(PollyCategory));
193 
194 static cl::opt<bool> RegisterTiling("polly-register-tiling",
195                                     cl::desc("Enable register tiling"),
196                                     cl::init(false), cl::ZeroOrMore,
197                                     cl::cat(PollyCategory));
198 
199 static cl::opt<int> RegisterDefaultTileSize(
200     "polly-register-tiling-default-tile-size",
201     cl::desc("The default register tile size (if not enough were provided by"
202              " --polly-register-tile-sizes)"),
203     cl::Hidden, cl::init(2), cl::ZeroOrMore, cl::cat(PollyCategory));
204 
205 static cl::opt<int> PollyPatternMatchingNcQuotient(
206     "polly-pattern-matching-nc-quotient",
207     cl::desc("Quotient that is obtained by dividing Nc, the parameter of the"
208              "macro-kernel, by Nr, the parameter of the micro-kernel"),
209     cl::Hidden, cl::init(256), cl::ZeroOrMore, cl::cat(PollyCategory));
210 
211 static cl::list<int>
212     RegisterTileSizes("polly-register-tile-sizes",
213                       cl::desc("A tile size for each loop dimension, filled "
214                                "with --polly-register-tile-size"),
215                       cl::Hidden, cl::ZeroOrMore, cl::CommaSeparated,
216                       cl::cat(PollyCategory));
217 
218 static cl::opt<bool>
219     PMBasedOpts("polly-pattern-matching-based-opts",
220                 cl::desc("Perform optimizations based on pattern matching"),
221                 cl::init(false), cl::ZeroOrMore, cl::cat(PollyCategory));
222 
223 static cl::opt<bool> OptimizedScops(
224     "polly-optimized-scops",
225     cl::desc("Polly - Dump polyhedral description of Scops optimized with "
226              "the isl scheduling optimizer and the set of post-scheduling "
227              "transformations is applied on the schedule tree"),
228     cl::init(false), cl::ZeroOrMore, cl::cat(PollyCategory));
229 
230 /// Create an isl_union_set, which describes the isolate option based on
231 /// IsoalteDomain.
232 ///
233 /// @param IsolateDomain An isl_set whose last dimension is the only one that
234 ///                      should belong to the current band node.
235 static __isl_give isl_union_set *
236 getIsolateOptions(__isl_take isl_set *IsolateDomain) {
237   auto Dims = isl_set_dim(IsolateDomain, isl_dim_set);
238   auto *IsolateRelation = isl_map_from_domain(IsolateDomain);
239   IsolateRelation = isl_map_move_dims(IsolateRelation, isl_dim_out, 0,
240                                       isl_dim_in, Dims - 1, 1);
241   auto *IsolateOption = isl_map_wrap(IsolateRelation);
242   auto *Id = isl_id_alloc(isl_set_get_ctx(IsolateOption), "isolate", nullptr);
243   return isl_union_set_from_set(isl_set_set_tuple_id(IsolateOption, Id));
244 }
245 
246 /// Create an isl_union_set, which describes the atomic option for the dimension
247 /// of the current node.
248 ///
249 /// It may help to reduce the size of generated code.
250 ///
251 /// @param Ctx An isl_ctx, which is used to create the isl_union_set.
252 static __isl_give isl_union_set *getAtomicOptions(__isl_take isl_ctx *Ctx) {
253   auto *Space = isl_space_set_alloc(Ctx, 0, 1);
254   auto *AtomicOption = isl_set_universe(Space);
255   auto *Id = isl_id_alloc(Ctx, "atomic", nullptr);
256   return isl_union_set_from_set(isl_set_set_tuple_id(AtomicOption, Id));
257 }
258 
259 /// Make the last dimension of Set to take values from 0 to VectorWidth - 1.
260 ///
261 /// @param Set         A set, which should be modified.
262 /// @param VectorWidth A parameter, which determines the constraint.
263 static __isl_give isl_set *addExtentConstraints(__isl_take isl_set *Set,
264                                                 int VectorWidth) {
265   auto Dims = isl_set_dim(Set, isl_dim_set);
266   auto Space = isl_set_get_space(Set);
267   auto *LocalSpace = isl_local_space_from_space(Space);
268   auto *ExtConstr =
269       isl_constraint_alloc_inequality(isl_local_space_copy(LocalSpace));
270   ExtConstr = isl_constraint_set_constant_si(ExtConstr, 0);
271   ExtConstr =
272       isl_constraint_set_coefficient_si(ExtConstr, isl_dim_set, Dims - 1, 1);
273   Set = isl_set_add_constraint(Set, ExtConstr);
274   ExtConstr = isl_constraint_alloc_inequality(LocalSpace);
275   ExtConstr = isl_constraint_set_constant_si(ExtConstr, VectorWidth - 1);
276   ExtConstr =
277       isl_constraint_set_coefficient_si(ExtConstr, isl_dim_set, Dims - 1, -1);
278   return isl_set_add_constraint(Set, ExtConstr);
279 }
280 
281 /// Build the desired set of partial tile prefixes.
282 ///
283 /// We build a set of partial tile prefixes, which are prefixes of the vector
284 /// loop that have exactly VectorWidth iterations.
285 ///
286 /// 1. Get all prefixes of the vector loop.
287 /// 2. Extend it to a set, which has exactly VectorWidth iterations for
288 ///    any prefix from the set that was built on the previous step.
289 /// 3. Subtract loop domain from it, project out the vector loop dimension and
290 ///    get a set of prefixes, which don't have exactly VectorWidth iterations.
291 /// 4. Subtract it from all prefixes of the vector loop and get the desired
292 ///    set.
293 ///
294 /// @param ScheduleRange A range of a map, which describes a prefix schedule
295 ///                      relation.
296 static __isl_give isl_set *
297 getPartialTilePrefixes(__isl_take isl_set *ScheduleRange, int VectorWidth) {
298   auto Dims = isl_set_dim(ScheduleRange, isl_dim_set);
299   auto *LoopPrefixes = isl_set_project_out(isl_set_copy(ScheduleRange),
300                                            isl_dim_set, Dims - 1, 1);
301   auto *ExtentPrefixes =
302       isl_set_add_dims(isl_set_copy(LoopPrefixes), isl_dim_set, 1);
303   ExtentPrefixes = addExtentConstraints(ExtentPrefixes, VectorWidth);
304   auto *BadPrefixes = isl_set_subtract(ExtentPrefixes, ScheduleRange);
305   BadPrefixes = isl_set_project_out(BadPrefixes, isl_dim_set, Dims - 1, 1);
306   return isl_set_subtract(LoopPrefixes, BadPrefixes);
307 }
308 
309 __isl_give isl_schedule_node *ScheduleTreeOptimizer::isolateFullPartialTiles(
310     __isl_take isl_schedule_node *Node, int VectorWidth) {
311   assert(isl_schedule_node_get_type(Node) == isl_schedule_node_band);
312   Node = isl_schedule_node_child(Node, 0);
313   Node = isl_schedule_node_child(Node, 0);
314   auto *SchedRelUMap = isl_schedule_node_get_prefix_schedule_relation(Node);
315   auto *ScheduleRelation = isl_map_from_union_map(SchedRelUMap);
316   auto *ScheduleRange = isl_map_range(ScheduleRelation);
317   auto *IsolateDomain = getPartialTilePrefixes(ScheduleRange, VectorWidth);
318   auto *AtomicOption = getAtomicOptions(isl_set_get_ctx(IsolateDomain));
319   auto *IsolateOption = getIsolateOptions(IsolateDomain);
320   Node = isl_schedule_node_parent(Node);
321   Node = isl_schedule_node_parent(Node);
322   auto *Options = isl_union_set_union(IsolateOption, AtomicOption);
323   Node = isl_schedule_node_band_set_ast_build_options(Node, Options);
324   return Node;
325 }
326 
327 __isl_give isl_schedule_node *
328 ScheduleTreeOptimizer::prevectSchedBand(__isl_take isl_schedule_node *Node,
329                                         unsigned DimToVectorize,
330                                         int VectorWidth) {
331   assert(isl_schedule_node_get_type(Node) == isl_schedule_node_band);
332 
333   auto Space = isl_schedule_node_band_get_space(Node);
334   auto ScheduleDimensions = isl_space_dim(Space, isl_dim_set);
335   isl_space_free(Space);
336   assert(DimToVectorize < ScheduleDimensions);
337 
338   if (DimToVectorize > 0) {
339     Node = isl_schedule_node_band_split(Node, DimToVectorize);
340     Node = isl_schedule_node_child(Node, 0);
341   }
342   if (DimToVectorize < ScheduleDimensions - 1)
343     Node = isl_schedule_node_band_split(Node, 1);
344   Space = isl_schedule_node_band_get_space(Node);
345   auto Sizes = isl_multi_val_zero(Space);
346   auto Ctx = isl_schedule_node_get_ctx(Node);
347   Sizes =
348       isl_multi_val_set_val(Sizes, 0, isl_val_int_from_si(Ctx, VectorWidth));
349   Node = isl_schedule_node_band_tile(Node, Sizes);
350   Node = isolateFullPartialTiles(Node, VectorWidth);
351   Node = isl_schedule_node_child(Node, 0);
352   // Make sure the "trivially vectorizable loop" is not unrolled. Otherwise,
353   // we will have troubles to match it in the backend.
354   Node = isl_schedule_node_band_set_ast_build_options(
355       Node, isl_union_set_read_from_str(Ctx, "{ unroll[x]: 1 = 0 }"));
356   Node = isl_schedule_node_band_sink(Node);
357   Node = isl_schedule_node_child(Node, 0);
358   if (isl_schedule_node_get_type(Node) == isl_schedule_node_leaf)
359     Node = isl_schedule_node_parent(Node);
360   isl_id *LoopMarker = isl_id_alloc(Ctx, "SIMD", nullptr);
361   Node = isl_schedule_node_insert_mark(Node, LoopMarker);
362   return Node;
363 }
364 
365 __isl_give isl_schedule_node *
366 ScheduleTreeOptimizer::tileNode(__isl_take isl_schedule_node *Node,
367                                 const char *Identifier, ArrayRef<int> TileSizes,
368                                 int DefaultTileSize) {
369   auto Ctx = isl_schedule_node_get_ctx(Node);
370   auto Space = isl_schedule_node_band_get_space(Node);
371   auto Dims = isl_space_dim(Space, isl_dim_set);
372   auto Sizes = isl_multi_val_zero(Space);
373   std::string IdentifierString(Identifier);
374   for (unsigned i = 0; i < Dims; i++) {
375     auto tileSize = i < TileSizes.size() ? TileSizes[i] : DefaultTileSize;
376     Sizes = isl_multi_val_set_val(Sizes, i, isl_val_int_from_si(Ctx, tileSize));
377   }
378   auto TileLoopMarkerStr = IdentifierString + " - Tiles";
379   isl_id *TileLoopMarker =
380       isl_id_alloc(Ctx, TileLoopMarkerStr.c_str(), nullptr);
381   Node = isl_schedule_node_insert_mark(Node, TileLoopMarker);
382   Node = isl_schedule_node_child(Node, 0);
383   Node = isl_schedule_node_band_tile(Node, Sizes);
384   Node = isl_schedule_node_child(Node, 0);
385   auto PointLoopMarkerStr = IdentifierString + " - Points";
386   isl_id *PointLoopMarker =
387       isl_id_alloc(Ctx, PointLoopMarkerStr.c_str(), nullptr);
388   Node = isl_schedule_node_insert_mark(Node, PointLoopMarker);
389   Node = isl_schedule_node_child(Node, 0);
390   return Node;
391 }
392 
393 __isl_give isl_schedule_node *
394 ScheduleTreeOptimizer::applyRegisterTiling(__isl_take isl_schedule_node *Node,
395                                            llvm::ArrayRef<int> TileSizes,
396                                            int DefaultTileSize) {
397   auto *Ctx = isl_schedule_node_get_ctx(Node);
398   Node = tileNode(Node, "Register tiling", TileSizes, DefaultTileSize);
399   Node = isl_schedule_node_band_set_ast_build_options(
400       Node, isl_union_set_read_from_str(Ctx, "{unroll[x]}"));
401   return Node;
402 }
403 
404 bool ScheduleTreeOptimizer::isTileableBandNode(
405     __isl_keep isl_schedule_node *Node) {
406   if (isl_schedule_node_get_type(Node) != isl_schedule_node_band)
407     return false;
408 
409   if (isl_schedule_node_n_children(Node) != 1)
410     return false;
411 
412   if (!isl_schedule_node_band_get_permutable(Node))
413     return false;
414 
415   auto Space = isl_schedule_node_band_get_space(Node);
416   auto Dims = isl_space_dim(Space, isl_dim_set);
417   isl_space_free(Space);
418 
419   if (Dims <= 1)
420     return false;
421 
422   auto Child = isl_schedule_node_get_child(Node, 0);
423   auto Type = isl_schedule_node_get_type(Child);
424   isl_schedule_node_free(Child);
425 
426   if (Type != isl_schedule_node_leaf)
427     return false;
428 
429   return true;
430 }
431 
432 __isl_give isl_schedule_node *
433 ScheduleTreeOptimizer::standardBandOpts(__isl_take isl_schedule_node *Node,
434                                         void *User) {
435   if (FirstLevelTiling)
436     Node = tileNode(Node, "1st level tiling", FirstLevelTileSizes,
437                     FirstLevelDefaultTileSize);
438 
439   if (SecondLevelTiling)
440     Node = tileNode(Node, "2nd level tiling", SecondLevelTileSizes,
441                     SecondLevelDefaultTileSize);
442 
443   if (RegisterTiling)
444     Node =
445         applyRegisterTiling(Node, RegisterTileSizes, RegisterDefaultTileSize);
446 
447   if (PollyVectorizerChoice == VECTORIZER_NONE)
448     return Node;
449 
450   auto Space = isl_schedule_node_band_get_space(Node);
451   auto Dims = isl_space_dim(Space, isl_dim_set);
452   isl_space_free(Space);
453 
454   for (int i = Dims - 1; i >= 0; i--)
455     if (isl_schedule_node_band_member_get_coincident(Node, i)) {
456       Node = prevectSchedBand(Node, i, PrevectorWidth);
457       break;
458     }
459 
460   return Node;
461 }
462 
463 /// Check whether output dimensions of the map rely on the specified input
464 /// dimension.
465 ///
466 /// @param IslMap The isl map to be considered.
467 /// @param DimNum The number of an input dimension to be checked.
468 static bool isInputDimUsed(__isl_take isl_map *IslMap, unsigned DimNum) {
469   auto *CheckedAccessRelation =
470       isl_map_project_out(isl_map_copy(IslMap), isl_dim_in, DimNum, 1);
471   CheckedAccessRelation =
472       isl_map_insert_dims(CheckedAccessRelation, isl_dim_in, DimNum, 1);
473   auto *InputDimsId = isl_map_get_tuple_id(IslMap, isl_dim_in);
474   CheckedAccessRelation =
475       isl_map_set_tuple_id(CheckedAccessRelation, isl_dim_in, InputDimsId);
476   InputDimsId = isl_map_get_tuple_id(IslMap, isl_dim_out);
477   CheckedAccessRelation =
478       isl_map_set_tuple_id(CheckedAccessRelation, isl_dim_out, InputDimsId);
479   auto res = !isl_map_is_equal(CheckedAccessRelation, IslMap);
480   isl_map_free(CheckedAccessRelation);
481   isl_map_free(IslMap);
482   return res;
483 }
484 
485 /// Check if the SCoP statement could probably be optimized with analytical
486 /// modeling.
487 ///
488 /// containsMatrMult tries to determine whether the following conditions
489 /// are true:
490 /// 1. all memory accesses of the statement will have stride 0 or 1,
491 ///    if we interchange loops (switch the variable used in the inner
492 ///    loop to the outer loop).
493 /// 2. all memory accesses of the statement except from the last one, are
494 ///    read memory access and the last one is write memory access.
495 /// 3. all subscripts of the last memory access of the statement don't contain
496 ///    the variable used in the inner loop.
497 ///
498 /// @param PartialSchedule The PartialSchedule that contains a SCoP statement
499 ///        to check.
500 static bool containsMatrMult(__isl_keep isl_map *PartialSchedule) {
501   auto InputDimsId = isl_map_get_tuple_id(PartialSchedule, isl_dim_in);
502   auto *ScpStmt = static_cast<ScopStmt *>(isl_id_get_user(InputDimsId));
503   isl_id_free(InputDimsId);
504   if (ScpStmt->size() <= 1)
505     return false;
506   auto MemA = ScpStmt->begin();
507   for (unsigned i = 0; i < ScpStmt->size() - 2 && MemA != ScpStmt->end();
508        i++, MemA++)
509     if (!(*MemA)->isRead() ||
510         ((*MemA)->isArrayKind() &&
511          !((*MemA)->isStrideOne(isl_map_copy(PartialSchedule)) ||
512            (*MemA)->isStrideZero(isl_map_copy(PartialSchedule)))))
513       return false;
514   MemA++;
515   if (!(*MemA)->isWrite() || !(*MemA)->isArrayKind() ||
516       !((*MemA)->isStrideOne(isl_map_copy(PartialSchedule)) ||
517         (*MemA)->isStrideZero(isl_map_copy(PartialSchedule))))
518     return false;
519   auto DimNum = isl_map_dim(PartialSchedule, isl_dim_in);
520   return !isInputDimUsed((*MemA)->getAccessRelation(), DimNum - 1);
521 }
522 
523 /// Circular shift of output dimensions of the integer map.
524 ///
525 /// @param IslMap The isl map to be modified.
526 static __isl_give isl_map *circularShiftOutputDims(__isl_take isl_map *IslMap) {
527   auto DimNum = isl_map_dim(IslMap, isl_dim_out);
528   if (DimNum == 0)
529     return IslMap;
530   auto InputDimsId = isl_map_get_tuple_id(IslMap, isl_dim_in);
531   IslMap = isl_map_move_dims(IslMap, isl_dim_in, 0, isl_dim_out, DimNum - 1, 1);
532   IslMap = isl_map_move_dims(IslMap, isl_dim_out, 0, isl_dim_in, 0, 1);
533   return isl_map_set_tuple_id(IslMap, isl_dim_in, InputDimsId);
534 }
535 
536 /// Permute two dimensions of the band node.
537 ///
538 /// Permute FirstDim and SecondDim dimensions of the Node.
539 ///
540 /// @param Node The band node to be modified.
541 /// @param FirstDim The first dimension to be permuted.
542 /// @param SecondDim The second dimension to be permuted.
543 static __isl_give isl_schedule_node *
544 permuteBandNodeDimensions(__isl_take isl_schedule_node *Node, unsigned FirstDim,
545                           unsigned SecondDim) {
546   assert(isl_schedule_node_get_type(Node) == isl_schedule_node_band &&
547          isl_schedule_node_band_n_member(Node) > std::max(FirstDim, SecondDim));
548   auto PartialSchedule = isl_schedule_node_band_get_partial_schedule(Node);
549   auto PartialScheduleFirstDim =
550       isl_multi_union_pw_aff_get_union_pw_aff(PartialSchedule, FirstDim);
551   auto PartialScheduleSecondDim =
552       isl_multi_union_pw_aff_get_union_pw_aff(PartialSchedule, SecondDim);
553   PartialSchedule = isl_multi_union_pw_aff_set_union_pw_aff(
554       PartialSchedule, SecondDim, PartialScheduleFirstDim);
555   PartialSchedule = isl_multi_union_pw_aff_set_union_pw_aff(
556       PartialSchedule, FirstDim, PartialScheduleSecondDim);
557   Node = isl_schedule_node_delete(Node);
558   Node = isl_schedule_node_insert_partial_schedule(Node, PartialSchedule);
559   return Node;
560 }
561 
562 __isl_give isl_schedule_node *ScheduleTreeOptimizer::createMicroKernel(
563     __isl_take isl_schedule_node *Node, MicroKernelParamsTy MicroKernelParams) {
564   applyRegisterTiling(Node, {MicroKernelParams.Mr, MicroKernelParams.Nr}, 1);
565   Node = isl_schedule_node_parent(isl_schedule_node_parent(Node));
566   Node = permuteBandNodeDimensions(Node, 0, 1);
567   return isl_schedule_node_child(isl_schedule_node_child(Node, 0), 0);
568 }
569 
570 __isl_give isl_schedule_node *ScheduleTreeOptimizer::createMacroKernel(
571     __isl_take isl_schedule_node *Node, MacroKernelParamsTy MacroKernelParams) {
572   assert(isl_schedule_node_get_type(Node) == isl_schedule_node_band);
573   if (MacroKernelParams.Mc == 1 && MacroKernelParams.Nc == 1 &&
574       MacroKernelParams.Kc == 1)
575     return Node;
576   Node = tileNode(
577       Node, "1st level tiling",
578       {MacroKernelParams.Mc, MacroKernelParams.Nc, MacroKernelParams.Kc}, 1);
579   Node = isl_schedule_node_parent(isl_schedule_node_parent(Node));
580   Node = permuteBandNodeDimensions(Node, 1, 2);
581   Node = permuteBandNodeDimensions(Node, 0, 2);
582   return isl_schedule_node_child(isl_schedule_node_child(Node, 0), 0);
583 }
584 
585 /// Get parameters of the BLIS micro kernel.
586 ///
587 /// We choose the Mr and Nr parameters of the micro kernel to be large enough
588 /// such that no stalls caused by the combination of latencies and dependencies
589 /// are introduced during the updates of the resulting matrix of the matrix
590 /// multiplication. However, they should also be as small as possible to
591 /// release more registers for entries of multiplied matrices.
592 ///
593 /// @param TTI Target Transform Info.
594 /// @return The structure of type MicroKernelParamsTy.
595 /// @see MicroKernelParamsTy
596 static struct MicroKernelParamsTy
597 getMicroKernelParams(const llvm::TargetTransformInfo *TTI) {
598   assert(TTI && "The target transform info should be provided.");
599 
600   // Nvec - Number of double-precision floating-point numbers that can be hold
601   // by a vector register. Use 2 by default.
602   auto Nvec = TTI->getRegisterBitWidth(true) / 64;
603   if (Nvec == 0)
604     Nvec = 2;
605   int Nr =
606       ceil(sqrt(Nvec * LatencyVectorFma * ThroughputVectorFma) / Nvec) * Nvec;
607   int Mr = ceil(Nvec * LatencyVectorFma * ThroughputVectorFma / Nr);
608   return {Mr, Nr};
609 }
610 
611 /// Get parameters of the BLIS macro kernel.
612 ///
613 /// During the computation of matrix multiplication, blocks of partitioned
614 /// matrices are mapped to different layers of the memory hierarchy.
615 /// To optimize data reuse, blocks should be ideally kept in cache between
616 /// iterations. Since parameters of the macro kernel determine sizes of these
617 /// blocks, there are upper and lower bounds on these parameters.
618 ///
619 /// @param MicroKernelParams Parameters of the micro-kernel
620 ///                          to be taken into account.
621 /// @return The structure of type MacroKernelParamsTy.
622 /// @see MacroKernelParamsTy
623 /// @see MicroKernelParamsTy
624 static struct MacroKernelParamsTy
625 getMacroKernelParams(const MicroKernelParamsTy &MicroKernelParams) {
626   // According to www.cs.utexas.edu/users/flame/pubs/TOMS-BLIS-Analytical.pdf,
627   // it requires information about the first two levels of a cache to determine
628   // all the parameters of a macro-kernel. It also checks that an associativity
629   // degree of a cache level is greater than two. Otherwise, another algorithm
630   // for determination of the parameters should be used.
631   if (!(MicroKernelParams.Mr > 0 && MicroKernelParams.Nr > 0 &&
632         FirstCacheLevelSize > 0 && SecondCacheLevelSize > 0 &&
633         FirstCacheLevelAssociativity > 2 && SecondCacheLevelAssociativity > 2))
634     return {1, 1, 1};
635   // The quotient should be greater than zero.
636   if (PollyPatternMatchingNcQuotient <= 0)
637     return {1, 1, 1};
638   int Car = floor(
639       (FirstCacheLevelAssociativity - 1) /
640       (1 + static_cast<double>(MicroKernelParams.Nr) / MicroKernelParams.Mr));
641   int Kc = (Car * FirstCacheLevelSize) /
642            (MicroKernelParams.Mr * FirstCacheLevelAssociativity * 8);
643   double Cac = static_cast<double>(Kc * 8 * SecondCacheLevelAssociativity) /
644                SecondCacheLevelSize;
645   int Mc = floor((SecondCacheLevelAssociativity - 2) / Cac);
646   int Nc = PollyPatternMatchingNcQuotient * MicroKernelParams.Nr;
647   return {Mc, Nc, Kc};
648 }
649 
650 /// Identify a memory access through the shape of its memory access relation.
651 ///
652 /// Identify the unique memory access in @p Stmt, that has an access relation
653 /// equal to @p ExpectedAccessRelation.
654 ///
655 /// @param Stmt The SCoP statement that contains the memory accesses under
656 ///             consideration.
657 /// @param ExpectedAccessRelation The access relation that identifies
658 ///                               the memory access.
659 /// @return  The memory access of @p Stmt whose memory access relation is equal
660 ///          to @p ExpectedAccessRelation. nullptr in case there is no or more
661 ///          than one such access.
662 MemoryAccess *
663 identifyAccessByAccessRelation(ScopStmt *Stmt,
664                                __isl_take isl_map *ExpectedAccessRelation) {
665   if (isl_map_has_tuple_id(ExpectedAccessRelation, isl_dim_out))
666     ExpectedAccessRelation =
667         isl_map_reset_tuple_id(ExpectedAccessRelation, isl_dim_out);
668   MemoryAccess *IdentifiedAccess = nullptr;
669   for (auto *Access : *Stmt) {
670     auto *AccessRelation = Access->getAccessRelation();
671     AccessRelation = isl_map_reset_tuple_id(AccessRelation, isl_dim_out);
672     if (isl_map_is_equal(ExpectedAccessRelation, AccessRelation)) {
673       if (IdentifiedAccess) {
674         isl_map_free(AccessRelation);
675         isl_map_free(ExpectedAccessRelation);
676         return nullptr;
677       }
678       IdentifiedAccess = Access;
679     }
680     isl_map_free(AccessRelation);
681   }
682   isl_map_free(ExpectedAccessRelation);
683   return IdentifiedAccess;
684 }
685 
686 /// Add constrains to @Dim dimension of @p ExtMap.
687 ///
688 /// If @ExtMap has the following form [O0, O1, O2]->[I1, I2, I3],
689 /// the following constraint will be added
690 /// Bound * OM <= IM <= Bound * (OM + 1) - 1,
691 /// where M is @p Dim and Bound is @p Bound.
692 ///
693 /// @param ExtMap The isl map to be modified.
694 /// @param Dim The output dimension to be modfied.
695 /// @param Bound The value that is used to specify the constraint.
696 /// @return The modified isl map
697 __isl_give isl_map *
698 addExtensionMapMatMulDimConstraint(__isl_take isl_map *ExtMap, unsigned Dim,
699                                    unsigned Bound) {
700   assert(Bound != 0);
701   auto *ExtMapSpace = isl_map_get_space(ExtMap);
702   auto *ConstrSpace = isl_local_space_from_space(ExtMapSpace);
703   auto *Constr =
704       isl_constraint_alloc_inequality(isl_local_space_copy(ConstrSpace));
705   Constr = isl_constraint_set_coefficient_si(Constr, isl_dim_out, Dim, 1);
706   Constr =
707       isl_constraint_set_coefficient_si(Constr, isl_dim_in, Dim, Bound * (-1));
708   ExtMap = isl_map_add_constraint(ExtMap, Constr);
709   Constr = isl_constraint_alloc_inequality(ConstrSpace);
710   Constr = isl_constraint_set_coefficient_si(Constr, isl_dim_out, Dim, -1);
711   Constr = isl_constraint_set_coefficient_si(Constr, isl_dim_in, Dim, Bound);
712   Constr = isl_constraint_set_constant_si(Constr, Bound - 1);
713   return isl_map_add_constraint(ExtMap, Constr);
714 }
715 
716 /// Create an access relation that is specific for matrix multiplication
717 /// pattern.
718 ///
719 /// Create an access relation of the following form:
720 /// { [O0, O1, O2]->[I1, I2, I3] :
721 ///   FirstOutputDimBound * O0 <= I1 <= FirstOutputDimBound * (O0 + 1) - 1
722 ///   and SecondOutputDimBound * O1 <= I2 <= SecondOutputDimBound * (O1 + 1) - 1
723 ///   and ThirdOutputDimBound * O2 <= I3 <= ThirdOutputDimBound * (O2 + 1) - 1}
724 ///   where FirstOutputDimBound is @p FirstOutputDimBound,
725 ///   SecondOutputDimBound is @p SecondOutputDimBound,
726 ///   ThirdOutputDimBound is @p ThirdOutputDimBound
727 ///
728 /// @param Ctx The isl context.
729 /// @param FirstOutputDimBound,
730 ///        SecondOutputDimBound,
731 ///        ThirdOutputDimBound The parameters of the access relation.
732 /// @return The specified access relation.
733 __isl_give isl_map *getMatMulExt(isl_ctx *Ctx, unsigned FirstOutputDimBound,
734                                  unsigned SecondOutputDimBound,
735                                  unsigned ThirdOutputDimBound) {
736   auto *NewRelSpace = isl_space_alloc(Ctx, 0, 3, 3);
737   auto *extensionMap = isl_map_universe(NewRelSpace);
738   if (!FirstOutputDimBound)
739     extensionMap = isl_map_fix_si(extensionMap, isl_dim_out, 0, 0);
740   else
741     extensionMap = addExtensionMapMatMulDimConstraint(extensionMap, 0,
742                                                       FirstOutputDimBound);
743   if (!SecondOutputDimBound)
744     extensionMap = isl_map_fix_si(extensionMap, isl_dim_out, 1, 0);
745   else
746     extensionMap = addExtensionMapMatMulDimConstraint(extensionMap, 1,
747                                                       SecondOutputDimBound);
748   if (!ThirdOutputDimBound)
749     extensionMap = isl_map_fix_si(extensionMap, isl_dim_out, 2, 0);
750   else
751     extensionMap = addExtensionMapMatMulDimConstraint(extensionMap, 2,
752                                                       ThirdOutputDimBound);
753   return extensionMap;
754 }
755 
756 /// Create an access relation that is specific to the matrix
757 ///        multiplication pattern.
758 ///
759 /// Create an access relation of the following form:
760 /// Stmt[O0, O1, O2]->[OI, OJ],
761 /// where I is @p I, J is @J
762 ///
763 /// @param Stmt The SCoP statement for which to generate the access relation.
764 /// @param I The index of the input dimension that is mapped to the first output
765 ///          dimension.
766 /// @param J The index of the input dimension that is mapped to the second
767 ///          output dimension.
768 /// @return The specified access relation.
769 __isl_give isl_map *
770 getMatMulPatternOriginalAccessRelation(ScopStmt *Stmt, unsigned I, unsigned J) {
771   auto *AccessRelSpace = isl_space_alloc(Stmt->getIslCtx(), 0, 3, 2);
772   auto *AccessRel = isl_map_universe(AccessRelSpace);
773   AccessRel = isl_map_equate(AccessRel, isl_dim_in, I, isl_dim_out, 0);
774   AccessRel = isl_map_equate(AccessRel, isl_dim_in, J, isl_dim_out, 1);
775   AccessRel = isl_map_set_tuple_id(AccessRel, isl_dim_in, Stmt->getDomainId());
776   return AccessRel;
777 }
778 
779 /// Identify the memory access that corresponds to the access to the second
780 /// operand of the matrix multiplication.
781 ///
782 /// Identify the memory access that corresponds to the access
783 /// to the matrix B of the matrix multiplication C = A x B.
784 ///
785 /// @param Stmt The SCoP statement that contains the memory accesses
786 ///             under consideration.
787 /// @return The memory access of @p Stmt that corresponds to the access
788 ///         to the second operand of the matrix multiplication.
789 MemoryAccess *identifyAccessA(ScopStmt *Stmt) {
790   auto *OriginalRel = getMatMulPatternOriginalAccessRelation(Stmt, 0, 2);
791   return identifyAccessByAccessRelation(Stmt, OriginalRel);
792 }
793 
794 /// Identify the memory access that corresponds to the access to the first
795 /// operand of the matrix multiplication.
796 ///
797 /// Identify the memory access that corresponds to the access
798 /// to the matrix A of the matrix multiplication C = A x B.
799 ///
800 /// @param Stmt The SCoP statement that contains the memory accesses
801 ///             under consideration.
802 /// @return The memory access of @p Stmt that corresponds to the access
803 ///         to the first operand of the matrix multiplication.
804 MemoryAccess *identifyAccessB(ScopStmt *Stmt) {
805   auto *OriginalRel = getMatMulPatternOriginalAccessRelation(Stmt, 2, 1);
806   return identifyAccessByAccessRelation(Stmt, OriginalRel);
807 }
808 
809 /// Create an access relation that is specific to
810 ///        the matrix multiplication pattern.
811 ///
812 /// Create an access relation of the following form:
813 /// [O0, O1, O2, O3, O4, O5, O6, O7, O8] -> [OI, O5, OJ]
814 /// where I is @p FirstDim, J is @p SecondDim.
815 ///
816 /// It can be used, for example, to create relations that helps to consequently
817 /// access elements of operands of a matrix multiplication after creation of
818 /// the BLIS micro and macro kernels.
819 ///
820 /// @see ScheduleTreeOptimizer::createMicroKernel
821 /// @see ScheduleTreeOptimizer::createMacroKernel
822 ///
823 /// Subsequently, the described access relation is applied to the range of
824 /// @p MapOldIndVar, that is used to map original induction variables to
825 /// the ones, which are produced by schedule transformations. It helps to
826 /// define relations using a new space and, at the same time, keep them
827 /// in the original one.
828 ///
829 /// @param MapOldIndVar The relation, which maps original induction variables
830 ///                     to the ones, which are produced by schedule
831 ///                     transformations.
832 /// @param FirstDim, SecondDim The input dimensions that are used to define
833 ///        the specified access relation.
834 /// @return The specified access relation.
835 __isl_give isl_map *getMatMulAccRel(__isl_take isl_map *MapOldIndVar,
836                                     unsigned FirstDim, unsigned SecondDim) {
837   auto *Ctx = isl_map_get_ctx(MapOldIndVar);
838   auto *AccessRelSpace = isl_space_alloc(Ctx, 0, 9, 3);
839   auto *AccessRel = isl_map_universe(AccessRelSpace);
840   AccessRel = isl_map_equate(AccessRel, isl_dim_in, FirstDim, isl_dim_out, 0);
841   AccessRel = isl_map_equate(AccessRel, isl_dim_in, 5, isl_dim_out, 1);
842   AccessRel = isl_map_equate(AccessRel, isl_dim_in, SecondDim, isl_dim_out, 2);
843   return isl_map_apply_range(MapOldIndVar, AccessRel);
844 }
845 
846 __isl_give isl_schedule_node *
847 createExtensionNode(__isl_take isl_schedule_node *Node,
848                     __isl_take isl_map *ExtensionMap) {
849   auto *Extension = isl_union_map_from_map(ExtensionMap);
850   auto *NewNode = isl_schedule_node_from_extension(Extension);
851   return isl_schedule_node_graft_before(Node, NewNode);
852 }
853 
854 /// Apply the packing transformation.
855 ///
856 /// The packing transformation can be described as a data-layout
857 /// transformation that requires to introduce a new array, copy data
858 /// to the array, and change memory access locations of the compute kernel
859 /// to reference the array.
860 ///
861 /// @param Node The schedule node to be optimized.
862 /// @param MapOldIndVar The relation, which maps original induction variables
863 ///                     to the ones, which are produced by schedule
864 ///                     transformations.
865 /// @param MicroParams, MacroParams Parameters of the BLIS kernel
866 ///                                 to be taken into account.
867 /// @return The optimized schedule node.
868 static __isl_give isl_schedule_node *optimizeDataLayoutMatrMulPattern(
869     __isl_take isl_schedule_node *Node, __isl_take isl_map *MapOldIndVar,
870     MicroKernelParamsTy MicroParams, MacroKernelParamsTy MacroParams) {
871   // Check whether memory accesses of the SCoP statement correspond to
872   // the matrix multiplication pattern and if this is true, obtain them.
873   auto InputDimsId = isl_map_get_tuple_id(MapOldIndVar, isl_dim_in);
874   auto *Stmt = static_cast<ScopStmt *>(isl_id_get_user(InputDimsId));
875   isl_id_free(InputDimsId);
876   MemoryAccess *MemAccessA = identifyAccessA(Stmt);
877   MemoryAccess *MemAccessB = identifyAccessB(Stmt);
878   if (!MemAccessA || !MemAccessB) {
879     isl_map_free(MapOldIndVar);
880     return Node;
881   }
882 
883   // Create a copy statement that corresponds to the memory access to the
884   // matrix B, the second operand of the matrix multiplication.
885   Node = isl_schedule_node_parent(isl_schedule_node_parent(Node));
886   Node = isl_schedule_node_parent(isl_schedule_node_parent(Node));
887   Node = isl_schedule_node_parent(Node);
888   Node = isl_schedule_node_child(isl_schedule_node_band_split(Node, 2), 0);
889   auto *AccRel = getMatMulAccRel(isl_map_copy(MapOldIndVar), 3, 7);
890   unsigned FirstDimSize = MacroParams.Nc / MicroParams.Nr;
891   unsigned SecondDimSize = MacroParams.Kc;
892   unsigned ThirdDimSize = MicroParams.Nr;
893   auto *SAI = Stmt->getParent()->createScopArrayInfo(
894       MemAccessB->getElementType(), "Packed_B",
895       {FirstDimSize, SecondDimSize, ThirdDimSize});
896   AccRel = isl_map_set_tuple_id(AccRel, isl_dim_out, SAI->getBasePtrId());
897   auto *OldAcc = MemAccessB->getAccessRelation();
898   MemAccessB->setNewAccessRelation(AccRel);
899   auto *ExtMap =
900       getMatMulExt(Stmt->getIslCtx(), 0, MacroParams.Nc, MacroParams.Kc);
901   isl_map_move_dims(ExtMap, isl_dim_out, 0, isl_dim_in, 0, 1);
902   isl_map_move_dims(ExtMap, isl_dim_in, 2, isl_dim_out, 0, 1);
903   ExtMap = isl_map_project_out(ExtMap, isl_dim_in, 2, 1);
904   auto *Domain = Stmt->getDomain();
905 
906   // Restrict the domains of the copy statements to only execute when also its
907   // originating statement is executed.
908   auto *DomainId = isl_set_get_tuple_id(Domain);
909   auto *NewStmt = Stmt->getParent()->addScopStmt(
910       OldAcc, MemAccessB->getAccessRelation(), isl_set_copy(Domain));
911   ExtMap = isl_map_set_tuple_id(ExtMap, isl_dim_out, isl_id_copy(DomainId));
912   ExtMap = isl_map_intersect_range(ExtMap, isl_set_copy(Domain));
913   ExtMap = isl_map_set_tuple_id(ExtMap, isl_dim_out, NewStmt->getDomainId());
914   Node = createExtensionNode(Node, ExtMap);
915 
916   // Create a copy statement that corresponds to the memory access
917   // to the matrix A, the first operand of the matrix multiplication.
918   Node = isl_schedule_node_child(Node, 0);
919   AccRel = getMatMulAccRel(MapOldIndVar, 4, 6);
920   FirstDimSize = MacroParams.Mc / MicroParams.Mr;
921   ThirdDimSize = MicroParams.Mr;
922   SAI = Stmt->getParent()->createScopArrayInfo(
923       MemAccessA->getElementType(), "Packed_A",
924       {FirstDimSize, SecondDimSize, ThirdDimSize});
925   AccRel = isl_map_set_tuple_id(AccRel, isl_dim_out, SAI->getBasePtrId());
926   OldAcc = MemAccessA->getAccessRelation();
927   MemAccessA->setNewAccessRelation(AccRel);
928   ExtMap = getMatMulExt(Stmt->getIslCtx(), MacroParams.Mc, 0, MacroParams.Kc);
929   isl_map_move_dims(ExtMap, isl_dim_out, 0, isl_dim_in, 0, 1);
930   isl_map_move_dims(ExtMap, isl_dim_in, 2, isl_dim_out, 0, 1);
931   NewStmt = Stmt->getParent()->addScopStmt(
932       OldAcc, MemAccessA->getAccessRelation(), isl_set_copy(Domain));
933 
934   // Restrict the domains of the copy statements to only execute when also its
935   // originating statement is executed.
936   ExtMap = isl_map_set_tuple_id(ExtMap, isl_dim_out, DomainId);
937   ExtMap = isl_map_intersect_range(ExtMap, Domain);
938   ExtMap = isl_map_set_tuple_id(ExtMap, isl_dim_out, NewStmt->getDomainId());
939   Node = createExtensionNode(Node, ExtMap);
940   Node = isl_schedule_node_child(isl_schedule_node_child(Node, 0), 0);
941   return isl_schedule_node_child(isl_schedule_node_child(Node, 0), 0);
942 }
943 
944 /// Get a relation mapping induction variables produced by schedule
945 /// transformations to the original ones.
946 ///
947 /// @param Node The schedule node produced as the result of creation
948 ///        of the BLIS kernels.
949 /// @param MicroKernelParams, MacroKernelParams Parameters of the BLIS kernel
950 ///                                             to be taken into account.
951 /// @return  The relation mapping original induction variables to the ones
952 ///          produced by schedule transformation.
953 /// @see ScheduleTreeOptimizer::createMicroKernel
954 /// @see ScheduleTreeOptimizer::createMacroKernel
955 /// @see getMacroKernelParams
956 __isl_give isl_map *
957 getInductionVariablesSubstitution(__isl_take isl_schedule_node *Node,
958                                   MicroKernelParamsTy MicroKernelParams,
959                                   MacroKernelParamsTy MacroKernelParams) {
960   auto *Child = isl_schedule_node_get_child(Node, 0);
961   auto *UnMapOldIndVar = isl_schedule_node_get_prefix_schedule_union_map(Child);
962   isl_schedule_node_free(Child);
963   auto *MapOldIndVar = isl_map_from_union_map(UnMapOldIndVar);
964   if (isl_map_dim(MapOldIndVar, isl_dim_out) > 9)
965     MapOldIndVar =
966         isl_map_project_out(MapOldIndVar, isl_dim_out, 0,
967                             isl_map_dim(MapOldIndVar, isl_dim_out) - 9);
968   return MapOldIndVar;
969 }
970 
971 __isl_give isl_schedule_node *ScheduleTreeOptimizer::optimizeMatMulPattern(
972     __isl_take isl_schedule_node *Node, const llvm::TargetTransformInfo *TTI) {
973   assert(TTI && "The target transform info should be provided.");
974   auto MicroKernelParams = getMicroKernelParams(TTI);
975   auto MacroKernelParams = getMacroKernelParams(MicroKernelParams);
976   Node = createMacroKernel(Node, MacroKernelParams);
977   Node = createMicroKernel(Node, MicroKernelParams);
978   if (MacroKernelParams.Mc == 1 || MacroKernelParams.Nc == 1 ||
979       MacroKernelParams.Kc == 1)
980     return Node;
981   auto *MapOldIndVar = getInductionVariablesSubstitution(
982       Node, MicroKernelParams, MacroKernelParams);
983   if (!MapOldIndVar)
984     return Node;
985   return optimizeDataLayoutMatrMulPattern(Node, MapOldIndVar, MicroKernelParams,
986                                           MacroKernelParams);
987 }
988 
989 bool ScheduleTreeOptimizer::isMatrMultPattern(
990     __isl_keep isl_schedule_node *Node) {
991   auto *PartialSchedule =
992       isl_schedule_node_band_get_partial_schedule_union_map(Node);
993   if (isl_schedule_node_band_n_member(Node) != 3 ||
994       isl_union_map_n_map(PartialSchedule) != 1) {
995     isl_union_map_free(PartialSchedule);
996     return false;
997   }
998   auto *NewPartialSchedule = isl_map_from_union_map(PartialSchedule);
999   NewPartialSchedule = circularShiftOutputDims(NewPartialSchedule);
1000   if (containsMatrMult(NewPartialSchedule)) {
1001     isl_map_free(NewPartialSchedule);
1002     return true;
1003   }
1004   isl_map_free(NewPartialSchedule);
1005   return false;
1006 }
1007 
1008 __isl_give isl_schedule_node *
1009 ScheduleTreeOptimizer::optimizeBand(__isl_take isl_schedule_node *Node,
1010                                     void *User) {
1011   if (!isTileableBandNode(Node))
1012     return Node;
1013 
1014   if (PMBasedOpts && User && isMatrMultPattern(Node)) {
1015     DEBUG(dbgs() << "The matrix multiplication pattern was detected\n");
1016     const llvm::TargetTransformInfo *TTI;
1017     TTI = static_cast<const llvm::TargetTransformInfo *>(User);
1018     Node = optimizeMatMulPattern(Node, TTI);
1019   }
1020 
1021   return standardBandOpts(Node, User);
1022 }
1023 
1024 __isl_give isl_schedule *
1025 ScheduleTreeOptimizer::optimizeSchedule(__isl_take isl_schedule *Schedule,
1026                                         const llvm::TargetTransformInfo *TTI) {
1027   isl_schedule_node *Root = isl_schedule_get_root(Schedule);
1028   Root = optimizeScheduleNode(Root, TTI);
1029   isl_schedule_free(Schedule);
1030   auto S = isl_schedule_node_get_schedule(Root);
1031   isl_schedule_node_free(Root);
1032   return S;
1033 }
1034 
1035 __isl_give isl_schedule_node *ScheduleTreeOptimizer::optimizeScheduleNode(
1036     __isl_take isl_schedule_node *Node, const llvm::TargetTransformInfo *TTI) {
1037   Node = isl_schedule_node_map_descendant_bottom_up(
1038       Node, optimizeBand, const_cast<void *>(static_cast<const void *>(TTI)));
1039   return Node;
1040 }
1041 
1042 bool ScheduleTreeOptimizer::isProfitableSchedule(
1043     Scop &S, __isl_keep isl_schedule *NewSchedule) {
1044   // To understand if the schedule has been optimized we check if the schedule
1045   // has changed at all.
1046   // TODO: We can improve this by tracking if any necessarily beneficial
1047   // transformations have been performed. This can e.g. be tiling, loop
1048   // interchange, or ...) We can track this either at the place where the
1049   // transformation has been performed or, in case of automatic ILP based
1050   // optimizations, by comparing (yet to be defined) performance metrics
1051   // before/after the scheduling optimizer
1052   // (e.g., #stride-one accesses)
1053   if (S.containsExtensionNode(NewSchedule))
1054     return true;
1055   auto *NewScheduleMap = isl_schedule_get_map(NewSchedule);
1056   isl_union_map *OldSchedule = S.getSchedule();
1057   assert(OldSchedule && "Only IslScheduleOptimizer can insert extension nodes "
1058                         "that make Scop::getSchedule() return nullptr.");
1059   bool changed = !isl_union_map_is_equal(OldSchedule, NewScheduleMap);
1060   isl_union_map_free(OldSchedule);
1061   isl_union_map_free(NewScheduleMap);
1062   return changed;
1063 }
1064 
1065 namespace {
1066 class IslScheduleOptimizer : public ScopPass {
1067 public:
1068   static char ID;
1069   explicit IslScheduleOptimizer() : ScopPass(ID) { LastSchedule = nullptr; }
1070 
1071   ~IslScheduleOptimizer() { isl_schedule_free(LastSchedule); }
1072 
1073   /// Optimize the schedule of the SCoP @p S.
1074   bool runOnScop(Scop &S) override;
1075 
1076   /// Print the new schedule for the SCoP @p S.
1077   void printScop(raw_ostream &OS, Scop &S) const override;
1078 
1079   /// Register all analyses and transformation required.
1080   void getAnalysisUsage(AnalysisUsage &AU) const override;
1081 
1082   /// Release the internal memory.
1083   void releaseMemory() override {
1084     isl_schedule_free(LastSchedule);
1085     LastSchedule = nullptr;
1086   }
1087 
1088 private:
1089   isl_schedule *LastSchedule;
1090 };
1091 } // namespace
1092 
1093 char IslScheduleOptimizer::ID = 0;
1094 
1095 bool IslScheduleOptimizer::runOnScop(Scop &S) {
1096 
1097   // Skip empty SCoPs but still allow code generation as it will delete the
1098   // loops present but not needed.
1099   if (S.getSize() == 0) {
1100     S.markAsOptimized();
1101     return false;
1102   }
1103 
1104   const Dependences &D =
1105       getAnalysis<DependenceInfo>().getDependences(Dependences::AL_Statement);
1106 
1107   if (!D.hasValidDependences())
1108     return false;
1109 
1110   isl_schedule_free(LastSchedule);
1111   LastSchedule = nullptr;
1112 
1113   // Build input data.
1114   int ValidityKinds =
1115       Dependences::TYPE_RAW | Dependences::TYPE_WAR | Dependences::TYPE_WAW;
1116   int ProximityKinds;
1117 
1118   if (OptimizeDeps == "all")
1119     ProximityKinds =
1120         Dependences::TYPE_RAW | Dependences::TYPE_WAR | Dependences::TYPE_WAW;
1121   else if (OptimizeDeps == "raw")
1122     ProximityKinds = Dependences::TYPE_RAW;
1123   else {
1124     errs() << "Do not know how to optimize for '" << OptimizeDeps << "'"
1125            << " Falling back to optimizing all dependences.\n";
1126     ProximityKinds =
1127         Dependences::TYPE_RAW | Dependences::TYPE_WAR | Dependences::TYPE_WAW;
1128   }
1129 
1130   isl_union_set *Domain = S.getDomains();
1131 
1132   if (!Domain)
1133     return false;
1134 
1135   isl_union_map *Validity = D.getDependences(ValidityKinds);
1136   isl_union_map *Proximity = D.getDependences(ProximityKinds);
1137 
1138   // Simplify the dependences by removing the constraints introduced by the
1139   // domains. This can speed up the scheduling time significantly, as large
1140   // constant coefficients will be removed from the dependences. The
1141   // introduction of some additional dependences reduces the possible
1142   // transformations, but in most cases, such transformation do not seem to be
1143   // interesting anyway. In some cases this option may stop the scheduler to
1144   // find any schedule.
1145   if (SimplifyDeps == "yes") {
1146     Validity = isl_union_map_gist_domain(Validity, isl_union_set_copy(Domain));
1147     Validity = isl_union_map_gist_range(Validity, isl_union_set_copy(Domain));
1148     Proximity =
1149         isl_union_map_gist_domain(Proximity, isl_union_set_copy(Domain));
1150     Proximity = isl_union_map_gist_range(Proximity, isl_union_set_copy(Domain));
1151   } else if (SimplifyDeps != "no") {
1152     errs() << "warning: Option -polly-opt-simplify-deps should either be 'yes' "
1153               "or 'no'. Falling back to default: 'yes'\n";
1154   }
1155 
1156   DEBUG(dbgs() << "\n\nCompute schedule from: ");
1157   DEBUG(dbgs() << "Domain := " << stringFromIslObj(Domain) << ";\n");
1158   DEBUG(dbgs() << "Proximity := " << stringFromIslObj(Proximity) << ";\n");
1159   DEBUG(dbgs() << "Validity := " << stringFromIslObj(Validity) << ";\n");
1160 
1161   unsigned IslSerializeSCCs;
1162 
1163   if (FusionStrategy == "max") {
1164     IslSerializeSCCs = 0;
1165   } else if (FusionStrategy == "min") {
1166     IslSerializeSCCs = 1;
1167   } else {
1168     errs() << "warning: Unknown fusion strategy. Falling back to maximal "
1169               "fusion.\n";
1170     IslSerializeSCCs = 0;
1171   }
1172 
1173   int IslMaximizeBands;
1174 
1175   if (MaximizeBandDepth == "yes") {
1176     IslMaximizeBands = 1;
1177   } else if (MaximizeBandDepth == "no") {
1178     IslMaximizeBands = 0;
1179   } else {
1180     errs() << "warning: Option -polly-opt-maximize-bands should either be 'yes'"
1181               " or 'no'. Falling back to default: 'yes'\n";
1182     IslMaximizeBands = 1;
1183   }
1184 
1185   int IslOuterCoincidence;
1186 
1187   if (OuterCoincidence == "yes") {
1188     IslOuterCoincidence = 1;
1189   } else if (OuterCoincidence == "no") {
1190     IslOuterCoincidence = 0;
1191   } else {
1192     errs() << "warning: Option -polly-opt-outer-coincidence should either be "
1193               "'yes' or 'no'. Falling back to default: 'no'\n";
1194     IslOuterCoincidence = 0;
1195   }
1196 
1197   isl_ctx *Ctx = S.getIslCtx();
1198 
1199   isl_options_set_schedule_outer_coincidence(Ctx, IslOuterCoincidence);
1200   isl_options_set_schedule_serialize_sccs(Ctx, IslSerializeSCCs);
1201   isl_options_set_schedule_maximize_band_depth(Ctx, IslMaximizeBands);
1202   isl_options_set_schedule_max_constant_term(Ctx, MaxConstantTerm);
1203   isl_options_set_schedule_max_coefficient(Ctx, MaxCoefficient);
1204   isl_options_set_tile_scale_tile_loops(Ctx, 0);
1205 
1206   auto OnErrorStatus = isl_options_get_on_error(Ctx);
1207   isl_options_set_on_error(Ctx, ISL_ON_ERROR_CONTINUE);
1208 
1209   isl_schedule_constraints *ScheduleConstraints;
1210   ScheduleConstraints = isl_schedule_constraints_on_domain(Domain);
1211   ScheduleConstraints =
1212       isl_schedule_constraints_set_proximity(ScheduleConstraints, Proximity);
1213   ScheduleConstraints = isl_schedule_constraints_set_validity(
1214       ScheduleConstraints, isl_union_map_copy(Validity));
1215   ScheduleConstraints =
1216       isl_schedule_constraints_set_coincidence(ScheduleConstraints, Validity);
1217   isl_schedule *Schedule;
1218   Schedule = isl_schedule_constraints_compute_schedule(ScheduleConstraints);
1219   isl_options_set_on_error(Ctx, OnErrorStatus);
1220 
1221   // In cases the scheduler is not able to optimize the code, we just do not
1222   // touch the schedule.
1223   if (!Schedule)
1224     return false;
1225 
1226   DEBUG({
1227     auto *P = isl_printer_to_str(Ctx);
1228     P = isl_printer_set_yaml_style(P, ISL_YAML_STYLE_BLOCK);
1229     P = isl_printer_print_schedule(P, Schedule);
1230     auto *str = isl_printer_get_str(P);
1231     dbgs() << "NewScheduleTree: \n" << str << "\n";
1232     free(str);
1233     isl_printer_free(P);
1234   });
1235 
1236   Function &F = S.getFunction();
1237   auto *TTI = &getAnalysis<TargetTransformInfoWrapperPass>().getTTI(F);
1238   isl_schedule *NewSchedule =
1239       ScheduleTreeOptimizer::optimizeSchedule(Schedule, TTI);
1240 
1241   if (!ScheduleTreeOptimizer::isProfitableSchedule(S, NewSchedule)) {
1242     isl_schedule_free(NewSchedule);
1243     return false;
1244   }
1245 
1246   S.setScheduleTree(NewSchedule);
1247   S.markAsOptimized();
1248 
1249   if (OptimizedScops)
1250     S.dump();
1251 
1252   return false;
1253 }
1254 
1255 void IslScheduleOptimizer::printScop(raw_ostream &OS, Scop &) const {
1256   isl_printer *p;
1257   char *ScheduleStr;
1258 
1259   OS << "Calculated schedule:\n";
1260 
1261   if (!LastSchedule) {
1262     OS << "n/a\n";
1263     return;
1264   }
1265 
1266   p = isl_printer_to_str(isl_schedule_get_ctx(LastSchedule));
1267   p = isl_printer_print_schedule(p, LastSchedule);
1268   ScheduleStr = isl_printer_get_str(p);
1269   isl_printer_free(p);
1270 
1271   OS << ScheduleStr << "\n";
1272 }
1273 
1274 void IslScheduleOptimizer::getAnalysisUsage(AnalysisUsage &AU) const {
1275   ScopPass::getAnalysisUsage(AU);
1276   AU.addRequired<DependenceInfo>();
1277   AU.addRequired<TargetTransformInfoWrapperPass>();
1278 }
1279 
1280 Pass *polly::createIslScheduleOptimizerPass() {
1281   return new IslScheduleOptimizer();
1282 }
1283 
1284 INITIALIZE_PASS_BEGIN(IslScheduleOptimizer, "polly-opt-isl",
1285                       "Polly - Optimize schedule of SCoP", false, false);
1286 INITIALIZE_PASS_DEPENDENCY(DependenceInfo);
1287 INITIALIZE_PASS_DEPENDENCY(ScopInfoRegionPass);
1288 INITIALIZE_PASS_DEPENDENCY(TargetTransformInfoWrapperPass);
1289 INITIALIZE_PASS_END(IslScheduleOptimizer, "polly-opt-isl",
1290                     "Polly - Optimize schedule of SCoP", false, false)
1291