1 //===- Schedule.cpp - Calculate an optimized schedule ---------------------===//
2 //
3 //                     The LLVM Compiler Infrastructure
4 //
5 // This file is distributed under the University of Illinois Open Source
6 // License. See LICENSE.TXT for details.
7 //
8 //===----------------------------------------------------------------------===//
9 //
10 // This pass generates an entirely new schedule tree from the data dependences
11 // and iteration domains. The new schedule tree is computed in two steps:
12 //
13 // 1) The isl scheduling optimizer is run
14 //
15 // The isl scheduling optimizer creates a new schedule tree that maximizes
16 // parallelism and tileability and minimizes data-dependence distances. The
17 // algorithm used is a modified version of the ``Pluto'' algorithm:
18 //
19 //   U. Bondhugula, A. Hartono, J. Ramanujam, and P. Sadayappan.
20 //   A Practical Automatic Polyhedral Parallelizer and Locality Optimizer.
21 //   In Proceedings of the 2008 ACM SIGPLAN Conference On Programming Language
22 //   Design and Implementation, PLDI ’08, pages 101–113. ACM, 2008.
23 //
24 // 2) A set of post-scheduling transformations is applied on the schedule tree.
25 //
26 // These optimizations include:
27 //
28 //  - Tiling of the innermost tilable bands
29 //  - Prevectorization - The coice of a possible outer loop that is strip-mined
30 //                       to the innermost level to enable inner-loop
31 //                       vectorization.
32 //  - Some optimizations for spatial locality are also planned.
33 //
34 // For a detailed description of the schedule tree itself please see section 6
35 // of:
36 //
37 // Polyhedral AST generation is more than scanning polyhedra
38 // Tobias Grosser, Sven Verdoolaege, Albert Cohen
39 // ACM Transations on Programming Languages and Systems (TOPLAS),
40 // 37(4), July 2015
41 // http://www.grosser.es/#pub-polyhedral-AST-generation
42 //
43 // This publication also contains a detailed discussion of the different options
44 // for polyhedral loop unrolling, full/partial tile separation and other uses
45 // of the schedule tree.
46 //
47 //===----------------------------------------------------------------------===//
48 
49 #include "polly/ScheduleOptimizer.h"
50 #include "polly/CodeGen/CodeGeneration.h"
51 #include "polly/DependenceInfo.h"
52 #include "polly/LinkAllPasses.h"
53 #include "polly/Options.h"
54 #include "polly/ScopInfo.h"
55 #include "polly/Support/GICHelper.h"
56 #include "llvm/Analysis/TargetTransformInfo.h"
57 #include "llvm/Support/Debug.h"
58 #include "isl/aff.h"
59 #include "isl/band.h"
60 #include "isl/constraint.h"
61 #include "isl/map.h"
62 #include "isl/options.h"
63 #include "isl/printer.h"
64 #include "isl/schedule.h"
65 #include "isl/schedule_node.h"
66 #include "isl/space.h"
67 #include "isl/union_map.h"
68 #include "isl/union_set.h"
69 
70 using namespace llvm;
71 using namespace polly;
72 
73 #define DEBUG_TYPE "polly-opt-isl"
74 
75 static cl::opt<std::string>
76     OptimizeDeps("polly-opt-optimize-only",
77                  cl::desc("Only a certain kind of dependences (all/raw)"),
78                  cl::Hidden, cl::init("all"), cl::ZeroOrMore,
79                  cl::cat(PollyCategory));
80 
81 static cl::opt<std::string>
82     SimplifyDeps("polly-opt-simplify-deps",
83                  cl::desc("Dependences should be simplified (yes/no)"),
84                  cl::Hidden, cl::init("yes"), cl::ZeroOrMore,
85                  cl::cat(PollyCategory));
86 
87 static cl::opt<int> MaxConstantTerm(
88     "polly-opt-max-constant-term",
89     cl::desc("The maximal constant term allowed (-1 is unlimited)"), cl::Hidden,
90     cl::init(20), cl::ZeroOrMore, cl::cat(PollyCategory));
91 
92 static cl::opt<int> MaxCoefficient(
93     "polly-opt-max-coefficient",
94     cl::desc("The maximal coefficient allowed (-1 is unlimited)"), cl::Hidden,
95     cl::init(20), cl::ZeroOrMore, cl::cat(PollyCategory));
96 
97 static cl::opt<std::string> FusionStrategy(
98     "polly-opt-fusion", cl::desc("The fusion strategy to choose (min/max)"),
99     cl::Hidden, cl::init("min"), cl::ZeroOrMore, cl::cat(PollyCategory));
100 
101 static cl::opt<std::string>
102     MaximizeBandDepth("polly-opt-maximize-bands",
103                       cl::desc("Maximize the band depth (yes/no)"), cl::Hidden,
104                       cl::init("yes"), cl::ZeroOrMore, cl::cat(PollyCategory));
105 
106 static cl::opt<std::string> OuterCoincidence(
107     "polly-opt-outer-coincidence",
108     cl::desc("Try to construct schedules where the outer member of each band "
109              "satisfies the coincidence constraints (yes/no)"),
110     cl::Hidden, cl::init("no"), cl::ZeroOrMore, cl::cat(PollyCategory));
111 
112 static cl::opt<int> PrevectorWidth(
113     "polly-prevect-width",
114     cl::desc(
115         "The number of loop iterations to strip-mine for pre-vectorization"),
116     cl::Hidden, cl::init(4), cl::ZeroOrMore, cl::cat(PollyCategory));
117 
118 static cl::opt<bool> FirstLevelTiling("polly-tiling",
119                                       cl::desc("Enable loop tiling"),
120                                       cl::init(true), cl::ZeroOrMore,
121                                       cl::cat(PollyCategory));
122 
123 static cl::opt<int> LatencyVectorFma(
124     "polly-target-latency-vector-fma",
125     cl::desc("The minimal number of cycles between issuing two "
126              "dependent consecutive vector fused multiply-add "
127              "instructions."),
128     cl::Hidden, cl::init(8), cl::ZeroOrMore, cl::cat(PollyCategory));
129 
130 static cl::opt<int> ThroughputVectorFma(
131     "polly-target-throughput-vector-fma",
132     cl::desc("A throughput of the processor floating-point arithmetic units "
133              "expressed in the number of vector fused multiply-add "
134              "instructions per clock cycle."),
135     cl::Hidden, cl::init(1), cl::ZeroOrMore, cl::cat(PollyCategory));
136 
137 // This option, along with --polly-target-2nd-cache-level-associativity,
138 // --polly-target-1st-cache-level-size, and --polly-target-2st-cache-level-size
139 // represent the parameters of the target cache, which do not have typical
140 // values that can be used by default. However, to apply the pattern matching
141 // optimizations, we use the values of the parameters of Intel Core i7-3820
142 // SandyBridge in case the parameters are not specified. Such an approach helps
143 // also to attain the high-performance on IBM POWER System S822 and IBM Power
144 // 730 Express server.
145 static cl::opt<int> FirstCacheLevelAssociativity(
146     "polly-target-1st-cache-level-associativity",
147     cl::desc("The associativity of the first cache level."), cl::Hidden,
148     cl::init(8), cl::ZeroOrMore, cl::cat(PollyCategory));
149 
150 static cl::opt<int> SecondCacheLevelAssociativity(
151     "polly-target-2nd-cache-level-associativity",
152     cl::desc("The associativity of the second cache level."), cl::Hidden,
153     cl::init(8), cl::ZeroOrMore, cl::cat(PollyCategory));
154 
155 static cl::opt<int> FirstCacheLevelSize(
156     "polly-target-1st-cache-level-size",
157     cl::desc("The size of the first cache level specified in bytes."),
158     cl::Hidden, cl::init(32768), cl::ZeroOrMore, cl::cat(PollyCategory));
159 
160 static cl::opt<int> SecondCacheLevelSize(
161     "polly-target-2nd-cache-level-size",
162     cl::desc("The size of the second level specified in bytes."), cl::Hidden,
163     cl::init(262144), cl::ZeroOrMore, cl::cat(PollyCategory));
164 
165 static cl::opt<int> VectorRegisterBitwidth(
166     "polly-target-vector-register-bitwidth",
167     cl::desc("The size in bits of a vector register (if not set, this "
168              "information is taken from LLVM's target information."),
169     cl::Hidden, cl::init(-1), cl::ZeroOrMore, cl::cat(PollyCategory));
170 
171 static cl::opt<int> FirstLevelDefaultTileSize(
172     "polly-default-tile-size",
173     cl::desc("The default tile size (if not enough were provided by"
174              " --polly-tile-sizes)"),
175     cl::Hidden, cl::init(32), cl::ZeroOrMore, cl::cat(PollyCategory));
176 
177 static cl::list<int>
178     FirstLevelTileSizes("polly-tile-sizes",
179                         cl::desc("A tile size for each loop dimension, filled "
180                                  "with --polly-default-tile-size"),
181                         cl::Hidden, cl::ZeroOrMore, cl::CommaSeparated,
182                         cl::cat(PollyCategory));
183 
184 static cl::opt<bool>
185     SecondLevelTiling("polly-2nd-level-tiling",
186                       cl::desc("Enable a 2nd level loop of loop tiling"),
187                       cl::init(false), cl::ZeroOrMore, cl::cat(PollyCategory));
188 
189 static cl::opt<int> SecondLevelDefaultTileSize(
190     "polly-2nd-level-default-tile-size",
191     cl::desc("The default 2nd-level tile size (if not enough were provided by"
192              " --polly-2nd-level-tile-sizes)"),
193     cl::Hidden, cl::init(16), cl::ZeroOrMore, cl::cat(PollyCategory));
194 
195 static cl::list<int>
196     SecondLevelTileSizes("polly-2nd-level-tile-sizes",
197                          cl::desc("A tile size for each loop dimension, filled "
198                                   "with --polly-default-tile-size"),
199                          cl::Hidden, cl::ZeroOrMore, cl::CommaSeparated,
200                          cl::cat(PollyCategory));
201 
202 static cl::opt<bool> RegisterTiling("polly-register-tiling",
203                                     cl::desc("Enable register tiling"),
204                                     cl::init(false), cl::ZeroOrMore,
205                                     cl::cat(PollyCategory));
206 
207 static cl::opt<int> RegisterDefaultTileSize(
208     "polly-register-tiling-default-tile-size",
209     cl::desc("The default register tile size (if not enough were provided by"
210              " --polly-register-tile-sizes)"),
211     cl::Hidden, cl::init(2), cl::ZeroOrMore, cl::cat(PollyCategory));
212 
213 static cl::opt<int> PollyPatternMatchingNcQuotient(
214     "polly-pattern-matching-nc-quotient",
215     cl::desc("Quotient that is obtained by dividing Nc, the parameter of the"
216              "macro-kernel, by Nr, the parameter of the micro-kernel"),
217     cl::Hidden, cl::init(256), cl::ZeroOrMore, cl::cat(PollyCategory));
218 
219 static cl::list<int>
220     RegisterTileSizes("polly-register-tile-sizes",
221                       cl::desc("A tile size for each loop dimension, filled "
222                                "with --polly-register-tile-size"),
223                       cl::Hidden, cl::ZeroOrMore, cl::CommaSeparated,
224                       cl::cat(PollyCategory));
225 
226 static cl::opt<bool>
227     PMBasedOpts("polly-pattern-matching-based-opts",
228                 cl::desc("Perform optimizations based on pattern matching"),
229                 cl::init(false), cl::ZeroOrMore, cl::cat(PollyCategory));
230 
231 static cl::opt<bool> OptimizedScops(
232     "polly-optimized-scops",
233     cl::desc("Polly - Dump polyhedral description of Scops optimized with "
234              "the isl scheduling optimizer and the set of post-scheduling "
235              "transformations is applied on the schedule tree"),
236     cl::init(false), cl::ZeroOrMore, cl::cat(PollyCategory));
237 
238 /// Create an isl_union_set, which describes the isolate option based on
239 /// IsoalteDomain.
240 ///
241 /// @param IsolateDomain An isl_set whose last dimension is the only one that
242 ///                      should belong to the current band node.
243 static __isl_give isl_union_set *
244 getIsolateOptions(__isl_take isl_set *IsolateDomain) {
245   auto Dims = isl_set_dim(IsolateDomain, isl_dim_set);
246   auto *IsolateRelation = isl_map_from_domain(IsolateDomain);
247   IsolateRelation = isl_map_move_dims(IsolateRelation, isl_dim_out, 0,
248                                       isl_dim_in, Dims - 1, 1);
249   auto *IsolateOption = isl_map_wrap(IsolateRelation);
250   auto *Id = isl_id_alloc(isl_set_get_ctx(IsolateOption), "isolate", nullptr);
251   return isl_union_set_from_set(isl_set_set_tuple_id(IsolateOption, Id));
252 }
253 
254 /// Create an isl_union_set, which describes the atomic option for the dimension
255 /// of the current node.
256 ///
257 /// It may help to reduce the size of generated code.
258 ///
259 /// @param Ctx An isl_ctx, which is used to create the isl_union_set.
260 static __isl_give isl_union_set *getAtomicOptions(__isl_take isl_ctx *Ctx) {
261   auto *Space = isl_space_set_alloc(Ctx, 0, 1);
262   auto *AtomicOption = isl_set_universe(Space);
263   auto *Id = isl_id_alloc(Ctx, "atomic", nullptr);
264   return isl_union_set_from_set(isl_set_set_tuple_id(AtomicOption, Id));
265 }
266 
267 /// Make the last dimension of Set to take values from 0 to VectorWidth - 1.
268 ///
269 /// @param Set         A set, which should be modified.
270 /// @param VectorWidth A parameter, which determines the constraint.
271 static __isl_give isl_set *addExtentConstraints(__isl_take isl_set *Set,
272                                                 int VectorWidth) {
273   auto Dims = isl_set_dim(Set, isl_dim_set);
274   auto Space = isl_set_get_space(Set);
275   auto *LocalSpace = isl_local_space_from_space(Space);
276   auto *ExtConstr =
277       isl_constraint_alloc_inequality(isl_local_space_copy(LocalSpace));
278   ExtConstr = isl_constraint_set_constant_si(ExtConstr, 0);
279   ExtConstr =
280       isl_constraint_set_coefficient_si(ExtConstr, isl_dim_set, Dims - 1, 1);
281   Set = isl_set_add_constraint(Set, ExtConstr);
282   ExtConstr = isl_constraint_alloc_inequality(LocalSpace);
283   ExtConstr = isl_constraint_set_constant_si(ExtConstr, VectorWidth - 1);
284   ExtConstr =
285       isl_constraint_set_coefficient_si(ExtConstr, isl_dim_set, Dims - 1, -1);
286   return isl_set_add_constraint(Set, ExtConstr);
287 }
288 
289 /// Build the desired set of partial tile prefixes.
290 ///
291 /// We build a set of partial tile prefixes, which are prefixes of the vector
292 /// loop that have exactly VectorWidth iterations.
293 ///
294 /// 1. Get all prefixes of the vector loop.
295 /// 2. Extend it to a set, which has exactly VectorWidth iterations for
296 ///    any prefix from the set that was built on the previous step.
297 /// 3. Subtract loop domain from it, project out the vector loop dimension and
298 ///    get a set of prefixes, which don't have exactly VectorWidth iterations.
299 /// 4. Subtract it from all prefixes of the vector loop and get the desired
300 ///    set.
301 ///
302 /// @param ScheduleRange A range of a map, which describes a prefix schedule
303 ///                      relation.
304 static __isl_give isl_set *
305 getPartialTilePrefixes(__isl_take isl_set *ScheduleRange, int VectorWidth) {
306   auto Dims = isl_set_dim(ScheduleRange, isl_dim_set);
307   auto *LoopPrefixes = isl_set_project_out(isl_set_copy(ScheduleRange),
308                                            isl_dim_set, Dims - 1, 1);
309   auto *ExtentPrefixes =
310       isl_set_add_dims(isl_set_copy(LoopPrefixes), isl_dim_set, 1);
311   ExtentPrefixes = addExtentConstraints(ExtentPrefixes, VectorWidth);
312   auto *BadPrefixes = isl_set_subtract(ExtentPrefixes, ScheduleRange);
313   BadPrefixes = isl_set_project_out(BadPrefixes, isl_dim_set, Dims - 1, 1);
314   return isl_set_subtract(LoopPrefixes, BadPrefixes);
315 }
316 
317 __isl_give isl_schedule_node *ScheduleTreeOptimizer::isolateFullPartialTiles(
318     __isl_take isl_schedule_node *Node, int VectorWidth) {
319   assert(isl_schedule_node_get_type(Node) == isl_schedule_node_band);
320   Node = isl_schedule_node_child(Node, 0);
321   Node = isl_schedule_node_child(Node, 0);
322   auto *SchedRelUMap = isl_schedule_node_get_prefix_schedule_relation(Node);
323   auto *ScheduleRelation = isl_map_from_union_map(SchedRelUMap);
324   auto *ScheduleRange = isl_map_range(ScheduleRelation);
325   auto *IsolateDomain = getPartialTilePrefixes(ScheduleRange, VectorWidth);
326   auto *AtomicOption = getAtomicOptions(isl_set_get_ctx(IsolateDomain));
327   auto *IsolateOption = getIsolateOptions(IsolateDomain);
328   Node = isl_schedule_node_parent(Node);
329   Node = isl_schedule_node_parent(Node);
330   auto *Options = isl_union_set_union(IsolateOption, AtomicOption);
331   Node = isl_schedule_node_band_set_ast_build_options(Node, Options);
332   return Node;
333 }
334 
335 __isl_give isl_schedule_node *
336 ScheduleTreeOptimizer::prevectSchedBand(__isl_take isl_schedule_node *Node,
337                                         unsigned DimToVectorize,
338                                         int VectorWidth) {
339   assert(isl_schedule_node_get_type(Node) == isl_schedule_node_band);
340 
341   auto Space = isl_schedule_node_band_get_space(Node);
342   auto ScheduleDimensions = isl_space_dim(Space, isl_dim_set);
343   isl_space_free(Space);
344   assert(DimToVectorize < ScheduleDimensions);
345 
346   if (DimToVectorize > 0) {
347     Node = isl_schedule_node_band_split(Node, DimToVectorize);
348     Node = isl_schedule_node_child(Node, 0);
349   }
350   if (DimToVectorize < ScheduleDimensions - 1)
351     Node = isl_schedule_node_band_split(Node, 1);
352   Space = isl_schedule_node_band_get_space(Node);
353   auto Sizes = isl_multi_val_zero(Space);
354   auto Ctx = isl_schedule_node_get_ctx(Node);
355   Sizes =
356       isl_multi_val_set_val(Sizes, 0, isl_val_int_from_si(Ctx, VectorWidth));
357   Node = isl_schedule_node_band_tile(Node, Sizes);
358   Node = isolateFullPartialTiles(Node, VectorWidth);
359   Node = isl_schedule_node_child(Node, 0);
360   // Make sure the "trivially vectorizable loop" is not unrolled. Otherwise,
361   // we will have troubles to match it in the backend.
362   Node = isl_schedule_node_band_set_ast_build_options(
363       Node, isl_union_set_read_from_str(Ctx, "{ unroll[x]: 1 = 0 }"));
364   Node = isl_schedule_node_band_sink(Node);
365   Node = isl_schedule_node_child(Node, 0);
366   if (isl_schedule_node_get_type(Node) == isl_schedule_node_leaf)
367     Node = isl_schedule_node_parent(Node);
368   isl_id *LoopMarker = isl_id_alloc(Ctx, "SIMD", nullptr);
369   Node = isl_schedule_node_insert_mark(Node, LoopMarker);
370   return Node;
371 }
372 
373 __isl_give isl_schedule_node *
374 ScheduleTreeOptimizer::tileNode(__isl_take isl_schedule_node *Node,
375                                 const char *Identifier, ArrayRef<int> TileSizes,
376                                 int DefaultTileSize) {
377   auto Ctx = isl_schedule_node_get_ctx(Node);
378   auto Space = isl_schedule_node_band_get_space(Node);
379   auto Dims = isl_space_dim(Space, isl_dim_set);
380   auto Sizes = isl_multi_val_zero(Space);
381   std::string IdentifierString(Identifier);
382   for (unsigned i = 0; i < Dims; i++) {
383     auto tileSize = i < TileSizes.size() ? TileSizes[i] : DefaultTileSize;
384     Sizes = isl_multi_val_set_val(Sizes, i, isl_val_int_from_si(Ctx, tileSize));
385   }
386   auto TileLoopMarkerStr = IdentifierString + " - Tiles";
387   isl_id *TileLoopMarker =
388       isl_id_alloc(Ctx, TileLoopMarkerStr.c_str(), nullptr);
389   Node = isl_schedule_node_insert_mark(Node, TileLoopMarker);
390   Node = isl_schedule_node_child(Node, 0);
391   Node = isl_schedule_node_band_tile(Node, Sizes);
392   Node = isl_schedule_node_child(Node, 0);
393   auto PointLoopMarkerStr = IdentifierString + " - Points";
394   isl_id *PointLoopMarker =
395       isl_id_alloc(Ctx, PointLoopMarkerStr.c_str(), nullptr);
396   Node = isl_schedule_node_insert_mark(Node, PointLoopMarker);
397   Node = isl_schedule_node_child(Node, 0);
398   return Node;
399 }
400 
401 __isl_give isl_schedule_node *
402 ScheduleTreeOptimizer::applyRegisterTiling(__isl_take isl_schedule_node *Node,
403                                            llvm::ArrayRef<int> TileSizes,
404                                            int DefaultTileSize) {
405   auto *Ctx = isl_schedule_node_get_ctx(Node);
406   Node = tileNode(Node, "Register tiling", TileSizes, DefaultTileSize);
407   Node = isl_schedule_node_band_set_ast_build_options(
408       Node, isl_union_set_read_from_str(Ctx, "{unroll[x]}"));
409   return Node;
410 }
411 
412 bool ScheduleTreeOptimizer::isTileableBandNode(
413     __isl_keep isl_schedule_node *Node) {
414   if (isl_schedule_node_get_type(Node) != isl_schedule_node_band)
415     return false;
416 
417   if (isl_schedule_node_n_children(Node) != 1)
418     return false;
419 
420   if (!isl_schedule_node_band_get_permutable(Node))
421     return false;
422 
423   auto Space = isl_schedule_node_band_get_space(Node);
424   auto Dims = isl_space_dim(Space, isl_dim_set);
425   isl_space_free(Space);
426 
427   if (Dims <= 1)
428     return false;
429 
430   auto Child = isl_schedule_node_get_child(Node, 0);
431   auto Type = isl_schedule_node_get_type(Child);
432   isl_schedule_node_free(Child);
433 
434   if (Type != isl_schedule_node_leaf)
435     return false;
436 
437   return true;
438 }
439 
440 __isl_give isl_schedule_node *
441 ScheduleTreeOptimizer::standardBandOpts(__isl_take isl_schedule_node *Node,
442                                         void *User) {
443   if (FirstLevelTiling)
444     Node = tileNode(Node, "1st level tiling", FirstLevelTileSizes,
445                     FirstLevelDefaultTileSize);
446 
447   if (SecondLevelTiling)
448     Node = tileNode(Node, "2nd level tiling", SecondLevelTileSizes,
449                     SecondLevelDefaultTileSize);
450 
451   if (RegisterTiling)
452     Node =
453         applyRegisterTiling(Node, RegisterTileSizes, RegisterDefaultTileSize);
454 
455   if (PollyVectorizerChoice == VECTORIZER_NONE)
456     return Node;
457 
458   auto Space = isl_schedule_node_band_get_space(Node);
459   auto Dims = isl_space_dim(Space, isl_dim_set);
460   isl_space_free(Space);
461 
462   for (int i = Dims - 1; i >= 0; i--)
463     if (isl_schedule_node_band_member_get_coincident(Node, i)) {
464       Node = prevectSchedBand(Node, i, PrevectorWidth);
465       break;
466     }
467 
468   return Node;
469 }
470 
471 /// Check whether output dimensions of the map rely on the specified input
472 /// dimension.
473 ///
474 /// @param IslMap The isl map to be considered.
475 /// @param DimNum The number of an input dimension to be checked.
476 static bool isInputDimUsed(__isl_take isl_map *IslMap, unsigned DimNum) {
477   auto *CheckedAccessRelation =
478       isl_map_project_out(isl_map_copy(IslMap), isl_dim_in, DimNum, 1);
479   CheckedAccessRelation =
480       isl_map_insert_dims(CheckedAccessRelation, isl_dim_in, DimNum, 1);
481   auto *InputDimsId = isl_map_get_tuple_id(IslMap, isl_dim_in);
482   CheckedAccessRelation =
483       isl_map_set_tuple_id(CheckedAccessRelation, isl_dim_in, InputDimsId);
484   InputDimsId = isl_map_get_tuple_id(IslMap, isl_dim_out);
485   CheckedAccessRelation =
486       isl_map_set_tuple_id(CheckedAccessRelation, isl_dim_out, InputDimsId);
487   auto res = !isl_map_is_equal(CheckedAccessRelation, IslMap);
488   isl_map_free(CheckedAccessRelation);
489   isl_map_free(IslMap);
490   return res;
491 }
492 
493 /// Check if the SCoP statement could probably be optimized with analytical
494 /// modeling.
495 ///
496 /// containsMatrMult tries to determine whether the following conditions
497 /// are true:
498 /// 1. all memory accesses of the statement will have stride 0 or 1,
499 ///    if we interchange loops (switch the variable used in the inner
500 ///    loop to the outer loop).
501 /// 2. all memory accesses of the statement except from the last one, are
502 ///    read memory access and the last one is write memory access.
503 /// 3. all subscripts of the last memory access of the statement don't contain
504 ///    the variable used in the inner loop.
505 ///
506 /// @param PartialSchedule The PartialSchedule that contains a SCoP statement
507 ///        to check.
508 static bool containsMatrMult(__isl_keep isl_map *PartialSchedule) {
509   auto InputDimsId = isl_map_get_tuple_id(PartialSchedule, isl_dim_in);
510   auto *ScpStmt = static_cast<ScopStmt *>(isl_id_get_user(InputDimsId));
511   isl_id_free(InputDimsId);
512   if (ScpStmt->size() <= 1)
513     return false;
514   auto MemA = ScpStmt->begin();
515   for (unsigned i = 0; i < ScpStmt->size() - 2 && MemA != ScpStmt->end();
516        i++, MemA++)
517     if (!(*MemA)->isRead() ||
518         ((*MemA)->isArrayKind() &&
519          !((*MemA)->isStrideOne(isl_map_copy(PartialSchedule)) ||
520            (*MemA)->isStrideZero(isl_map_copy(PartialSchedule)))))
521       return false;
522   MemA++;
523   if (!(*MemA)->isWrite() || !(*MemA)->isArrayKind() ||
524       !((*MemA)->isStrideOne(isl_map_copy(PartialSchedule)) ||
525         (*MemA)->isStrideZero(isl_map_copy(PartialSchedule))))
526     return false;
527   auto DimNum = isl_map_dim(PartialSchedule, isl_dim_in);
528   return !isInputDimUsed((*MemA)->getAccessRelation(), DimNum - 1);
529 }
530 
531 /// Circular shift of output dimensions of the integer map.
532 ///
533 /// @param IslMap The isl map to be modified.
534 static __isl_give isl_map *circularShiftOutputDims(__isl_take isl_map *IslMap) {
535   auto DimNum = isl_map_dim(IslMap, isl_dim_out);
536   if (DimNum == 0)
537     return IslMap;
538   auto InputDimsId = isl_map_get_tuple_id(IslMap, isl_dim_in);
539   IslMap = isl_map_move_dims(IslMap, isl_dim_in, 0, isl_dim_out, DimNum - 1, 1);
540   IslMap = isl_map_move_dims(IslMap, isl_dim_out, 0, isl_dim_in, 0, 1);
541   return isl_map_set_tuple_id(IslMap, isl_dim_in, InputDimsId);
542 }
543 
544 /// Permute two dimensions of the band node.
545 ///
546 /// Permute FirstDim and SecondDim dimensions of the Node.
547 ///
548 /// @param Node The band node to be modified.
549 /// @param FirstDim The first dimension to be permuted.
550 /// @param SecondDim The second dimension to be permuted.
551 static __isl_give isl_schedule_node *
552 permuteBandNodeDimensions(__isl_take isl_schedule_node *Node, unsigned FirstDim,
553                           unsigned SecondDim) {
554   assert(isl_schedule_node_get_type(Node) == isl_schedule_node_band &&
555          isl_schedule_node_band_n_member(Node) > std::max(FirstDim, SecondDim));
556   auto PartialSchedule = isl_schedule_node_band_get_partial_schedule(Node);
557   auto PartialScheduleFirstDim =
558       isl_multi_union_pw_aff_get_union_pw_aff(PartialSchedule, FirstDim);
559   auto PartialScheduleSecondDim =
560       isl_multi_union_pw_aff_get_union_pw_aff(PartialSchedule, SecondDim);
561   PartialSchedule = isl_multi_union_pw_aff_set_union_pw_aff(
562       PartialSchedule, SecondDim, PartialScheduleFirstDim);
563   PartialSchedule = isl_multi_union_pw_aff_set_union_pw_aff(
564       PartialSchedule, FirstDim, PartialScheduleSecondDim);
565   Node = isl_schedule_node_delete(Node);
566   Node = isl_schedule_node_insert_partial_schedule(Node, PartialSchedule);
567   return Node;
568 }
569 
570 __isl_give isl_schedule_node *ScheduleTreeOptimizer::createMicroKernel(
571     __isl_take isl_schedule_node *Node, MicroKernelParamsTy MicroKernelParams) {
572   applyRegisterTiling(Node, {MicroKernelParams.Mr, MicroKernelParams.Nr}, 1);
573   Node = isl_schedule_node_parent(isl_schedule_node_parent(Node));
574   Node = permuteBandNodeDimensions(Node, 0, 1);
575   return isl_schedule_node_child(isl_schedule_node_child(Node, 0), 0);
576 }
577 
578 __isl_give isl_schedule_node *ScheduleTreeOptimizer::createMacroKernel(
579     __isl_take isl_schedule_node *Node, MacroKernelParamsTy MacroKernelParams) {
580   assert(isl_schedule_node_get_type(Node) == isl_schedule_node_band);
581   if (MacroKernelParams.Mc == 1 && MacroKernelParams.Nc == 1 &&
582       MacroKernelParams.Kc == 1)
583     return Node;
584   Node = tileNode(
585       Node, "1st level tiling",
586       {MacroKernelParams.Mc, MacroKernelParams.Nc, MacroKernelParams.Kc}, 1);
587   Node = isl_schedule_node_parent(isl_schedule_node_parent(Node));
588   Node = permuteBandNodeDimensions(Node, 1, 2);
589   Node = permuteBandNodeDimensions(Node, 0, 2);
590   return isl_schedule_node_child(isl_schedule_node_child(Node, 0), 0);
591 }
592 
593 /// Get parameters of the BLIS micro kernel.
594 ///
595 /// We choose the Mr and Nr parameters of the micro kernel to be large enough
596 /// such that no stalls caused by the combination of latencies and dependencies
597 /// are introduced during the updates of the resulting matrix of the matrix
598 /// multiplication. However, they should also be as small as possible to
599 /// release more registers for entries of multiplied matrices.
600 ///
601 /// @param TTI Target Transform Info.
602 /// @return The structure of type MicroKernelParamsTy.
603 /// @see MicroKernelParamsTy
604 static struct MicroKernelParamsTy
605 getMicroKernelParams(const llvm::TargetTransformInfo *TTI) {
606   assert(TTI && "The target transform info should be provided.");
607 
608   // Nvec - Number of double-precision floating-point numbers that can be hold
609   // by a vector register. Use 2 by default.
610   long RegisterBitwidth = VectorRegisterBitwidth;
611 
612   if (RegisterBitwidth == -1)
613     RegisterBitwidth = TTI->getRegisterBitWidth(true);
614   auto Nvec = RegisterBitwidth / 64;
615   if (Nvec == 0)
616     Nvec = 2;
617   int Nr =
618       ceil(sqrt(Nvec * LatencyVectorFma * ThroughputVectorFma) / Nvec) * Nvec;
619   int Mr = ceil(Nvec * LatencyVectorFma * ThroughputVectorFma / Nr);
620   return {Mr, Nr};
621 }
622 
623 /// Get parameters of the BLIS macro kernel.
624 ///
625 /// During the computation of matrix multiplication, blocks of partitioned
626 /// matrices are mapped to different layers of the memory hierarchy.
627 /// To optimize data reuse, blocks should be ideally kept in cache between
628 /// iterations. Since parameters of the macro kernel determine sizes of these
629 /// blocks, there are upper and lower bounds on these parameters.
630 ///
631 /// @param MicroKernelParams Parameters of the micro-kernel
632 ///                          to be taken into account.
633 /// @return The structure of type MacroKernelParamsTy.
634 /// @see MacroKernelParamsTy
635 /// @see MicroKernelParamsTy
636 static struct MacroKernelParamsTy
637 getMacroKernelParams(const MicroKernelParamsTy &MicroKernelParams) {
638   // According to www.cs.utexas.edu/users/flame/pubs/TOMS-BLIS-Analytical.pdf,
639   // it requires information about the first two levels of a cache to determine
640   // all the parameters of a macro-kernel. It also checks that an associativity
641   // degree of a cache level is greater than two. Otherwise, another algorithm
642   // for determination of the parameters should be used.
643   if (!(MicroKernelParams.Mr > 0 && MicroKernelParams.Nr > 0 &&
644         FirstCacheLevelSize > 0 && SecondCacheLevelSize > 0 &&
645         FirstCacheLevelAssociativity > 2 && SecondCacheLevelAssociativity > 2))
646     return {1, 1, 1};
647   // The quotient should be greater than zero.
648   if (PollyPatternMatchingNcQuotient <= 0)
649     return {1, 1, 1};
650   int Car = floor(
651       (FirstCacheLevelAssociativity - 1) /
652       (1 + static_cast<double>(MicroKernelParams.Nr) / MicroKernelParams.Mr));
653   int Kc = (Car * FirstCacheLevelSize) /
654            (MicroKernelParams.Mr * FirstCacheLevelAssociativity * 8);
655   double Cac = static_cast<double>(Kc * 8 * SecondCacheLevelAssociativity) /
656                SecondCacheLevelSize;
657   int Mc = floor((SecondCacheLevelAssociativity - 2) / Cac);
658   int Nc = PollyPatternMatchingNcQuotient * MicroKernelParams.Nr;
659   return {Mc, Nc, Kc};
660 }
661 
662 /// Identify a memory access through the shape of its memory access relation.
663 ///
664 /// Identify the unique memory access in @p Stmt, that has an access relation
665 /// equal to @p ExpectedAccessRelation.
666 ///
667 /// @param Stmt The SCoP statement that contains the memory accesses under
668 ///             consideration.
669 /// @param ExpectedAccessRelation The access relation that identifies
670 ///                               the memory access.
671 /// @return  The memory access of @p Stmt whose memory access relation is equal
672 ///          to @p ExpectedAccessRelation. nullptr in case there is no or more
673 ///          than one such access.
674 MemoryAccess *
675 identifyAccessByAccessRelation(ScopStmt *Stmt,
676                                __isl_take isl_map *ExpectedAccessRelation) {
677   if (isl_map_has_tuple_id(ExpectedAccessRelation, isl_dim_out))
678     ExpectedAccessRelation =
679         isl_map_reset_tuple_id(ExpectedAccessRelation, isl_dim_out);
680   MemoryAccess *IdentifiedAccess = nullptr;
681   for (auto *Access : *Stmt) {
682     auto *AccessRelation = Access->getAccessRelation();
683     AccessRelation = isl_map_reset_tuple_id(AccessRelation, isl_dim_out);
684     if (isl_map_is_equal(ExpectedAccessRelation, AccessRelation)) {
685       if (IdentifiedAccess) {
686         isl_map_free(AccessRelation);
687         isl_map_free(ExpectedAccessRelation);
688         return nullptr;
689       }
690       IdentifiedAccess = Access;
691     }
692     isl_map_free(AccessRelation);
693   }
694   isl_map_free(ExpectedAccessRelation);
695   return IdentifiedAccess;
696 }
697 
698 /// Add constrains to @Dim dimension of @p ExtMap.
699 ///
700 /// If @ExtMap has the following form [O0, O1, O2]->[I1, I2, I3],
701 /// the following constraint will be added
702 /// Bound * OM <= IM <= Bound * (OM + 1) - 1,
703 /// where M is @p Dim and Bound is @p Bound.
704 ///
705 /// @param ExtMap The isl map to be modified.
706 /// @param Dim The output dimension to be modfied.
707 /// @param Bound The value that is used to specify the constraint.
708 /// @return The modified isl map
709 __isl_give isl_map *
710 addExtensionMapMatMulDimConstraint(__isl_take isl_map *ExtMap, unsigned Dim,
711                                    unsigned Bound) {
712   assert(Bound != 0);
713   auto *ExtMapSpace = isl_map_get_space(ExtMap);
714   auto *ConstrSpace = isl_local_space_from_space(ExtMapSpace);
715   auto *Constr =
716       isl_constraint_alloc_inequality(isl_local_space_copy(ConstrSpace));
717   Constr = isl_constraint_set_coefficient_si(Constr, isl_dim_out, Dim, 1);
718   Constr =
719       isl_constraint_set_coefficient_si(Constr, isl_dim_in, Dim, Bound * (-1));
720   ExtMap = isl_map_add_constraint(ExtMap, Constr);
721   Constr = isl_constraint_alloc_inequality(ConstrSpace);
722   Constr = isl_constraint_set_coefficient_si(Constr, isl_dim_out, Dim, -1);
723   Constr = isl_constraint_set_coefficient_si(Constr, isl_dim_in, Dim, Bound);
724   Constr = isl_constraint_set_constant_si(Constr, Bound - 1);
725   return isl_map_add_constraint(ExtMap, Constr);
726 }
727 
728 /// Create an access relation that is specific for matrix multiplication
729 /// pattern.
730 ///
731 /// Create an access relation of the following form:
732 /// { [O0, O1, O2]->[I1, I2, I3] :
733 ///   FirstOutputDimBound * O0 <= I1 <= FirstOutputDimBound * (O0 + 1) - 1
734 ///   and SecondOutputDimBound * O1 <= I2 <= SecondOutputDimBound * (O1 + 1) - 1
735 ///   and ThirdOutputDimBound * O2 <= I3 <= ThirdOutputDimBound * (O2 + 1) - 1}
736 ///   where FirstOutputDimBound is @p FirstOutputDimBound,
737 ///   SecondOutputDimBound is @p SecondOutputDimBound,
738 ///   ThirdOutputDimBound is @p ThirdOutputDimBound
739 ///
740 /// @param Ctx The isl context.
741 /// @param FirstOutputDimBound,
742 ///        SecondOutputDimBound,
743 ///        ThirdOutputDimBound The parameters of the access relation.
744 /// @return The specified access relation.
745 __isl_give isl_map *getMatMulExt(isl_ctx *Ctx, unsigned FirstOutputDimBound,
746                                  unsigned SecondOutputDimBound,
747                                  unsigned ThirdOutputDimBound) {
748   auto *NewRelSpace = isl_space_alloc(Ctx, 0, 3, 3);
749   auto *extensionMap = isl_map_universe(NewRelSpace);
750   if (!FirstOutputDimBound)
751     extensionMap = isl_map_fix_si(extensionMap, isl_dim_out, 0, 0);
752   else
753     extensionMap = addExtensionMapMatMulDimConstraint(extensionMap, 0,
754                                                       FirstOutputDimBound);
755   if (!SecondOutputDimBound)
756     extensionMap = isl_map_fix_si(extensionMap, isl_dim_out, 1, 0);
757   else
758     extensionMap = addExtensionMapMatMulDimConstraint(extensionMap, 1,
759                                                       SecondOutputDimBound);
760   if (!ThirdOutputDimBound)
761     extensionMap = isl_map_fix_si(extensionMap, isl_dim_out, 2, 0);
762   else
763     extensionMap = addExtensionMapMatMulDimConstraint(extensionMap, 2,
764                                                       ThirdOutputDimBound);
765   return extensionMap;
766 }
767 
768 /// Create an access relation that is specific to the matrix
769 ///        multiplication pattern.
770 ///
771 /// Create an access relation of the following form:
772 /// Stmt[O0, O1, O2]->[OI, OJ],
773 /// where I is @p I, J is @J
774 ///
775 /// @param Stmt The SCoP statement for which to generate the access relation.
776 /// @param I The index of the input dimension that is mapped to the first output
777 ///          dimension.
778 /// @param J The index of the input dimension that is mapped to the second
779 ///          output dimension.
780 /// @return The specified access relation.
781 __isl_give isl_map *
782 getMatMulPatternOriginalAccessRelation(ScopStmt *Stmt, unsigned I, unsigned J) {
783   auto *AccessRelSpace = isl_space_alloc(Stmt->getIslCtx(), 0, 3, 2);
784   auto *AccessRel = isl_map_universe(AccessRelSpace);
785   AccessRel = isl_map_equate(AccessRel, isl_dim_in, I, isl_dim_out, 0);
786   AccessRel = isl_map_equate(AccessRel, isl_dim_in, J, isl_dim_out, 1);
787   AccessRel = isl_map_set_tuple_id(AccessRel, isl_dim_in, Stmt->getDomainId());
788   return AccessRel;
789 }
790 
791 /// Identify the memory access that corresponds to the access to the second
792 /// operand of the matrix multiplication.
793 ///
794 /// Identify the memory access that corresponds to the access
795 /// to the matrix B of the matrix multiplication C = A x B.
796 ///
797 /// @param Stmt The SCoP statement that contains the memory accesses
798 ///             under consideration.
799 /// @return The memory access of @p Stmt that corresponds to the access
800 ///         to the second operand of the matrix multiplication.
801 MemoryAccess *identifyAccessA(ScopStmt *Stmt) {
802   auto *OriginalRel = getMatMulPatternOriginalAccessRelation(Stmt, 0, 2);
803   return identifyAccessByAccessRelation(Stmt, OriginalRel);
804 }
805 
806 /// Identify the memory access that corresponds to the access to the first
807 /// operand of the matrix multiplication.
808 ///
809 /// Identify the memory access that corresponds to the access
810 /// to the matrix A of the matrix multiplication C = A x B.
811 ///
812 /// @param Stmt The SCoP statement that contains the memory accesses
813 ///             under consideration.
814 /// @return The memory access of @p Stmt that corresponds to the access
815 ///         to the first operand of the matrix multiplication.
816 MemoryAccess *identifyAccessB(ScopStmt *Stmt) {
817   auto *OriginalRel = getMatMulPatternOriginalAccessRelation(Stmt, 2, 1);
818   return identifyAccessByAccessRelation(Stmt, OriginalRel);
819 }
820 
821 /// Create an access relation that is specific to
822 ///        the matrix multiplication pattern.
823 ///
824 /// Create an access relation of the following form:
825 /// [O0, O1, O2, O3, O4, O5, O6, O7, O8] -> [OI, O5, OJ]
826 /// where I is @p FirstDim, J is @p SecondDim.
827 ///
828 /// It can be used, for example, to create relations that helps to consequently
829 /// access elements of operands of a matrix multiplication after creation of
830 /// the BLIS micro and macro kernels.
831 ///
832 /// @see ScheduleTreeOptimizer::createMicroKernel
833 /// @see ScheduleTreeOptimizer::createMacroKernel
834 ///
835 /// Subsequently, the described access relation is applied to the range of
836 /// @p MapOldIndVar, that is used to map original induction variables to
837 /// the ones, which are produced by schedule transformations. It helps to
838 /// define relations using a new space and, at the same time, keep them
839 /// in the original one.
840 ///
841 /// @param MapOldIndVar The relation, which maps original induction variables
842 ///                     to the ones, which are produced by schedule
843 ///                     transformations.
844 /// @param FirstDim, SecondDim The input dimensions that are used to define
845 ///        the specified access relation.
846 /// @return The specified access relation.
847 __isl_give isl_map *getMatMulAccRel(__isl_take isl_map *MapOldIndVar,
848                                     unsigned FirstDim, unsigned SecondDim) {
849   auto *Ctx = isl_map_get_ctx(MapOldIndVar);
850   auto *AccessRelSpace = isl_space_alloc(Ctx, 0, 9, 3);
851   auto *AccessRel = isl_map_universe(AccessRelSpace);
852   AccessRel = isl_map_equate(AccessRel, isl_dim_in, FirstDim, isl_dim_out, 0);
853   AccessRel = isl_map_equate(AccessRel, isl_dim_in, 5, isl_dim_out, 1);
854   AccessRel = isl_map_equate(AccessRel, isl_dim_in, SecondDim, isl_dim_out, 2);
855   return isl_map_apply_range(MapOldIndVar, AccessRel);
856 }
857 
858 __isl_give isl_schedule_node *
859 createExtensionNode(__isl_take isl_schedule_node *Node,
860                     __isl_take isl_map *ExtensionMap) {
861   auto *Extension = isl_union_map_from_map(ExtensionMap);
862   auto *NewNode = isl_schedule_node_from_extension(Extension);
863   return isl_schedule_node_graft_before(Node, NewNode);
864 }
865 
866 /// Apply the packing transformation.
867 ///
868 /// The packing transformation can be described as a data-layout
869 /// transformation that requires to introduce a new array, copy data
870 /// to the array, and change memory access locations to reference the array.
871 /// It can be used to ensure that elements of the new array are read in-stride
872 /// access, aligned to cache lines boundaries, and preloaded into certain cache
873 /// levels.
874 ///
875 /// As an example let us consider the packing of the array A that would help
876 /// to read its elements with in-stride access. An access to the array A
877 /// is represented by an access relation that has the form
878 /// S[i, j, k] -> A[i, k]. The scheduling function of the SCoP statement S has
879 /// the form S[i,j, k] -> [floor((j mod Nc) / Nr), floor((i mod Mc) / Mr),
880 /// k mod Kc, j mod Nr, i mod Mr].
881 ///
882 /// To ensure that elements of the array A are read in-stride access, we add
883 /// a new array Packed_A[Mc/Mr][Kc][Mr] to the SCoP, using
884 /// Scop::createScopArrayInfo, change the access relation
885 /// S[i, j, k] -> A[i, k] to
886 /// S[i, j, k] -> Packed_A[floor((i mod Mc) / Mr), k mod Kc, i mod Mr], using
887 /// MemoryAccess::setNewAccessRelation, and copy the data to the array, using
888 /// the copy statement created by Scop::addScopStmt.
889 ///
890 /// @param Node The schedule node to be optimized.
891 /// @param MapOldIndVar The relation, which maps original induction variables
892 ///                     to the ones, which are produced by schedule
893 ///                     transformations.
894 /// @param MicroParams, MacroParams Parameters of the BLIS kernel
895 ///                                 to be taken into account.
896 /// @return The optimized schedule node.
897 static __isl_give isl_schedule_node *optimizeDataLayoutMatrMulPattern(
898     __isl_take isl_schedule_node *Node, __isl_take isl_map *MapOldIndVar,
899     MicroKernelParamsTy MicroParams, MacroKernelParamsTy MacroParams) {
900   // Check whether memory accesses of the SCoP statement correspond to
901   // the matrix multiplication pattern and if this is true, obtain them.
902   auto InputDimsId = isl_map_get_tuple_id(MapOldIndVar, isl_dim_in);
903   auto *Stmt = static_cast<ScopStmt *>(isl_id_get_user(InputDimsId));
904   isl_id_free(InputDimsId);
905   MemoryAccess *MemAccessA = identifyAccessA(Stmt);
906   MemoryAccess *MemAccessB = identifyAccessB(Stmt);
907   if (!MemAccessA || !MemAccessB) {
908     isl_map_free(MapOldIndVar);
909     return Node;
910   }
911 
912   // Create a copy statement that corresponds to the memory access to the
913   // matrix B, the second operand of the matrix multiplication.
914   Node = isl_schedule_node_parent(isl_schedule_node_parent(Node));
915   Node = isl_schedule_node_parent(isl_schedule_node_parent(Node));
916   Node = isl_schedule_node_parent(Node);
917   Node = isl_schedule_node_child(isl_schedule_node_band_split(Node, 2), 0);
918   auto *AccRel = getMatMulAccRel(isl_map_copy(MapOldIndVar), 3, 7);
919   unsigned FirstDimSize = MacroParams.Nc / MicroParams.Nr;
920   unsigned SecondDimSize = MacroParams.Kc;
921   unsigned ThirdDimSize = MicroParams.Nr;
922   auto *SAI = Stmt->getParent()->createScopArrayInfo(
923       MemAccessB->getElementType(), "Packed_B",
924       {FirstDimSize, SecondDimSize, ThirdDimSize});
925   AccRel = isl_map_set_tuple_id(AccRel, isl_dim_out, SAI->getBasePtrId());
926   auto *OldAcc = MemAccessB->getAccessRelation();
927   MemAccessB->setNewAccessRelation(AccRel);
928   auto *ExtMap =
929       getMatMulExt(Stmt->getIslCtx(), 0, MacroParams.Nc, MacroParams.Kc);
930   isl_map_move_dims(ExtMap, isl_dim_out, 0, isl_dim_in, 0, 1);
931   isl_map_move_dims(ExtMap, isl_dim_in, 2, isl_dim_out, 0, 1);
932   ExtMap = isl_map_project_out(ExtMap, isl_dim_in, 2, 1);
933   auto *Domain = Stmt->getDomain();
934 
935   // Restrict the domains of the copy statements to only execute when also its
936   // originating statement is executed.
937   auto *DomainId = isl_set_get_tuple_id(Domain);
938   auto *NewStmt = Stmt->getParent()->addScopStmt(
939       OldAcc, MemAccessB->getAccessRelation(), isl_set_copy(Domain));
940   ExtMap = isl_map_set_tuple_id(ExtMap, isl_dim_out, isl_id_copy(DomainId));
941   ExtMap = isl_map_intersect_range(ExtMap, isl_set_copy(Domain));
942   ExtMap = isl_map_set_tuple_id(ExtMap, isl_dim_out, NewStmt->getDomainId());
943   Node = createExtensionNode(Node, ExtMap);
944 
945   // Create a copy statement that corresponds to the memory access
946   // to the matrix A, the first operand of the matrix multiplication.
947   Node = isl_schedule_node_child(Node, 0);
948   AccRel = getMatMulAccRel(MapOldIndVar, 4, 6);
949   FirstDimSize = MacroParams.Mc / MicroParams.Mr;
950   ThirdDimSize = MicroParams.Mr;
951   SAI = Stmt->getParent()->createScopArrayInfo(
952       MemAccessA->getElementType(), "Packed_A",
953       {FirstDimSize, SecondDimSize, ThirdDimSize});
954   AccRel = isl_map_set_tuple_id(AccRel, isl_dim_out, SAI->getBasePtrId());
955   OldAcc = MemAccessA->getAccessRelation();
956   MemAccessA->setNewAccessRelation(AccRel);
957   ExtMap = getMatMulExt(Stmt->getIslCtx(), MacroParams.Mc, 0, MacroParams.Kc);
958   isl_map_move_dims(ExtMap, isl_dim_out, 0, isl_dim_in, 0, 1);
959   isl_map_move_dims(ExtMap, isl_dim_in, 2, isl_dim_out, 0, 1);
960   NewStmt = Stmt->getParent()->addScopStmt(
961       OldAcc, MemAccessA->getAccessRelation(), isl_set_copy(Domain));
962 
963   // Restrict the domains of the copy statements to only execute when also its
964   // originating statement is executed.
965   ExtMap = isl_map_set_tuple_id(ExtMap, isl_dim_out, DomainId);
966   ExtMap = isl_map_intersect_range(ExtMap, Domain);
967   ExtMap = isl_map_set_tuple_id(ExtMap, isl_dim_out, NewStmt->getDomainId());
968   Node = createExtensionNode(Node, ExtMap);
969   Node = isl_schedule_node_child(isl_schedule_node_child(Node, 0), 0);
970   return isl_schedule_node_child(isl_schedule_node_child(Node, 0), 0);
971 }
972 
973 /// Get a relation mapping induction variables produced by schedule
974 /// transformations to the original ones.
975 ///
976 /// @param Node The schedule node produced as the result of creation
977 ///        of the BLIS kernels.
978 /// @param MicroKernelParams, MacroKernelParams Parameters of the BLIS kernel
979 ///                                             to be taken into account.
980 /// @return  The relation mapping original induction variables to the ones
981 ///          produced by schedule transformation.
982 /// @see ScheduleTreeOptimizer::createMicroKernel
983 /// @see ScheduleTreeOptimizer::createMacroKernel
984 /// @see getMacroKernelParams
985 __isl_give isl_map *
986 getInductionVariablesSubstitution(__isl_take isl_schedule_node *Node,
987                                   MicroKernelParamsTy MicroKernelParams,
988                                   MacroKernelParamsTy MacroKernelParams) {
989   auto *Child = isl_schedule_node_get_child(Node, 0);
990   auto *UnMapOldIndVar = isl_schedule_node_get_prefix_schedule_union_map(Child);
991   isl_schedule_node_free(Child);
992   auto *MapOldIndVar = isl_map_from_union_map(UnMapOldIndVar);
993   if (isl_map_dim(MapOldIndVar, isl_dim_out) > 9)
994     MapOldIndVar =
995         isl_map_project_out(MapOldIndVar, isl_dim_out, 0,
996                             isl_map_dim(MapOldIndVar, isl_dim_out) - 9);
997   return MapOldIndVar;
998 }
999 
1000 __isl_give isl_schedule_node *ScheduleTreeOptimizer::optimizeMatMulPattern(
1001     __isl_take isl_schedule_node *Node, const llvm::TargetTransformInfo *TTI) {
1002   assert(TTI && "The target transform info should be provided.");
1003   auto MicroKernelParams = getMicroKernelParams(TTI);
1004   auto MacroKernelParams = getMacroKernelParams(MicroKernelParams);
1005   Node = createMacroKernel(Node, MacroKernelParams);
1006   Node = createMicroKernel(Node, MicroKernelParams);
1007   if (MacroKernelParams.Mc == 1 || MacroKernelParams.Nc == 1 ||
1008       MacroKernelParams.Kc == 1)
1009     return Node;
1010   auto *MapOldIndVar = getInductionVariablesSubstitution(
1011       Node, MicroKernelParams, MacroKernelParams);
1012   if (!MapOldIndVar)
1013     return Node;
1014   return optimizeDataLayoutMatrMulPattern(Node, MapOldIndVar, MicroKernelParams,
1015                                           MacroKernelParams);
1016 }
1017 
1018 bool ScheduleTreeOptimizer::isMatrMultPattern(
1019     __isl_keep isl_schedule_node *Node) {
1020   auto *PartialSchedule =
1021       isl_schedule_node_band_get_partial_schedule_union_map(Node);
1022   if (isl_schedule_node_band_n_member(Node) != 3 ||
1023       isl_union_map_n_map(PartialSchedule) != 1) {
1024     isl_union_map_free(PartialSchedule);
1025     return false;
1026   }
1027   auto *NewPartialSchedule = isl_map_from_union_map(PartialSchedule);
1028   NewPartialSchedule = circularShiftOutputDims(NewPartialSchedule);
1029   if (containsMatrMult(NewPartialSchedule)) {
1030     isl_map_free(NewPartialSchedule);
1031     return true;
1032   }
1033   isl_map_free(NewPartialSchedule);
1034   return false;
1035 }
1036 
1037 __isl_give isl_schedule_node *
1038 ScheduleTreeOptimizer::optimizeBand(__isl_take isl_schedule_node *Node,
1039                                     void *User) {
1040   if (!isTileableBandNode(Node))
1041     return Node;
1042 
1043   if (PMBasedOpts && User && isMatrMultPattern(Node)) {
1044     DEBUG(dbgs() << "The matrix multiplication pattern was detected\n");
1045     const llvm::TargetTransformInfo *TTI;
1046     TTI = static_cast<const llvm::TargetTransformInfo *>(User);
1047     Node = optimizeMatMulPattern(Node, TTI);
1048   }
1049 
1050   return standardBandOpts(Node, User);
1051 }
1052 
1053 __isl_give isl_schedule *
1054 ScheduleTreeOptimizer::optimizeSchedule(__isl_take isl_schedule *Schedule,
1055                                         const llvm::TargetTransformInfo *TTI) {
1056   isl_schedule_node *Root = isl_schedule_get_root(Schedule);
1057   Root = optimizeScheduleNode(Root, TTI);
1058   isl_schedule_free(Schedule);
1059   auto S = isl_schedule_node_get_schedule(Root);
1060   isl_schedule_node_free(Root);
1061   return S;
1062 }
1063 
1064 __isl_give isl_schedule_node *ScheduleTreeOptimizer::optimizeScheduleNode(
1065     __isl_take isl_schedule_node *Node, const llvm::TargetTransformInfo *TTI) {
1066   Node = isl_schedule_node_map_descendant_bottom_up(
1067       Node, optimizeBand, const_cast<void *>(static_cast<const void *>(TTI)));
1068   return Node;
1069 }
1070 
1071 bool ScheduleTreeOptimizer::isProfitableSchedule(
1072     Scop &S, __isl_keep isl_schedule *NewSchedule) {
1073   // To understand if the schedule has been optimized we check if the schedule
1074   // has changed at all.
1075   // TODO: We can improve this by tracking if any necessarily beneficial
1076   // transformations have been performed. This can e.g. be tiling, loop
1077   // interchange, or ...) We can track this either at the place where the
1078   // transformation has been performed or, in case of automatic ILP based
1079   // optimizations, by comparing (yet to be defined) performance metrics
1080   // before/after the scheduling optimizer
1081   // (e.g., #stride-one accesses)
1082   if (S.containsExtensionNode(NewSchedule))
1083     return true;
1084   auto *NewScheduleMap = isl_schedule_get_map(NewSchedule);
1085   isl_union_map *OldSchedule = S.getSchedule();
1086   assert(OldSchedule && "Only IslScheduleOptimizer can insert extension nodes "
1087                         "that make Scop::getSchedule() return nullptr.");
1088   bool changed = !isl_union_map_is_equal(OldSchedule, NewScheduleMap);
1089   isl_union_map_free(OldSchedule);
1090   isl_union_map_free(NewScheduleMap);
1091   return changed;
1092 }
1093 
1094 namespace {
1095 class IslScheduleOptimizer : public ScopPass {
1096 public:
1097   static char ID;
1098   explicit IslScheduleOptimizer() : ScopPass(ID) { LastSchedule = nullptr; }
1099 
1100   ~IslScheduleOptimizer() { isl_schedule_free(LastSchedule); }
1101 
1102   /// Optimize the schedule of the SCoP @p S.
1103   bool runOnScop(Scop &S) override;
1104 
1105   /// Print the new schedule for the SCoP @p S.
1106   void printScop(raw_ostream &OS, Scop &S) const override;
1107 
1108   /// Register all analyses and transformation required.
1109   void getAnalysisUsage(AnalysisUsage &AU) const override;
1110 
1111   /// Release the internal memory.
1112   void releaseMemory() override {
1113     isl_schedule_free(LastSchedule);
1114     LastSchedule = nullptr;
1115   }
1116 
1117 private:
1118   isl_schedule *LastSchedule;
1119 };
1120 } // namespace
1121 
1122 char IslScheduleOptimizer::ID = 0;
1123 
1124 bool IslScheduleOptimizer::runOnScop(Scop &S) {
1125 
1126   // Skip empty SCoPs but still allow code generation as it will delete the
1127   // loops present but not needed.
1128   if (S.getSize() == 0) {
1129     S.markAsOptimized();
1130     return false;
1131   }
1132 
1133   const Dependences &D =
1134       getAnalysis<DependenceInfo>().getDependences(Dependences::AL_Statement);
1135 
1136   if (!D.hasValidDependences())
1137     return false;
1138 
1139   isl_schedule_free(LastSchedule);
1140   LastSchedule = nullptr;
1141 
1142   // Build input data.
1143   int ValidityKinds =
1144       Dependences::TYPE_RAW | Dependences::TYPE_WAR | Dependences::TYPE_WAW;
1145   int ProximityKinds;
1146 
1147   if (OptimizeDeps == "all")
1148     ProximityKinds =
1149         Dependences::TYPE_RAW | Dependences::TYPE_WAR | Dependences::TYPE_WAW;
1150   else if (OptimizeDeps == "raw")
1151     ProximityKinds = Dependences::TYPE_RAW;
1152   else {
1153     errs() << "Do not know how to optimize for '" << OptimizeDeps << "'"
1154            << " Falling back to optimizing all dependences.\n";
1155     ProximityKinds =
1156         Dependences::TYPE_RAW | Dependences::TYPE_WAR | Dependences::TYPE_WAW;
1157   }
1158 
1159   isl_union_set *Domain = S.getDomains();
1160 
1161   if (!Domain)
1162     return false;
1163 
1164   isl_union_map *Validity = D.getDependences(ValidityKinds);
1165   isl_union_map *Proximity = D.getDependences(ProximityKinds);
1166 
1167   // Simplify the dependences by removing the constraints introduced by the
1168   // domains. This can speed up the scheduling time significantly, as large
1169   // constant coefficients will be removed from the dependences. The
1170   // introduction of some additional dependences reduces the possible
1171   // transformations, but in most cases, such transformation do not seem to be
1172   // interesting anyway. In some cases this option may stop the scheduler to
1173   // find any schedule.
1174   if (SimplifyDeps == "yes") {
1175     Validity = isl_union_map_gist_domain(Validity, isl_union_set_copy(Domain));
1176     Validity = isl_union_map_gist_range(Validity, isl_union_set_copy(Domain));
1177     Proximity =
1178         isl_union_map_gist_domain(Proximity, isl_union_set_copy(Domain));
1179     Proximity = isl_union_map_gist_range(Proximity, isl_union_set_copy(Domain));
1180   } else if (SimplifyDeps != "no") {
1181     errs() << "warning: Option -polly-opt-simplify-deps should either be 'yes' "
1182               "or 'no'. Falling back to default: 'yes'\n";
1183   }
1184 
1185   DEBUG(dbgs() << "\n\nCompute schedule from: ");
1186   DEBUG(dbgs() << "Domain := " << stringFromIslObj(Domain) << ";\n");
1187   DEBUG(dbgs() << "Proximity := " << stringFromIslObj(Proximity) << ";\n");
1188   DEBUG(dbgs() << "Validity := " << stringFromIslObj(Validity) << ";\n");
1189 
1190   unsigned IslSerializeSCCs;
1191 
1192   if (FusionStrategy == "max") {
1193     IslSerializeSCCs = 0;
1194   } else if (FusionStrategy == "min") {
1195     IslSerializeSCCs = 1;
1196   } else {
1197     errs() << "warning: Unknown fusion strategy. Falling back to maximal "
1198               "fusion.\n";
1199     IslSerializeSCCs = 0;
1200   }
1201 
1202   int IslMaximizeBands;
1203 
1204   if (MaximizeBandDepth == "yes") {
1205     IslMaximizeBands = 1;
1206   } else if (MaximizeBandDepth == "no") {
1207     IslMaximizeBands = 0;
1208   } else {
1209     errs() << "warning: Option -polly-opt-maximize-bands should either be 'yes'"
1210               " or 'no'. Falling back to default: 'yes'\n";
1211     IslMaximizeBands = 1;
1212   }
1213 
1214   int IslOuterCoincidence;
1215 
1216   if (OuterCoincidence == "yes") {
1217     IslOuterCoincidence = 1;
1218   } else if (OuterCoincidence == "no") {
1219     IslOuterCoincidence = 0;
1220   } else {
1221     errs() << "warning: Option -polly-opt-outer-coincidence should either be "
1222               "'yes' or 'no'. Falling back to default: 'no'\n";
1223     IslOuterCoincidence = 0;
1224   }
1225 
1226   isl_ctx *Ctx = S.getIslCtx();
1227 
1228   isl_options_set_schedule_outer_coincidence(Ctx, IslOuterCoincidence);
1229   isl_options_set_schedule_serialize_sccs(Ctx, IslSerializeSCCs);
1230   isl_options_set_schedule_maximize_band_depth(Ctx, IslMaximizeBands);
1231   isl_options_set_schedule_max_constant_term(Ctx, MaxConstantTerm);
1232   isl_options_set_schedule_max_coefficient(Ctx, MaxCoefficient);
1233   isl_options_set_tile_scale_tile_loops(Ctx, 0);
1234 
1235   auto OnErrorStatus = isl_options_get_on_error(Ctx);
1236   isl_options_set_on_error(Ctx, ISL_ON_ERROR_CONTINUE);
1237 
1238   isl_schedule_constraints *ScheduleConstraints;
1239   ScheduleConstraints = isl_schedule_constraints_on_domain(Domain);
1240   ScheduleConstraints =
1241       isl_schedule_constraints_set_proximity(ScheduleConstraints, Proximity);
1242   ScheduleConstraints = isl_schedule_constraints_set_validity(
1243       ScheduleConstraints, isl_union_map_copy(Validity));
1244   ScheduleConstraints =
1245       isl_schedule_constraints_set_coincidence(ScheduleConstraints, Validity);
1246   isl_schedule *Schedule;
1247   Schedule = isl_schedule_constraints_compute_schedule(ScheduleConstraints);
1248   isl_options_set_on_error(Ctx, OnErrorStatus);
1249 
1250   // In cases the scheduler is not able to optimize the code, we just do not
1251   // touch the schedule.
1252   if (!Schedule)
1253     return false;
1254 
1255   DEBUG({
1256     auto *P = isl_printer_to_str(Ctx);
1257     P = isl_printer_set_yaml_style(P, ISL_YAML_STYLE_BLOCK);
1258     P = isl_printer_print_schedule(P, Schedule);
1259     auto *str = isl_printer_get_str(P);
1260     dbgs() << "NewScheduleTree: \n" << str << "\n";
1261     free(str);
1262     isl_printer_free(P);
1263   });
1264 
1265   Function &F = S.getFunction();
1266   auto *TTI = &getAnalysis<TargetTransformInfoWrapperPass>().getTTI(F);
1267   isl_schedule *NewSchedule =
1268       ScheduleTreeOptimizer::optimizeSchedule(Schedule, TTI);
1269 
1270   if (!ScheduleTreeOptimizer::isProfitableSchedule(S, NewSchedule)) {
1271     isl_schedule_free(NewSchedule);
1272     return false;
1273   }
1274 
1275   S.setScheduleTree(NewSchedule);
1276   S.markAsOptimized();
1277 
1278   if (OptimizedScops)
1279     S.dump();
1280 
1281   return false;
1282 }
1283 
1284 void IslScheduleOptimizer::printScop(raw_ostream &OS, Scop &) const {
1285   isl_printer *p;
1286   char *ScheduleStr;
1287 
1288   OS << "Calculated schedule:\n";
1289 
1290   if (!LastSchedule) {
1291     OS << "n/a\n";
1292     return;
1293   }
1294 
1295   p = isl_printer_to_str(isl_schedule_get_ctx(LastSchedule));
1296   p = isl_printer_print_schedule(p, LastSchedule);
1297   ScheduleStr = isl_printer_get_str(p);
1298   isl_printer_free(p);
1299 
1300   OS << ScheduleStr << "\n";
1301 }
1302 
1303 void IslScheduleOptimizer::getAnalysisUsage(AnalysisUsage &AU) const {
1304   ScopPass::getAnalysisUsage(AU);
1305   AU.addRequired<DependenceInfo>();
1306   AU.addRequired<TargetTransformInfoWrapperPass>();
1307 }
1308 
1309 Pass *polly::createIslScheduleOptimizerPass() {
1310   return new IslScheduleOptimizer();
1311 }
1312 
1313 INITIALIZE_PASS_BEGIN(IslScheduleOptimizer, "polly-opt-isl",
1314                       "Polly - Optimize schedule of SCoP", false, false);
1315 INITIALIZE_PASS_DEPENDENCY(DependenceInfo);
1316 INITIALIZE_PASS_DEPENDENCY(ScopInfoRegionPass);
1317 INITIALIZE_PASS_DEPENDENCY(TargetTransformInfoWrapperPass);
1318 INITIALIZE_PASS_END(IslScheduleOptimizer, "polly-opt-isl",
1319                     "Polly - Optimize schedule of SCoP", false, false)
1320