1 //===- Schedule.cpp - Calculate an optimized schedule ---------------------===//
2 //
3 // Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
4 // See https://llvm.org/LICENSE.txt for license information.
5 // SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
6 //
7 //===----------------------------------------------------------------------===//
8 //
9 // This pass generates an entirely new schedule tree from the data dependences
10 // and iteration domains. The new schedule tree is computed in two steps:
11 //
12 // 1) The isl scheduling optimizer is run
13 //
14 // The isl scheduling optimizer creates a new schedule tree that maximizes
15 // parallelism and tileability and minimizes data-dependence distances. The
16 // algorithm used is a modified version of the ``Pluto'' algorithm:
17 //
18 //   U. Bondhugula, A. Hartono, J. Ramanujam, and P. Sadayappan.
19 //   A Practical Automatic Polyhedral Parallelizer and Locality Optimizer.
20 //   In Proceedings of the 2008 ACM SIGPLAN Conference On Programming Language
21 //   Design and Implementation, PLDI ’08, pages 101–113. ACM, 2008.
22 //
23 // 2) A set of post-scheduling transformations is applied on the schedule tree.
24 //
25 // These optimizations include:
26 //
27 //  - Tiling of the innermost tilable bands
28 //  - Prevectorization - The choice of a possible outer loop that is strip-mined
29 //                       to the innermost level to enable inner-loop
30 //                       vectorization.
31 //  - Some optimizations for spatial locality are also planned.
32 //
33 // For a detailed description of the schedule tree itself please see section 6
34 // of:
35 //
36 // Polyhedral AST generation is more than scanning polyhedra
37 // Tobias Grosser, Sven Verdoolaege, Albert Cohen
38 // ACM Transactions on Programming Languages and Systems (TOPLAS),
39 // 37(4), July 2015
40 // http://www.grosser.es/#pub-polyhedral-AST-generation
41 //
42 // This publication also contains a detailed discussion of the different options
43 // for polyhedral loop unrolling, full/partial tile separation and other uses
44 // of the schedule tree.
45 //
46 //===----------------------------------------------------------------------===//
47 
48 #include "polly/ScheduleOptimizer.h"
49 #include "polly/CodeGen/CodeGeneration.h"
50 #include "polly/DependenceInfo.h"
51 #include "polly/LinkAllPasses.h"
52 #include "polly/Options.h"
53 #include "polly/ScheduleTreeTransform.h"
54 #include "polly/ScopInfo.h"
55 #include "polly/ScopPass.h"
56 #include "polly/Simplify.h"
57 #include "polly/Support/ISLOStream.h"
58 #include "llvm/ADT/Sequence.h"
59 #include "llvm/ADT/Statistic.h"
60 #include "llvm/Analysis/TargetTransformInfo.h"
61 #include "llvm/IR/Function.h"
62 #include "llvm/InitializePasses.h"
63 #include "llvm/Support/CommandLine.h"
64 #include "llvm/Support/Debug.h"
65 #include "llvm/Support/raw_ostream.h"
66 #include "isl/ctx.h"
67 #include "isl/options.h"
68 #include "isl/printer.h"
69 #include "isl/schedule.h"
70 #include "isl/schedule_node.h"
71 #include "isl/union_map.h"
72 #include "isl/union_set.h"
73 #include <algorithm>
74 #include <cassert>
75 #include <cmath>
76 #include <cstdint>
77 #include <cstdlib>
78 #include <string>
79 #include <vector>
80 
81 using namespace llvm;
82 using namespace polly;
83 
84 #define DEBUG_TYPE "polly-opt-isl"
85 
86 static cl::opt<std::string>
87     OptimizeDeps("polly-opt-optimize-only",
88                  cl::desc("Only a certain kind of dependences (all/raw)"),
89                  cl::Hidden, cl::init("all"), cl::ZeroOrMore,
90                  cl::cat(PollyCategory));
91 
92 static cl::opt<std::string>
93     SimplifyDeps("polly-opt-simplify-deps",
94                  cl::desc("Dependences should be simplified (yes/no)"),
95                  cl::Hidden, cl::init("yes"), cl::ZeroOrMore,
96                  cl::cat(PollyCategory));
97 
98 static cl::opt<int> MaxConstantTerm(
99     "polly-opt-max-constant-term",
100     cl::desc("The maximal constant term allowed (-1 is unlimited)"), cl::Hidden,
101     cl::init(20), cl::ZeroOrMore, cl::cat(PollyCategory));
102 
103 static cl::opt<int> MaxCoefficient(
104     "polly-opt-max-coefficient",
105     cl::desc("The maximal coefficient allowed (-1 is unlimited)"), cl::Hidden,
106     cl::init(20), cl::ZeroOrMore, cl::cat(PollyCategory));
107 
108 static cl::opt<std::string> FusionStrategy(
109     "polly-opt-fusion", cl::desc("The fusion strategy to choose (min/max)"),
110     cl::Hidden, cl::init("min"), cl::ZeroOrMore, cl::cat(PollyCategory));
111 
112 static cl::opt<std::string>
113     MaximizeBandDepth("polly-opt-maximize-bands",
114                       cl::desc("Maximize the band depth (yes/no)"), cl::Hidden,
115                       cl::init("yes"), cl::ZeroOrMore, cl::cat(PollyCategory));
116 
117 static cl::opt<std::string> OuterCoincidence(
118     "polly-opt-outer-coincidence",
119     cl::desc("Try to construct schedules where the outer member of each band "
120              "satisfies the coincidence constraints (yes/no)"),
121     cl::Hidden, cl::init("no"), cl::ZeroOrMore, cl::cat(PollyCategory));
122 
123 static cl::opt<int> PrevectorWidth(
124     "polly-prevect-width",
125     cl::desc(
126         "The number of loop iterations to strip-mine for pre-vectorization"),
127     cl::Hidden, cl::init(4), cl::ZeroOrMore, cl::cat(PollyCategory));
128 
129 static cl::opt<bool> FirstLevelTiling("polly-tiling",
130                                       cl::desc("Enable loop tiling"),
131                                       cl::init(true), cl::ZeroOrMore,
132                                       cl::cat(PollyCategory));
133 
134 static cl::opt<int> LatencyVectorFma(
135     "polly-target-latency-vector-fma",
136     cl::desc("The minimal number of cycles between issuing two "
137              "dependent consecutive vector fused multiply-add "
138              "instructions."),
139     cl::Hidden, cl::init(8), cl::ZeroOrMore, cl::cat(PollyCategory));
140 
141 static cl::opt<int> ThroughputVectorFma(
142     "polly-target-throughput-vector-fma",
143     cl::desc("A throughput of the processor floating-point arithmetic units "
144              "expressed in the number of vector fused multiply-add "
145              "instructions per clock cycle."),
146     cl::Hidden, cl::init(1), cl::ZeroOrMore, cl::cat(PollyCategory));
147 
148 // This option, along with --polly-target-2nd-cache-level-associativity,
149 // --polly-target-1st-cache-level-size, and --polly-target-2st-cache-level-size
150 // represent the parameters of the target cache, which do not have typical
151 // values that can be used by default. However, to apply the pattern matching
152 // optimizations, we use the values of the parameters of Intel Core i7-3820
153 // SandyBridge in case the parameters are not specified or not provided by the
154 // TargetTransformInfo.
155 static cl::opt<int> FirstCacheLevelAssociativity(
156     "polly-target-1st-cache-level-associativity",
157     cl::desc("The associativity of the first cache level."), cl::Hidden,
158     cl::init(-1), cl::ZeroOrMore, cl::cat(PollyCategory));
159 
160 static cl::opt<int> FirstCacheLevelDefaultAssociativity(
161     "polly-target-1st-cache-level-default-associativity",
162     cl::desc("The default associativity of the first cache level"
163              " (if not enough were provided by the TargetTransformInfo)."),
164     cl::Hidden, cl::init(8), cl::ZeroOrMore, cl::cat(PollyCategory));
165 
166 static cl::opt<int> SecondCacheLevelAssociativity(
167     "polly-target-2nd-cache-level-associativity",
168     cl::desc("The associativity of the second cache level."), cl::Hidden,
169     cl::init(-1), cl::ZeroOrMore, cl::cat(PollyCategory));
170 
171 static cl::opt<int> SecondCacheLevelDefaultAssociativity(
172     "polly-target-2nd-cache-level-default-associativity",
173     cl::desc("The default associativity of the second cache level"
174              " (if not enough were provided by the TargetTransformInfo)."),
175     cl::Hidden, cl::init(8), cl::ZeroOrMore, cl::cat(PollyCategory));
176 
177 static cl::opt<int> FirstCacheLevelSize(
178     "polly-target-1st-cache-level-size",
179     cl::desc("The size of the first cache level specified in bytes."),
180     cl::Hidden, cl::init(-1), cl::ZeroOrMore, cl::cat(PollyCategory));
181 
182 static cl::opt<int> FirstCacheLevelDefaultSize(
183     "polly-target-1st-cache-level-default-size",
184     cl::desc("The default size of the first cache level specified in bytes"
185              " (if not enough were provided by the TargetTransformInfo)."),
186     cl::Hidden, cl::init(32768), cl::ZeroOrMore, cl::cat(PollyCategory));
187 
188 static cl::opt<int> SecondCacheLevelSize(
189     "polly-target-2nd-cache-level-size",
190     cl::desc("The size of the second level specified in bytes."), cl::Hidden,
191     cl::init(-1), cl::ZeroOrMore, cl::cat(PollyCategory));
192 
193 static cl::opt<int> SecondCacheLevelDefaultSize(
194     "polly-target-2nd-cache-level-default-size",
195     cl::desc("The default size of the second cache level specified in bytes"
196              " (if not enough were provided by the TargetTransformInfo)."),
197     cl::Hidden, cl::init(262144), cl::ZeroOrMore, cl::cat(PollyCategory));
198 
199 static cl::opt<int> VectorRegisterBitwidth(
200     "polly-target-vector-register-bitwidth",
201     cl::desc("The size in bits of a vector register (if not set, this "
202              "information is taken from LLVM's target information."),
203     cl::Hidden, cl::init(-1), cl::ZeroOrMore, cl::cat(PollyCategory));
204 
205 static cl::opt<int> FirstLevelDefaultTileSize(
206     "polly-default-tile-size",
207     cl::desc("The default tile size (if not enough were provided by"
208              " --polly-tile-sizes)"),
209     cl::Hidden, cl::init(32), cl::ZeroOrMore, cl::cat(PollyCategory));
210 
211 static cl::list<int>
212     FirstLevelTileSizes("polly-tile-sizes",
213                         cl::desc("A tile size for each loop dimension, filled "
214                                  "with --polly-default-tile-size"),
215                         cl::Hidden, cl::ZeroOrMore, cl::CommaSeparated,
216                         cl::cat(PollyCategory));
217 
218 static cl::opt<bool>
219     SecondLevelTiling("polly-2nd-level-tiling",
220                       cl::desc("Enable a 2nd level loop of loop tiling"),
221                       cl::init(false), cl::ZeroOrMore, cl::cat(PollyCategory));
222 
223 static cl::opt<int> SecondLevelDefaultTileSize(
224     "polly-2nd-level-default-tile-size",
225     cl::desc("The default 2nd-level tile size (if not enough were provided by"
226              " --polly-2nd-level-tile-sizes)"),
227     cl::Hidden, cl::init(16), cl::ZeroOrMore, cl::cat(PollyCategory));
228 
229 static cl::list<int>
230     SecondLevelTileSizes("polly-2nd-level-tile-sizes",
231                          cl::desc("A tile size for each loop dimension, filled "
232                                   "with --polly-default-tile-size"),
233                          cl::Hidden, cl::ZeroOrMore, cl::CommaSeparated,
234                          cl::cat(PollyCategory));
235 
236 static cl::opt<bool> RegisterTiling("polly-register-tiling",
237                                     cl::desc("Enable register tiling"),
238                                     cl::init(false), cl::ZeroOrMore,
239                                     cl::cat(PollyCategory));
240 
241 static cl::opt<int> RegisterDefaultTileSize(
242     "polly-register-tiling-default-tile-size",
243     cl::desc("The default register tile size (if not enough were provided by"
244              " --polly-register-tile-sizes)"),
245     cl::Hidden, cl::init(2), cl::ZeroOrMore, cl::cat(PollyCategory));
246 
247 static cl::opt<int> PollyPatternMatchingNcQuotient(
248     "polly-pattern-matching-nc-quotient",
249     cl::desc("Quotient that is obtained by dividing Nc, the parameter of the"
250              "macro-kernel, by Nr, the parameter of the micro-kernel"),
251     cl::Hidden, cl::init(256), cl::ZeroOrMore, cl::cat(PollyCategory));
252 
253 static cl::list<int>
254     RegisterTileSizes("polly-register-tile-sizes",
255                       cl::desc("A tile size for each loop dimension, filled "
256                                "with --polly-register-tile-size"),
257                       cl::Hidden, cl::ZeroOrMore, cl::CommaSeparated,
258                       cl::cat(PollyCategory));
259 
260 static cl::opt<bool>
261     PMBasedOpts("polly-pattern-matching-based-opts",
262                 cl::desc("Perform optimizations based on pattern matching"),
263                 cl::init(true), cl::ZeroOrMore, cl::cat(PollyCategory));
264 
265 static cl::opt<bool> OptimizedScops(
266     "polly-optimized-scops",
267     cl::desc("Polly - Dump polyhedral description of Scops optimized with "
268              "the isl scheduling optimizer and the set of post-scheduling "
269              "transformations is applied on the schedule tree"),
270     cl::init(false), cl::ZeroOrMore, cl::cat(PollyCategory));
271 
272 STATISTIC(ScopsProcessed, "Number of scops processed");
273 STATISTIC(ScopsRescheduled, "Number of scops rescheduled");
274 STATISTIC(ScopsOptimized, "Number of scops optimized");
275 
276 STATISTIC(NumAffineLoopsOptimized, "Number of affine loops optimized");
277 STATISTIC(NumBoxedLoopsOptimized, "Number of boxed loops optimized");
278 
279 #define THREE_STATISTICS(VARNAME, DESC)                                        \
280   static Statistic VARNAME[3] = {                                              \
281       {DEBUG_TYPE, #VARNAME "0", DESC " (original)"},                          \
282       {DEBUG_TYPE, #VARNAME "1", DESC " (after scheduler)"},                   \
283       {DEBUG_TYPE, #VARNAME "2", DESC " (after optimizer)"}}
284 
285 THREE_STATISTICS(NumBands, "Number of bands");
286 THREE_STATISTICS(NumBandMembers, "Number of band members");
287 THREE_STATISTICS(NumCoincident, "Number of coincident band members");
288 THREE_STATISTICS(NumPermutable, "Number of permutable bands");
289 THREE_STATISTICS(NumFilters, "Number of filter nodes");
290 THREE_STATISTICS(NumExtension, "Number of extension nodes");
291 
292 STATISTIC(FirstLevelTileOpts, "Number of first level tiling applied");
293 STATISTIC(SecondLevelTileOpts, "Number of second level tiling applied");
294 STATISTIC(RegisterTileOpts, "Number of register tiling applied");
295 STATISTIC(PrevectOpts, "Number of strip-mining for prevectorization applied");
296 STATISTIC(MatMulOpts,
297           "Number of matrix multiplication patterns detected and optimized");
298 
299 namespace {
300 /// Parameters of the micro kernel.
301 ///
302 /// Parameters, which determine sizes of rank-1 (i.e., outer product) update
303 /// used in the optimized matrix multiplication.
304 struct MicroKernelParamsTy {
305   int Mr;
306   int Nr;
307 };
308 
309 /// Parameters of the macro kernel.
310 ///
311 /// Parameters, which determine sizes of blocks of partitioned matrices
312 /// used in the optimized matrix multiplication.
313 struct MacroKernelParamsTy {
314   int Mc;
315   int Nc;
316   int Kc;
317 };
318 
319 /// Additional parameters of the schedule optimizer.
320 ///
321 /// Target Transform Info and the SCoP dependencies used by the schedule
322 /// optimizer.
323 struct OptimizerAdditionalInfoTy {
324   const llvm::TargetTransformInfo *TTI;
325   const Dependences *D;
326 };
327 
328 /// Parameters of the matrix multiplication operands.
329 ///
330 /// Parameters, which describe access relations that represent operands of the
331 /// matrix multiplication.
332 struct MatMulInfoTy {
333   MemoryAccess *A = nullptr;
334   MemoryAccess *B = nullptr;
335   MemoryAccess *ReadFromC = nullptr;
336   MemoryAccess *WriteToC = nullptr;
337   int i = -1;
338   int j = -1;
339   int k = -1;
340 };
341 
342 class ScheduleTreeOptimizer {
343 public:
344   /// Apply schedule tree transformations.
345   ///
346   /// This function takes an (possibly already optimized) schedule tree and
347   /// applies a set of additional optimizations on the schedule tree. The
348   /// transformations applied include:
349   ///
350   ///   - Tiling
351   ///   - Prevectorization
352   ///
353   /// @param Schedule The schedule object the transformations will be applied
354   ///                 to.
355   /// @param OAI      Target Transform Info and the SCoP dependencies.
356   /// @returns        The transformed schedule.
357   static isl::schedule
358   optimizeSchedule(isl::schedule Schedule,
359                    const OptimizerAdditionalInfoTy *OAI = nullptr);
360 
361   /// Apply schedule tree transformations.
362   ///
363   /// This function takes a node in an (possibly already optimized) schedule
364   /// tree and applies a set of additional optimizations on this schedule tree
365   /// node and its descendants. The transformations applied include:
366   ///
367   ///   - Tiling
368   ///   - Prevectorization
369   ///
370   /// @param Node The schedule object post-transformations will be applied to.
371   /// @param OAI  Target Transform Info and the SCoP dependencies.
372   /// @returns    The transformed schedule.
373   static isl::schedule_node
374   optimizeScheduleNode(isl::schedule_node Node,
375                        const OptimizerAdditionalInfoTy *OAI = nullptr);
376 
377   /// Decide if the @p NewSchedule is profitable for @p S.
378   ///
379   /// @param S           The SCoP we optimize.
380   /// @param NewSchedule The new schedule we computed.
381   ///
382   /// @return True, if we believe @p NewSchedule is an improvement for @p S.
383   static bool isProfitableSchedule(polly::Scop &S, isl::schedule NewSchedule);
384 
385   /// Isolate a set of partial tile prefixes.
386   ///
387   /// This set should ensure that it contains only partial tile prefixes that
388   /// have exactly VectorWidth iterations.
389   ///
390   /// @param Node A schedule node band, which is a parent of a band node,
391   ///             that contains a vector loop.
392   /// @return Modified isl_schedule_node.
393   static isl::schedule_node isolateFullPartialTiles(isl::schedule_node Node,
394                                                     int VectorWidth);
395 
396 private:
397   /// Tile a schedule node.
398   ///
399   /// @param Node            The node to tile.
400   /// @param Identifier      An name that identifies this kind of tiling and
401   ///                        that is used to mark the tiled loops in the
402   ///                        generated AST.
403   /// @param TileSizes       A vector of tile sizes that should be used for
404   ///                        tiling.
405   /// @param DefaultTileSize A default tile size that is used for dimensions
406   ///                        that are not covered by the TileSizes vector.
407   static isl::schedule_node tileNode(isl::schedule_node Node,
408                                      const char *Identifier,
409                                      llvm::ArrayRef<int> TileSizes,
410                                      int DefaultTileSize);
411 
412   /// Tile a schedule node and unroll point loops.
413   ///
414   /// @param Node            The node to register tile.
415   /// @param TileSizes       A vector of tile sizes that should be used for
416   ///                        tiling.
417   /// @param DefaultTileSize A default tile size that is used for dimensions
418   static isl::schedule_node applyRegisterTiling(isl::schedule_node Node,
419                                                 llvm::ArrayRef<int> TileSizes,
420                                                 int DefaultTileSize);
421 
422   /// Apply the BLIS matmul optimization pattern.
423   ///
424   /// Make the loops containing the matrix multiplication be the innermost
425   /// loops and apply the BLIS matmul optimization pattern. BLIS implements
426   /// gemm as three nested loops around a macro-kernel, plus two packing
427   /// routines. The macro-kernel is implemented in terms of two additional
428   /// loops around a micro-kernel. The micro-kernel is a loop around a rank-1
429   /// (i.e., outer product) update.
430   ///
431   /// For a detailed description please see [1].
432   ///
433   /// The order of the loops defines the data reused in the BLIS implementation
434   /// of gemm ([1]). In particular, elements of the matrix B, the second
435   /// operand of matrix multiplication, are reused between iterations of the
436   /// innermost loop. To keep the reused data in cache, only elements of matrix
437   /// A, the first operand of matrix multiplication, should be evicted during
438   /// an iteration of the innermost loop. To provide such a cache replacement
439   /// policy, elements of the matrix A can, in particular, be loaded first and,
440   /// consequently, be least-recently-used.
441   ///
442   /// In our case matrices are stored in row-major order instead of
443   /// column-major order used in the BLIS implementation ([1]). It affects only
444   /// on the form of the BLIS micro kernel and the computation of its
445   /// parameters. In particular, reused elements of the matrix B are
446   /// successively multiplied by specific elements of the matrix A.
447   ///
448   /// Refs.:
449   /// [1] - Analytical Modeling is Enough for High Performance BLIS
450   /// Tze Meng Low, Francisco D Igual, Tyler M Smith, Enrique S Quintana-Orti
451   /// Technical Report, 2014
452   /// http://www.cs.utexas.edu/users/flame/pubs/TOMS-BLIS-Analytical.pdf
453   ///
454   /// @see ScheduleTreeOptimizer::createMicroKernel
455   /// @see ScheduleTreeOptimizer::createMacroKernel
456   /// @see getMicroKernelParams
457   /// @see getMacroKernelParams
458   ///
459   /// TODO: Implement the packing transformation.
460   ///
461   /// @param Node The node that contains a band to be optimized. The node
462   ///             is required to successfully pass
463   ///             ScheduleTreeOptimizer::isMatrMultPattern.
464   /// @param TTI  Target Transform Info.
465   /// @param MMI  Parameters of the matrix multiplication operands.
466   /// @returns    The transformed schedule.
467   static isl::schedule_node
468   optimizeMatMulPattern(isl::schedule_node Node,
469                         const llvm::TargetTransformInfo *TTI,
470                         MatMulInfoTy &MMI);
471 
472   /// Check if this node is a band node we want to tile.
473   ///
474   /// We look for innermost band nodes where individual dimensions are marked as
475   /// permutable.
476   ///
477   /// @param Node The node to check.
478   static bool isTileableBandNode(isl::schedule_node Node);
479 
480   /// Pre-vectorizes one scheduling dimension of a schedule band.
481   ///
482   /// prevectSchedBand splits out the dimension DimToVectorize, tiles it and
483   /// sinks the resulting point loop.
484   ///
485   /// Example (DimToVectorize=0, VectorWidth=4):
486   ///
487   /// | Before transformation:
488   /// |
489   /// | A[i,j] -> [i,j]
490   /// |
491   /// | for (i = 0; i < 128; i++)
492   /// |    for (j = 0; j < 128; j++)
493   /// |      A(i,j);
494   ///
495   /// | After transformation:
496   /// |
497   /// | for (it = 0; it < 32; it+=1)
498   /// |    for (j = 0; j < 128; j++)
499   /// |      for (ip = 0; ip <= 3; ip++)
500   /// |        A(4 * it + ip,j);
501   ///
502   /// The goal of this transformation is to create a trivially vectorizable
503   /// loop.  This means a parallel loop at the innermost level that has a
504   /// constant number of iterations corresponding to the target vector width.
505   ///
506   /// This transformation creates a loop at the innermost level. The loop has
507   /// a constant number of iterations, if the number of loop iterations at
508   /// DimToVectorize can be divided by VectorWidth. The default VectorWidth is
509   /// currently constant and not yet target specific. This function does not
510   /// reason about parallelism.
511   static isl::schedule_node prevectSchedBand(isl::schedule_node Node,
512                                              unsigned DimToVectorize,
513                                              int VectorWidth);
514 
515   /// Apply additional optimizations on the bands in the schedule tree.
516   ///
517   /// We are looking for an innermost band node and apply the following
518   /// transformations:
519   ///
520   ///  - Tile the band
521   ///      - if the band is tileable
522   ///      - if the band has more than one loop dimension
523   ///
524   ///  - Prevectorize the schedule of the band (or the point loop in case of
525   ///    tiling).
526   ///      - if vectorization is enabled
527   ///
528   /// @param Node The schedule node to (possibly) optimize.
529   /// @param User A pointer to forward some use information
530   ///        (currently unused).
531   static isl_schedule_node *optimizeBand(isl_schedule_node *Node, void *User);
532 
533   /// Apply additional optimizations on the bands in the schedule tree.
534   ///
535   /// We apply the following
536   /// transformations:
537   ///
538   ///  - Tile the band
539   ///  - Prevectorize the schedule of the band (or the point loop in case of
540   ///    tiling).
541   ///      - if vectorization is enabled
542   ///
543   /// @param Node The schedule node to (possibly) optimize.
544   /// @param User A pointer to forward some use information
545   ///        (currently unused).
546   static isl::schedule_node standardBandOpts(isl::schedule_node Node,
547                                              void *User);
548 
549   /// Check if this node contains a partial schedule that could
550   ///        probably be optimized with analytical modeling.
551   ///
552   /// isMatrMultPattern tries to determine whether the following conditions
553   /// are true:
554   /// 1. the partial schedule contains only one statement.
555   /// 2. there are exactly three input dimensions.
556   /// 3. all memory accesses of the statement will have stride 0 or 1, if we
557   ///    interchange loops (switch the variable used in the inner loop to
558   ///    the outer loop).
559   /// 4. all memory accesses of the statement except from the last one, are
560   ///    read memory access and the last one is write memory access.
561   /// 5. all subscripts of the last memory access of the statement don't
562   ///    contain the variable used in the inner loop.
563   /// If this is the case, we could try to use an approach that is similar to
564   /// the one used to get close-to-peak performance of matrix multiplications.
565   ///
566   /// @param Node The node to check.
567   /// @param D    The SCoP dependencies.
568   /// @param MMI  Parameters of the matrix multiplication operands.
569   static bool isMatrMultPattern(isl::schedule_node Node,
570                                 const polly::Dependences *D, MatMulInfoTy &MMI);
571 
572   /// Create the BLIS macro-kernel.
573   ///
574   /// We create the BLIS macro-kernel by applying a combination of tiling
575   /// of dimensions of the band node and interchanging of two innermost
576   /// modified dimensions. The values of of MacroKernelParams's fields are used
577   /// as tile sizes.
578   ///
579   /// @param Node The schedule node to be modified.
580   /// @param MacroKernelParams Parameters of the macro kernel
581   ///                          to be used as tile sizes.
582   static isl::schedule_node
583   createMacroKernel(isl::schedule_node Node,
584                     MacroKernelParamsTy MacroKernelParams);
585 
586   /// Create the BLIS macro-kernel.
587   ///
588   /// We create the BLIS macro-kernel by applying a combination of tiling
589   /// of dimensions of the band node and interchanging of two innermost
590   /// modified dimensions. The values passed in MicroKernelParam are used
591   /// as tile sizes.
592   ///
593   /// @param Node The schedule node to be modified.
594   /// @param MicroKernelParams Parameters of the micro kernel
595   ///                          to be used as tile sizes.
596   /// @see MicroKernelParamsTy
597   static isl::schedule_node
598   createMicroKernel(isl::schedule_node Node,
599                     MicroKernelParamsTy MicroKernelParams);
600 };
601 
602 /// Create an isl::union_set, which describes the isolate option based on
603 /// IsolateDomain.
604 ///
605 /// @param IsolateDomain An isl::set whose @p OutDimsNum last dimensions should
606 ///                      belong to the current band node.
607 /// @param OutDimsNum    A number of dimensions that should belong to
608 ///                      the current band node.
609 static isl::union_set getIsolateOptions(isl::set IsolateDomain,
610                                         isl_size OutDimsNum) {
611   isl_size Dims = IsolateDomain.dim(isl::dim::set);
612   assert(OutDimsNum <= Dims &&
613          "The isl::set IsolateDomain is used to describe the range of schedule "
614          "dimensions values, which should be isolated. Consequently, the "
615          "number of its dimensions should be greater than or equal to the "
616          "number of the schedule dimensions.");
617   isl::map IsolateRelation = isl::map::from_domain(IsolateDomain);
618   IsolateRelation = IsolateRelation.move_dims(isl::dim::out, 0, isl::dim::in,
619                                               Dims - OutDimsNum, OutDimsNum);
620   isl::set IsolateOption = IsolateRelation.wrap();
621   isl::id Id = isl::id::alloc(IsolateOption.get_ctx(), "isolate", nullptr);
622   IsolateOption = IsolateOption.set_tuple_id(Id);
623   return isl::union_set(IsolateOption);
624 }
625 
626 /// Create an isl::union_set, which describes the specified option for the
627 /// dimension of the current node.
628 ///
629 /// @param Ctx    An isl::ctx, which is used to create the isl::union_set.
630 /// @param Option The name of the option.
631 isl::union_set getDimOptions(isl::ctx Ctx, const char *Option) {
632   isl::space Space(Ctx, 0, 1);
633   auto DimOption = isl::set::universe(Space);
634   auto Id = isl::id::alloc(Ctx, Option, nullptr);
635   DimOption = DimOption.set_tuple_id(Id);
636   return isl::union_set(DimOption);
637 }
638 
639 /// Create an isl::union_set, which describes the option of the form
640 /// [isolate[] -> unroll[x]].
641 ///
642 /// @param Ctx An isl::ctx, which is used to create the isl::union_set.
643 static isl::union_set getUnrollIsolatedSetOptions(isl::ctx Ctx) {
644   isl::space Space = isl::space(Ctx, 0, 0, 1);
645   isl::map UnrollIsolatedSetOption = isl::map::universe(Space);
646   isl::id DimInId = isl::id::alloc(Ctx, "isolate", nullptr);
647   isl::id DimOutId = isl::id::alloc(Ctx, "unroll", nullptr);
648   UnrollIsolatedSetOption =
649       UnrollIsolatedSetOption.set_tuple_id(isl::dim::in, DimInId);
650   UnrollIsolatedSetOption =
651       UnrollIsolatedSetOption.set_tuple_id(isl::dim::out, DimOutId);
652   return UnrollIsolatedSetOption.wrap();
653 }
654 
655 /// Make the last dimension of Set to take values from 0 to VectorWidth - 1.
656 ///
657 /// @param Set         A set, which should be modified.
658 /// @param VectorWidth A parameter, which determines the constraint.
659 static isl::set addExtentConstraints(isl::set Set, int VectorWidth) {
660   unsigned Dims = Set.dim(isl::dim::set);
661   isl::space Space = Set.get_space();
662   isl::local_space LocalSpace = isl::local_space(Space);
663   isl::constraint ExtConstr = isl::constraint::alloc_inequality(LocalSpace);
664   ExtConstr = ExtConstr.set_constant_si(0);
665   ExtConstr = ExtConstr.set_coefficient_si(isl::dim::set, Dims - 1, 1);
666   Set = Set.add_constraint(ExtConstr);
667   ExtConstr = isl::constraint::alloc_inequality(LocalSpace);
668   ExtConstr = ExtConstr.set_constant_si(VectorWidth - 1);
669   ExtConstr = ExtConstr.set_coefficient_si(isl::dim::set, Dims - 1, -1);
670   return Set.add_constraint(ExtConstr);
671 }
672 } // namespace
673 
674 isl::set polly::getPartialTilePrefixes(isl::set ScheduleRange,
675                                        int VectorWidth) {
676   isl_size Dims = ScheduleRange.dim(isl::dim::set);
677   isl::set LoopPrefixes =
678       ScheduleRange.drop_constraints_involving_dims(isl::dim::set, Dims - 1, 1);
679   auto ExtentPrefixes = addExtentConstraints(LoopPrefixes, VectorWidth);
680   isl::set BadPrefixes = ExtentPrefixes.subtract(ScheduleRange);
681   BadPrefixes = BadPrefixes.project_out(isl::dim::set, Dims - 1, 1);
682   LoopPrefixes = LoopPrefixes.project_out(isl::dim::set, Dims - 1, 1);
683   return LoopPrefixes.subtract(BadPrefixes);
684 }
685 
686 namespace {
687 isl::schedule_node
688 ScheduleTreeOptimizer::isolateFullPartialTiles(isl::schedule_node Node,
689                                                int VectorWidth) {
690   assert(isl_schedule_node_get_type(Node.get()) == isl_schedule_node_band);
691   Node = Node.child(0).child(0);
692   isl::union_map SchedRelUMap = Node.get_prefix_schedule_relation();
693   isl::union_set ScheduleRangeUSet = SchedRelUMap.range();
694   isl::set ScheduleRange{ScheduleRangeUSet};
695   isl::set IsolateDomain = getPartialTilePrefixes(ScheduleRange, VectorWidth);
696   auto AtomicOption = getDimOptions(IsolateDomain.get_ctx(), "atomic");
697   isl::union_set IsolateOption = getIsolateOptions(IsolateDomain, 1);
698   Node = Node.parent().parent();
699   isl::union_set Options = IsolateOption.unite(AtomicOption);
700   Node = Node.band_set_ast_build_options(Options);
701   return Node;
702 }
703 
704 isl::schedule_node ScheduleTreeOptimizer::prevectSchedBand(
705     isl::schedule_node Node, unsigned DimToVectorize, int VectorWidth) {
706   assert(isl_schedule_node_get_type(Node.get()) == isl_schedule_node_band);
707 
708   auto Space = isl::manage(isl_schedule_node_band_get_space(Node.get()));
709   isl_size ScheduleDimensions = Space.dim(isl::dim::set);
710   assert((isl_size)DimToVectorize < ScheduleDimensions);
711 
712   if (DimToVectorize > 0) {
713     Node = isl::manage(
714         isl_schedule_node_band_split(Node.release(), DimToVectorize));
715     Node = Node.child(0);
716   }
717   if ((isl_size)DimToVectorize < ScheduleDimensions - 1)
718     Node = isl::manage(isl_schedule_node_band_split(Node.release(), 1));
719   Space = isl::manage(isl_schedule_node_band_get_space(Node.get()));
720   auto Sizes = isl::multi_val::zero(Space);
721   Sizes = Sizes.set_val(0, isl::val(Node.get_ctx(), VectorWidth));
722   Node =
723       isl::manage(isl_schedule_node_band_tile(Node.release(), Sizes.release()));
724   Node = isolateFullPartialTiles(Node, VectorWidth);
725   Node = Node.child(0);
726   // Make sure the "trivially vectorizable loop" is not unrolled. Otherwise,
727   // we will have troubles to match it in the backend.
728   Node = Node.band_set_ast_build_options(
729       isl::union_set(Node.get_ctx(), "{ unroll[x]: 1 = 0 }"));
730   Node = isl::manage(isl_schedule_node_band_sink(Node.release()));
731   Node = Node.child(0);
732   if (isl_schedule_node_get_type(Node.get()) == isl_schedule_node_leaf)
733     Node = Node.parent();
734   auto LoopMarker = isl::id::alloc(Node.get_ctx(), "SIMD", nullptr);
735   PrevectOpts++;
736   return Node.insert_mark(LoopMarker);
737 }
738 
739 isl::schedule_node ScheduleTreeOptimizer::tileNode(isl::schedule_node Node,
740                                                    const char *Identifier,
741                                                    ArrayRef<int> TileSizes,
742                                                    int DefaultTileSize) {
743   auto Space = isl::manage(isl_schedule_node_band_get_space(Node.get()));
744   auto Dims = Space.dim(isl::dim::set);
745   auto Sizes = isl::multi_val::zero(Space);
746   std::string IdentifierString(Identifier);
747   for (auto i : seq<isl_size>(0, Dims)) {
748     auto tileSize =
749         i < (isl_size)TileSizes.size() ? TileSizes[i] : DefaultTileSize;
750     Sizes = Sizes.set_val(i, isl::val(Node.get_ctx(), tileSize));
751   }
752   auto TileLoopMarkerStr = IdentifierString + " - Tiles";
753   auto TileLoopMarker =
754       isl::id::alloc(Node.get_ctx(), TileLoopMarkerStr, nullptr);
755   Node = Node.insert_mark(TileLoopMarker);
756   Node = Node.child(0);
757   Node =
758       isl::manage(isl_schedule_node_band_tile(Node.release(), Sizes.release()));
759   Node = Node.child(0);
760   auto PointLoopMarkerStr = IdentifierString + " - Points";
761   auto PointLoopMarker =
762       isl::id::alloc(Node.get_ctx(), PointLoopMarkerStr, nullptr);
763   Node = Node.insert_mark(PointLoopMarker);
764   return Node.child(0);
765 }
766 
767 isl::schedule_node ScheduleTreeOptimizer::applyRegisterTiling(
768     isl::schedule_node Node, ArrayRef<int> TileSizes, int DefaultTileSize) {
769   Node = tileNode(Node, "Register tiling", TileSizes, DefaultTileSize);
770   auto Ctx = Node.get_ctx();
771   return Node.band_set_ast_build_options(isl::union_set(Ctx, "{unroll[x]}"));
772 }
773 
774 static bool isSimpleInnermostBand(const isl::schedule_node &Node) {
775   assert(isl_schedule_node_get_type(Node.get()) == isl_schedule_node_band);
776   assert(isl_schedule_node_n_children(Node.get()) == 1);
777 
778   auto ChildType = isl_schedule_node_get_type(Node.child(0).get());
779 
780   if (ChildType == isl_schedule_node_leaf)
781     return true;
782 
783   if (ChildType != isl_schedule_node_sequence)
784     return false;
785 
786   auto Sequence = Node.child(0);
787 
788   for (int c = 0, nc = isl_schedule_node_n_children(Sequence.get()); c < nc;
789        ++c) {
790     auto Child = Sequence.child(c);
791     if (isl_schedule_node_get_type(Child.get()) != isl_schedule_node_filter)
792       return false;
793     if (isl_schedule_node_get_type(Child.child(0).get()) !=
794         isl_schedule_node_leaf)
795       return false;
796   }
797   return true;
798 }
799 
800 bool ScheduleTreeOptimizer::isTileableBandNode(isl::schedule_node Node) {
801   if (isl_schedule_node_get_type(Node.get()) != isl_schedule_node_band)
802     return false;
803 
804   if (isl_schedule_node_n_children(Node.get()) != 1)
805     return false;
806 
807   if (!isl_schedule_node_band_get_permutable(Node.get()))
808     return false;
809 
810   auto Space = isl::manage(isl_schedule_node_band_get_space(Node.get()));
811   auto Dims = Space.dim(isl::dim::set);
812 
813   if (Dims <= 1)
814     return false;
815 
816   return isSimpleInnermostBand(Node);
817 }
818 
819 __isl_give isl::schedule_node
820 ScheduleTreeOptimizer::standardBandOpts(isl::schedule_node Node, void *User) {
821   if (FirstLevelTiling) {
822     Node = tileNode(Node, "1st level tiling", FirstLevelTileSizes,
823                     FirstLevelDefaultTileSize);
824     FirstLevelTileOpts++;
825   }
826 
827   if (SecondLevelTiling) {
828     Node = tileNode(Node, "2nd level tiling", SecondLevelTileSizes,
829                     SecondLevelDefaultTileSize);
830     SecondLevelTileOpts++;
831   }
832 
833   if (RegisterTiling) {
834     Node =
835         applyRegisterTiling(Node, RegisterTileSizes, RegisterDefaultTileSize);
836     RegisterTileOpts++;
837   }
838 
839   if (PollyVectorizerChoice == VECTORIZER_NONE)
840     return Node;
841 
842   auto Space = isl::manage(isl_schedule_node_band_get_space(Node.get()));
843   auto Dims = Space.dim(isl::dim::set);
844 
845   for (int i = Dims - 1; i >= 0; i--)
846     if (Node.band_member_get_coincident(i)) {
847       Node = prevectSchedBand(Node, i, PrevectorWidth);
848       break;
849     }
850 
851   return Node;
852 }
853 
854 /// Permute the two dimensions of the isl map.
855 ///
856 /// Permute @p DstPos and @p SrcPos dimensions of the isl map @p Map that
857 /// have type @p DimType.
858 ///
859 /// @param Map     The isl map to be modified.
860 /// @param DimType The type of the dimensions.
861 /// @param DstPos  The first dimension.
862 /// @param SrcPos  The second dimension.
863 /// @return        The modified map.
864 isl::map permuteDimensions(isl::map Map, isl::dim DimType, unsigned DstPos,
865                            unsigned SrcPos) {
866   assert((isl_size)DstPos < Map.dim(DimType) &&
867          (isl_size)SrcPos < Map.dim(DimType));
868   if (DstPos == SrcPos)
869     return Map;
870   isl::id DimId;
871   if (Map.has_tuple_id(DimType))
872     DimId = Map.get_tuple_id(DimType);
873   auto FreeDim = DimType == isl::dim::in ? isl::dim::out : isl::dim::in;
874   isl::id FreeDimId;
875   if (Map.has_tuple_id(FreeDim))
876     FreeDimId = Map.get_tuple_id(FreeDim);
877   auto MaxDim = std::max(DstPos, SrcPos);
878   auto MinDim = std::min(DstPos, SrcPos);
879   Map = Map.move_dims(FreeDim, 0, DimType, MaxDim, 1);
880   Map = Map.move_dims(FreeDim, 0, DimType, MinDim, 1);
881   Map = Map.move_dims(DimType, MinDim, FreeDim, 1, 1);
882   Map = Map.move_dims(DimType, MaxDim, FreeDim, 0, 1);
883   if (DimId)
884     Map = Map.set_tuple_id(DimType, DimId);
885   if (FreeDimId)
886     Map = Map.set_tuple_id(FreeDim, FreeDimId);
887   return Map;
888 }
889 
890 /// Check the form of the access relation.
891 ///
892 /// Check that the access relation @p AccMap has the form M[i][j], where i
893 /// is a @p FirstPos and j is a @p SecondPos.
894 ///
895 /// @param AccMap    The access relation to be checked.
896 /// @param FirstPos  The index of the input dimension that is mapped to
897 ///                  the first output dimension.
898 /// @param SecondPos The index of the input dimension that is mapped to the
899 ///                  second output dimension.
900 /// @return          True in case @p AccMap has the expected form and false,
901 ///                  otherwise.
902 static bool isMatMulOperandAcc(isl::set Domain, isl::map AccMap, int &FirstPos,
903                                int &SecondPos) {
904   isl::space Space = AccMap.get_space();
905   isl::map Universe = isl::map::universe(Space);
906 
907   if (Space.dim(isl::dim::out) != 2)
908     return false;
909 
910   // MatMul has the form:
911   // for (i = 0; i < N; i++)
912   //   for (j = 0; j < M; j++)
913   //     for (k = 0; k < P; k++)
914   //       C[i, j] += A[i, k] * B[k, j]
915   //
916   // Permutation of three outer loops: 3! = 6 possibilities.
917   int FirstDims[] = {0, 0, 1, 1, 2, 2};
918   int SecondDims[] = {1, 2, 2, 0, 0, 1};
919   for (int i = 0; i < 6; i += 1) {
920     auto PossibleMatMul =
921         Universe.equate(isl::dim::in, FirstDims[i], isl::dim::out, 0)
922             .equate(isl::dim::in, SecondDims[i], isl::dim::out, 1);
923 
924     AccMap = AccMap.intersect_domain(Domain);
925     PossibleMatMul = PossibleMatMul.intersect_domain(Domain);
926 
927     // If AccMap spans entire domain (Non-partial write),
928     // compute FirstPos and SecondPos.
929     // If AccMap != PossibleMatMul here (the two maps have been gisted at
930     // this point), it means that the writes are not complete, or in other
931     // words, it is a Partial write and Partial writes must be rejected.
932     if (AccMap.is_equal(PossibleMatMul)) {
933       if (FirstPos != -1 && FirstPos != FirstDims[i])
934         continue;
935       FirstPos = FirstDims[i];
936       if (SecondPos != -1 && SecondPos != SecondDims[i])
937         continue;
938       SecondPos = SecondDims[i];
939       return true;
940     }
941   }
942 
943   return false;
944 }
945 
946 /// Does the memory access represent a non-scalar operand of the matrix
947 /// multiplication.
948 ///
949 /// Check that the memory access @p MemAccess is the read access to a non-scalar
950 /// operand of the matrix multiplication or its result.
951 ///
952 /// @param MemAccess The memory access to be checked.
953 /// @param MMI       Parameters of the matrix multiplication operands.
954 /// @return          True in case the memory access represents the read access
955 ///                  to a non-scalar operand of the matrix multiplication and
956 ///                  false, otherwise.
957 static bool isMatMulNonScalarReadAccess(MemoryAccess *MemAccess,
958                                         MatMulInfoTy &MMI) {
959   if (!MemAccess->isLatestArrayKind() || !MemAccess->isRead())
960     return false;
961   auto AccMap = MemAccess->getLatestAccessRelation();
962   isl::set StmtDomain = MemAccess->getStatement()->getDomain();
963   if (isMatMulOperandAcc(StmtDomain, AccMap, MMI.i, MMI.j) && !MMI.ReadFromC) {
964     MMI.ReadFromC = MemAccess;
965     return true;
966   }
967   if (isMatMulOperandAcc(StmtDomain, AccMap, MMI.i, MMI.k) && !MMI.A) {
968     MMI.A = MemAccess;
969     return true;
970   }
971   if (isMatMulOperandAcc(StmtDomain, AccMap, MMI.k, MMI.j) && !MMI.B) {
972     MMI.B = MemAccess;
973     return true;
974   }
975   return false;
976 }
977 
978 /// Check accesses to operands of the matrix multiplication.
979 ///
980 /// Check that accesses of the SCoP statement, which corresponds to
981 /// the partial schedule @p PartialSchedule, are scalar in terms of loops
982 /// containing the matrix multiplication, in case they do not represent
983 /// accesses to the non-scalar operands of the matrix multiplication or
984 /// its result.
985 ///
986 /// @param  PartialSchedule The partial schedule of the SCoP statement.
987 /// @param  MMI             Parameters of the matrix multiplication operands.
988 /// @return                 True in case the corresponding SCoP statement
989 ///                         represents matrix multiplication and false,
990 ///                         otherwise.
991 static bool containsOnlyMatrMultAcc(isl::map PartialSchedule,
992                                     MatMulInfoTy &MMI) {
993   auto InputDimId = PartialSchedule.get_tuple_id(isl::dim::in);
994   auto *Stmt = static_cast<ScopStmt *>(InputDimId.get_user());
995   isl_size OutDimNum = PartialSchedule.dim(isl::dim::out);
996   assert(OutDimNum > 2 && "In case of the matrix multiplication the loop nest "
997                           "and, consequently, the corresponding scheduling "
998                           "functions have at least three dimensions.");
999   auto MapI =
1000       permuteDimensions(PartialSchedule, isl::dim::out, MMI.i, OutDimNum - 1);
1001   auto MapJ =
1002       permuteDimensions(PartialSchedule, isl::dim::out, MMI.j, OutDimNum - 1);
1003   auto MapK =
1004       permuteDimensions(PartialSchedule, isl::dim::out, MMI.k, OutDimNum - 1);
1005 
1006   auto Accesses = getAccessesInOrder(*Stmt);
1007   for (auto *MemA = Accesses.begin(); MemA != Accesses.end() - 1; MemA++) {
1008     auto *MemAccessPtr = *MemA;
1009     if (MemAccessPtr->isLatestArrayKind() && MemAccessPtr != MMI.WriteToC &&
1010         !isMatMulNonScalarReadAccess(MemAccessPtr, MMI) &&
1011         !(MemAccessPtr->isStrideZero(MapI)) &&
1012         MemAccessPtr->isStrideZero(MapJ) && MemAccessPtr->isStrideZero(MapK))
1013       return false;
1014   }
1015   return true;
1016 }
1017 
1018 /// Check for dependencies corresponding to the matrix multiplication.
1019 ///
1020 /// Check that there is only true dependence of the form
1021 /// S(..., k, ...) -> S(..., k + 1, …), where S is the SCoP statement
1022 /// represented by @p Schedule and k is @p Pos. Such a dependence corresponds
1023 /// to the dependency produced by the matrix multiplication.
1024 ///
1025 /// @param  Schedule The schedule of the SCoP statement.
1026 /// @param  D The SCoP dependencies.
1027 /// @param  Pos The parameter to describe an acceptable true dependence.
1028 ///             In case it has a negative value, try to determine its
1029 ///             acceptable value.
1030 /// @return True in case dependencies correspond to the matrix multiplication
1031 ///         and false, otherwise.
1032 static bool containsOnlyMatMulDep(isl::map Schedule, const Dependences *D,
1033                                   int &Pos) {
1034   isl::union_map Dep = D->getDependences(Dependences::TYPE_RAW);
1035   isl::union_map Red = D->getDependences(Dependences::TYPE_RED);
1036   if (Red)
1037     Dep = Dep.unite(Red);
1038   auto DomainSpace = Schedule.get_space().domain();
1039   auto Space = DomainSpace.map_from_domain_and_range(DomainSpace);
1040   auto Deltas = Dep.extract_map(Space).deltas();
1041   isl_size DeltasDimNum = Deltas.dim(isl::dim::set);
1042   for (int i = 0; i < DeltasDimNum; i++) {
1043     auto Val = Deltas.plain_get_val_if_fixed(isl::dim::set, i);
1044     Pos = Pos < 0 && Val.is_one() ? i : Pos;
1045     if (Val.is_nan() || !(Val.is_zero() || (i == Pos && Val.is_one())))
1046       return false;
1047   }
1048   if (DeltasDimNum == 0 || Pos < 0)
1049     return false;
1050   return true;
1051 }
1052 
1053 /// Check if the SCoP statement could probably be optimized with analytical
1054 /// modeling.
1055 ///
1056 /// containsMatrMult tries to determine whether the following conditions
1057 /// are true:
1058 /// 1. The last memory access modeling an array, MA1, represents writing to
1059 ///    memory and has the form S(..., i1, ..., i2, ...) -> M(i1, i2) or
1060 ///    S(..., i2, ..., i1, ...) -> M(i1, i2), where S is the SCoP statement
1061 ///    under consideration.
1062 /// 2. There is only one loop-carried true dependency, and it has the
1063 ///    form S(..., i3, ...) -> S(..., i3 + 1, ...), and there are no
1064 ///    loop-carried or anti dependencies.
1065 /// 3. SCoP contains three access relations, MA2, MA3, and MA4 that represent
1066 ///    reading from memory and have the form S(..., i3, ...) -> M(i1, i3),
1067 ///    S(..., i3, ...) -> M(i3, i2), S(...) -> M(i1, i2), respectively,
1068 ///    and all memory accesses of the SCoP that are different from MA1, MA2,
1069 ///    MA3, and MA4 have stride 0, if the innermost loop is exchanged with any
1070 ///    of loops i1, i2 and i3.
1071 ///
1072 /// @param PartialSchedule The PartialSchedule that contains a SCoP statement
1073 ///        to check.
1074 /// @D     The SCoP dependencies.
1075 /// @MMI   Parameters of the matrix multiplication operands.
1076 static bool containsMatrMult(isl::map PartialSchedule, const Dependences *D,
1077                              MatMulInfoTy &MMI) {
1078   auto InputDimsId = PartialSchedule.get_tuple_id(isl::dim::in);
1079   auto *Stmt = static_cast<ScopStmt *>(InputDimsId.get_user());
1080   if (Stmt->size() <= 1)
1081     return false;
1082 
1083   auto Accesses = getAccessesInOrder(*Stmt);
1084   for (auto *MemA = Accesses.end() - 1; MemA != Accesses.begin(); MemA--) {
1085     auto *MemAccessPtr = *MemA;
1086     if (!MemAccessPtr->isLatestArrayKind())
1087       continue;
1088     if (!MemAccessPtr->isWrite())
1089       return false;
1090     auto AccMap = MemAccessPtr->getLatestAccessRelation();
1091     if (!isMatMulOperandAcc(Stmt->getDomain(), AccMap, MMI.i, MMI.j))
1092       return false;
1093     MMI.WriteToC = MemAccessPtr;
1094     break;
1095   }
1096 
1097   if (!containsOnlyMatMulDep(PartialSchedule, D, MMI.k))
1098     return false;
1099 
1100   if (!MMI.WriteToC || !containsOnlyMatrMultAcc(PartialSchedule, MMI))
1101     return false;
1102 
1103   if (!MMI.A || !MMI.B || !MMI.ReadFromC)
1104     return false;
1105   return true;
1106 }
1107 
1108 /// Permute two dimensions of the band node.
1109 ///
1110 /// Permute FirstDim and SecondDim dimensions of the Node.
1111 ///
1112 /// @param Node The band node to be modified.
1113 /// @param FirstDim The first dimension to be permuted.
1114 /// @param SecondDim The second dimension to be permuted.
1115 static isl::schedule_node permuteBandNodeDimensions(isl::schedule_node Node,
1116                                                     unsigned FirstDim,
1117                                                     unsigned SecondDim) {
1118   assert(isl_schedule_node_get_type(Node.get()) == isl_schedule_node_band &&
1119          (unsigned)isl_schedule_node_band_n_member(Node.get()) >
1120              std::max(FirstDim, SecondDim));
1121   auto PartialSchedule =
1122       isl::manage(isl_schedule_node_band_get_partial_schedule(Node.get()));
1123   auto PartialScheduleFirstDim = PartialSchedule.get_union_pw_aff(FirstDim);
1124   auto PartialScheduleSecondDim = PartialSchedule.get_union_pw_aff(SecondDim);
1125   PartialSchedule =
1126       PartialSchedule.set_union_pw_aff(SecondDim, PartialScheduleFirstDim);
1127   PartialSchedule =
1128       PartialSchedule.set_union_pw_aff(FirstDim, PartialScheduleSecondDim);
1129   Node = isl::manage(isl_schedule_node_delete(Node.release()));
1130   return Node.insert_partial_schedule(PartialSchedule);
1131 }
1132 
1133 isl::schedule_node ScheduleTreeOptimizer::createMicroKernel(
1134     isl::schedule_node Node, MicroKernelParamsTy MicroKernelParams) {
1135   Node = applyRegisterTiling(Node, {MicroKernelParams.Mr, MicroKernelParams.Nr},
1136                              1);
1137   Node = Node.parent().parent();
1138   return permuteBandNodeDimensions(Node, 0, 1).child(0).child(0);
1139 }
1140 
1141 isl::schedule_node ScheduleTreeOptimizer::createMacroKernel(
1142     isl::schedule_node Node, MacroKernelParamsTy MacroKernelParams) {
1143   assert(isl_schedule_node_get_type(Node.get()) == isl_schedule_node_band);
1144   if (MacroKernelParams.Mc == 1 && MacroKernelParams.Nc == 1 &&
1145       MacroKernelParams.Kc == 1)
1146     return Node;
1147   int DimOutNum = isl_schedule_node_band_n_member(Node.get());
1148   std::vector<int> TileSizes(DimOutNum, 1);
1149   TileSizes[DimOutNum - 3] = MacroKernelParams.Mc;
1150   TileSizes[DimOutNum - 2] = MacroKernelParams.Nc;
1151   TileSizes[DimOutNum - 1] = MacroKernelParams.Kc;
1152   Node = tileNode(Node, "1st level tiling", TileSizes, 1);
1153   Node = Node.parent().parent();
1154   Node = permuteBandNodeDimensions(Node, DimOutNum - 2, DimOutNum - 1);
1155   Node = permuteBandNodeDimensions(Node, DimOutNum - 3, DimOutNum - 1);
1156 
1157   // Mark the outermost loop as parallelizable.
1158   Node = Node.band_member_set_coincident(0, true);
1159 
1160   return Node.child(0).child(0);
1161 }
1162 
1163 /// Get the size of the widest type of the matrix multiplication operands
1164 /// in bytes, including alignment padding.
1165 ///
1166 /// @param MMI Parameters of the matrix multiplication operands.
1167 /// @return The size of the widest type of the matrix multiplication operands
1168 ///         in bytes, including alignment padding.
1169 static uint64_t getMatMulAlignTypeSize(MatMulInfoTy MMI) {
1170   auto *S = MMI.A->getStatement()->getParent();
1171   auto &DL = S->getFunction().getParent()->getDataLayout();
1172   auto ElementSizeA = DL.getTypeAllocSize(MMI.A->getElementType());
1173   auto ElementSizeB = DL.getTypeAllocSize(MMI.B->getElementType());
1174   auto ElementSizeC = DL.getTypeAllocSize(MMI.WriteToC->getElementType());
1175   return std::max({ElementSizeA, ElementSizeB, ElementSizeC});
1176 }
1177 
1178 /// Get the size of the widest type of the matrix multiplication operands
1179 /// in bits.
1180 ///
1181 /// @param MMI Parameters of the matrix multiplication operands.
1182 /// @return The size of the widest type of the matrix multiplication operands
1183 ///         in bits.
1184 static uint64_t getMatMulTypeSize(MatMulInfoTy MMI) {
1185   auto *S = MMI.A->getStatement()->getParent();
1186   auto &DL = S->getFunction().getParent()->getDataLayout();
1187   auto ElementSizeA = DL.getTypeSizeInBits(MMI.A->getElementType());
1188   auto ElementSizeB = DL.getTypeSizeInBits(MMI.B->getElementType());
1189   auto ElementSizeC = DL.getTypeSizeInBits(MMI.WriteToC->getElementType());
1190   return std::max({ElementSizeA, ElementSizeB, ElementSizeC});
1191 }
1192 
1193 /// Get parameters of the BLIS micro kernel.
1194 ///
1195 /// We choose the Mr and Nr parameters of the micro kernel to be large enough
1196 /// such that no stalls caused by the combination of latencies and dependencies
1197 /// are introduced during the updates of the resulting matrix of the matrix
1198 /// multiplication. However, they should also be as small as possible to
1199 /// release more registers for entries of multiplied matrices.
1200 ///
1201 /// @param TTI Target Transform Info.
1202 /// @param MMI Parameters of the matrix multiplication operands.
1203 /// @return The structure of type MicroKernelParamsTy.
1204 /// @see MicroKernelParamsTy
1205 static struct MicroKernelParamsTy
1206 getMicroKernelParams(const TargetTransformInfo *TTI, MatMulInfoTy MMI) {
1207   assert(TTI && "The target transform info should be provided.");
1208 
1209   // Nvec - Number of double-precision floating-point numbers that can be hold
1210   // by a vector register. Use 2 by default.
1211   long RegisterBitwidth = VectorRegisterBitwidth;
1212 
1213   if (RegisterBitwidth == -1)
1214     RegisterBitwidth = TTI->getRegisterBitWidth(true);
1215   auto ElementSize = getMatMulTypeSize(MMI);
1216   assert(ElementSize > 0 && "The element size of the matrix multiplication "
1217                             "operands should be greater than zero.");
1218   auto Nvec = RegisterBitwidth / ElementSize;
1219   if (Nvec == 0)
1220     Nvec = 2;
1221   int Nr = ceil(sqrt((double)(Nvec * LatencyVectorFma * ThroughputVectorFma)) /
1222                 Nvec) *
1223            Nvec;
1224   int Mr = ceil((double)(Nvec * LatencyVectorFma * ThroughputVectorFma / Nr));
1225   return {Mr, Nr};
1226 }
1227 
1228 /// Determine parameters of the target cache.
1229 ///
1230 /// @param TTI Target Transform Info.
1231 void getTargetCacheParameters(const llvm::TargetTransformInfo *TTI) {
1232   auto L1DCache = llvm::TargetTransformInfo::CacheLevel::L1D;
1233   auto L2DCache = llvm::TargetTransformInfo::CacheLevel::L2D;
1234   if (FirstCacheLevelSize == -1) {
1235     if (TTI->getCacheSize(L1DCache).hasValue())
1236       FirstCacheLevelSize = TTI->getCacheSize(L1DCache).getValue();
1237     else
1238       FirstCacheLevelSize = static_cast<int>(FirstCacheLevelDefaultSize);
1239   }
1240   if (SecondCacheLevelSize == -1) {
1241     if (TTI->getCacheSize(L2DCache).hasValue())
1242       SecondCacheLevelSize = TTI->getCacheSize(L2DCache).getValue();
1243     else
1244       SecondCacheLevelSize = static_cast<int>(SecondCacheLevelDefaultSize);
1245   }
1246   if (FirstCacheLevelAssociativity == -1) {
1247     if (TTI->getCacheAssociativity(L1DCache).hasValue())
1248       FirstCacheLevelAssociativity =
1249           TTI->getCacheAssociativity(L1DCache).getValue();
1250     else
1251       FirstCacheLevelAssociativity =
1252           static_cast<int>(FirstCacheLevelDefaultAssociativity);
1253   }
1254   if (SecondCacheLevelAssociativity == -1) {
1255     if (TTI->getCacheAssociativity(L2DCache).hasValue())
1256       SecondCacheLevelAssociativity =
1257           TTI->getCacheAssociativity(L2DCache).getValue();
1258     else
1259       SecondCacheLevelAssociativity =
1260           static_cast<int>(SecondCacheLevelDefaultAssociativity);
1261   }
1262 }
1263 
1264 /// Get parameters of the BLIS macro kernel.
1265 ///
1266 /// During the computation of matrix multiplication, blocks of partitioned
1267 /// matrices are mapped to different layers of the memory hierarchy.
1268 /// To optimize data reuse, blocks should be ideally kept in cache between
1269 /// iterations. Since parameters of the macro kernel determine sizes of these
1270 /// blocks, there are upper and lower bounds on these parameters.
1271 ///
1272 /// @param TTI Target Transform Info.
1273 /// @param MicroKernelParams Parameters of the micro-kernel
1274 ///                          to be taken into account.
1275 /// @param MMI Parameters of the matrix multiplication operands.
1276 /// @return The structure of type MacroKernelParamsTy.
1277 /// @see MacroKernelParamsTy
1278 /// @see MicroKernelParamsTy
1279 static struct MacroKernelParamsTy
1280 getMacroKernelParams(const llvm::TargetTransformInfo *TTI,
1281                      const MicroKernelParamsTy &MicroKernelParams,
1282                      MatMulInfoTy MMI) {
1283   getTargetCacheParameters(TTI);
1284   // According to www.cs.utexas.edu/users/flame/pubs/TOMS-BLIS-Analytical.pdf,
1285   // it requires information about the first two levels of a cache to determine
1286   // all the parameters of a macro-kernel. It also checks that an associativity
1287   // degree of a cache level is greater than two. Otherwise, another algorithm
1288   // for determination of the parameters should be used.
1289   if (!(MicroKernelParams.Mr > 0 && MicroKernelParams.Nr > 0 &&
1290         FirstCacheLevelSize > 0 && SecondCacheLevelSize > 0 &&
1291         FirstCacheLevelAssociativity > 2 && SecondCacheLevelAssociativity > 2))
1292     return {1, 1, 1};
1293   // The quotient should be greater than zero.
1294   if (PollyPatternMatchingNcQuotient <= 0)
1295     return {1, 1, 1};
1296   int Car = floor(
1297       (FirstCacheLevelAssociativity - 1) /
1298       (1 + static_cast<double>(MicroKernelParams.Nr) / MicroKernelParams.Mr));
1299 
1300   // Car can be computed to be zero since it is floor to int.
1301   // On Mac OS, division by 0 does not raise a signal. This causes negative
1302   // tile sizes to be computed. Prevent division by Cac==0 by early returning
1303   // if this happens.
1304   if (Car == 0)
1305     return {1, 1, 1};
1306 
1307   auto ElementSize = getMatMulAlignTypeSize(MMI);
1308   assert(ElementSize > 0 && "The element size of the matrix multiplication "
1309                             "operands should be greater than zero.");
1310   int Kc = (Car * FirstCacheLevelSize) /
1311            (MicroKernelParams.Mr * FirstCacheLevelAssociativity * ElementSize);
1312   double Cac =
1313       static_cast<double>(Kc * ElementSize * SecondCacheLevelAssociativity) /
1314       SecondCacheLevelSize;
1315   int Mc = floor((SecondCacheLevelAssociativity - 2) / Cac);
1316   int Nc = PollyPatternMatchingNcQuotient * MicroKernelParams.Nr;
1317 
1318   assert(Mc > 0 && Nc > 0 && Kc > 0 &&
1319          "Matrix block sizes should be  greater than zero");
1320   return {Mc, Nc, Kc};
1321 }
1322 
1323 /// Create an access relation that is specific to
1324 ///        the matrix multiplication pattern.
1325 ///
1326 /// Create an access relation of the following form:
1327 /// [O0, O1, O2, O3, O4, O5, O6, O7, O8] -> [OI, O5, OJ]
1328 /// where I is @p FirstDim, J is @p SecondDim.
1329 ///
1330 /// It can be used, for example, to create relations that helps to consequently
1331 /// access elements of operands of a matrix multiplication after creation of
1332 /// the BLIS micro and macro kernels.
1333 ///
1334 /// @see ScheduleTreeOptimizer::createMicroKernel
1335 /// @see ScheduleTreeOptimizer::createMacroKernel
1336 ///
1337 /// Subsequently, the described access relation is applied to the range of
1338 /// @p MapOldIndVar, that is used to map original induction variables to
1339 /// the ones, which are produced by schedule transformations. It helps to
1340 /// define relations using a new space and, at the same time, keep them
1341 /// in the original one.
1342 ///
1343 /// @param MapOldIndVar The relation, which maps original induction variables
1344 ///                     to the ones, which are produced by schedule
1345 ///                     transformations.
1346 /// @param FirstDim, SecondDim The input dimensions that are used to define
1347 ///        the specified access relation.
1348 /// @return The specified access relation.
1349 isl::map getMatMulAccRel(isl::map MapOldIndVar, unsigned FirstDim,
1350                          unsigned SecondDim) {
1351   auto AccessRelSpace = isl::space(MapOldIndVar.get_ctx(), 0, 9, 3);
1352   auto AccessRel = isl::map::universe(AccessRelSpace);
1353   AccessRel = AccessRel.equate(isl::dim::in, FirstDim, isl::dim::out, 0);
1354   AccessRel = AccessRel.equate(isl::dim::in, 5, isl::dim::out, 1);
1355   AccessRel = AccessRel.equate(isl::dim::in, SecondDim, isl::dim::out, 2);
1356   return MapOldIndVar.apply_range(AccessRel);
1357 }
1358 
1359 isl::schedule_node createExtensionNode(isl::schedule_node Node,
1360                                        isl::map ExtensionMap) {
1361   auto Extension = isl::union_map(ExtensionMap);
1362   auto NewNode = isl::schedule_node::from_extension(Extension);
1363   return Node.graft_before(NewNode);
1364 }
1365 
1366 /// Apply the packing transformation.
1367 ///
1368 /// The packing transformation can be described as a data-layout
1369 /// transformation that requires to introduce a new array, copy data
1370 /// to the array, and change memory access locations to reference the array.
1371 /// It can be used to ensure that elements of the new array are read in-stride
1372 /// access, aligned to cache lines boundaries, and preloaded into certain cache
1373 /// levels.
1374 ///
1375 /// As an example let us consider the packing of the array A that would help
1376 /// to read its elements with in-stride access. An access to the array A
1377 /// is represented by an access relation that has the form
1378 /// S[i, j, k] -> A[i, k]. The scheduling function of the SCoP statement S has
1379 /// the form S[i,j, k] -> [floor((j mod Nc) / Nr), floor((i mod Mc) / Mr),
1380 /// k mod Kc, j mod Nr, i mod Mr].
1381 ///
1382 /// To ensure that elements of the array A are read in-stride access, we add
1383 /// a new array Packed_A[Mc/Mr][Kc][Mr] to the SCoP, using
1384 /// Scop::createScopArrayInfo, change the access relation
1385 /// S[i, j, k] -> A[i, k] to
1386 /// S[i, j, k] -> Packed_A[floor((i mod Mc) / Mr), k mod Kc, i mod Mr], using
1387 /// MemoryAccess::setNewAccessRelation, and copy the data to the array, using
1388 /// the copy statement created by Scop::addScopStmt.
1389 ///
1390 /// @param Node The schedule node to be optimized.
1391 /// @param MapOldIndVar The relation, which maps original induction variables
1392 ///                     to the ones, which are produced by schedule
1393 ///                     transformations.
1394 /// @param MicroParams, MacroParams Parameters of the BLIS kernel
1395 ///                                 to be taken into account.
1396 /// @param MMI Parameters of the matrix multiplication operands.
1397 /// @return The optimized schedule node.
1398 static isl::schedule_node
1399 optimizeDataLayoutMatrMulPattern(isl::schedule_node Node, isl::map MapOldIndVar,
1400                                  MicroKernelParamsTy MicroParams,
1401                                  MacroKernelParamsTy MacroParams,
1402                                  MatMulInfoTy &MMI) {
1403   auto InputDimsId = MapOldIndVar.get_tuple_id(isl::dim::in);
1404   auto *Stmt = static_cast<ScopStmt *>(InputDimsId.get_user());
1405 
1406   // Create a copy statement that corresponds to the memory access to the
1407   // matrix B, the second operand of the matrix multiplication.
1408   Node = Node.parent().parent().parent().parent().parent().parent();
1409   Node = isl::manage(isl_schedule_node_band_split(Node.release(), 2)).child(0);
1410   auto AccRel = getMatMulAccRel(MapOldIndVar, 3, 7);
1411   unsigned FirstDimSize = MacroParams.Nc / MicroParams.Nr;
1412   unsigned SecondDimSize = MacroParams.Kc;
1413   unsigned ThirdDimSize = MicroParams.Nr;
1414   auto *SAI = Stmt->getParent()->createScopArrayInfo(
1415       MMI.B->getElementType(), "Packed_B",
1416       {FirstDimSize, SecondDimSize, ThirdDimSize});
1417   AccRel = AccRel.set_tuple_id(isl::dim::out, SAI->getBasePtrId());
1418   auto OldAcc = MMI.B->getLatestAccessRelation();
1419   MMI.B->setNewAccessRelation(AccRel);
1420   auto ExtMap = MapOldIndVar.project_out(isl::dim::out, 2,
1421                                          MapOldIndVar.dim(isl::dim::out) - 2);
1422   ExtMap = ExtMap.reverse();
1423   ExtMap = ExtMap.fix_si(isl::dim::out, MMI.i, 0);
1424   auto Domain = Stmt->getDomain();
1425 
1426   // Restrict the domains of the copy statements to only execute when also its
1427   // originating statement is executed.
1428   auto DomainId = Domain.get_tuple_id();
1429   auto *NewStmt = Stmt->getParent()->addScopStmt(
1430       OldAcc, MMI.B->getLatestAccessRelation(), Domain);
1431   ExtMap = ExtMap.set_tuple_id(isl::dim::out, DomainId);
1432   ExtMap = ExtMap.intersect_range(Domain);
1433   ExtMap = ExtMap.set_tuple_id(isl::dim::out, NewStmt->getDomainId());
1434   Node = createExtensionNode(Node, ExtMap);
1435 
1436   // Create a copy statement that corresponds to the memory access
1437   // to the matrix A, the first operand of the matrix multiplication.
1438   Node = Node.child(0);
1439   AccRel = getMatMulAccRel(MapOldIndVar, 4, 6);
1440   FirstDimSize = MacroParams.Mc / MicroParams.Mr;
1441   ThirdDimSize = MicroParams.Mr;
1442   SAI = Stmt->getParent()->createScopArrayInfo(
1443       MMI.A->getElementType(), "Packed_A",
1444       {FirstDimSize, SecondDimSize, ThirdDimSize});
1445   AccRel = AccRel.set_tuple_id(isl::dim::out, SAI->getBasePtrId());
1446   OldAcc = MMI.A->getLatestAccessRelation();
1447   MMI.A->setNewAccessRelation(AccRel);
1448   ExtMap = MapOldIndVar.project_out(isl::dim::out, 3,
1449                                     MapOldIndVar.dim(isl::dim::out) - 3);
1450   ExtMap = ExtMap.reverse();
1451   ExtMap = ExtMap.fix_si(isl::dim::out, MMI.j, 0);
1452   NewStmt = Stmt->getParent()->addScopStmt(
1453       OldAcc, MMI.A->getLatestAccessRelation(), Domain);
1454 
1455   // Restrict the domains of the copy statements to only execute when also its
1456   // originating statement is executed.
1457   ExtMap = ExtMap.set_tuple_id(isl::dim::out, DomainId);
1458   ExtMap = ExtMap.intersect_range(Domain);
1459   ExtMap = ExtMap.set_tuple_id(isl::dim::out, NewStmt->getDomainId());
1460   Node = createExtensionNode(Node, ExtMap);
1461   return Node.child(0).child(0).child(0).child(0).child(0);
1462 }
1463 
1464 /// Get a relation mapping induction variables produced by schedule
1465 /// transformations to the original ones.
1466 ///
1467 /// @param Node The schedule node produced as the result of creation
1468 ///        of the BLIS kernels.
1469 /// @param MicroKernelParams, MacroKernelParams Parameters of the BLIS kernel
1470 ///                                             to be taken into account.
1471 /// @return  The relation mapping original induction variables to the ones
1472 ///          produced by schedule transformation.
1473 /// @see ScheduleTreeOptimizer::createMicroKernel
1474 /// @see ScheduleTreeOptimizer::createMacroKernel
1475 /// @see getMacroKernelParams
1476 isl::map
1477 getInductionVariablesSubstitution(isl::schedule_node Node,
1478                                   MicroKernelParamsTy MicroKernelParams,
1479                                   MacroKernelParamsTy MacroKernelParams) {
1480   auto Child = Node.child(0);
1481   auto UnMapOldIndVar = Child.get_prefix_schedule_union_map();
1482   auto MapOldIndVar = isl::map::from_union_map(UnMapOldIndVar);
1483   if (MapOldIndVar.dim(isl::dim::out) > 9)
1484     return MapOldIndVar.project_out(isl::dim::out, 0,
1485                                     MapOldIndVar.dim(isl::dim::out) - 9);
1486   return MapOldIndVar;
1487 }
1488 
1489 /// Isolate a set of partial tile prefixes and unroll the isolated part.
1490 ///
1491 /// The set should ensure that it contains only partial tile prefixes that have
1492 /// exactly Mr x Nr iterations of the two innermost loops produced by
1493 /// the optimization of the matrix multiplication. Mr and Nr are parameters of
1494 /// the micro-kernel.
1495 ///
1496 /// In case of parametric bounds, this helps to auto-vectorize the unrolled
1497 /// innermost loops, using the SLP vectorizer.
1498 ///
1499 /// @param Node              The schedule node to be modified.
1500 /// @param MicroKernelParams Parameters of the micro-kernel
1501 ///                          to be taken into account.
1502 /// @return The modified isl_schedule_node.
1503 static isl::schedule_node
1504 isolateAndUnrollMatMulInnerLoops(isl::schedule_node Node,
1505                                  struct MicroKernelParamsTy MicroKernelParams) {
1506   isl::schedule_node Child = Node.get_child(0);
1507   isl::union_map UnMapOldIndVar = Child.get_prefix_schedule_relation();
1508   isl::set Prefix = isl::map::from_union_map(UnMapOldIndVar).range();
1509   isl_size Dims = Prefix.dim(isl::dim::set);
1510   Prefix = Prefix.project_out(isl::dim::set, Dims - 1, 1);
1511   Prefix = getPartialTilePrefixes(Prefix, MicroKernelParams.Nr);
1512   Prefix = getPartialTilePrefixes(Prefix, MicroKernelParams.Mr);
1513 
1514   isl::union_set IsolateOption =
1515       getIsolateOptions(Prefix.add_dims(isl::dim::set, 3), 3);
1516   isl::ctx Ctx = Node.get_ctx();
1517   auto Options = IsolateOption.unite(getDimOptions(Ctx, "unroll"));
1518   Options = Options.unite(getUnrollIsolatedSetOptions(Ctx));
1519   Node = Node.band_set_ast_build_options(Options);
1520   Node = Node.parent().parent().parent();
1521   IsolateOption = getIsolateOptions(Prefix, 3);
1522   Options = IsolateOption.unite(getDimOptions(Ctx, "separate"));
1523   Node = Node.band_set_ast_build_options(Options);
1524   Node = Node.child(0).child(0).child(0);
1525   return Node;
1526 }
1527 
1528 /// Mark @p BasePtr with "Inter iteration alias-free" mark node.
1529 ///
1530 /// @param Node The child of the mark node to be inserted.
1531 /// @param BasePtr The pointer to be marked.
1532 /// @return The modified isl_schedule_node.
1533 static isl::schedule_node markInterIterationAliasFree(isl::schedule_node Node,
1534                                                       Value *BasePtr) {
1535   if (!BasePtr)
1536     return Node;
1537 
1538   auto Id =
1539       isl::id::alloc(Node.get_ctx(), "Inter iteration alias-free", BasePtr);
1540   return Node.insert_mark(Id).child(0);
1541 }
1542 
1543 /// Insert "Loop Vectorizer Disabled" mark node.
1544 ///
1545 /// @param Node The child of the mark node to be inserted.
1546 /// @return The modified isl_schedule_node.
1547 static isl::schedule_node markLoopVectorizerDisabled(isl::schedule_node Node) {
1548   auto Id = isl::id::alloc(Node.get_ctx(), "Loop Vectorizer Disabled", nullptr);
1549   return Node.insert_mark(Id).child(0);
1550 }
1551 
1552 /// Restore the initial ordering of dimensions of the band node
1553 ///
1554 /// In case the band node represents all the dimensions of the iteration
1555 /// domain, recreate the band node to restore the initial ordering of the
1556 /// dimensions.
1557 ///
1558 /// @param Node The band node to be modified.
1559 /// @return The modified schedule node.
1560 static isl::schedule_node
1561 getBandNodeWithOriginDimOrder(isl::schedule_node Node) {
1562   assert(isl_schedule_node_get_type(Node.get()) == isl_schedule_node_band);
1563   if (isl_schedule_node_get_type(Node.child(0).get()) != isl_schedule_node_leaf)
1564     return Node;
1565   auto Domain = Node.get_universe_domain();
1566   assert(isl_union_set_n_set(Domain.get()) == 1);
1567   if (Node.get_schedule_depth() != 0 ||
1568       (isl::set(Domain).dim(isl::dim::set) !=
1569        isl_schedule_node_band_n_member(Node.get())))
1570     return Node;
1571   Node = isl::manage(isl_schedule_node_delete(Node.copy()));
1572   auto PartialSchedulePwAff = Domain.identity_union_pw_multi_aff();
1573   auto PartialScheduleMultiPwAff =
1574       isl::multi_union_pw_aff(PartialSchedulePwAff);
1575   PartialScheduleMultiPwAff =
1576       PartialScheduleMultiPwAff.reset_tuple_id(isl::dim::set);
1577   return Node.insert_partial_schedule(PartialScheduleMultiPwAff);
1578 }
1579 
1580 isl::schedule_node
1581 ScheduleTreeOptimizer::optimizeMatMulPattern(isl::schedule_node Node,
1582                                              const TargetTransformInfo *TTI,
1583                                              MatMulInfoTy &MMI) {
1584   assert(TTI && "The target transform info should be provided.");
1585   Node = markInterIterationAliasFree(
1586       Node, MMI.WriteToC->getLatestScopArrayInfo()->getBasePtr());
1587   int DimOutNum = isl_schedule_node_band_n_member(Node.get());
1588   assert(DimOutNum > 2 && "In case of the matrix multiplication the loop nest "
1589                           "and, consequently, the corresponding scheduling "
1590                           "functions have at least three dimensions.");
1591   Node = getBandNodeWithOriginDimOrder(Node);
1592   Node = permuteBandNodeDimensions(Node, MMI.i, DimOutNum - 3);
1593   int NewJ = MMI.j == DimOutNum - 3 ? MMI.i : MMI.j;
1594   int NewK = MMI.k == DimOutNum - 3 ? MMI.i : MMI.k;
1595   Node = permuteBandNodeDimensions(Node, NewJ, DimOutNum - 2);
1596   NewK = NewK == DimOutNum - 2 ? NewJ : NewK;
1597   Node = permuteBandNodeDimensions(Node, NewK, DimOutNum - 1);
1598   auto MicroKernelParams = getMicroKernelParams(TTI, MMI);
1599   auto MacroKernelParams = getMacroKernelParams(TTI, MicroKernelParams, MMI);
1600   Node = createMacroKernel(Node, MacroKernelParams);
1601   Node = createMicroKernel(Node, MicroKernelParams);
1602   if (MacroKernelParams.Mc == 1 || MacroKernelParams.Nc == 1 ||
1603       MacroKernelParams.Kc == 1)
1604     return Node;
1605   auto MapOldIndVar = getInductionVariablesSubstitution(Node, MicroKernelParams,
1606                                                         MacroKernelParams);
1607   if (!MapOldIndVar)
1608     return Node;
1609   Node = markLoopVectorizerDisabled(Node.parent()).child(0);
1610   Node = isolateAndUnrollMatMulInnerLoops(Node, MicroKernelParams);
1611   return optimizeDataLayoutMatrMulPattern(Node, MapOldIndVar, MicroKernelParams,
1612                                           MacroKernelParams, MMI);
1613 }
1614 
1615 bool ScheduleTreeOptimizer::isMatrMultPattern(isl::schedule_node Node,
1616                                               const Dependences *D,
1617                                               MatMulInfoTy &MMI) {
1618   auto PartialSchedule = isl::manage(
1619       isl_schedule_node_band_get_partial_schedule_union_map(Node.get()));
1620   Node = Node.child(0);
1621   auto LeafType = isl_schedule_node_get_type(Node.get());
1622   Node = Node.parent();
1623   if (LeafType != isl_schedule_node_leaf ||
1624       isl_schedule_node_band_n_member(Node.get()) < 3 ||
1625       Node.get_schedule_depth() != 0 ||
1626       isl_union_map_n_map(PartialSchedule.get()) != 1)
1627     return false;
1628   auto NewPartialSchedule = isl::map::from_union_map(PartialSchedule);
1629   if (containsMatrMult(NewPartialSchedule, D, MMI))
1630     return true;
1631   return false;
1632 }
1633 
1634 __isl_give isl_schedule_node *
1635 ScheduleTreeOptimizer::optimizeBand(__isl_take isl_schedule_node *Node,
1636                                     void *User) {
1637   if (!isTileableBandNode(isl::manage_copy(Node)))
1638     return Node;
1639 
1640   const OptimizerAdditionalInfoTy *OAI =
1641       static_cast<const OptimizerAdditionalInfoTy *>(User);
1642 
1643   MatMulInfoTy MMI;
1644   if (PMBasedOpts && User &&
1645       isMatrMultPattern(isl::manage_copy(Node), OAI->D, MMI)) {
1646     LLVM_DEBUG(dbgs() << "The matrix multiplication pattern was detected\n");
1647     MatMulOpts++;
1648     return optimizeMatMulPattern(isl::manage(Node), OAI->TTI, MMI).release();
1649   }
1650 
1651   return standardBandOpts(isl::manage(Node), User).release();
1652 }
1653 
1654 isl::schedule
1655 ScheduleTreeOptimizer::optimizeSchedule(isl::schedule Schedule,
1656                                         const OptimizerAdditionalInfoTy *OAI) {
1657   auto Root = Schedule.get_root();
1658   Root = optimizeScheduleNode(Root, OAI);
1659   return Root.get_schedule();
1660 }
1661 
1662 isl::schedule_node ScheduleTreeOptimizer::optimizeScheduleNode(
1663     isl::schedule_node Node, const OptimizerAdditionalInfoTy *OAI) {
1664   Node = isl::manage(isl_schedule_node_map_descendant_bottom_up(
1665       Node.release(), optimizeBand,
1666       const_cast<void *>(static_cast<const void *>(OAI))));
1667   return Node;
1668 }
1669 
1670 bool ScheduleTreeOptimizer::isProfitableSchedule(Scop &S,
1671                                                  isl::schedule NewSchedule) {
1672   // To understand if the schedule has been optimized we check if the schedule
1673   // has changed at all.
1674   // TODO: We can improve this by tracking if any necessarily beneficial
1675   // transformations have been performed. This can e.g. be tiling, loop
1676   // interchange, or ...) We can track this either at the place where the
1677   // transformation has been performed or, in case of automatic ILP based
1678   // optimizations, by comparing (yet to be defined) performance metrics
1679   // before/after the scheduling optimizer
1680   // (e.g., #stride-one accesses)
1681   auto NewScheduleMap = NewSchedule.get_map();
1682   auto OldSchedule = S.getSchedule();
1683   assert(OldSchedule && "Only IslScheduleOptimizer can insert extension nodes "
1684                         "that make Scop::getSchedule() return nullptr.");
1685   bool changed = !OldSchedule.is_equal(NewScheduleMap);
1686   return changed;
1687 }
1688 
1689 class IslScheduleOptimizerWrapperPass : public ScopPass {
1690 public:
1691   static char ID;
1692 
1693   explicit IslScheduleOptimizerWrapperPass() : ScopPass(ID) {}
1694 
1695   ~IslScheduleOptimizerWrapperPass() override { releaseMemory(); }
1696 
1697   /// Optimize the schedule of the SCoP @p S.
1698   bool runOnScop(Scop &S) override;
1699 
1700   /// Print the new schedule for the SCoP @p S.
1701   void printScop(raw_ostream &OS, Scop &S) const override;
1702 
1703   /// Register all analyses and transformation required.
1704   void getAnalysisUsage(AnalysisUsage &AU) const override;
1705 
1706   /// Release the internal memory.
1707   void releaseMemory() override {
1708     LastSchedule = nullptr;
1709     IslCtx.reset();
1710   }
1711 
1712 private:
1713   std::shared_ptr<isl_ctx> IslCtx;
1714   isl::schedule LastSchedule;
1715 };
1716 
1717 char IslScheduleOptimizerWrapperPass::ID = 0;
1718 
1719 /// Collect statistics for the schedule tree.
1720 ///
1721 /// @param Schedule The schedule tree to analyze. If not a schedule tree it is
1722 /// ignored.
1723 /// @param Version  The version of the schedule tree that is analyzed.
1724 ///                 0 for the original schedule tree before any transformation.
1725 ///                 1 for the schedule tree after isl's rescheduling.
1726 ///                 2 for the schedule tree after optimizations are applied
1727 ///                 (tiling, pattern matching)
1728 static void walkScheduleTreeForStatistics(isl::schedule Schedule, int Version) {
1729   auto Root = Schedule.get_root();
1730   if (!Root)
1731     return;
1732 
1733   isl_schedule_node_foreach_descendant_top_down(
1734       Root.get(),
1735       [](__isl_keep isl_schedule_node *nodeptr, void *user) -> isl_bool {
1736         isl::schedule_node Node = isl::manage_copy(nodeptr);
1737         int Version = *static_cast<int *>(user);
1738 
1739         switch (isl_schedule_node_get_type(Node.get())) {
1740         case isl_schedule_node_band: {
1741           NumBands[Version]++;
1742           if (isl_schedule_node_band_get_permutable(Node.get()) ==
1743               isl_bool_true)
1744             NumPermutable[Version]++;
1745 
1746           int CountMembers = isl_schedule_node_band_n_member(Node.get());
1747           NumBandMembers[Version] += CountMembers;
1748           for (int i = 0; i < CountMembers; i += 1) {
1749             if (Node.band_member_get_coincident(i))
1750               NumCoincident[Version]++;
1751           }
1752           break;
1753         }
1754 
1755         case isl_schedule_node_filter:
1756           NumFilters[Version]++;
1757           break;
1758 
1759         case isl_schedule_node_extension:
1760           NumExtension[Version]++;
1761           break;
1762 
1763         default:
1764           break;
1765         }
1766 
1767         return isl_bool_true;
1768       },
1769       &Version);
1770 }
1771 
1772 static bool runIslScheduleOptimizer(
1773     Scop &S,
1774     function_ref<const Dependences &(Dependences::AnalysisLevel)> GetDeps,
1775     TargetTransformInfo *TTI, isl::schedule &LastSchedule) {
1776   // Skip SCoPs in case they're already optimised by PPCGCodeGeneration
1777   if (S.isToBeSkipped())
1778     return false;
1779 
1780   // Skip empty SCoPs but still allow code generation as it will delete the
1781   // loops present but not needed.
1782   if (S.getSize() == 0) {
1783     S.markAsOptimized();
1784     return false;
1785   }
1786 
1787   const Dependences &D = GetDeps(Dependences::AL_Statement);
1788 
1789   if (D.getSharedIslCtx() != S.getSharedIslCtx()) {
1790     LLVM_DEBUG(dbgs() << "DependenceInfo for another SCoP/isl_ctx\n");
1791     return false;
1792   }
1793 
1794   if (!D.hasValidDependences())
1795     return false;
1796 
1797   // Build input data.
1798   int ValidityKinds =
1799       Dependences::TYPE_RAW | Dependences::TYPE_WAR | Dependences::TYPE_WAW;
1800   int ProximityKinds;
1801 
1802   if (OptimizeDeps == "all")
1803     ProximityKinds =
1804         Dependences::TYPE_RAW | Dependences::TYPE_WAR | Dependences::TYPE_WAW;
1805   else if (OptimizeDeps == "raw")
1806     ProximityKinds = Dependences::TYPE_RAW;
1807   else {
1808     errs() << "Do not know how to optimize for '" << OptimizeDeps << "'"
1809            << " Falling back to optimizing all dependences.\n";
1810     ProximityKinds =
1811         Dependences::TYPE_RAW | Dependences::TYPE_WAR | Dependences::TYPE_WAW;
1812   }
1813 
1814   isl::union_set Domain = S.getDomains();
1815 
1816   if (!Domain)
1817     return false;
1818 
1819   ScopsProcessed++;
1820   walkScheduleTreeForStatistics(S.getScheduleTree(), 0);
1821 
1822   isl::union_map Validity = D.getDependences(ValidityKinds);
1823   isl::union_map Proximity = D.getDependences(ProximityKinds);
1824 
1825   // Simplify the dependences by removing the constraints introduced by the
1826   // domains. This can speed up the scheduling time significantly, as large
1827   // constant coefficients will be removed from the dependences. The
1828   // introduction of some additional dependences reduces the possible
1829   // transformations, but in most cases, such transformation do not seem to be
1830   // interesting anyway. In some cases this option may stop the scheduler to
1831   // find any schedule.
1832   if (SimplifyDeps == "yes") {
1833     Validity = Validity.gist_domain(Domain);
1834     Validity = Validity.gist_range(Domain);
1835     Proximity = Proximity.gist_domain(Domain);
1836     Proximity = Proximity.gist_range(Domain);
1837   } else if (SimplifyDeps != "no") {
1838     errs() << "warning: Option -polly-opt-simplify-deps should either be 'yes' "
1839               "or 'no'. Falling back to default: 'yes'\n";
1840   }
1841 
1842   LLVM_DEBUG(dbgs() << "\n\nCompute schedule from: ");
1843   LLVM_DEBUG(dbgs() << "Domain := " << Domain << ";\n");
1844   LLVM_DEBUG(dbgs() << "Proximity := " << Proximity << ";\n");
1845   LLVM_DEBUG(dbgs() << "Validity := " << Validity << ";\n");
1846 
1847   unsigned IslSerializeSCCs;
1848 
1849   if (FusionStrategy == "max") {
1850     IslSerializeSCCs = 0;
1851   } else if (FusionStrategy == "min") {
1852     IslSerializeSCCs = 1;
1853   } else {
1854     errs() << "warning: Unknown fusion strategy. Falling back to maximal "
1855               "fusion.\n";
1856     IslSerializeSCCs = 0;
1857   }
1858 
1859   int IslMaximizeBands;
1860 
1861   if (MaximizeBandDepth == "yes") {
1862     IslMaximizeBands = 1;
1863   } else if (MaximizeBandDepth == "no") {
1864     IslMaximizeBands = 0;
1865   } else {
1866     errs() << "warning: Option -polly-opt-maximize-bands should either be 'yes'"
1867               " or 'no'. Falling back to default: 'yes'\n";
1868     IslMaximizeBands = 1;
1869   }
1870 
1871   int IslOuterCoincidence;
1872 
1873   if (OuterCoincidence == "yes") {
1874     IslOuterCoincidence = 1;
1875   } else if (OuterCoincidence == "no") {
1876     IslOuterCoincidence = 0;
1877   } else {
1878     errs() << "warning: Option -polly-opt-outer-coincidence should either be "
1879               "'yes' or 'no'. Falling back to default: 'no'\n";
1880     IslOuterCoincidence = 0;
1881   }
1882 
1883   isl_ctx *Ctx = S.getIslCtx().get();
1884 
1885   isl_options_set_schedule_outer_coincidence(Ctx, IslOuterCoincidence);
1886   isl_options_set_schedule_serialize_sccs(Ctx, IslSerializeSCCs);
1887   isl_options_set_schedule_maximize_band_depth(Ctx, IslMaximizeBands);
1888   isl_options_set_schedule_max_constant_term(Ctx, MaxConstantTerm);
1889   isl_options_set_schedule_max_coefficient(Ctx, MaxCoefficient);
1890   isl_options_set_tile_scale_tile_loops(Ctx, 0);
1891 
1892   auto OnErrorStatus = isl_options_get_on_error(Ctx);
1893   isl_options_set_on_error(Ctx, ISL_ON_ERROR_CONTINUE);
1894 
1895   auto SC = isl::schedule_constraints::on_domain(Domain);
1896   SC = SC.set_proximity(Proximity);
1897   SC = SC.set_validity(Validity);
1898   SC = SC.set_coincidence(Validity);
1899   auto Schedule = SC.compute_schedule();
1900   isl_options_set_on_error(Ctx, OnErrorStatus);
1901 
1902   walkScheduleTreeForStatistics(Schedule, 1);
1903 
1904   // In cases the scheduler is not able to optimize the code, we just do not
1905   // touch the schedule.
1906   if (!Schedule)
1907     return false;
1908 
1909   ScopsRescheduled++;
1910 
1911   LLVM_DEBUG({
1912     auto *P = isl_printer_to_str(Ctx);
1913     P = isl_printer_set_yaml_style(P, ISL_YAML_STYLE_BLOCK);
1914     P = isl_printer_print_schedule(P, Schedule.get());
1915     auto *str = isl_printer_get_str(P);
1916     dbgs() << "NewScheduleTree: \n" << str << "\n";
1917     free(str);
1918     isl_printer_free(P);
1919   });
1920 
1921   const OptimizerAdditionalInfoTy OAI = {TTI, const_cast<Dependences *>(&D)};
1922   auto NewSchedule = ScheduleTreeOptimizer::optimizeSchedule(Schedule, &OAI);
1923   NewSchedule = hoistExtensionNodes(NewSchedule);
1924   walkScheduleTreeForStatistics(NewSchedule, 2);
1925 
1926   if (!ScheduleTreeOptimizer::isProfitableSchedule(S, NewSchedule))
1927     return false;
1928 
1929   auto ScopStats = S.getStatistics();
1930   ScopsOptimized++;
1931   NumAffineLoopsOptimized += ScopStats.NumAffineLoops;
1932   NumBoxedLoopsOptimized += ScopStats.NumBoxedLoops;
1933   LastSchedule = NewSchedule;
1934 
1935   S.setScheduleTree(NewSchedule);
1936   S.markAsOptimized();
1937 
1938   if (OptimizedScops)
1939     errs() << S;
1940 
1941   return false;
1942 }
1943 
1944 bool IslScheduleOptimizerWrapperPass::runOnScop(Scop &S) {
1945   releaseMemory();
1946 
1947   Function &F = S.getFunction();
1948   IslCtx = S.getSharedIslCtx();
1949 
1950   auto getDependences =
1951       [this](Dependences::AnalysisLevel) -> const Dependences & {
1952     return getAnalysis<DependenceInfo>().getDependences(
1953         Dependences::AL_Statement);
1954   };
1955   // auto &Deps  = getAnalysis<DependenceInfo>();
1956   TargetTransformInfo *TTI =
1957       &getAnalysis<TargetTransformInfoWrapperPass>().getTTI(F);
1958   return runIslScheduleOptimizer(S, getDependences, TTI, LastSchedule);
1959 }
1960 
1961 static void runScheduleOptimizerPrinter(raw_ostream &OS,
1962                                         isl::schedule LastSchedule) {
1963   isl_printer *p;
1964   char *ScheduleStr;
1965 
1966   OS << "Calculated schedule:\n";
1967 
1968   if (!LastSchedule) {
1969     OS << "n/a\n";
1970     return;
1971   }
1972 
1973   p = isl_printer_to_str(LastSchedule.get_ctx().get());
1974   p = isl_printer_set_yaml_style(p, ISL_YAML_STYLE_BLOCK);
1975   p = isl_printer_print_schedule(p, LastSchedule.get());
1976   ScheduleStr = isl_printer_get_str(p);
1977   isl_printer_free(p);
1978 
1979   OS << ScheduleStr << "\n";
1980 
1981   free(ScheduleStr);
1982 }
1983 
1984 void IslScheduleOptimizerWrapperPass::printScop(raw_ostream &OS, Scop &) const {
1985   runScheduleOptimizerPrinter(OS, LastSchedule);
1986 }
1987 
1988 void IslScheduleOptimizerWrapperPass::getAnalysisUsage(
1989     AnalysisUsage &AU) const {
1990   ScopPass::getAnalysisUsage(AU);
1991   AU.addRequired<DependenceInfo>();
1992   AU.addRequired<TargetTransformInfoWrapperPass>();
1993 
1994   AU.addPreserved<DependenceInfo>();
1995 }
1996 
1997 } // namespace
1998 
1999 Pass *polly::createIslScheduleOptimizerWrapperPass() {
2000   return new IslScheduleOptimizerWrapperPass();
2001 }
2002 
2003 INITIALIZE_PASS_BEGIN(IslScheduleOptimizerWrapperPass, "polly-opt-isl",
2004                       "Polly - Optimize schedule of SCoP", false, false);
2005 INITIALIZE_PASS_DEPENDENCY(DependenceInfo);
2006 INITIALIZE_PASS_DEPENDENCY(ScopInfoRegionPass);
2007 INITIALIZE_PASS_DEPENDENCY(TargetTransformInfoWrapperPass);
2008 INITIALIZE_PASS_END(IslScheduleOptimizerWrapperPass, "polly-opt-isl",
2009                     "Polly - Optimize schedule of SCoP", false, false)
2010 
2011 static llvm::PreservedAnalyses
2012 runIslScheduleOptimizerUsingNPM(Scop &S, ScopAnalysisManager &SAM,
2013                                 ScopStandardAnalysisResults &SAR, SPMUpdater &U,
2014                                 raw_ostream *OS) {
2015   DependenceAnalysis::Result &Deps = SAM.getResult<DependenceAnalysis>(S, SAR);
2016   auto GetDeps = [&Deps](Dependences::AnalysisLevel) -> const Dependences & {
2017     return Deps.getDependences(Dependences::AL_Statement);
2018   };
2019   TargetTransformInfo *TTI = &SAR.TTI;
2020   isl::schedule LastSchedule;
2021   bool Modified = runIslScheduleOptimizer(S, GetDeps, TTI, LastSchedule);
2022   if (OS) {
2023     *OS << "Printing analysis 'Polly - Optimize schedule of SCoP' for region: '"
2024         << S.getName() << "' in function '" << S.getFunction().getName()
2025         << "':\n";
2026     runScheduleOptimizerPrinter(*OS, LastSchedule);
2027   }
2028 
2029   if (!Modified)
2030     return PreservedAnalyses::all();
2031 
2032   PreservedAnalyses PA;
2033   PA.preserveSet<AllAnalysesOn<Module>>();
2034   PA.preserveSet<AllAnalysesOn<Function>>();
2035   PA.preserveSet<AllAnalysesOn<Loop>>();
2036   return PA;
2037 }
2038 
2039 llvm::PreservedAnalyses
2040 IslScheduleOptimizerPass::run(Scop &S, ScopAnalysisManager &SAM,
2041                               ScopStandardAnalysisResults &SAR, SPMUpdater &U) {
2042   return runIslScheduleOptimizerUsingNPM(S, SAM, SAR, U, nullptr);
2043 }
2044 
2045 llvm::PreservedAnalyses
2046 IslScheduleOptimizerPrinterPass::run(Scop &S, ScopAnalysisManager &SAM,
2047                                      ScopStandardAnalysisResults &SAR,
2048                                      SPMUpdater &U) {
2049   return runIslScheduleOptimizerUsingNPM(S, SAM, SAR, U, &OS);
2050 }
2051