1 //===- Schedule.cpp - Calculate an optimized schedule ---------------------===//
2 //
3 // Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
4 // See https://llvm.org/LICENSE.txt for license information.
5 // SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
6 //
7 //===----------------------------------------------------------------------===//
8 //
9 // This pass generates an entirely new schedule tree from the data dependences
10 // and iteration domains. The new schedule tree is computed in two steps:
11 //
12 // 1) The isl scheduling optimizer is run
13 //
14 // The isl scheduling optimizer creates a new schedule tree that maximizes
15 // parallelism and tileability and minimizes data-dependence distances. The
16 // algorithm used is a modified version of the ``Pluto'' algorithm:
17 //
18 //   U. Bondhugula, A. Hartono, J. Ramanujam, and P. Sadayappan.
19 //   A Practical Automatic Polyhedral Parallelizer and Locality Optimizer.
20 //   In Proceedings of the 2008 ACM SIGPLAN Conference On Programming Language
21 //   Design and Implementation, PLDI ’08, pages 101–113. ACM, 2008.
22 //
23 // 2) A set of post-scheduling transformations is applied on the schedule tree.
24 //
25 // These optimizations include:
26 //
27 //  - Tiling of the innermost tilable bands
28 //  - Prevectorization - The choice of a possible outer loop that is strip-mined
29 //                       to the innermost level to enable inner-loop
30 //                       vectorization.
31 //  - Some optimizations for spatial locality are also planned.
32 //
33 // For a detailed description of the schedule tree itself please see section 6
34 // of:
35 //
36 // Polyhedral AST generation is more than scanning polyhedra
37 // Tobias Grosser, Sven Verdoolaege, Albert Cohen
38 // ACM Transactions on Programming Languages and Systems (TOPLAS),
39 // 37(4), July 2015
40 // http://www.grosser.es/#pub-polyhedral-AST-generation
41 //
42 // This publication also contains a detailed discussion of the different options
43 // for polyhedral loop unrolling, full/partial tile separation and other uses
44 // of the schedule tree.
45 //
46 //===----------------------------------------------------------------------===//
47 
48 #include "polly/ScheduleOptimizer.h"
49 #include "polly/CodeGen/CodeGeneration.h"
50 #include "polly/DependenceInfo.h"
51 #include "polly/ManualOptimizer.h"
52 #include "polly/MatmulOptimizer.h"
53 #include "polly/Options.h"
54 #include "polly/ScheduleTreeTransform.h"
55 #include "polly/Support/ISLOStream.h"
56 #include "llvm/ADT/Sequence.h"
57 #include "llvm/ADT/Statistic.h"
58 #include "llvm/InitializePasses.h"
59 #include "llvm/Support/CommandLine.h"
60 #include "isl/options.h"
61 
62 using namespace llvm;
63 using namespace polly;
64 
65 namespace llvm {
66 class Loop;
67 class Module;
68 } // namespace llvm
69 
70 #define DEBUG_TYPE "polly-opt-isl"
71 
72 static cl::opt<std::string>
73     OptimizeDeps("polly-opt-optimize-only",
74                  cl::desc("Only a certain kind of dependences (all/raw)"),
75                  cl::Hidden, cl::init("all"), cl::ZeroOrMore,
76                  cl::cat(PollyCategory));
77 
78 static cl::opt<std::string>
79     SimplifyDeps("polly-opt-simplify-deps",
80                  cl::desc("Dependences should be simplified (yes/no)"),
81                  cl::Hidden, cl::init("yes"), cl::ZeroOrMore,
82                  cl::cat(PollyCategory));
83 
84 static cl::opt<int> MaxConstantTerm(
85     "polly-opt-max-constant-term",
86     cl::desc("The maximal constant term allowed (-1 is unlimited)"), cl::Hidden,
87     cl::init(20), cl::ZeroOrMore, cl::cat(PollyCategory));
88 
89 static cl::opt<int> MaxCoefficient(
90     "polly-opt-max-coefficient",
91     cl::desc("The maximal coefficient allowed (-1 is unlimited)"), cl::Hidden,
92     cl::init(20), cl::ZeroOrMore, cl::cat(PollyCategory));
93 
94 static cl::opt<std::string> FusionStrategy(
95     "polly-opt-fusion", cl::desc("The fusion strategy to choose (min/max)"),
96     cl::Hidden, cl::init("min"), cl::ZeroOrMore, cl::cat(PollyCategory));
97 
98 static cl::opt<std::string>
99     MaximizeBandDepth("polly-opt-maximize-bands",
100                       cl::desc("Maximize the band depth (yes/no)"), cl::Hidden,
101                       cl::init("yes"), cl::ZeroOrMore, cl::cat(PollyCategory));
102 
103 static cl::opt<std::string> OuterCoincidence(
104     "polly-opt-outer-coincidence",
105     cl::desc("Try to construct schedules where the outer member of each band "
106              "satisfies the coincidence constraints (yes/no)"),
107     cl::Hidden, cl::init("no"), cl::ZeroOrMore, cl::cat(PollyCategory));
108 
109 static cl::opt<int> PrevectorWidth(
110     "polly-prevect-width",
111     cl::desc(
112         "The number of loop iterations to strip-mine for pre-vectorization"),
113     cl::Hidden, cl::init(4), cl::ZeroOrMore, cl::cat(PollyCategory));
114 
115 static cl::opt<bool> FirstLevelTiling("polly-tiling",
116                                       cl::desc("Enable loop tiling"),
117                                       cl::init(true), cl::ZeroOrMore,
118                                       cl::cat(PollyCategory));
119 
120 static cl::opt<int> FirstLevelDefaultTileSize(
121     "polly-default-tile-size",
122     cl::desc("The default tile size (if not enough were provided by"
123              " --polly-tile-sizes)"),
124     cl::Hidden, cl::init(32), cl::ZeroOrMore, cl::cat(PollyCategory));
125 
126 static cl::list<int>
127     FirstLevelTileSizes("polly-tile-sizes",
128                         cl::desc("A tile size for each loop dimension, filled "
129                                  "with --polly-default-tile-size"),
130                         cl::Hidden, cl::ZeroOrMore, cl::CommaSeparated,
131                         cl::cat(PollyCategory));
132 
133 static cl::opt<bool>
134     SecondLevelTiling("polly-2nd-level-tiling",
135                       cl::desc("Enable a 2nd level loop of loop tiling"),
136                       cl::init(false), cl::ZeroOrMore, cl::cat(PollyCategory));
137 
138 static cl::opt<int> SecondLevelDefaultTileSize(
139     "polly-2nd-level-default-tile-size",
140     cl::desc("The default 2nd-level tile size (if not enough were provided by"
141              " --polly-2nd-level-tile-sizes)"),
142     cl::Hidden, cl::init(16), cl::ZeroOrMore, cl::cat(PollyCategory));
143 
144 static cl::list<int>
145     SecondLevelTileSizes("polly-2nd-level-tile-sizes",
146                          cl::desc("A tile size for each loop dimension, filled "
147                                   "with --polly-default-tile-size"),
148                          cl::Hidden, cl::ZeroOrMore, cl::CommaSeparated,
149                          cl::cat(PollyCategory));
150 
151 static cl::opt<bool> RegisterTiling("polly-register-tiling",
152                                     cl::desc("Enable register tiling"),
153                                     cl::init(false), cl::ZeroOrMore,
154                                     cl::cat(PollyCategory));
155 
156 static cl::opt<int> RegisterDefaultTileSize(
157     "polly-register-tiling-default-tile-size",
158     cl::desc("The default register tile size (if not enough were provided by"
159              " --polly-register-tile-sizes)"),
160     cl::Hidden, cl::init(2), cl::ZeroOrMore, cl::cat(PollyCategory));
161 
162 static cl::list<int>
163     RegisterTileSizes("polly-register-tile-sizes",
164                       cl::desc("A tile size for each loop dimension, filled "
165                                "with --polly-register-tile-size"),
166                       cl::Hidden, cl::ZeroOrMore, cl::CommaSeparated,
167                       cl::cat(PollyCategory));
168 
169 static cl::opt<bool> PragmaBasedOpts(
170     "polly-pragma-based-opts",
171     cl::desc("Apply user-directed transformation from metadata"),
172     cl::init(true), cl::ZeroOrMore, cl::cat(PollyCategory));
173 
174 static cl::opt<bool>
175     PMBasedOpts("polly-pattern-matching-based-opts",
176                 cl::desc("Perform optimizations based on pattern matching"),
177                 cl::init(true), cl::ZeroOrMore, cl::cat(PollyCategory));
178 
179 static cl::opt<bool> OptimizedScops(
180     "polly-optimized-scops",
181     cl::desc("Polly - Dump polyhedral description of Scops optimized with "
182              "the isl scheduling optimizer and the set of post-scheduling "
183              "transformations is applied on the schedule tree"),
184     cl::init(false), cl::ZeroOrMore, cl::cat(PollyCategory));
185 
186 STATISTIC(ScopsProcessed, "Number of scops processed");
187 STATISTIC(ScopsRescheduled, "Number of scops rescheduled");
188 STATISTIC(ScopsOptimized, "Number of scops optimized");
189 
190 STATISTIC(NumAffineLoopsOptimized, "Number of affine loops optimized");
191 STATISTIC(NumBoxedLoopsOptimized, "Number of boxed loops optimized");
192 
193 #define THREE_STATISTICS(VARNAME, DESC)                                        \
194   static Statistic VARNAME[3] = {                                              \
195       {DEBUG_TYPE, #VARNAME "0", DESC " (original)"},                          \
196       {DEBUG_TYPE, #VARNAME "1", DESC " (after scheduler)"},                   \
197       {DEBUG_TYPE, #VARNAME "2", DESC " (after optimizer)"}}
198 
199 THREE_STATISTICS(NumBands, "Number of bands");
200 THREE_STATISTICS(NumBandMembers, "Number of band members");
201 THREE_STATISTICS(NumCoincident, "Number of coincident band members");
202 THREE_STATISTICS(NumPermutable, "Number of permutable bands");
203 THREE_STATISTICS(NumFilters, "Number of filter nodes");
204 THREE_STATISTICS(NumExtension, "Number of extension nodes");
205 
206 STATISTIC(FirstLevelTileOpts, "Number of first level tiling applied");
207 STATISTIC(SecondLevelTileOpts, "Number of second level tiling applied");
208 STATISTIC(RegisterTileOpts, "Number of register tiling applied");
209 STATISTIC(PrevectOpts, "Number of strip-mining for prevectorization applied");
210 STATISTIC(MatMulOpts,
211           "Number of matrix multiplication patterns detected and optimized");
212 
213 namespace {
214 /// Additional parameters of the schedule optimizer.
215 ///
216 /// Target Transform Info and the SCoP dependencies used by the schedule
217 /// optimizer.
218 struct OptimizerAdditionalInfoTy {
219   const llvm::TargetTransformInfo *TTI;
220   const Dependences *D;
221 };
222 
223 class ScheduleTreeOptimizer {
224 public:
225   /// Apply schedule tree transformations.
226   ///
227   /// This function takes an (possibly already optimized) schedule tree and
228   /// applies a set of additional optimizations on the schedule tree. The
229   /// transformations applied include:
230   ///
231   ///   - Tiling
232   ///   - Prevectorization
233   ///
234   /// @param Schedule The schedule object the transformations will be applied
235   ///                 to.
236   /// @param OAI      Target Transform Info and the SCoP dependencies.
237   /// @returns        The transformed schedule.
238   static isl::schedule
239   optimizeSchedule(isl::schedule Schedule,
240                    const OptimizerAdditionalInfoTy *OAI = nullptr);
241 
242   /// Apply schedule tree transformations.
243   ///
244   /// This function takes a node in an (possibly already optimized) schedule
245   /// tree and applies a set of additional optimizations on this schedule tree
246   /// node and its descendants. The transformations applied include:
247   ///
248   ///   - Tiling
249   ///   - Prevectorization
250   ///
251   /// @param Node The schedule object post-transformations will be applied to.
252   /// @param OAI  Target Transform Info and the SCoP dependencies.
253   /// @returns    The transformed schedule.
254   static isl::schedule_node
255   optimizeScheduleNode(isl::schedule_node Node,
256                        const OptimizerAdditionalInfoTy *OAI = nullptr);
257 
258   /// Decide if the @p NewSchedule is profitable for @p S.
259   ///
260   /// @param S           The SCoP we optimize.
261   /// @param NewSchedule The new schedule we computed.
262   ///
263   /// @return True, if we believe @p NewSchedule is an improvement for @p S.
264   static bool isProfitableSchedule(polly::Scop &S, isl::schedule NewSchedule);
265 
266   /// Isolate a set of partial tile prefixes.
267   ///
268   /// This set should ensure that it contains only partial tile prefixes that
269   /// have exactly VectorWidth iterations.
270   ///
271   /// @param Node A schedule node band, which is a parent of a band node,
272   ///             that contains a vector loop.
273   /// @return Modified isl_schedule_node.
274   static isl::schedule_node isolateFullPartialTiles(isl::schedule_node Node,
275                                                     int VectorWidth);
276 
277 private:
278   /// Check if this node is a band node we want to tile.
279   ///
280   /// We look for innermost band nodes where individual dimensions are marked as
281   /// permutable.
282   ///
283   /// @param Node The node to check.
284   static bool isTileableBandNode(isl::schedule_node Node);
285 
286   /// Pre-vectorizes one scheduling dimension of a schedule band.
287   ///
288   /// prevectSchedBand splits out the dimension DimToVectorize, tiles it and
289   /// sinks the resulting point loop.
290   ///
291   /// Example (DimToVectorize=0, VectorWidth=4):
292   ///
293   /// | Before transformation:
294   /// |
295   /// | A[i,j] -> [i,j]
296   /// |
297   /// | for (i = 0; i < 128; i++)
298   /// |    for (j = 0; j < 128; j++)
299   /// |      A(i,j);
300   ///
301   /// | After transformation:
302   /// |
303   /// | for (it = 0; it < 32; it+=1)
304   /// |    for (j = 0; j < 128; j++)
305   /// |      for (ip = 0; ip <= 3; ip++)
306   /// |        A(4 * it + ip,j);
307   ///
308   /// The goal of this transformation is to create a trivially vectorizable
309   /// loop.  This means a parallel loop at the innermost level that has a
310   /// constant number of iterations corresponding to the target vector width.
311   ///
312   /// This transformation creates a loop at the innermost level. The loop has
313   /// a constant number of iterations, if the number of loop iterations at
314   /// DimToVectorize can be divided by VectorWidth. The default VectorWidth is
315   /// currently constant and not yet target specific. This function does not
316   /// reason about parallelism.
317   static isl::schedule_node prevectSchedBand(isl::schedule_node Node,
318                                              unsigned DimToVectorize,
319                                              int VectorWidth);
320 
321   /// Apply additional optimizations on the bands in the schedule tree.
322   ///
323   /// We are looking for an innermost band node and apply the following
324   /// transformations:
325   ///
326   ///  - Tile the band
327   ///      - if the band is tileable
328   ///      - if the band has more than one loop dimension
329   ///
330   ///  - Prevectorize the schedule of the band (or the point loop in case of
331   ///    tiling).
332   ///      - if vectorization is enabled
333   ///
334   /// @param Node The schedule node to (possibly) optimize.
335   /// @param User A pointer to forward some use information
336   ///        (currently unused).
337   static isl_schedule_node *optimizeBand(isl_schedule_node *Node, void *User);
338 
339   /// Apply additional optimizations on the bands in the schedule tree.
340   ///
341   /// We apply the following
342   /// transformations:
343   ///
344   ///  - Tile the band
345   ///  - Prevectorize the schedule of the band (or the point loop in case of
346   ///    tiling).
347   ///      - if vectorization is enabled
348   ///
349   /// @param Node The schedule node to (possibly) optimize.
350   /// @param User A pointer to forward some use information
351   ///        (currently unused).
352   static isl::schedule_node standardBandOpts(isl::schedule_node Node,
353                                              void *User);
354 };
355 
356 isl::schedule_node
357 ScheduleTreeOptimizer::isolateFullPartialTiles(isl::schedule_node Node,
358                                                int VectorWidth) {
359   assert(isl_schedule_node_get_type(Node.get()) == isl_schedule_node_band);
360   Node = Node.child(0).child(0);
361   isl::union_map SchedRelUMap = Node.get_prefix_schedule_relation();
362   isl::union_set ScheduleRangeUSet = SchedRelUMap.range();
363   isl::set ScheduleRange{ScheduleRangeUSet};
364   isl::set IsolateDomain = getPartialTilePrefixes(ScheduleRange, VectorWidth);
365   auto AtomicOption = getDimOptions(IsolateDomain.get_ctx(), "atomic");
366   isl::union_set IsolateOption = getIsolateOptions(IsolateDomain, 1);
367   Node = Node.parent().parent();
368   isl::union_set Options = IsolateOption.unite(AtomicOption);
369   Node = Node.band_set_ast_build_options(Options);
370   return Node;
371 }
372 
373 isl::schedule_node ScheduleTreeOptimizer::prevectSchedBand(
374     isl::schedule_node Node, unsigned DimToVectorize, int VectorWidth) {
375   assert(isl_schedule_node_get_type(Node.get()) == isl_schedule_node_band);
376 
377   auto Space = isl::manage(isl_schedule_node_band_get_space(Node.get()));
378   isl_size ScheduleDimensions = Space.dim(isl::dim::set);
379   assert((isl_size)DimToVectorize < ScheduleDimensions);
380 
381   if (DimToVectorize > 0) {
382     Node = isl::manage(
383         isl_schedule_node_band_split(Node.release(), DimToVectorize));
384     Node = Node.child(0);
385   }
386   if ((isl_size)DimToVectorize < ScheduleDimensions - 1)
387     Node = isl::manage(isl_schedule_node_band_split(Node.release(), 1));
388   Space = isl::manage(isl_schedule_node_band_get_space(Node.get()));
389   auto Sizes = isl::multi_val::zero(Space);
390   Sizes = Sizes.set_val(0, isl::val(Node.get_ctx(), VectorWidth));
391   Node =
392       isl::manage(isl_schedule_node_band_tile(Node.release(), Sizes.release()));
393   Node = isolateFullPartialTiles(Node, VectorWidth);
394   Node = Node.child(0);
395   // Make sure the "trivially vectorizable loop" is not unrolled. Otherwise,
396   // we will have troubles to match it in the backend.
397   Node = Node.band_set_ast_build_options(
398       isl::union_set(Node.get_ctx(), "{ unroll[x]: 1 = 0 }"));
399   Node = isl::manage(isl_schedule_node_band_sink(Node.release()));
400   Node = Node.child(0);
401   if (isl_schedule_node_get_type(Node.get()) == isl_schedule_node_leaf)
402     Node = Node.parent();
403   auto LoopMarker = isl::id::alloc(Node.get_ctx(), "SIMD", nullptr);
404   PrevectOpts++;
405   return Node.insert_mark(LoopMarker);
406 }
407 
408 static bool isSimpleInnermostBand(const isl::schedule_node &Node) {
409   assert(isl_schedule_node_get_type(Node.get()) == isl_schedule_node_band);
410   assert(isl_schedule_node_n_children(Node.get()) == 1);
411 
412   auto ChildType = isl_schedule_node_get_type(Node.child(0).get());
413 
414   if (ChildType == isl_schedule_node_leaf)
415     return true;
416 
417   if (ChildType != isl_schedule_node_sequence)
418     return false;
419 
420   auto Sequence = Node.child(0);
421 
422   for (int c = 0, nc = isl_schedule_node_n_children(Sequence.get()); c < nc;
423        ++c) {
424     auto Child = Sequence.child(c);
425     if (isl_schedule_node_get_type(Child.get()) != isl_schedule_node_filter)
426       return false;
427     if (isl_schedule_node_get_type(Child.child(0).get()) !=
428         isl_schedule_node_leaf)
429       return false;
430   }
431   return true;
432 }
433 
434 bool ScheduleTreeOptimizer::isTileableBandNode(isl::schedule_node Node) {
435   if (isl_schedule_node_get_type(Node.get()) != isl_schedule_node_band)
436     return false;
437 
438   if (isl_schedule_node_n_children(Node.get()) != 1)
439     return false;
440 
441   if (!isl_schedule_node_band_get_permutable(Node.get()))
442     return false;
443 
444   auto Space = isl::manage(isl_schedule_node_band_get_space(Node.get()));
445   auto Dims = Space.dim(isl::dim::set);
446 
447   if (Dims <= 1)
448     return false;
449 
450   return isSimpleInnermostBand(Node);
451 }
452 
453 __isl_give isl::schedule_node
454 ScheduleTreeOptimizer::standardBandOpts(isl::schedule_node Node, void *User) {
455   if (FirstLevelTiling) {
456     Node = tileNode(Node, "1st level tiling", FirstLevelTileSizes,
457                     FirstLevelDefaultTileSize);
458     FirstLevelTileOpts++;
459   }
460 
461   if (SecondLevelTiling) {
462     Node = tileNode(Node, "2nd level tiling", SecondLevelTileSizes,
463                     SecondLevelDefaultTileSize);
464     SecondLevelTileOpts++;
465   }
466 
467   if (RegisterTiling) {
468     Node =
469         applyRegisterTiling(Node, RegisterTileSizes, RegisterDefaultTileSize);
470     RegisterTileOpts++;
471   }
472 
473   if (PollyVectorizerChoice == VECTORIZER_NONE)
474     return Node;
475 
476   auto Space = isl::manage(isl_schedule_node_band_get_space(Node.get()));
477   auto Dims = Space.dim(isl::dim::set);
478 
479   for (int i = Dims - 1; i >= 0; i--)
480     if (Node.band_member_get_coincident(i)) {
481       Node = prevectSchedBand(Node, i, PrevectorWidth);
482       break;
483     }
484 
485   return Node;
486 }
487 
488 __isl_give isl_schedule_node *
489 ScheduleTreeOptimizer::optimizeBand(__isl_take isl_schedule_node *Node,
490                                     void *User) {
491   if (!isTileableBandNode(isl::manage_copy(Node)))
492     return Node;
493 
494   const OptimizerAdditionalInfoTy *OAI =
495       static_cast<const OptimizerAdditionalInfoTy *>(User);
496 
497   if (PMBasedOpts && User) {
498     isl::schedule_node PatternOptimizedSchedule =
499         tryOptimizeMatMulPattern(isl::manage_copy(Node), OAI->TTI, OAI->D);
500     if (!PatternOptimizedSchedule.is_null()) {
501       MatMulOpts++;
502       isl_schedule_node_free(Node);
503       return PatternOptimizedSchedule.release();
504     }
505   }
506 
507   return standardBandOpts(isl::manage(Node), User).release();
508 }
509 
510 isl::schedule
511 ScheduleTreeOptimizer::optimizeSchedule(isl::schedule Schedule,
512                                         const OptimizerAdditionalInfoTy *OAI) {
513   auto Root = Schedule.get_root();
514   Root = optimizeScheduleNode(Root, OAI);
515   return Root.get_schedule();
516 }
517 
518 isl::schedule_node ScheduleTreeOptimizer::optimizeScheduleNode(
519     isl::schedule_node Node, const OptimizerAdditionalInfoTy *OAI) {
520   Node = isl::manage(isl_schedule_node_map_descendant_bottom_up(
521       Node.release(), optimizeBand,
522       const_cast<void *>(static_cast<const void *>(OAI))));
523   return Node;
524 }
525 
526 bool ScheduleTreeOptimizer::isProfitableSchedule(Scop &S,
527                                                  isl::schedule NewSchedule) {
528   // To understand if the schedule has been optimized we check if the schedule
529   // has changed at all.
530   // TODO: We can improve this by tracking if any necessarily beneficial
531   // transformations have been performed. This can e.g. be tiling, loop
532   // interchange, or ...) We can track this either at the place where the
533   // transformation has been performed or, in case of automatic ILP based
534   // optimizations, by comparing (yet to be defined) performance metrics
535   // before/after the scheduling optimizer
536   // (e.g., #stride-one accesses)
537   auto NewScheduleMap = NewSchedule.get_map();
538   auto OldSchedule = S.getSchedule();
539   assert(!OldSchedule.is_null() &&
540          "Only IslScheduleOptimizer can insert extension nodes "
541          "that make Scop::getSchedule() return nullptr.");
542   bool changed = !OldSchedule.is_equal(NewScheduleMap);
543   return changed;
544 }
545 
546 class IslScheduleOptimizerWrapperPass : public ScopPass {
547 public:
548   static char ID;
549 
550   explicit IslScheduleOptimizerWrapperPass() : ScopPass(ID) {}
551 
552   /// Optimize the schedule of the SCoP @p S.
553   bool runOnScop(Scop &S) override;
554 
555   /// Print the new schedule for the SCoP @p S.
556   void printScop(raw_ostream &OS, Scop &S) const override;
557 
558   /// Register all analyses and transformation required.
559   void getAnalysisUsage(AnalysisUsage &AU) const override;
560 
561   /// Release the internal memory.
562   void releaseMemory() override {
563     LastSchedule = {};
564     IslCtx.reset();
565   }
566 
567 private:
568   std::shared_ptr<isl_ctx> IslCtx;
569   isl::schedule LastSchedule;
570 };
571 
572 char IslScheduleOptimizerWrapperPass::ID = 0;
573 
574 #ifndef NDEBUG
575 static void printSchedule(llvm::raw_ostream &OS, const isl::schedule &Schedule,
576                           StringRef Desc) {
577   isl::ctx Ctx = Schedule.get_ctx();
578   isl_printer *P = isl_printer_to_str(Ctx.get());
579   P = isl_printer_set_yaml_style(P, ISL_YAML_STYLE_BLOCK);
580   P = isl_printer_print_schedule(P, Schedule.get());
581   char *Str = isl_printer_get_str(P);
582   OS << Desc << ": \n" << Str << "\n";
583   free(Str);
584   isl_printer_free(P);
585 }
586 #endif
587 
588 /// Collect statistics for the schedule tree.
589 ///
590 /// @param Schedule The schedule tree to analyze. If not a schedule tree it is
591 /// ignored.
592 /// @param Version  The version of the schedule tree that is analyzed.
593 ///                 0 for the original schedule tree before any transformation.
594 ///                 1 for the schedule tree after isl's rescheduling.
595 ///                 2 for the schedule tree after optimizations are applied
596 ///                 (tiling, pattern matching)
597 static void walkScheduleTreeForStatistics(isl::schedule Schedule, int Version) {
598   auto Root = Schedule.get_root();
599   if (Root.is_null())
600     return;
601 
602   isl_schedule_node_foreach_descendant_top_down(
603       Root.get(),
604       [](__isl_keep isl_schedule_node *nodeptr, void *user) -> isl_bool {
605         isl::schedule_node Node = isl::manage_copy(nodeptr);
606         int Version = *static_cast<int *>(user);
607 
608         switch (isl_schedule_node_get_type(Node.get())) {
609         case isl_schedule_node_band: {
610           NumBands[Version]++;
611           if (isl_schedule_node_band_get_permutable(Node.get()) ==
612               isl_bool_true)
613             NumPermutable[Version]++;
614 
615           int CountMembers = isl_schedule_node_band_n_member(Node.get());
616           NumBandMembers[Version] += CountMembers;
617           for (int i = 0; i < CountMembers; i += 1) {
618             if (Node.band_member_get_coincident(i))
619               NumCoincident[Version]++;
620           }
621           break;
622         }
623 
624         case isl_schedule_node_filter:
625           NumFilters[Version]++;
626           break;
627 
628         case isl_schedule_node_extension:
629           NumExtension[Version]++;
630           break;
631 
632         default:
633           break;
634         }
635 
636         return isl_bool_true;
637       },
638       &Version);
639 }
640 
641 static bool runIslScheduleOptimizer(
642     Scop &S,
643     function_ref<const Dependences &(Dependences::AnalysisLevel)> GetDeps,
644     TargetTransformInfo *TTI, isl::schedule &LastSchedule) {
645   // Skip SCoPs in case they're already optimised by PPCGCodeGeneration
646   if (S.isToBeSkipped())
647     return false;
648 
649   // Skip empty SCoPs but still allow code generation as it will delete the
650   // loops present but not needed.
651   if (S.getSize() == 0) {
652     S.markAsOptimized();
653     return false;
654   }
655 
656   ScopsProcessed++;
657 
658   // Schedule without optimizations.
659   isl::schedule Schedule = S.getScheduleTree();
660   walkScheduleTreeForStatistics(S.getScheduleTree(), 0);
661   LLVM_DEBUG(printSchedule(dbgs(), Schedule, "Original schedule tree"));
662 
663   bool HasUserTransformation = false;
664   if (PragmaBasedOpts) {
665     isl::schedule ManuallyTransformed =
666         applyManualTransformations(&S, Schedule);
667     if (ManuallyTransformed.is_null()) {
668       LLVM_DEBUG(dbgs() << "Error during manual optimization\n");
669       return false;
670     }
671 
672     if (ManuallyTransformed.get() != Schedule.get()) {
673       // User transformations have precedence over other transformations.
674       HasUserTransformation = true;
675       Schedule = std::move(ManuallyTransformed);
676       LLVM_DEBUG(
677           printSchedule(dbgs(), Schedule, "After manual transformations"));
678     }
679   }
680 
681   // Only continue if either manual transformations have been applied or we are
682   // allowed to apply heuristics.
683   // TODO: Detect disabled heuristics and no user-directed transformation
684   // metadata earlier in ScopDetection.
685   if (!HasUserTransformation && S.hasDisableHeuristicsHint()) {
686     LLVM_DEBUG(dbgs() << "Heuristic optimizations disabled by metadata\n");
687     return false;
688   }
689 
690   // Get dependency analysis.
691   const Dependences &D = GetDeps(Dependences::AL_Statement);
692   if (D.getSharedIslCtx() != S.getSharedIslCtx()) {
693     LLVM_DEBUG(dbgs() << "DependenceInfo for another SCoP/isl_ctx\n");
694     return false;
695   }
696   if (!D.hasValidDependences()) {
697     LLVM_DEBUG(dbgs() << "Dependency information not available\n");
698     return false;
699   }
700 
701   // Apply ISL's algorithm only if not overriden by the user. Note that
702   // post-rescheduling optimizations (tiling, pattern-based, prevectorization)
703   // rely on the coincidence/permutable annotations on schedule tree bands that
704   // are added by the rescheduling analyzer. Therefore, disabling the
705   // rescheduler implicitly also disables these optimizations.
706   if (HasUserTransformation) {
707     LLVM_DEBUG(
708         dbgs() << "Skipping rescheduling due to manual transformation\n");
709   } else {
710     // Build input data.
711     int ValidityKinds =
712         Dependences::TYPE_RAW | Dependences::TYPE_WAR | Dependences::TYPE_WAW;
713     int ProximityKinds;
714 
715     if (OptimizeDeps == "all")
716       ProximityKinds =
717           Dependences::TYPE_RAW | Dependences::TYPE_WAR | Dependences::TYPE_WAW;
718     else if (OptimizeDeps == "raw")
719       ProximityKinds = Dependences::TYPE_RAW;
720     else {
721       errs() << "Do not know how to optimize for '" << OptimizeDeps << "'"
722              << " Falling back to optimizing all dependences.\n";
723       ProximityKinds =
724           Dependences::TYPE_RAW | Dependences::TYPE_WAR | Dependences::TYPE_WAW;
725     }
726 
727     isl::union_set Domain = S.getDomains();
728 
729     if (Domain.is_null())
730       return false;
731 
732     isl::union_map Validity = D.getDependences(ValidityKinds);
733     isl::union_map Proximity = D.getDependences(ProximityKinds);
734 
735     // Simplify the dependences by removing the constraints introduced by the
736     // domains. This can speed up the scheduling time significantly, as large
737     // constant coefficients will be removed from the dependences. The
738     // introduction of some additional dependences reduces the possible
739     // transformations, but in most cases, such transformation do not seem to be
740     // interesting anyway. In some cases this option may stop the scheduler to
741     // find any schedule.
742     if (SimplifyDeps == "yes") {
743       Validity = Validity.gist_domain(Domain);
744       Validity = Validity.gist_range(Domain);
745       Proximity = Proximity.gist_domain(Domain);
746       Proximity = Proximity.gist_range(Domain);
747     } else if (SimplifyDeps != "no") {
748       errs()
749           << "warning: Option -polly-opt-simplify-deps should either be 'yes' "
750              "or 'no'. Falling back to default: 'yes'\n";
751     }
752 
753     LLVM_DEBUG(dbgs() << "\n\nCompute schedule from: ");
754     LLVM_DEBUG(dbgs() << "Domain := " << Domain << ";\n");
755     LLVM_DEBUG(dbgs() << "Proximity := " << Proximity << ";\n");
756     LLVM_DEBUG(dbgs() << "Validity := " << Validity << ";\n");
757 
758     unsigned IslSerializeSCCs;
759 
760     if (FusionStrategy == "max") {
761       IslSerializeSCCs = 0;
762     } else if (FusionStrategy == "min") {
763       IslSerializeSCCs = 1;
764     } else {
765       errs() << "warning: Unknown fusion strategy. Falling back to maximal "
766                 "fusion.\n";
767       IslSerializeSCCs = 0;
768     }
769 
770     int IslMaximizeBands;
771 
772     if (MaximizeBandDepth == "yes") {
773       IslMaximizeBands = 1;
774     } else if (MaximizeBandDepth == "no") {
775       IslMaximizeBands = 0;
776     } else {
777       errs()
778           << "warning: Option -polly-opt-maximize-bands should either be 'yes'"
779              " or 'no'. Falling back to default: 'yes'\n";
780       IslMaximizeBands = 1;
781     }
782 
783     int IslOuterCoincidence;
784 
785     if (OuterCoincidence == "yes") {
786       IslOuterCoincidence = 1;
787     } else if (OuterCoincidence == "no") {
788       IslOuterCoincidence = 0;
789     } else {
790       errs() << "warning: Option -polly-opt-outer-coincidence should either be "
791                 "'yes' or 'no'. Falling back to default: 'no'\n";
792       IslOuterCoincidence = 0;
793     }
794 
795     isl_ctx *Ctx = S.getIslCtx().get();
796 
797     isl_options_set_schedule_outer_coincidence(Ctx, IslOuterCoincidence);
798     isl_options_set_schedule_serialize_sccs(Ctx, IslSerializeSCCs);
799     isl_options_set_schedule_maximize_band_depth(Ctx, IslMaximizeBands);
800     isl_options_set_schedule_max_constant_term(Ctx, MaxConstantTerm);
801     isl_options_set_schedule_max_coefficient(Ctx, MaxCoefficient);
802     isl_options_set_tile_scale_tile_loops(Ctx, 0);
803 
804     auto OnErrorStatus = isl_options_get_on_error(Ctx);
805     isl_options_set_on_error(Ctx, ISL_ON_ERROR_CONTINUE);
806 
807     auto SC = isl::schedule_constraints::on_domain(Domain);
808     SC = SC.set_proximity(Proximity);
809     SC = SC.set_validity(Validity);
810     SC = SC.set_coincidence(Validity);
811     Schedule = SC.compute_schedule();
812     isl_options_set_on_error(Ctx, OnErrorStatus);
813 
814     ScopsRescheduled++;
815     LLVM_DEBUG(printSchedule(dbgs(), Schedule, "After rescheduling"));
816   }
817 
818   walkScheduleTreeForStatistics(Schedule, 1);
819 
820   // In cases the scheduler is not able to optimize the code, we just do not
821   // touch the schedule.
822   if (Schedule.is_null())
823     return false;
824 
825   // Apply post-rescheduling optimizations.
826   const OptimizerAdditionalInfoTy OAI = {TTI, const_cast<Dependences *>(&D)};
827   Schedule = ScheduleTreeOptimizer::optimizeSchedule(Schedule, &OAI);
828   Schedule = hoistExtensionNodes(Schedule);
829   LLVM_DEBUG(printSchedule(dbgs(), Schedule, "After post-optimizations"));
830   walkScheduleTreeForStatistics(Schedule, 2);
831 
832   if (!ScheduleTreeOptimizer::isProfitableSchedule(S, Schedule))
833     return false;
834 
835   auto ScopStats = S.getStatistics();
836   ScopsOptimized++;
837   NumAffineLoopsOptimized += ScopStats.NumAffineLoops;
838   NumBoxedLoopsOptimized += ScopStats.NumBoxedLoops;
839   LastSchedule = Schedule;
840 
841   S.setScheduleTree(Schedule);
842   S.markAsOptimized();
843 
844   if (OptimizedScops)
845     errs() << S;
846 
847   return false;
848 }
849 
850 bool IslScheduleOptimizerWrapperPass::runOnScop(Scop &S) {
851   releaseMemory();
852 
853   Function &F = S.getFunction();
854   IslCtx = S.getSharedIslCtx();
855 
856   auto getDependences =
857       [this](Dependences::AnalysisLevel) -> const Dependences & {
858     return getAnalysis<DependenceInfo>().getDependences(
859         Dependences::AL_Statement);
860   };
861   // auto &Deps  = getAnalysis<DependenceInfo>();
862   TargetTransformInfo *TTI =
863       &getAnalysis<TargetTransformInfoWrapperPass>().getTTI(F);
864   return runIslScheduleOptimizer(S, getDependences, TTI, LastSchedule);
865 }
866 
867 static void runScheduleOptimizerPrinter(raw_ostream &OS,
868                                         isl::schedule LastSchedule) {
869   isl_printer *p;
870   char *ScheduleStr;
871 
872   OS << "Calculated schedule:\n";
873 
874   if (LastSchedule.is_null()) {
875     OS << "n/a\n";
876     return;
877   }
878 
879   p = isl_printer_to_str(LastSchedule.get_ctx().get());
880   p = isl_printer_set_yaml_style(p, ISL_YAML_STYLE_BLOCK);
881   p = isl_printer_print_schedule(p, LastSchedule.get());
882   ScheduleStr = isl_printer_get_str(p);
883   isl_printer_free(p);
884 
885   OS << ScheduleStr << "\n";
886 
887   free(ScheduleStr);
888 }
889 
890 void IslScheduleOptimizerWrapperPass::printScop(raw_ostream &OS, Scop &) const {
891   runScheduleOptimizerPrinter(OS, LastSchedule);
892 }
893 
894 void IslScheduleOptimizerWrapperPass::getAnalysisUsage(
895     AnalysisUsage &AU) const {
896   ScopPass::getAnalysisUsage(AU);
897   AU.addRequired<DependenceInfo>();
898   AU.addRequired<TargetTransformInfoWrapperPass>();
899 
900   AU.addPreserved<DependenceInfo>();
901 }
902 
903 } // namespace
904 
905 Pass *polly::createIslScheduleOptimizerWrapperPass() {
906   return new IslScheduleOptimizerWrapperPass();
907 }
908 
909 INITIALIZE_PASS_BEGIN(IslScheduleOptimizerWrapperPass, "polly-opt-isl",
910                       "Polly - Optimize schedule of SCoP", false, false);
911 INITIALIZE_PASS_DEPENDENCY(DependenceInfo);
912 INITIALIZE_PASS_DEPENDENCY(ScopInfoRegionPass);
913 INITIALIZE_PASS_DEPENDENCY(TargetTransformInfoWrapperPass);
914 INITIALIZE_PASS_END(IslScheduleOptimizerWrapperPass, "polly-opt-isl",
915                     "Polly - Optimize schedule of SCoP", false, false)
916 
917 static llvm::PreservedAnalyses
918 runIslScheduleOptimizerUsingNPM(Scop &S, ScopAnalysisManager &SAM,
919                                 ScopStandardAnalysisResults &SAR, SPMUpdater &U,
920                                 raw_ostream *OS) {
921   DependenceAnalysis::Result &Deps = SAM.getResult<DependenceAnalysis>(S, SAR);
922   auto GetDeps = [&Deps](Dependences::AnalysisLevel) -> const Dependences & {
923     return Deps.getDependences(Dependences::AL_Statement);
924   };
925   TargetTransformInfo *TTI = &SAR.TTI;
926   isl::schedule LastSchedule;
927   bool Modified = runIslScheduleOptimizer(S, GetDeps, TTI, LastSchedule);
928   if (OS) {
929     *OS << "Printing analysis 'Polly - Optimize schedule of SCoP' for region: '"
930         << S.getName() << "' in function '" << S.getFunction().getName()
931         << "':\n";
932     runScheduleOptimizerPrinter(*OS, LastSchedule);
933   }
934 
935   if (!Modified)
936     return PreservedAnalyses::all();
937 
938   PreservedAnalyses PA;
939   PA.preserveSet<AllAnalysesOn<Module>>();
940   PA.preserveSet<AllAnalysesOn<Function>>();
941   PA.preserveSet<AllAnalysesOn<Loop>>();
942   return PA;
943 }
944 
945 llvm::PreservedAnalyses
946 IslScheduleOptimizerPass::run(Scop &S, ScopAnalysisManager &SAM,
947                               ScopStandardAnalysisResults &SAR, SPMUpdater &U) {
948   return runIslScheduleOptimizerUsingNPM(S, SAM, SAR, U, nullptr);
949 }
950 
951 llvm::PreservedAnalyses
952 IslScheduleOptimizerPrinterPass::run(Scop &S, ScopAnalysisManager &SAM,
953                                      ScopStandardAnalysisResults &SAR,
954                                      SPMUpdater &U) {
955   return runIslScheduleOptimizerUsingNPM(S, SAM, SAR, U, &OS);
956 }
957