1 //===- ScheduleOptimizer.cpp - Calculate an optimized schedule ------------===//
2 //
3 // Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
4 // See https://llvm.org/LICENSE.txt for license information.
5 // SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
6 //
7 //===----------------------------------------------------------------------===//
8 //
9 // This pass generates an entirely new schedule tree from the data dependences
10 // and iteration domains. The new schedule tree is computed in two steps:
11 //
12 // 1) The isl scheduling optimizer is run
13 //
14 // The isl scheduling optimizer creates a new schedule tree that maximizes
15 // parallelism and tileability and minimizes data-dependence distances. The
16 // algorithm used is a modified version of the ``Pluto'' algorithm:
17 //
18 //   U. Bondhugula, A. Hartono, J. Ramanujam, and P. Sadayappan.
19 //   A Practical Automatic Polyhedral Parallelizer and Locality Optimizer.
20 //   In Proceedings of the 2008 ACM SIGPLAN Conference On Programming Language
21 //   Design and Implementation, PLDI ’08, pages 101–113. ACM, 2008.
22 //
23 // 2) A set of post-scheduling transformations is applied on the schedule tree.
24 //
25 // These optimizations include:
26 //
27 //  - Tiling of the innermost tilable bands
28 //  - Prevectorization - The choice of a possible outer loop that is strip-mined
29 //                       to the innermost level to enable inner-loop
30 //                       vectorization.
31 //  - Some optimizations for spatial locality are also planned.
32 //
33 // For a detailed description of the schedule tree itself please see section 6
34 // of:
35 //
36 // Polyhedral AST generation is more than scanning polyhedra
37 // Tobias Grosser, Sven Verdoolaege, Albert Cohen
38 // ACM Transactions on Programming Languages and Systems (TOPLAS),
39 // 37(4), July 2015
40 // http://www.grosser.es/#pub-polyhedral-AST-generation
41 //
42 // This publication also contains a detailed discussion of the different options
43 // for polyhedral loop unrolling, full/partial tile separation and other uses
44 // of the schedule tree.
45 //
46 //===----------------------------------------------------------------------===//
47 
48 #include "polly/ScheduleOptimizer.h"
49 #include "polly/CodeGen/CodeGeneration.h"
50 #include "polly/DependenceInfo.h"
51 #include "polly/ManualOptimizer.h"
52 #include "polly/MatmulOptimizer.h"
53 #include "polly/Options.h"
54 #include "polly/ScheduleTreeTransform.h"
55 #include "polly/Support/ISLOStream.h"
56 #include "polly/Support/ISLTools.h"
57 #include "llvm/ADT/Sequence.h"
58 #include "llvm/ADT/Statistic.h"
59 #include "llvm/Analysis/OptimizationRemarkEmitter.h"
60 #include "llvm/InitializePasses.h"
61 #include "llvm/Support/CommandLine.h"
62 #include "isl/options.h"
63 
64 using namespace llvm;
65 using namespace polly;
66 
67 namespace llvm {
68 class Loop;
69 class Module;
70 } // namespace llvm
71 
72 #define DEBUG_TYPE "polly-opt-isl"
73 
74 static cl::opt<std::string>
75     OptimizeDeps("polly-opt-optimize-only",
76                  cl::desc("Only a certain kind of dependences (all/raw)"),
77                  cl::Hidden, cl::init("all"), cl::ZeroOrMore,
78                  cl::cat(PollyCategory));
79 
80 static cl::opt<std::string>
81     SimplifyDeps("polly-opt-simplify-deps",
82                  cl::desc("Dependences should be simplified (yes/no)"),
83                  cl::Hidden, cl::init("yes"), cl::ZeroOrMore,
84                  cl::cat(PollyCategory));
85 
86 static cl::opt<int> MaxConstantTerm(
87     "polly-opt-max-constant-term",
88     cl::desc("The maximal constant term allowed (-1 is unlimited)"), cl::Hidden,
89     cl::init(20), cl::ZeroOrMore, cl::cat(PollyCategory));
90 
91 static cl::opt<int> MaxCoefficient(
92     "polly-opt-max-coefficient",
93     cl::desc("The maximal coefficient allowed (-1 is unlimited)"), cl::Hidden,
94     cl::init(20), cl::ZeroOrMore, cl::cat(PollyCategory));
95 
96 static cl::opt<std::string>
97     MaximizeBandDepth("polly-opt-maximize-bands",
98                       cl::desc("Maximize the band depth (yes/no)"), cl::Hidden,
99                       cl::init("yes"), cl::ZeroOrMore, cl::cat(PollyCategory));
100 
101 static cl::opt<bool>
102     GreedyFusion("polly-loopfusion-greedy",
103                  cl::desc("Aggressively try to fuse everything"), cl::Hidden,
104                  cl::ZeroOrMore, cl::cat(PollyCategory));
105 
106 static cl::opt<std::string> OuterCoincidence(
107     "polly-opt-outer-coincidence",
108     cl::desc("Try to construct schedules where the outer member of each band "
109              "satisfies the coincidence constraints (yes/no)"),
110     cl::Hidden, cl::init("no"), cl::ZeroOrMore, cl::cat(PollyCategory));
111 
112 static cl::opt<int> PrevectorWidth(
113     "polly-prevect-width",
114     cl::desc(
115         "The number of loop iterations to strip-mine for pre-vectorization"),
116     cl::Hidden, cl::init(4), cl::ZeroOrMore, cl::cat(PollyCategory));
117 
118 static cl::opt<bool> FirstLevelTiling("polly-tiling",
119                                       cl::desc("Enable loop tiling"),
120                                       cl::init(true), cl::ZeroOrMore,
121                                       cl::cat(PollyCategory));
122 
123 static cl::opt<int> FirstLevelDefaultTileSize(
124     "polly-default-tile-size",
125     cl::desc("The default tile size (if not enough were provided by"
126              " --polly-tile-sizes)"),
127     cl::Hidden, cl::init(32), cl::ZeroOrMore, cl::cat(PollyCategory));
128 
129 static cl::list<int>
130     FirstLevelTileSizes("polly-tile-sizes",
131                         cl::desc("A tile size for each loop dimension, filled "
132                                  "with --polly-default-tile-size"),
133                         cl::Hidden, cl::ZeroOrMore, cl::CommaSeparated,
134                         cl::cat(PollyCategory));
135 
136 static cl::opt<bool>
137     SecondLevelTiling("polly-2nd-level-tiling",
138                       cl::desc("Enable a 2nd level loop of loop tiling"),
139                       cl::init(false), cl::ZeroOrMore, cl::cat(PollyCategory));
140 
141 static cl::opt<int> SecondLevelDefaultTileSize(
142     "polly-2nd-level-default-tile-size",
143     cl::desc("The default 2nd-level tile size (if not enough were provided by"
144              " --polly-2nd-level-tile-sizes)"),
145     cl::Hidden, cl::init(16), cl::ZeroOrMore, cl::cat(PollyCategory));
146 
147 static cl::list<int>
148     SecondLevelTileSizes("polly-2nd-level-tile-sizes",
149                          cl::desc("A tile size for each loop dimension, filled "
150                                   "with --polly-default-tile-size"),
151                          cl::Hidden, cl::ZeroOrMore, cl::CommaSeparated,
152                          cl::cat(PollyCategory));
153 
154 static cl::opt<bool> RegisterTiling("polly-register-tiling",
155                                     cl::desc("Enable register tiling"),
156                                     cl::init(false), cl::ZeroOrMore,
157                                     cl::cat(PollyCategory));
158 
159 static cl::opt<int> RegisterDefaultTileSize(
160     "polly-register-tiling-default-tile-size",
161     cl::desc("The default register tile size (if not enough were provided by"
162              " --polly-register-tile-sizes)"),
163     cl::Hidden, cl::init(2), cl::ZeroOrMore, cl::cat(PollyCategory));
164 
165 static cl::list<int>
166     RegisterTileSizes("polly-register-tile-sizes",
167                       cl::desc("A tile size for each loop dimension, filled "
168                                "with --polly-register-tile-size"),
169                       cl::Hidden, cl::ZeroOrMore, cl::CommaSeparated,
170                       cl::cat(PollyCategory));
171 
172 static cl::opt<bool> PragmaBasedOpts(
173     "polly-pragma-based-opts",
174     cl::desc("Apply user-directed transformation from metadata"),
175     cl::init(true), cl::ZeroOrMore, cl::cat(PollyCategory));
176 
177 static cl::opt<bool> EnableReschedule("polly-reschedule",
178                                       cl::desc("Optimize SCoPs using ISL"),
179                                       cl::init(true), cl::ZeroOrMore,
180                                       cl::cat(PollyCategory));
181 
182 static cl::opt<bool>
183     PMBasedOpts("polly-pattern-matching-based-opts",
184                 cl::desc("Perform optimizations based on pattern matching"),
185                 cl::init(true), cl::ZeroOrMore, cl::cat(PollyCategory));
186 
187 static cl::opt<bool>
188     EnablePostopts("polly-postopts",
189                    cl::desc("Apply post-rescheduling optimizations such as "
190                             "tiling (requires -polly-reschedule)"),
191                    cl::init(true), cl::ZeroOrMore, cl::cat(PollyCategory));
192 
193 static cl::opt<bool> OptimizedScops(
194     "polly-optimized-scops",
195     cl::desc("Polly - Dump polyhedral description of Scops optimized with "
196              "the isl scheduling optimizer and the set of post-scheduling "
197              "transformations is applied on the schedule tree"),
198     cl::init(false), cl::ZeroOrMore, cl::cat(PollyCategory));
199 
200 STATISTIC(ScopsProcessed, "Number of scops processed");
201 STATISTIC(ScopsRescheduled, "Number of scops rescheduled");
202 STATISTIC(ScopsOptimized, "Number of scops optimized");
203 
204 STATISTIC(NumAffineLoopsOptimized, "Number of affine loops optimized");
205 STATISTIC(NumBoxedLoopsOptimized, "Number of boxed loops optimized");
206 
207 #define THREE_STATISTICS(VARNAME, DESC)                                        \
208   static Statistic VARNAME[3] = {                                              \
209       {DEBUG_TYPE, #VARNAME "0", DESC " (original)"},                          \
210       {DEBUG_TYPE, #VARNAME "1", DESC " (after scheduler)"},                   \
211       {DEBUG_TYPE, #VARNAME "2", DESC " (after optimizer)"}}
212 
213 THREE_STATISTICS(NumBands, "Number of bands");
214 THREE_STATISTICS(NumBandMembers, "Number of band members");
215 THREE_STATISTICS(NumCoincident, "Number of coincident band members");
216 THREE_STATISTICS(NumPermutable, "Number of permutable bands");
217 THREE_STATISTICS(NumFilters, "Number of filter nodes");
218 THREE_STATISTICS(NumExtension, "Number of extension nodes");
219 
220 STATISTIC(FirstLevelTileOpts, "Number of first level tiling applied");
221 STATISTIC(SecondLevelTileOpts, "Number of second level tiling applied");
222 STATISTIC(RegisterTileOpts, "Number of register tiling applied");
223 STATISTIC(PrevectOpts, "Number of strip-mining for prevectorization applied");
224 STATISTIC(MatMulOpts,
225           "Number of matrix multiplication patterns detected and optimized");
226 
227 namespace {
228 /// Additional parameters of the schedule optimizer.
229 ///
230 /// Target Transform Info and the SCoP dependencies used by the schedule
231 /// optimizer.
232 struct OptimizerAdditionalInfoTy {
233   const llvm::TargetTransformInfo *TTI;
234   const Dependences *D;
235   bool PatternOpts;
236   bool Postopts;
237   bool Prevect;
238 };
239 
240 class ScheduleTreeOptimizer {
241 public:
242   /// Apply schedule tree transformations.
243   ///
244   /// This function takes an (possibly already optimized) schedule tree and
245   /// applies a set of additional optimizations on the schedule tree. The
246   /// transformations applied include:
247   ///
248   ///   - Pattern-based optimizations
249   ///   - Tiling
250   ///   - Prevectorization
251   ///
252   /// @param Schedule The schedule object the transformations will be applied
253   ///                 to.
254   /// @param OAI      Target Transform Info and the SCoP dependencies.
255   /// @returns        The transformed schedule.
256   static isl::schedule
257   optimizeSchedule(isl::schedule Schedule,
258                    const OptimizerAdditionalInfoTy *OAI = nullptr);
259 
260   /// Apply schedule tree transformations.
261   ///
262   /// This function takes a node in an (possibly already optimized) schedule
263   /// tree and applies a set of additional optimizations on this schedule tree
264   /// node and its descendants. The transformations applied include:
265   ///
266   ///   - Pattern-based optimizations
267   ///   - Tiling
268   ///   - Prevectorization
269   ///
270   /// @param Node The schedule object post-transformations will be applied to.
271   /// @param OAI  Target Transform Info and the SCoP dependencies.
272   /// @returns    The transformed schedule.
273   static isl::schedule_node
274   optimizeScheduleNode(isl::schedule_node Node,
275                        const OptimizerAdditionalInfoTy *OAI = nullptr);
276 
277   /// Decide if the @p NewSchedule is profitable for @p S.
278   ///
279   /// @param S           The SCoP we optimize.
280   /// @param NewSchedule The new schedule we computed.
281   ///
282   /// @return True, if we believe @p NewSchedule is an improvement for @p S.
283   static bool isProfitableSchedule(polly::Scop &S, isl::schedule NewSchedule);
284 
285   /// Isolate a set of partial tile prefixes.
286   ///
287   /// This set should ensure that it contains only partial tile prefixes that
288   /// have exactly VectorWidth iterations.
289   ///
290   /// @param Node A schedule node band, which is a parent of a band node,
291   ///             that contains a vector loop.
292   /// @return Modified isl_schedule_node.
293   static isl::schedule_node isolateFullPartialTiles(isl::schedule_node Node,
294                                                     int VectorWidth);
295 
296 private:
297   /// Check if this node is a band node we want to tile.
298   ///
299   /// We look for innermost band nodes where individual dimensions are marked as
300   /// permutable.
301   ///
302   /// @param Node The node to check.
303   static bool isTileableBandNode(isl::schedule_node Node);
304 
305   /// Pre-vectorizes one scheduling dimension of a schedule band.
306   ///
307   /// prevectSchedBand splits out the dimension DimToVectorize, tiles it and
308   /// sinks the resulting point loop.
309   ///
310   /// Example (DimToVectorize=0, VectorWidth=4):
311   ///
312   /// | Before transformation:
313   /// |
314   /// | A[i,j] -> [i,j]
315   /// |
316   /// | for (i = 0; i < 128; i++)
317   /// |    for (j = 0; j < 128; j++)
318   /// |      A(i,j);
319   ///
320   /// | After transformation:
321   /// |
322   /// | for (it = 0; it < 32; it+=1)
323   /// |    for (j = 0; j < 128; j++)
324   /// |      for (ip = 0; ip <= 3; ip++)
325   /// |        A(4 * it + ip,j);
326   ///
327   /// The goal of this transformation is to create a trivially vectorizable
328   /// loop.  This means a parallel loop at the innermost level that has a
329   /// constant number of iterations corresponding to the target vector width.
330   ///
331   /// This transformation creates a loop at the innermost level. The loop has
332   /// a constant number of iterations, if the number of loop iterations at
333   /// DimToVectorize can be divided by VectorWidth. The default VectorWidth is
334   /// currently constant and not yet target specific. This function does not
335   /// reason about parallelism.
336   static isl::schedule_node prevectSchedBand(isl::schedule_node Node,
337                                              unsigned DimToVectorize,
338                                              int VectorWidth);
339 
340   /// Apply additional optimizations on the bands in the schedule tree.
341   ///
342   /// We are looking for an innermost band node and apply the following
343   /// transformations:
344   ///
345   ///  - Tile the band
346   ///      - if the band is tileable
347   ///      - if the band has more than one loop dimension
348   ///
349   ///  - Prevectorize the schedule of the band (or the point loop in case of
350   ///    tiling).
351   ///      - if vectorization is enabled
352   ///
353   /// @param Node The schedule node to (possibly) optimize.
354   /// @param User A pointer to forward some use information
355   ///        (currently unused).
356   static isl_schedule_node *optimizeBand(isl_schedule_node *Node, void *User);
357 
358   /// Apply tiling optimizations on the bands in the schedule tree.
359   ///
360   /// @param Node The schedule node to (possibly) optimize.
361   static isl::schedule_node applyTileBandOpt(isl::schedule_node Node);
362 
363   /// Apply prevectorization on the bands in the schedule tree.
364   ///
365   /// @param Node The schedule node to (possibly) prevectorize.
366   static isl::schedule_node applyPrevectBandOpt(isl::schedule_node Node);
367 };
368 
369 isl::schedule_node
370 ScheduleTreeOptimizer::isolateFullPartialTiles(isl::schedule_node Node,
371                                                int VectorWidth) {
372   assert(isl_schedule_node_get_type(Node.get()) == isl_schedule_node_band);
373   Node = Node.child(0).child(0);
374   isl::union_map SchedRelUMap = Node.get_prefix_schedule_relation();
375   isl::union_set ScheduleRangeUSet = SchedRelUMap.range();
376   isl::set ScheduleRange{ScheduleRangeUSet};
377   isl::set IsolateDomain = getPartialTilePrefixes(ScheduleRange, VectorWidth);
378   auto AtomicOption = getDimOptions(IsolateDomain.ctx(), "atomic");
379   isl::union_set IsolateOption = getIsolateOptions(IsolateDomain, 1);
380   Node = Node.parent().parent();
381   isl::union_set Options = IsolateOption.unite(AtomicOption);
382   isl::schedule_node_band Result =
383       Node.as<isl::schedule_node_band>().set_ast_build_options(Options);
384   return Result;
385 }
386 
387 struct InsertSimdMarkers : public ScheduleNodeRewriter<InsertSimdMarkers> {
388   isl::schedule_node visitBand(isl::schedule_node_band Band) {
389     isl::schedule_node Node = visitChildren(Band);
390 
391     // Only add SIMD markers to innermost bands.
392     if (!Node.first_child().isa<isl::schedule_node_leaf>())
393       return Node;
394 
395     isl::id LoopMarker = isl::id::alloc(Band.ctx(), "SIMD", nullptr);
396     return Band.insert_mark(LoopMarker);
397   }
398 };
399 
400 isl::schedule_node ScheduleTreeOptimizer::prevectSchedBand(
401     isl::schedule_node Node, unsigned DimToVectorize, int VectorWidth) {
402   assert(isl_schedule_node_get_type(Node.get()) == isl_schedule_node_band);
403 
404   auto Space = isl::manage(isl_schedule_node_band_get_space(Node.get()));
405   unsigned ScheduleDimensions = unsignedFromIslSize(Space.dim(isl::dim::set));
406   assert(DimToVectorize < ScheduleDimensions);
407 
408   if (DimToVectorize > 0) {
409     Node = isl::manage(
410         isl_schedule_node_band_split(Node.release(), DimToVectorize));
411     Node = Node.child(0);
412   }
413   if (DimToVectorize < ScheduleDimensions - 1)
414     Node = isl::manage(isl_schedule_node_band_split(Node.release(), 1));
415   Space = isl::manage(isl_schedule_node_band_get_space(Node.get()));
416   auto Sizes = isl::multi_val::zero(Space);
417   Sizes = Sizes.set_val(0, isl::val(Node.ctx(), VectorWidth));
418   Node =
419       isl::manage(isl_schedule_node_band_tile(Node.release(), Sizes.release()));
420   Node = isolateFullPartialTiles(Node, VectorWidth);
421   Node = Node.child(0);
422   // Make sure the "trivially vectorizable loop" is not unrolled. Otherwise,
423   // we will have troubles to match it in the backend.
424   Node = Node.as<isl::schedule_node_band>().set_ast_build_options(
425       isl::union_set(Node.ctx(), "{ unroll[x]: 1 = 0 }"));
426 
427   // Sink the inner loop into the smallest possible statements to make them
428   // represent a single vector instruction if possible.
429   Node = isl::manage(isl_schedule_node_band_sink(Node.release()));
430 
431   // Add SIMD markers to those vector statements.
432   InsertSimdMarkers SimdMarkerInserter;
433   Node = SimdMarkerInserter.visit(Node);
434 
435   PrevectOpts++;
436   return Node.parent();
437 }
438 
439 static bool isSimpleInnermostBand(const isl::schedule_node &Node) {
440   assert(isl_schedule_node_get_type(Node.get()) == isl_schedule_node_band);
441   assert(isl_schedule_node_n_children(Node.get()) == 1);
442 
443   auto ChildType = isl_schedule_node_get_type(Node.child(0).get());
444 
445   if (ChildType == isl_schedule_node_leaf)
446     return true;
447 
448   if (ChildType != isl_schedule_node_sequence)
449     return false;
450 
451   auto Sequence = Node.child(0);
452 
453   for (int c = 0, nc = isl_schedule_node_n_children(Sequence.get()); c < nc;
454        ++c) {
455     auto Child = Sequence.child(c);
456     if (isl_schedule_node_get_type(Child.get()) != isl_schedule_node_filter)
457       return false;
458     if (isl_schedule_node_get_type(Child.child(0).get()) !=
459         isl_schedule_node_leaf)
460       return false;
461   }
462   return true;
463 }
464 
465 bool ScheduleTreeOptimizer::isTileableBandNode(isl::schedule_node Node) {
466   if (isl_schedule_node_get_type(Node.get()) != isl_schedule_node_band)
467     return false;
468 
469   if (isl_schedule_node_n_children(Node.get()) != 1)
470     return false;
471 
472   if (!isl_schedule_node_band_get_permutable(Node.get()))
473     return false;
474 
475   auto Space = isl::manage(isl_schedule_node_band_get_space(Node.get()));
476 
477   if (unsignedFromIslSize(Space.dim(isl::dim::set)) <= 1u)
478     return false;
479 
480   return isSimpleInnermostBand(Node);
481 }
482 
483 __isl_give isl::schedule_node
484 ScheduleTreeOptimizer::applyTileBandOpt(isl::schedule_node Node) {
485   if (FirstLevelTiling) {
486     Node = tileNode(Node, "1st level tiling", FirstLevelTileSizes,
487                     FirstLevelDefaultTileSize);
488     FirstLevelTileOpts++;
489   }
490 
491   if (SecondLevelTiling) {
492     Node = tileNode(Node, "2nd level tiling", SecondLevelTileSizes,
493                     SecondLevelDefaultTileSize);
494     SecondLevelTileOpts++;
495   }
496 
497   if (RegisterTiling) {
498     Node =
499         applyRegisterTiling(Node, RegisterTileSizes, RegisterDefaultTileSize);
500     RegisterTileOpts++;
501   }
502 
503   return Node;
504 }
505 
506 isl::schedule_node
507 ScheduleTreeOptimizer::applyPrevectBandOpt(isl::schedule_node Node) {
508   auto Space = isl::manage(isl_schedule_node_band_get_space(Node.get()));
509   int Dims = unsignedFromIslSize(Space.dim(isl::dim::set));
510 
511   for (int i = Dims - 1; i >= 0; i--)
512     if (Node.as<isl::schedule_node_band>().member_get_coincident(i)) {
513       Node = prevectSchedBand(Node, i, PrevectorWidth);
514       break;
515     }
516 
517   return Node;
518 }
519 
520 __isl_give isl_schedule_node *
521 ScheduleTreeOptimizer::optimizeBand(__isl_take isl_schedule_node *NodeArg,
522                                     void *User) {
523   const OptimizerAdditionalInfoTy *OAI =
524       static_cast<const OptimizerAdditionalInfoTy *>(User);
525   assert(OAI && "Expecting optimization options");
526 
527   isl::schedule_node Node = isl::manage(NodeArg);
528   if (!isTileableBandNode(Node))
529     return Node.release();
530 
531   if (OAI->PatternOpts) {
532     isl::schedule_node PatternOptimizedSchedule =
533         tryOptimizeMatMulPattern(Node, OAI->TTI, OAI->D);
534     if (!PatternOptimizedSchedule.is_null()) {
535       MatMulOpts++;
536       return PatternOptimizedSchedule.release();
537     }
538   }
539 
540   if (OAI->Postopts)
541     Node = applyTileBandOpt(Node);
542 
543   if (OAI->Prevect) {
544     // FIXME: Prevectorization requirements are different from those checked by
545     // isTileableBandNode.
546     Node = applyPrevectBandOpt(Node);
547   }
548 
549   return Node.release();
550 }
551 
552 isl::schedule
553 ScheduleTreeOptimizer::optimizeSchedule(isl::schedule Schedule,
554                                         const OptimizerAdditionalInfoTy *OAI) {
555   auto Root = Schedule.get_root();
556   Root = optimizeScheduleNode(Root, OAI);
557   return Root.get_schedule();
558 }
559 
560 isl::schedule_node ScheduleTreeOptimizer::optimizeScheduleNode(
561     isl::schedule_node Node, const OptimizerAdditionalInfoTy *OAI) {
562   Node = isl::manage(isl_schedule_node_map_descendant_bottom_up(
563       Node.release(), optimizeBand,
564       const_cast<void *>(static_cast<const void *>(OAI))));
565   return Node;
566 }
567 
568 bool ScheduleTreeOptimizer::isProfitableSchedule(Scop &S,
569                                                  isl::schedule NewSchedule) {
570   // To understand if the schedule has been optimized we check if the schedule
571   // has changed at all.
572   // TODO: We can improve this by tracking if any necessarily beneficial
573   // transformations have been performed. This can e.g. be tiling, loop
574   // interchange, or ...) We can track this either at the place where the
575   // transformation has been performed or, in case of automatic ILP based
576   // optimizations, by comparing (yet to be defined) performance metrics
577   // before/after the scheduling optimizer
578   // (e.g., #stride-one accesses)
579   // FIXME: A schedule tree whose union_map-conversion is identical to the
580   // original schedule map may still allow for parallelization, i.e. can still
581   // be profitable.
582   auto NewScheduleMap = NewSchedule.get_map();
583   auto OldSchedule = S.getSchedule();
584   assert(!OldSchedule.is_null() &&
585          "Only IslScheduleOptimizer can insert extension nodes "
586          "that make Scop::getSchedule() return nullptr.");
587   bool changed = !OldSchedule.is_equal(NewScheduleMap);
588   return changed;
589 }
590 
591 class IslScheduleOptimizerWrapperPass : public ScopPass {
592 public:
593   static char ID;
594 
595   explicit IslScheduleOptimizerWrapperPass() : ScopPass(ID) {}
596 
597   /// Optimize the schedule of the SCoP @p S.
598   bool runOnScop(Scop &S) override;
599 
600   /// Print the new schedule for the SCoP @p S.
601   void printScop(raw_ostream &OS, Scop &S) const override;
602 
603   /// Register all analyses and transformation required.
604   void getAnalysisUsage(AnalysisUsage &AU) const override;
605 
606   /// Release the internal memory.
607   void releaseMemory() override {
608     LastSchedule = {};
609     IslCtx.reset();
610   }
611 
612 private:
613   std::shared_ptr<isl_ctx> IslCtx;
614   isl::schedule LastSchedule;
615 };
616 
617 char IslScheduleOptimizerWrapperPass::ID = 0;
618 
619 #ifndef NDEBUG
620 static void printSchedule(llvm::raw_ostream &OS, const isl::schedule &Schedule,
621                           StringRef Desc) {
622   isl::ctx Ctx = Schedule.ctx();
623   isl_printer *P = isl_printer_to_str(Ctx.get());
624   P = isl_printer_set_yaml_style(P, ISL_YAML_STYLE_BLOCK);
625   P = isl_printer_print_schedule(P, Schedule.get());
626   char *Str = isl_printer_get_str(P);
627   OS << Desc << ": \n" << Str << "\n";
628   free(Str);
629   isl_printer_free(P);
630 }
631 #endif
632 
633 /// Collect statistics for the schedule tree.
634 ///
635 /// @param Schedule The schedule tree to analyze. If not a schedule tree it is
636 /// ignored.
637 /// @param Version  The version of the schedule tree that is analyzed.
638 ///                 0 for the original schedule tree before any transformation.
639 ///                 1 for the schedule tree after isl's rescheduling.
640 ///                 2 for the schedule tree after optimizations are applied
641 ///                 (tiling, pattern matching)
642 static void walkScheduleTreeForStatistics(isl::schedule Schedule, int Version) {
643   auto Root = Schedule.get_root();
644   if (Root.is_null())
645     return;
646 
647   isl_schedule_node_foreach_descendant_top_down(
648       Root.get(),
649       [](__isl_keep isl_schedule_node *nodeptr, void *user) -> isl_bool {
650         isl::schedule_node Node = isl::manage_copy(nodeptr);
651         int Version = *static_cast<int *>(user);
652 
653         switch (isl_schedule_node_get_type(Node.get())) {
654         case isl_schedule_node_band: {
655           NumBands[Version]++;
656           if (isl_schedule_node_band_get_permutable(Node.get()) ==
657               isl_bool_true)
658             NumPermutable[Version]++;
659 
660           int CountMembers = isl_schedule_node_band_n_member(Node.get());
661           NumBandMembers[Version] += CountMembers;
662           for (int i = 0; i < CountMembers; i += 1) {
663             if (Node.as<isl::schedule_node_band>().member_get_coincident(i))
664               NumCoincident[Version]++;
665           }
666           break;
667         }
668 
669         case isl_schedule_node_filter:
670           NumFilters[Version]++;
671           break;
672 
673         case isl_schedule_node_extension:
674           NumExtension[Version]++;
675           break;
676 
677         default:
678           break;
679         }
680 
681         return isl_bool_true;
682       },
683       &Version);
684 }
685 
686 static bool runIslScheduleOptimizer(
687     Scop &S,
688     function_ref<const Dependences &(Dependences::AnalysisLevel)> GetDeps,
689     TargetTransformInfo *TTI, OptimizationRemarkEmitter *ORE,
690     isl::schedule &LastSchedule) {
691 
692   // Skip SCoPs in case they're already optimised by PPCGCodeGeneration
693   if (S.isToBeSkipped())
694     return false;
695 
696   // Skip empty SCoPs but still allow code generation as it will delete the
697   // loops present but not needed.
698   if (S.getSize() == 0) {
699     S.markAsOptimized();
700     return false;
701   }
702 
703   ScopsProcessed++;
704 
705   // Schedule without optimizations.
706   isl::schedule Schedule = S.getScheduleTree();
707   walkScheduleTreeForStatistics(S.getScheduleTree(), 0);
708   LLVM_DEBUG(printSchedule(dbgs(), Schedule, "Original schedule tree"));
709 
710   bool HasUserTransformation = false;
711   if (PragmaBasedOpts) {
712     isl::schedule ManuallyTransformed = applyManualTransformations(
713         &S, Schedule, GetDeps(Dependences::AL_Statement), ORE);
714     if (ManuallyTransformed.is_null()) {
715       LLVM_DEBUG(dbgs() << "Error during manual optimization\n");
716       return false;
717     }
718 
719     if (ManuallyTransformed.get() != Schedule.get()) {
720       // User transformations have precedence over other transformations.
721       HasUserTransformation = true;
722       Schedule = std::move(ManuallyTransformed);
723       LLVM_DEBUG(
724           printSchedule(dbgs(), Schedule, "After manual transformations"));
725     }
726   }
727 
728   // Only continue if either manual transformations have been applied or we are
729   // allowed to apply heuristics.
730   // TODO: Detect disabled heuristics and no user-directed transformation
731   // metadata earlier in ScopDetection.
732   if (!HasUserTransformation && S.hasDisableHeuristicsHint()) {
733     LLVM_DEBUG(dbgs() << "Heuristic optimizations disabled by metadata\n");
734     return false;
735   }
736 
737   // Get dependency analysis.
738   const Dependences &D = GetDeps(Dependences::AL_Statement);
739   if (D.getSharedIslCtx() != S.getSharedIslCtx()) {
740     LLVM_DEBUG(dbgs() << "DependenceInfo for another SCoP/isl_ctx\n");
741     return false;
742   }
743   if (!D.hasValidDependences()) {
744     LLVM_DEBUG(dbgs() << "Dependency information not available\n");
745     return false;
746   }
747 
748   // Apply ISL's algorithm only if not overriden by the user. Note that
749   // post-rescheduling optimizations (tiling, pattern-based, prevectorization)
750   // rely on the coincidence/permutable annotations on schedule tree bands that
751   // are added by the rescheduling analyzer. Therefore, disabling the
752   // rescheduler implicitly also disables these optimizations.
753   if (!EnableReschedule) {
754     LLVM_DEBUG(dbgs() << "Skipping rescheduling due to command line option\n");
755   } else if (HasUserTransformation) {
756     LLVM_DEBUG(
757         dbgs() << "Skipping rescheduling due to manual transformation\n");
758   } else {
759     // Build input data.
760     int ValidityKinds =
761         Dependences::TYPE_RAW | Dependences::TYPE_WAR | Dependences::TYPE_WAW;
762     int ProximityKinds;
763 
764     if (OptimizeDeps == "all")
765       ProximityKinds =
766           Dependences::TYPE_RAW | Dependences::TYPE_WAR | Dependences::TYPE_WAW;
767     else if (OptimizeDeps == "raw")
768       ProximityKinds = Dependences::TYPE_RAW;
769     else {
770       errs() << "Do not know how to optimize for '" << OptimizeDeps << "'"
771              << " Falling back to optimizing all dependences.\n";
772       ProximityKinds =
773           Dependences::TYPE_RAW | Dependences::TYPE_WAR | Dependences::TYPE_WAW;
774     }
775 
776     isl::union_set Domain = S.getDomains();
777 
778     if (Domain.is_null())
779       return false;
780 
781     isl::union_map Validity = D.getDependences(ValidityKinds);
782     isl::union_map Proximity = D.getDependences(ProximityKinds);
783 
784     // Simplify the dependences by removing the constraints introduced by the
785     // domains. This can speed up the scheduling time significantly, as large
786     // constant coefficients will be removed from the dependences. The
787     // introduction of some additional dependences reduces the possible
788     // transformations, but in most cases, such transformation do not seem to be
789     // interesting anyway. In some cases this option may stop the scheduler to
790     // find any schedule.
791     if (SimplifyDeps == "yes") {
792       Validity = Validity.gist_domain(Domain);
793       Validity = Validity.gist_range(Domain);
794       Proximity = Proximity.gist_domain(Domain);
795       Proximity = Proximity.gist_range(Domain);
796     } else if (SimplifyDeps != "no") {
797       errs()
798           << "warning: Option -polly-opt-simplify-deps should either be 'yes' "
799              "or 'no'. Falling back to default: 'yes'\n";
800     }
801 
802     LLVM_DEBUG(dbgs() << "\n\nCompute schedule from: ");
803     LLVM_DEBUG(dbgs() << "Domain := " << Domain << ";\n");
804     LLVM_DEBUG(dbgs() << "Proximity := " << Proximity << ";\n");
805     LLVM_DEBUG(dbgs() << "Validity := " << Validity << ";\n");
806 
807     int IslMaximizeBands;
808     if (MaximizeBandDepth == "yes") {
809       IslMaximizeBands = 1;
810     } else if (MaximizeBandDepth == "no") {
811       IslMaximizeBands = 0;
812     } else {
813       errs()
814           << "warning: Option -polly-opt-maximize-bands should either be 'yes'"
815              " or 'no'. Falling back to default: 'yes'\n";
816       IslMaximizeBands = 1;
817     }
818 
819     int IslOuterCoincidence;
820     if (OuterCoincidence == "yes") {
821       IslOuterCoincidence = 1;
822     } else if (OuterCoincidence == "no") {
823       IslOuterCoincidence = 0;
824     } else {
825       errs() << "warning: Option -polly-opt-outer-coincidence should either be "
826                 "'yes' or 'no'. Falling back to default: 'no'\n";
827       IslOuterCoincidence = 0;
828     }
829 
830     isl_ctx *Ctx = S.getIslCtx().get();
831 
832     isl_options_set_schedule_outer_coincidence(Ctx, IslOuterCoincidence);
833     isl_options_set_schedule_maximize_band_depth(Ctx, IslMaximizeBands);
834     isl_options_set_schedule_max_constant_term(Ctx, MaxConstantTerm);
835     isl_options_set_schedule_max_coefficient(Ctx, MaxCoefficient);
836     isl_options_set_tile_scale_tile_loops(Ctx, 0);
837 
838     auto OnErrorStatus = isl_options_get_on_error(Ctx);
839     isl_options_set_on_error(Ctx, ISL_ON_ERROR_CONTINUE);
840 
841     auto SC = isl::schedule_constraints::on_domain(Domain);
842     SC = SC.set_proximity(Proximity);
843     SC = SC.set_validity(Validity);
844     SC = SC.set_coincidence(Validity);
845     Schedule = SC.compute_schedule();
846     isl_options_set_on_error(Ctx, OnErrorStatus);
847 
848     ScopsRescheduled++;
849     LLVM_DEBUG(printSchedule(dbgs(), Schedule, "After rescheduling"));
850   }
851 
852   walkScheduleTreeForStatistics(Schedule, 1);
853 
854   // In cases the scheduler is not able to optimize the code, we just do not
855   // touch the schedule.
856   if (Schedule.is_null())
857     return false;
858 
859   if (GreedyFusion) {
860     isl::union_map Validity = D.getDependences(
861         Dependences::TYPE_RAW | Dependences::TYPE_WAR | Dependences::TYPE_WAW);
862     Schedule = applyGreedyFusion(Schedule, Validity);
863     assert(!Schedule.is_null());
864   }
865 
866   // Apply post-rescheduling optimizations (if enabled) and/or prevectorization.
867   const OptimizerAdditionalInfoTy OAI = {
868       TTI, const_cast<Dependences *>(&D),
869       /*PatternOpts=*/!HasUserTransformation && PMBasedOpts,
870       /*Postopts=*/!HasUserTransformation && EnablePostopts,
871       /*Prevect=*/PollyVectorizerChoice != VECTORIZER_NONE};
872   if (OAI.PatternOpts || OAI.Postopts || OAI.Prevect) {
873     Schedule = ScheduleTreeOptimizer::optimizeSchedule(Schedule, &OAI);
874     Schedule = hoistExtensionNodes(Schedule);
875     LLVM_DEBUG(printSchedule(dbgs(), Schedule, "After post-optimizations"));
876     walkScheduleTreeForStatistics(Schedule, 2);
877   }
878 
879   // Skip profitability check if user transformation(s) have been applied.
880   if (!HasUserTransformation &&
881       !ScheduleTreeOptimizer::isProfitableSchedule(S, Schedule))
882     return false;
883 
884   auto ScopStats = S.getStatistics();
885   ScopsOptimized++;
886   NumAffineLoopsOptimized += ScopStats.NumAffineLoops;
887   NumBoxedLoopsOptimized += ScopStats.NumBoxedLoops;
888   LastSchedule = Schedule;
889 
890   S.setScheduleTree(Schedule);
891   S.markAsOptimized();
892 
893   if (OptimizedScops)
894     errs() << S;
895 
896   return false;
897 }
898 
899 bool IslScheduleOptimizerWrapperPass::runOnScop(Scop &S) {
900   releaseMemory();
901 
902   Function &F = S.getFunction();
903   IslCtx = S.getSharedIslCtx();
904 
905   auto getDependences =
906       [this](Dependences::AnalysisLevel) -> const Dependences & {
907     return getAnalysis<DependenceInfo>().getDependences(
908         Dependences::AL_Statement);
909   };
910   OptimizationRemarkEmitter &ORE =
911       getAnalysis<OptimizationRemarkEmitterWrapperPass>().getORE();
912   TargetTransformInfo *TTI =
913       &getAnalysis<TargetTransformInfoWrapperPass>().getTTI(F);
914   return runIslScheduleOptimizer(S, getDependences, TTI, &ORE, LastSchedule);
915 }
916 
917 static void runScheduleOptimizerPrinter(raw_ostream &OS,
918                                         isl::schedule LastSchedule) {
919   isl_printer *p;
920   char *ScheduleStr;
921 
922   OS << "Calculated schedule:\n";
923 
924   if (LastSchedule.is_null()) {
925     OS << "n/a\n";
926     return;
927   }
928 
929   p = isl_printer_to_str(LastSchedule.ctx().get());
930   p = isl_printer_set_yaml_style(p, ISL_YAML_STYLE_BLOCK);
931   p = isl_printer_print_schedule(p, LastSchedule.get());
932   ScheduleStr = isl_printer_get_str(p);
933   isl_printer_free(p);
934 
935   OS << ScheduleStr << "\n";
936 
937   free(ScheduleStr);
938 }
939 
940 void IslScheduleOptimizerWrapperPass::printScop(raw_ostream &OS, Scop &) const {
941   runScheduleOptimizerPrinter(OS, LastSchedule);
942 }
943 
944 void IslScheduleOptimizerWrapperPass::getAnalysisUsage(
945     AnalysisUsage &AU) const {
946   ScopPass::getAnalysisUsage(AU);
947   AU.addRequired<DependenceInfo>();
948   AU.addRequired<TargetTransformInfoWrapperPass>();
949   AU.addRequired<OptimizationRemarkEmitterWrapperPass>();
950 
951   AU.addPreserved<DependenceInfo>();
952   AU.addPreserved<OptimizationRemarkEmitterWrapperPass>();
953 }
954 
955 } // namespace
956 
957 Pass *polly::createIslScheduleOptimizerWrapperPass() {
958   return new IslScheduleOptimizerWrapperPass();
959 }
960 
961 INITIALIZE_PASS_BEGIN(IslScheduleOptimizerWrapperPass, "polly-opt-isl",
962                       "Polly - Optimize schedule of SCoP", false, false);
963 INITIALIZE_PASS_DEPENDENCY(DependenceInfo);
964 INITIALIZE_PASS_DEPENDENCY(ScopInfoRegionPass);
965 INITIALIZE_PASS_DEPENDENCY(TargetTransformInfoWrapperPass);
966 INITIALIZE_PASS_DEPENDENCY(OptimizationRemarkEmitterWrapperPass);
967 INITIALIZE_PASS_END(IslScheduleOptimizerWrapperPass, "polly-opt-isl",
968                     "Polly - Optimize schedule of SCoP", false, false)
969 
970 static llvm::PreservedAnalyses
971 runIslScheduleOptimizerUsingNPM(Scop &S, ScopAnalysisManager &SAM,
972                                 ScopStandardAnalysisResults &SAR, SPMUpdater &U,
973                                 raw_ostream *OS) {
974   DependenceAnalysis::Result &Deps = SAM.getResult<DependenceAnalysis>(S, SAR);
975   auto GetDeps = [&Deps](Dependences::AnalysisLevel) -> const Dependences & {
976     return Deps.getDependences(Dependences::AL_Statement);
977   };
978   OptimizationRemarkEmitter ORE(&S.getFunction());
979   TargetTransformInfo *TTI = &SAR.TTI;
980   isl::schedule LastSchedule;
981   bool Modified = runIslScheduleOptimizer(S, GetDeps, TTI, &ORE, LastSchedule);
982   if (OS) {
983     *OS << "Printing analysis 'Polly - Optimize schedule of SCoP' for region: '"
984         << S.getName() << "' in function '" << S.getFunction().getName()
985         << "':\n";
986     runScheduleOptimizerPrinter(*OS, LastSchedule);
987   }
988 
989   if (!Modified)
990     return PreservedAnalyses::all();
991 
992   PreservedAnalyses PA;
993   PA.preserveSet<AllAnalysesOn<Module>>();
994   PA.preserveSet<AllAnalysesOn<Function>>();
995   PA.preserveSet<AllAnalysesOn<Loop>>();
996   return PA;
997 }
998 
999 llvm::PreservedAnalyses
1000 IslScheduleOptimizerPass::run(Scop &S, ScopAnalysisManager &SAM,
1001                               ScopStandardAnalysisResults &SAR, SPMUpdater &U) {
1002   return runIslScheduleOptimizerUsingNPM(S, SAM, SAR, U, nullptr);
1003 }
1004 
1005 llvm::PreservedAnalyses
1006 IslScheduleOptimizerPrinterPass::run(Scop &S, ScopAnalysisManager &SAM,
1007                                      ScopStandardAnalysisResults &SAR,
1008                                      SPMUpdater &U) {
1009   return runIslScheduleOptimizerUsingNPM(S, SAM, SAR, U, &OS);
1010 }
1011