1 //===- ScheduleOptimizer.cpp - Calculate an optimized schedule ------------===//
2 //
3 // Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
4 // See https://llvm.org/LICENSE.txt for license information.
5 // SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
6 //
7 //===----------------------------------------------------------------------===//
8 //
9 // This pass generates an entirely new schedule tree from the data dependences
10 // and iteration domains. The new schedule tree is computed in two steps:
11 //
12 // 1) The isl scheduling optimizer is run
13 //
14 // The isl scheduling optimizer creates a new schedule tree that maximizes
15 // parallelism and tileability and minimizes data-dependence distances. The
16 // algorithm used is a modified version of the ``Pluto'' algorithm:
17 //
18 //   U. Bondhugula, A. Hartono, J. Ramanujam, and P. Sadayappan.
19 //   A Practical Automatic Polyhedral Parallelizer and Locality Optimizer.
20 //   In Proceedings of the 2008 ACM SIGPLAN Conference On Programming Language
21 //   Design and Implementation, PLDI ’08, pages 101–113. ACM, 2008.
22 //
23 // 2) A set of post-scheduling transformations is applied on the schedule tree.
24 //
25 // These optimizations include:
26 //
27 //  - Tiling of the innermost tilable bands
28 //  - Prevectorization - The choice of a possible outer loop that is strip-mined
29 //                       to the innermost level to enable inner-loop
30 //                       vectorization.
31 //  - Some optimizations for spatial locality are also planned.
32 //
33 // For a detailed description of the schedule tree itself please see section 6
34 // of:
35 //
36 // Polyhedral AST generation is more than scanning polyhedra
37 // Tobias Grosser, Sven Verdoolaege, Albert Cohen
38 // ACM Transactions on Programming Languages and Systems (TOPLAS),
39 // 37(4), July 2015
40 // http://www.grosser.es/#pub-polyhedral-AST-generation
41 //
42 // This publication also contains a detailed discussion of the different options
43 // for polyhedral loop unrolling, full/partial tile separation and other uses
44 // of the schedule tree.
45 //
46 //===----------------------------------------------------------------------===//
47 
48 #include "polly/ScheduleOptimizer.h"
49 #include "polly/CodeGen/CodeGeneration.h"
50 #include "polly/DependenceInfo.h"
51 #include "polly/ManualOptimizer.h"
52 #include "polly/MatmulOptimizer.h"
53 #include "polly/Options.h"
54 #include "polly/ScheduleTreeTransform.h"
55 #include "polly/Support/ISLOStream.h"
56 #include "llvm/ADT/Sequence.h"
57 #include "llvm/ADT/Statistic.h"
58 #include "llvm/Analysis/OptimizationRemarkEmitter.h"
59 #include "llvm/InitializePasses.h"
60 #include "llvm/Support/CommandLine.h"
61 #include "isl/options.h"
62 
63 using namespace llvm;
64 using namespace polly;
65 
66 namespace llvm {
67 class Loop;
68 class Module;
69 } // namespace llvm
70 
71 #define DEBUG_TYPE "polly-opt-isl"
72 
73 static cl::opt<std::string>
74     OptimizeDeps("polly-opt-optimize-only",
75                  cl::desc("Only a certain kind of dependences (all/raw)"),
76                  cl::Hidden, cl::init("all"), cl::ZeroOrMore,
77                  cl::cat(PollyCategory));
78 
79 static cl::opt<std::string>
80     SimplifyDeps("polly-opt-simplify-deps",
81                  cl::desc("Dependences should be simplified (yes/no)"),
82                  cl::Hidden, cl::init("yes"), cl::ZeroOrMore,
83                  cl::cat(PollyCategory));
84 
85 static cl::opt<int> MaxConstantTerm(
86     "polly-opt-max-constant-term",
87     cl::desc("The maximal constant term allowed (-1 is unlimited)"), cl::Hidden,
88     cl::init(20), cl::ZeroOrMore, cl::cat(PollyCategory));
89 
90 static cl::opt<int> MaxCoefficient(
91     "polly-opt-max-coefficient",
92     cl::desc("The maximal coefficient allowed (-1 is unlimited)"), cl::Hidden,
93     cl::init(20), cl::ZeroOrMore, cl::cat(PollyCategory));
94 
95 static cl::opt<std::string> FusionStrategy(
96     "polly-opt-fusion", cl::desc("The fusion strategy to choose (min/max)"),
97     cl::Hidden, cl::init("min"), cl::ZeroOrMore, cl::cat(PollyCategory));
98 
99 static cl::opt<std::string>
100     MaximizeBandDepth("polly-opt-maximize-bands",
101                       cl::desc("Maximize the band depth (yes/no)"), cl::Hidden,
102                       cl::init("yes"), cl::ZeroOrMore, cl::cat(PollyCategory));
103 
104 static cl::opt<std::string> OuterCoincidence(
105     "polly-opt-outer-coincidence",
106     cl::desc("Try to construct schedules where the outer member of each band "
107              "satisfies the coincidence constraints (yes/no)"),
108     cl::Hidden, cl::init("no"), cl::ZeroOrMore, cl::cat(PollyCategory));
109 
110 static cl::opt<int> PrevectorWidth(
111     "polly-prevect-width",
112     cl::desc(
113         "The number of loop iterations to strip-mine for pre-vectorization"),
114     cl::Hidden, cl::init(4), cl::ZeroOrMore, cl::cat(PollyCategory));
115 
116 static cl::opt<bool> FirstLevelTiling("polly-tiling",
117                                       cl::desc("Enable loop tiling"),
118                                       cl::init(true), cl::ZeroOrMore,
119                                       cl::cat(PollyCategory));
120 
121 static cl::opt<int> FirstLevelDefaultTileSize(
122     "polly-default-tile-size",
123     cl::desc("The default tile size (if not enough were provided by"
124              " --polly-tile-sizes)"),
125     cl::Hidden, cl::init(32), cl::ZeroOrMore, cl::cat(PollyCategory));
126 
127 static cl::list<int>
128     FirstLevelTileSizes("polly-tile-sizes",
129                         cl::desc("A tile size for each loop dimension, filled "
130                                  "with --polly-default-tile-size"),
131                         cl::Hidden, cl::ZeroOrMore, cl::CommaSeparated,
132                         cl::cat(PollyCategory));
133 
134 static cl::opt<bool>
135     SecondLevelTiling("polly-2nd-level-tiling",
136                       cl::desc("Enable a 2nd level loop of loop tiling"),
137                       cl::init(false), cl::ZeroOrMore, cl::cat(PollyCategory));
138 
139 static cl::opt<int> SecondLevelDefaultTileSize(
140     "polly-2nd-level-default-tile-size",
141     cl::desc("The default 2nd-level tile size (if not enough were provided by"
142              " --polly-2nd-level-tile-sizes)"),
143     cl::Hidden, cl::init(16), cl::ZeroOrMore, cl::cat(PollyCategory));
144 
145 static cl::list<int>
146     SecondLevelTileSizes("polly-2nd-level-tile-sizes",
147                          cl::desc("A tile size for each loop dimension, filled "
148                                   "with --polly-default-tile-size"),
149                          cl::Hidden, cl::ZeroOrMore, cl::CommaSeparated,
150                          cl::cat(PollyCategory));
151 
152 static cl::opt<bool> RegisterTiling("polly-register-tiling",
153                                     cl::desc("Enable register tiling"),
154                                     cl::init(false), cl::ZeroOrMore,
155                                     cl::cat(PollyCategory));
156 
157 static cl::opt<int> RegisterDefaultTileSize(
158     "polly-register-tiling-default-tile-size",
159     cl::desc("The default register tile size (if not enough were provided by"
160              " --polly-register-tile-sizes)"),
161     cl::Hidden, cl::init(2), cl::ZeroOrMore, cl::cat(PollyCategory));
162 
163 static cl::list<int>
164     RegisterTileSizes("polly-register-tile-sizes",
165                       cl::desc("A tile size for each loop dimension, filled "
166                                "with --polly-register-tile-size"),
167                       cl::Hidden, cl::ZeroOrMore, cl::CommaSeparated,
168                       cl::cat(PollyCategory));
169 
170 static cl::opt<bool> PragmaBasedOpts(
171     "polly-pragma-based-opts",
172     cl::desc("Apply user-directed transformation from metadata"),
173     cl::init(true), cl::ZeroOrMore, cl::cat(PollyCategory));
174 
175 static cl::opt<bool> EnableReschedule("polly-reschedule",
176                                       cl::desc("Optimize SCoPs using ISL"),
177                                       cl::init(true), cl::ZeroOrMore,
178                                       cl::cat(PollyCategory));
179 
180 static cl::opt<bool>
181     PMBasedOpts("polly-pattern-matching-based-opts",
182                 cl::desc("Perform optimizations based on pattern matching"),
183                 cl::init(true), cl::ZeroOrMore, cl::cat(PollyCategory));
184 
185 static cl::opt<bool>
186     EnablePostopts("polly-postopts",
187                    cl::desc("Apply post-rescheduling optimizations such as "
188                             "tiling (requires -polly-reschedule)"),
189                    cl::init(true), cl::ZeroOrMore, cl::cat(PollyCategory));
190 
191 static cl::opt<bool> OptimizedScops(
192     "polly-optimized-scops",
193     cl::desc("Polly - Dump polyhedral description of Scops optimized with "
194              "the isl scheduling optimizer and the set of post-scheduling "
195              "transformations is applied on the schedule tree"),
196     cl::init(false), cl::ZeroOrMore, cl::cat(PollyCategory));
197 
198 STATISTIC(ScopsProcessed, "Number of scops processed");
199 STATISTIC(ScopsRescheduled, "Number of scops rescheduled");
200 STATISTIC(ScopsOptimized, "Number of scops optimized");
201 
202 STATISTIC(NumAffineLoopsOptimized, "Number of affine loops optimized");
203 STATISTIC(NumBoxedLoopsOptimized, "Number of boxed loops optimized");
204 
205 #define THREE_STATISTICS(VARNAME, DESC)                                        \
206   static Statistic VARNAME[3] = {                                              \
207       {DEBUG_TYPE, #VARNAME "0", DESC " (original)"},                          \
208       {DEBUG_TYPE, #VARNAME "1", DESC " (after scheduler)"},                   \
209       {DEBUG_TYPE, #VARNAME "2", DESC " (after optimizer)"}}
210 
211 THREE_STATISTICS(NumBands, "Number of bands");
212 THREE_STATISTICS(NumBandMembers, "Number of band members");
213 THREE_STATISTICS(NumCoincident, "Number of coincident band members");
214 THREE_STATISTICS(NumPermutable, "Number of permutable bands");
215 THREE_STATISTICS(NumFilters, "Number of filter nodes");
216 THREE_STATISTICS(NumExtension, "Number of extension nodes");
217 
218 STATISTIC(FirstLevelTileOpts, "Number of first level tiling applied");
219 STATISTIC(SecondLevelTileOpts, "Number of second level tiling applied");
220 STATISTIC(RegisterTileOpts, "Number of register tiling applied");
221 STATISTIC(PrevectOpts, "Number of strip-mining for prevectorization applied");
222 STATISTIC(MatMulOpts,
223           "Number of matrix multiplication patterns detected and optimized");
224 
225 namespace {
226 /// Additional parameters of the schedule optimizer.
227 ///
228 /// Target Transform Info and the SCoP dependencies used by the schedule
229 /// optimizer.
230 struct OptimizerAdditionalInfoTy {
231   const llvm::TargetTransformInfo *TTI;
232   const Dependences *D;
233   bool PatternOpts;
234   bool Postopts;
235   bool Prevect;
236 };
237 
238 class ScheduleTreeOptimizer {
239 public:
240   /// Apply schedule tree transformations.
241   ///
242   /// This function takes an (possibly already optimized) schedule tree and
243   /// applies a set of additional optimizations on the schedule tree. The
244   /// transformations applied include:
245   ///
246   ///   - Pattern-based optimizations
247   ///   - Tiling
248   ///   - Prevectorization
249   ///
250   /// @param Schedule The schedule object the transformations will be applied
251   ///                 to.
252   /// @param OAI      Target Transform Info and the SCoP dependencies.
253   /// @returns        The transformed schedule.
254   static isl::schedule
255   optimizeSchedule(isl::schedule Schedule,
256                    const OptimizerAdditionalInfoTy *OAI = nullptr);
257 
258   /// Apply schedule tree transformations.
259   ///
260   /// This function takes a node in an (possibly already optimized) schedule
261   /// tree and applies a set of additional optimizations on this schedule tree
262   /// node and its descendants. The transformations applied include:
263   ///
264   ///   - Pattern-based optimizations
265   ///   - Tiling
266   ///   - Prevectorization
267   ///
268   /// @param Node The schedule object post-transformations will be applied to.
269   /// @param OAI  Target Transform Info and the SCoP dependencies.
270   /// @returns    The transformed schedule.
271   static isl::schedule_node
272   optimizeScheduleNode(isl::schedule_node Node,
273                        const OptimizerAdditionalInfoTy *OAI = nullptr);
274 
275   /// Decide if the @p NewSchedule is profitable for @p S.
276   ///
277   /// @param S           The SCoP we optimize.
278   /// @param NewSchedule The new schedule we computed.
279   ///
280   /// @return True, if we believe @p NewSchedule is an improvement for @p S.
281   static bool isProfitableSchedule(polly::Scop &S, isl::schedule NewSchedule);
282 
283   /// Isolate a set of partial tile prefixes.
284   ///
285   /// This set should ensure that it contains only partial tile prefixes that
286   /// have exactly VectorWidth iterations.
287   ///
288   /// @param Node A schedule node band, which is a parent of a band node,
289   ///             that contains a vector loop.
290   /// @return Modified isl_schedule_node.
291   static isl::schedule_node isolateFullPartialTiles(isl::schedule_node Node,
292                                                     int VectorWidth);
293 
294 private:
295   /// Check if this node is a band node we want to tile.
296   ///
297   /// We look for innermost band nodes where individual dimensions are marked as
298   /// permutable.
299   ///
300   /// @param Node The node to check.
301   static bool isTileableBandNode(isl::schedule_node Node);
302 
303   /// Pre-vectorizes one scheduling dimension of a schedule band.
304   ///
305   /// prevectSchedBand splits out the dimension DimToVectorize, tiles it and
306   /// sinks the resulting point loop.
307   ///
308   /// Example (DimToVectorize=0, VectorWidth=4):
309   ///
310   /// | Before transformation:
311   /// |
312   /// | A[i,j] -> [i,j]
313   /// |
314   /// | for (i = 0; i < 128; i++)
315   /// |    for (j = 0; j < 128; j++)
316   /// |      A(i,j);
317   ///
318   /// | After transformation:
319   /// |
320   /// | for (it = 0; it < 32; it+=1)
321   /// |    for (j = 0; j < 128; j++)
322   /// |      for (ip = 0; ip <= 3; ip++)
323   /// |        A(4 * it + ip,j);
324   ///
325   /// The goal of this transformation is to create a trivially vectorizable
326   /// loop.  This means a parallel loop at the innermost level that has a
327   /// constant number of iterations corresponding to the target vector width.
328   ///
329   /// This transformation creates a loop at the innermost level. The loop has
330   /// a constant number of iterations, if the number of loop iterations at
331   /// DimToVectorize can be divided by VectorWidth. The default VectorWidth is
332   /// currently constant and not yet target specific. This function does not
333   /// reason about parallelism.
334   static isl::schedule_node prevectSchedBand(isl::schedule_node Node,
335                                              unsigned DimToVectorize,
336                                              int VectorWidth);
337 
338   /// Apply additional optimizations on the bands in the schedule tree.
339   ///
340   /// We are looking for an innermost band node and apply the following
341   /// transformations:
342   ///
343   ///  - Tile the band
344   ///      - if the band is tileable
345   ///      - if the band has more than one loop dimension
346   ///
347   ///  - Prevectorize the schedule of the band (or the point loop in case of
348   ///    tiling).
349   ///      - if vectorization is enabled
350   ///
351   /// @param Node The schedule node to (possibly) optimize.
352   /// @param User A pointer to forward some use information
353   ///        (currently unused).
354   static isl_schedule_node *optimizeBand(isl_schedule_node *Node, void *User);
355 
356   /// Apply tiling optimizations on the bands in the schedule tree.
357   ///
358   /// @param Node The schedule node to (possibly) optimize.
359   static isl::schedule_node applyTileBandOpt(isl::schedule_node Node);
360 
361   /// Apply prevectorization on the bands in the schedule tree.
362   ///
363   /// @param Node The schedule node to (possibly) prevectorize.
364   static isl::schedule_node applyPrevectBandOpt(isl::schedule_node Node);
365 };
366 
367 isl::schedule_node
368 ScheduleTreeOptimizer::isolateFullPartialTiles(isl::schedule_node Node,
369                                                int VectorWidth) {
370   assert(isl_schedule_node_get_type(Node.get()) == isl_schedule_node_band);
371   Node = Node.child(0).child(0);
372   isl::union_map SchedRelUMap = Node.get_prefix_schedule_relation();
373   isl::union_set ScheduleRangeUSet = SchedRelUMap.range();
374   isl::set ScheduleRange{ScheduleRangeUSet};
375   isl::set IsolateDomain = getPartialTilePrefixes(ScheduleRange, VectorWidth);
376   auto AtomicOption = getDimOptions(IsolateDomain.ctx(), "atomic");
377   isl::union_set IsolateOption = getIsolateOptions(IsolateDomain, 1);
378   Node = Node.parent().parent();
379   isl::union_set Options = IsolateOption.unite(AtomicOption);
380   isl::schedule_node_band Result =
381       Node.as<isl::schedule_node_band>().set_ast_build_options(Options);
382   return Result;
383 }
384 
385 isl::schedule_node ScheduleTreeOptimizer::prevectSchedBand(
386     isl::schedule_node Node, unsigned DimToVectorize, int VectorWidth) {
387   assert(isl_schedule_node_get_type(Node.get()) == isl_schedule_node_band);
388 
389   auto Space = isl::manage(isl_schedule_node_band_get_space(Node.get()));
390   isl_size ScheduleDimensions = Space.dim(isl::dim::set).release();
391   assert((isl_size)DimToVectorize < ScheduleDimensions);
392 
393   if (DimToVectorize > 0) {
394     Node = isl::manage(
395         isl_schedule_node_band_split(Node.release(), DimToVectorize));
396     Node = Node.child(0);
397   }
398   if ((isl_size)DimToVectorize < ScheduleDimensions - 1)
399     Node = isl::manage(isl_schedule_node_band_split(Node.release(), 1));
400   Space = isl::manage(isl_schedule_node_band_get_space(Node.get()));
401   auto Sizes = isl::multi_val::zero(Space);
402   Sizes = Sizes.set_val(0, isl::val(Node.ctx(), VectorWidth));
403   Node =
404       isl::manage(isl_schedule_node_band_tile(Node.release(), Sizes.release()));
405   Node = isolateFullPartialTiles(Node, VectorWidth);
406   Node = Node.child(0);
407   // Make sure the "trivially vectorizable loop" is not unrolled. Otherwise,
408   // we will have troubles to match it in the backend.
409   isl::schedule_node_band NodeBand =
410       Node.as<isl::schedule_node_band>().set_ast_build_options(
411           isl::union_set(Node.ctx(), "{ unroll[x]: 1 = 0 }"));
412   Node = isl::manage(isl_schedule_node_band_sink(NodeBand.release()));
413   Node = Node.child(0);
414   if (isl_schedule_node_get_type(Node.get()) == isl_schedule_node_leaf)
415     Node = Node.parent();
416   auto LoopMarker = isl::id::alloc(Node.ctx(), "SIMD", nullptr);
417   PrevectOpts++;
418   return Node.insert_mark(LoopMarker);
419 }
420 
421 static bool isSimpleInnermostBand(const isl::schedule_node &Node) {
422   assert(isl_schedule_node_get_type(Node.get()) == isl_schedule_node_band);
423   assert(isl_schedule_node_n_children(Node.get()) == 1);
424 
425   auto ChildType = isl_schedule_node_get_type(Node.child(0).get());
426 
427   if (ChildType == isl_schedule_node_leaf)
428     return true;
429 
430   if (ChildType != isl_schedule_node_sequence)
431     return false;
432 
433   auto Sequence = Node.child(0);
434 
435   for (int c = 0, nc = isl_schedule_node_n_children(Sequence.get()); c < nc;
436        ++c) {
437     auto Child = Sequence.child(c);
438     if (isl_schedule_node_get_type(Child.get()) != isl_schedule_node_filter)
439       return false;
440     if (isl_schedule_node_get_type(Child.child(0).get()) !=
441         isl_schedule_node_leaf)
442       return false;
443   }
444   return true;
445 }
446 
447 bool ScheduleTreeOptimizer::isTileableBandNode(isl::schedule_node Node) {
448   if (isl_schedule_node_get_type(Node.get()) != isl_schedule_node_band)
449     return false;
450 
451   if (isl_schedule_node_n_children(Node.get()) != 1)
452     return false;
453 
454   if (!isl_schedule_node_band_get_permutable(Node.get()))
455     return false;
456 
457   auto Space = isl::manage(isl_schedule_node_band_get_space(Node.get()));
458   auto Dims = Space.dim(isl::dim::set).release();
459 
460   if (Dims <= 1)
461     return false;
462 
463   return isSimpleInnermostBand(Node);
464 }
465 
466 __isl_give isl::schedule_node
467 ScheduleTreeOptimizer::applyTileBandOpt(isl::schedule_node Node) {
468   if (FirstLevelTiling) {
469     Node = tileNode(Node, "1st level tiling", FirstLevelTileSizes,
470                     FirstLevelDefaultTileSize);
471     FirstLevelTileOpts++;
472   }
473 
474   if (SecondLevelTiling) {
475     Node = tileNode(Node, "2nd level tiling", SecondLevelTileSizes,
476                     SecondLevelDefaultTileSize);
477     SecondLevelTileOpts++;
478   }
479 
480   if (RegisterTiling) {
481     Node =
482         applyRegisterTiling(Node, RegisterTileSizes, RegisterDefaultTileSize);
483     RegisterTileOpts++;
484   }
485 
486   return Node;
487 }
488 
489 isl::schedule_node
490 ScheduleTreeOptimizer::applyPrevectBandOpt(isl::schedule_node Node) {
491   auto Space = isl::manage(isl_schedule_node_band_get_space(Node.get()));
492   auto Dims = Space.dim(isl::dim::set).release();
493 
494   for (int i = Dims - 1; i >= 0; i--)
495     if (Node.as<isl::schedule_node_band>().member_get_coincident(i)) {
496       Node = prevectSchedBand(Node, i, PrevectorWidth);
497       break;
498     }
499 
500   return Node;
501 }
502 
503 __isl_give isl_schedule_node *
504 ScheduleTreeOptimizer::optimizeBand(__isl_take isl_schedule_node *NodeArg,
505                                     void *User) {
506   const OptimizerAdditionalInfoTy *OAI =
507       static_cast<const OptimizerAdditionalInfoTy *>(User);
508   assert(OAI && "Expecting optimization options");
509 
510   isl::schedule_node Node = isl::manage(NodeArg);
511   if (!isTileableBandNode(Node))
512     return Node.release();
513 
514   if (OAI->PatternOpts) {
515     isl::schedule_node PatternOptimizedSchedule =
516         tryOptimizeMatMulPattern(Node, OAI->TTI, OAI->D);
517     if (!PatternOptimizedSchedule.is_null()) {
518       MatMulOpts++;
519       return PatternOptimizedSchedule.release();
520     }
521   }
522 
523   if (OAI->Postopts)
524     Node = applyTileBandOpt(Node);
525 
526   if (OAI->Prevect) {
527     // FIXME: Prevectorization requirements are different from those checked by
528     // isTileableBandNode.
529     Node = applyPrevectBandOpt(Node);
530   }
531 
532   return Node.release();
533 }
534 
535 isl::schedule
536 ScheduleTreeOptimizer::optimizeSchedule(isl::schedule Schedule,
537                                         const OptimizerAdditionalInfoTy *OAI) {
538   auto Root = Schedule.get_root();
539   Root = optimizeScheduleNode(Root, OAI);
540   return Root.get_schedule();
541 }
542 
543 isl::schedule_node ScheduleTreeOptimizer::optimizeScheduleNode(
544     isl::schedule_node Node, const OptimizerAdditionalInfoTy *OAI) {
545   Node = isl::manage(isl_schedule_node_map_descendant_bottom_up(
546       Node.release(), optimizeBand,
547       const_cast<void *>(static_cast<const void *>(OAI))));
548   return Node;
549 }
550 
551 bool ScheduleTreeOptimizer::isProfitableSchedule(Scop &S,
552                                                  isl::schedule NewSchedule) {
553   // To understand if the schedule has been optimized we check if the schedule
554   // has changed at all.
555   // TODO: We can improve this by tracking if any necessarily beneficial
556   // transformations have been performed. This can e.g. be tiling, loop
557   // interchange, or ...) We can track this either at the place where the
558   // transformation has been performed or, in case of automatic ILP based
559   // optimizations, by comparing (yet to be defined) performance metrics
560   // before/after the scheduling optimizer
561   // (e.g., #stride-one accesses)
562   // FIXME: A schedule tree whose union_map-conversion is identical to the
563   // original schedule map may still allow for parallelization, i.e. can still
564   // be profitable.
565   auto NewScheduleMap = NewSchedule.get_map();
566   auto OldSchedule = S.getSchedule();
567   assert(!OldSchedule.is_null() &&
568          "Only IslScheduleOptimizer can insert extension nodes "
569          "that make Scop::getSchedule() return nullptr.");
570   bool changed = !OldSchedule.is_equal(NewScheduleMap);
571   return changed;
572 }
573 
574 class IslScheduleOptimizerWrapperPass : public ScopPass {
575 public:
576   static char ID;
577 
578   explicit IslScheduleOptimizerWrapperPass() : ScopPass(ID) {}
579 
580   /// Optimize the schedule of the SCoP @p S.
581   bool runOnScop(Scop &S) override;
582 
583   /// Print the new schedule for the SCoP @p S.
584   void printScop(raw_ostream &OS, Scop &S) const override;
585 
586   /// Register all analyses and transformation required.
587   void getAnalysisUsage(AnalysisUsage &AU) const override;
588 
589   /// Release the internal memory.
590   void releaseMemory() override {
591     LastSchedule = {};
592     IslCtx.reset();
593   }
594 
595 private:
596   std::shared_ptr<isl_ctx> IslCtx;
597   isl::schedule LastSchedule;
598 };
599 
600 char IslScheduleOptimizerWrapperPass::ID = 0;
601 
602 #ifndef NDEBUG
603 static void printSchedule(llvm::raw_ostream &OS, const isl::schedule &Schedule,
604                           StringRef Desc) {
605   isl::ctx Ctx = Schedule.ctx();
606   isl_printer *P = isl_printer_to_str(Ctx.get());
607   P = isl_printer_set_yaml_style(P, ISL_YAML_STYLE_BLOCK);
608   P = isl_printer_print_schedule(P, Schedule.get());
609   char *Str = isl_printer_get_str(P);
610   OS << Desc << ": \n" << Str << "\n";
611   free(Str);
612   isl_printer_free(P);
613 }
614 #endif
615 
616 /// Collect statistics for the schedule tree.
617 ///
618 /// @param Schedule The schedule tree to analyze. If not a schedule tree it is
619 /// ignored.
620 /// @param Version  The version of the schedule tree that is analyzed.
621 ///                 0 for the original schedule tree before any transformation.
622 ///                 1 for the schedule tree after isl's rescheduling.
623 ///                 2 for the schedule tree after optimizations are applied
624 ///                 (tiling, pattern matching)
625 static void walkScheduleTreeForStatistics(isl::schedule Schedule, int Version) {
626   auto Root = Schedule.get_root();
627   if (Root.is_null())
628     return;
629 
630   isl_schedule_node_foreach_descendant_top_down(
631       Root.get(),
632       [](__isl_keep isl_schedule_node *nodeptr, void *user) -> isl_bool {
633         isl::schedule_node Node = isl::manage_copy(nodeptr);
634         int Version = *static_cast<int *>(user);
635 
636         switch (isl_schedule_node_get_type(Node.get())) {
637         case isl_schedule_node_band: {
638           NumBands[Version]++;
639           if (isl_schedule_node_band_get_permutable(Node.get()) ==
640               isl_bool_true)
641             NumPermutable[Version]++;
642 
643           int CountMembers = isl_schedule_node_band_n_member(Node.get());
644           NumBandMembers[Version] += CountMembers;
645           for (int i = 0; i < CountMembers; i += 1) {
646             if (Node.as<isl::schedule_node_band>().member_get_coincident(i))
647               NumCoincident[Version]++;
648           }
649           break;
650         }
651 
652         case isl_schedule_node_filter:
653           NumFilters[Version]++;
654           break;
655 
656         case isl_schedule_node_extension:
657           NumExtension[Version]++;
658           break;
659 
660         default:
661           break;
662         }
663 
664         return isl_bool_true;
665       },
666       &Version);
667 }
668 
669 static bool runIslScheduleOptimizer(
670     Scop &S,
671     function_ref<const Dependences &(Dependences::AnalysisLevel)> GetDeps,
672     TargetTransformInfo *TTI, OptimizationRemarkEmitter *ORE,
673     isl::schedule &LastSchedule) {
674 
675   // Skip SCoPs in case they're already optimised by PPCGCodeGeneration
676   if (S.isToBeSkipped())
677     return false;
678 
679   // Skip empty SCoPs but still allow code generation as it will delete the
680   // loops present but not needed.
681   if (S.getSize() == 0) {
682     S.markAsOptimized();
683     return false;
684   }
685 
686   ScopsProcessed++;
687 
688   // Schedule without optimizations.
689   isl::schedule Schedule = S.getScheduleTree();
690   walkScheduleTreeForStatistics(S.getScheduleTree(), 0);
691   LLVM_DEBUG(printSchedule(dbgs(), Schedule, "Original schedule tree"));
692 
693   bool HasUserTransformation = false;
694   if (PragmaBasedOpts) {
695     isl::schedule ManuallyTransformed = applyManualTransformations(
696         &S, Schedule, GetDeps(Dependences::AL_Statement), ORE);
697     if (ManuallyTransformed.is_null()) {
698       LLVM_DEBUG(dbgs() << "Error during manual optimization\n");
699       return false;
700     }
701 
702     if (ManuallyTransformed.get() != Schedule.get()) {
703       // User transformations have precedence over other transformations.
704       HasUserTransformation = true;
705       Schedule = std::move(ManuallyTransformed);
706       LLVM_DEBUG(
707           printSchedule(dbgs(), Schedule, "After manual transformations"));
708     }
709   }
710 
711   // Only continue if either manual transformations have been applied or we are
712   // allowed to apply heuristics.
713   // TODO: Detect disabled heuristics and no user-directed transformation
714   // metadata earlier in ScopDetection.
715   if (!HasUserTransformation && S.hasDisableHeuristicsHint()) {
716     LLVM_DEBUG(dbgs() << "Heuristic optimizations disabled by metadata\n");
717     return false;
718   }
719 
720   // Get dependency analysis.
721   const Dependences &D = GetDeps(Dependences::AL_Statement);
722   if (D.getSharedIslCtx() != S.getSharedIslCtx()) {
723     LLVM_DEBUG(dbgs() << "DependenceInfo for another SCoP/isl_ctx\n");
724     return false;
725   }
726   if (!D.hasValidDependences()) {
727     LLVM_DEBUG(dbgs() << "Dependency information not available\n");
728     return false;
729   }
730 
731   // Apply ISL's algorithm only if not overriden by the user. Note that
732   // post-rescheduling optimizations (tiling, pattern-based, prevectorization)
733   // rely on the coincidence/permutable annotations on schedule tree bands that
734   // are added by the rescheduling analyzer. Therefore, disabling the
735   // rescheduler implicitly also disables these optimizations.
736   if (!EnableReschedule) {
737     LLVM_DEBUG(dbgs() << "Skipping rescheduling due to command line option\n");
738   } else if (HasUserTransformation) {
739     LLVM_DEBUG(
740         dbgs() << "Skipping rescheduling due to manual transformation\n");
741   } else {
742     // Build input data.
743     int ValidityKinds =
744         Dependences::TYPE_RAW | Dependences::TYPE_WAR | Dependences::TYPE_WAW;
745     int ProximityKinds;
746 
747     if (OptimizeDeps == "all")
748       ProximityKinds =
749           Dependences::TYPE_RAW | Dependences::TYPE_WAR | Dependences::TYPE_WAW;
750     else if (OptimizeDeps == "raw")
751       ProximityKinds = Dependences::TYPE_RAW;
752     else {
753       errs() << "Do not know how to optimize for '" << OptimizeDeps << "'"
754              << " Falling back to optimizing all dependences.\n";
755       ProximityKinds =
756           Dependences::TYPE_RAW | Dependences::TYPE_WAR | Dependences::TYPE_WAW;
757     }
758 
759     isl::union_set Domain = S.getDomains();
760 
761     if (Domain.is_null())
762       return false;
763 
764     isl::union_map Validity = D.getDependences(ValidityKinds);
765     isl::union_map Proximity = D.getDependences(ProximityKinds);
766 
767     // Simplify the dependences by removing the constraints introduced by the
768     // domains. This can speed up the scheduling time significantly, as large
769     // constant coefficients will be removed from the dependences. The
770     // introduction of some additional dependences reduces the possible
771     // transformations, but in most cases, such transformation do not seem to be
772     // interesting anyway. In some cases this option may stop the scheduler to
773     // find any schedule.
774     if (SimplifyDeps == "yes") {
775       Validity = Validity.gist_domain(Domain);
776       Validity = Validity.gist_range(Domain);
777       Proximity = Proximity.gist_domain(Domain);
778       Proximity = Proximity.gist_range(Domain);
779     } else if (SimplifyDeps != "no") {
780       errs()
781           << "warning: Option -polly-opt-simplify-deps should either be 'yes' "
782              "or 'no'. Falling back to default: 'yes'\n";
783     }
784 
785     LLVM_DEBUG(dbgs() << "\n\nCompute schedule from: ");
786     LLVM_DEBUG(dbgs() << "Domain := " << Domain << ";\n");
787     LLVM_DEBUG(dbgs() << "Proximity := " << Proximity << ";\n");
788     LLVM_DEBUG(dbgs() << "Validity := " << Validity << ";\n");
789 
790     unsigned IslSerializeSCCs;
791 
792     if (FusionStrategy == "max") {
793       IslSerializeSCCs = 0;
794     } else if (FusionStrategy == "min") {
795       IslSerializeSCCs = 1;
796     } else {
797       errs() << "warning: Unknown fusion strategy. Falling back to maximal "
798                 "fusion.\n";
799       IslSerializeSCCs = 0;
800     }
801 
802     int IslMaximizeBands;
803 
804     if (MaximizeBandDepth == "yes") {
805       IslMaximizeBands = 1;
806     } else if (MaximizeBandDepth == "no") {
807       IslMaximizeBands = 0;
808     } else {
809       errs()
810           << "warning: Option -polly-opt-maximize-bands should either be 'yes'"
811              " or 'no'. Falling back to default: 'yes'\n";
812       IslMaximizeBands = 1;
813     }
814 
815     int IslOuterCoincidence;
816 
817     if (OuterCoincidence == "yes") {
818       IslOuterCoincidence = 1;
819     } else if (OuterCoincidence == "no") {
820       IslOuterCoincidence = 0;
821     } else {
822       errs() << "warning: Option -polly-opt-outer-coincidence should either be "
823                 "'yes' or 'no'. Falling back to default: 'no'\n";
824       IslOuterCoincidence = 0;
825     }
826 
827     isl_ctx *Ctx = S.getIslCtx().get();
828 
829     isl_options_set_schedule_outer_coincidence(Ctx, IslOuterCoincidence);
830     isl_options_set_schedule_serialize_sccs(Ctx, IslSerializeSCCs);
831     isl_options_set_schedule_maximize_band_depth(Ctx, IslMaximizeBands);
832     isl_options_set_schedule_max_constant_term(Ctx, MaxConstantTerm);
833     isl_options_set_schedule_max_coefficient(Ctx, MaxCoefficient);
834     isl_options_set_tile_scale_tile_loops(Ctx, 0);
835 
836     auto OnErrorStatus = isl_options_get_on_error(Ctx);
837     isl_options_set_on_error(Ctx, ISL_ON_ERROR_CONTINUE);
838 
839     auto SC = isl::schedule_constraints::on_domain(Domain);
840     SC = SC.set_proximity(Proximity);
841     SC = SC.set_validity(Validity);
842     SC = SC.set_coincidence(Validity);
843     Schedule = SC.compute_schedule();
844     isl_options_set_on_error(Ctx, OnErrorStatus);
845 
846     ScopsRescheduled++;
847     LLVM_DEBUG(printSchedule(dbgs(), Schedule, "After rescheduling"));
848   }
849 
850   walkScheduleTreeForStatistics(Schedule, 1);
851 
852   // In cases the scheduler is not able to optimize the code, we just do not
853   // touch the schedule.
854   if (Schedule.is_null())
855     return false;
856 
857   // Apply post-rescheduling optimizations (if enabled) and/or prevectorization.
858   const OptimizerAdditionalInfoTy OAI = {
859       TTI, const_cast<Dependences *>(&D),
860       /*PatternOpts=*/!HasUserTransformation && PMBasedOpts,
861       /*Postopts=*/!HasUserTransformation && EnablePostopts,
862       /*Prevect=*/PollyVectorizerChoice != VECTORIZER_NONE};
863   if (OAI.PatternOpts || OAI.Postopts || OAI.Prevect) {
864     Schedule = ScheduleTreeOptimizer::optimizeSchedule(Schedule, &OAI);
865     Schedule = hoistExtensionNodes(Schedule);
866     LLVM_DEBUG(printSchedule(dbgs(), Schedule, "After post-optimizations"));
867     walkScheduleTreeForStatistics(Schedule, 2);
868   }
869 
870   // Skip profitability check if user transformation(s) have been applied.
871   if (!HasUserTransformation &&
872       !ScheduleTreeOptimizer::isProfitableSchedule(S, Schedule))
873     return false;
874 
875   auto ScopStats = S.getStatistics();
876   ScopsOptimized++;
877   NumAffineLoopsOptimized += ScopStats.NumAffineLoops;
878   NumBoxedLoopsOptimized += ScopStats.NumBoxedLoops;
879   LastSchedule = Schedule;
880 
881   S.setScheduleTree(Schedule);
882   S.markAsOptimized();
883 
884   if (OptimizedScops)
885     errs() << S;
886 
887   return false;
888 }
889 
890 bool IslScheduleOptimizerWrapperPass::runOnScop(Scop &S) {
891   releaseMemory();
892 
893   Function &F = S.getFunction();
894   IslCtx = S.getSharedIslCtx();
895 
896   auto getDependences =
897       [this](Dependences::AnalysisLevel) -> const Dependences & {
898     return getAnalysis<DependenceInfo>().getDependences(
899         Dependences::AL_Statement);
900   };
901   OptimizationRemarkEmitter &ORE =
902       getAnalysis<OptimizationRemarkEmitterWrapperPass>().getORE();
903   TargetTransformInfo *TTI =
904       &getAnalysis<TargetTransformInfoWrapperPass>().getTTI(F);
905   return runIslScheduleOptimizer(S, getDependences, TTI, &ORE, LastSchedule);
906 }
907 
908 static void runScheduleOptimizerPrinter(raw_ostream &OS,
909                                         isl::schedule LastSchedule) {
910   isl_printer *p;
911   char *ScheduleStr;
912 
913   OS << "Calculated schedule:\n";
914 
915   if (LastSchedule.is_null()) {
916     OS << "n/a\n";
917     return;
918   }
919 
920   p = isl_printer_to_str(LastSchedule.ctx().get());
921   p = isl_printer_set_yaml_style(p, ISL_YAML_STYLE_BLOCK);
922   p = isl_printer_print_schedule(p, LastSchedule.get());
923   ScheduleStr = isl_printer_get_str(p);
924   isl_printer_free(p);
925 
926   OS << ScheduleStr << "\n";
927 
928   free(ScheduleStr);
929 }
930 
931 void IslScheduleOptimizerWrapperPass::printScop(raw_ostream &OS, Scop &) const {
932   runScheduleOptimizerPrinter(OS, LastSchedule);
933 }
934 
935 void IslScheduleOptimizerWrapperPass::getAnalysisUsage(
936     AnalysisUsage &AU) const {
937   ScopPass::getAnalysisUsage(AU);
938   AU.addRequired<DependenceInfo>();
939   AU.addRequired<TargetTransformInfoWrapperPass>();
940   AU.addRequired<OptimizationRemarkEmitterWrapperPass>();
941 
942   AU.addPreserved<DependenceInfo>();
943   AU.addPreserved<OptimizationRemarkEmitterWrapperPass>();
944 }
945 
946 } // namespace
947 
948 Pass *polly::createIslScheduleOptimizerWrapperPass() {
949   return new IslScheduleOptimizerWrapperPass();
950 }
951 
952 INITIALIZE_PASS_BEGIN(IslScheduleOptimizerWrapperPass, "polly-opt-isl",
953                       "Polly - Optimize schedule of SCoP", false, false);
954 INITIALIZE_PASS_DEPENDENCY(DependenceInfo);
955 INITIALIZE_PASS_DEPENDENCY(ScopInfoRegionPass);
956 INITIALIZE_PASS_DEPENDENCY(TargetTransformInfoWrapperPass);
957 INITIALIZE_PASS_DEPENDENCY(OptimizationRemarkEmitterWrapperPass);
958 INITIALIZE_PASS_END(IslScheduleOptimizerWrapperPass, "polly-opt-isl",
959                     "Polly - Optimize schedule of SCoP", false, false)
960 
961 static llvm::PreservedAnalyses
962 runIslScheduleOptimizerUsingNPM(Scop &S, ScopAnalysisManager &SAM,
963                                 ScopStandardAnalysisResults &SAR, SPMUpdater &U,
964                                 raw_ostream *OS) {
965   DependenceAnalysis::Result &Deps = SAM.getResult<DependenceAnalysis>(S, SAR);
966   auto GetDeps = [&Deps](Dependences::AnalysisLevel) -> const Dependences & {
967     return Deps.getDependences(Dependences::AL_Statement);
968   };
969   OptimizationRemarkEmitter ORE(&S.getFunction());
970   TargetTransformInfo *TTI = &SAR.TTI;
971   isl::schedule LastSchedule;
972   bool Modified = runIslScheduleOptimizer(S, GetDeps, TTI, &ORE, LastSchedule);
973   if (OS) {
974     *OS << "Printing analysis 'Polly - Optimize schedule of SCoP' for region: '"
975         << S.getName() << "' in function '" << S.getFunction().getName()
976         << "':\n";
977     runScheduleOptimizerPrinter(*OS, LastSchedule);
978   }
979 
980   if (!Modified)
981     return PreservedAnalyses::all();
982 
983   PreservedAnalyses PA;
984   PA.preserveSet<AllAnalysesOn<Module>>();
985   PA.preserveSet<AllAnalysesOn<Function>>();
986   PA.preserveSet<AllAnalysesOn<Loop>>();
987   return PA;
988 }
989 
990 llvm::PreservedAnalyses
991 IslScheduleOptimizerPass::run(Scop &S, ScopAnalysisManager &SAM,
992                               ScopStandardAnalysisResults &SAR, SPMUpdater &U) {
993   return runIslScheduleOptimizerUsingNPM(S, SAM, SAR, U, nullptr);
994 }
995 
996 llvm::PreservedAnalyses
997 IslScheduleOptimizerPrinterPass::run(Scop &S, ScopAnalysisManager &SAM,
998                                      ScopStandardAnalysisResults &SAR,
999                                      SPMUpdater &U) {
1000   return runIslScheduleOptimizerUsingNPM(S, SAM, SAR, U, &OS);
1001 }
1002