1 //===- ScheduleOptimizer.cpp - Calculate an optimized schedule ------------===//
2 //
3 // Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
4 // See https://llvm.org/LICENSE.txt for license information.
5 // SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
6 //
7 //===----------------------------------------------------------------------===//
8 //
9 // This pass generates an entirely new schedule tree from the data dependences
10 // and iteration domains. The new schedule tree is computed in two steps:
11 //
12 // 1) The isl scheduling optimizer is run
13 //
14 // The isl scheduling optimizer creates a new schedule tree that maximizes
15 // parallelism and tileability and minimizes data-dependence distances. The
16 // algorithm used is a modified version of the ``Pluto'' algorithm:
17 //
18 //   U. Bondhugula, A. Hartono, J. Ramanujam, and P. Sadayappan.
19 //   A Practical Automatic Polyhedral Parallelizer and Locality Optimizer.
20 //   In Proceedings of the 2008 ACM SIGPLAN Conference On Programming Language
21 //   Design and Implementation, PLDI ’08, pages 101–113. ACM, 2008.
22 //
23 // 2) A set of post-scheduling transformations is applied on the schedule tree.
24 //
25 // These optimizations include:
26 //
27 //  - Tiling of the innermost tilable bands
28 //  - Prevectorization - The choice of a possible outer loop that is strip-mined
29 //                       to the innermost level to enable inner-loop
30 //                       vectorization.
31 //  - Some optimizations for spatial locality are also planned.
32 //
33 // For a detailed description of the schedule tree itself please see section 6
34 // of:
35 //
36 // Polyhedral AST generation is more than scanning polyhedra
37 // Tobias Grosser, Sven Verdoolaege, Albert Cohen
38 // ACM Transactions on Programming Languages and Systems (TOPLAS),
39 // 37(4), July 2015
40 // http://www.grosser.es/#pub-polyhedral-AST-generation
41 //
42 // This publication also contains a detailed discussion of the different options
43 // for polyhedral loop unrolling, full/partial tile separation and other uses
44 // of the schedule tree.
45 //
46 //===----------------------------------------------------------------------===//
47 
48 #include "polly/ScheduleOptimizer.h"
49 #include "polly/CodeGen/CodeGeneration.h"
50 #include "polly/DependenceInfo.h"
51 #include "polly/ManualOptimizer.h"
52 #include "polly/MatmulOptimizer.h"
53 #include "polly/Options.h"
54 #include "polly/ScheduleTreeTransform.h"
55 #include "polly/Support/ISLOStream.h"
56 #include "polly/Support/ISLTools.h"
57 #include "llvm/ADT/Sequence.h"
58 #include "llvm/ADT/Statistic.h"
59 #include "llvm/Analysis/OptimizationRemarkEmitter.h"
60 #include "llvm/InitializePasses.h"
61 #include "llvm/Support/CommandLine.h"
62 #include "isl/options.h"
63 
64 using namespace llvm;
65 using namespace polly;
66 
67 namespace llvm {
68 class Loop;
69 class Module;
70 } // namespace llvm
71 
72 #define DEBUG_TYPE "polly-opt-isl"
73 
74 static cl::opt<std::string>
75     OptimizeDeps("polly-opt-optimize-only",
76                  cl::desc("Only a certain kind of dependences (all/raw)"),
77                  cl::Hidden, cl::init("all"), cl::cat(PollyCategory));
78 
79 static cl::opt<std::string>
80     SimplifyDeps("polly-opt-simplify-deps",
81                  cl::desc("Dependences should be simplified (yes/no)"),
82                  cl::Hidden, cl::init("yes"), cl::cat(PollyCategory));
83 
84 static cl::opt<int> MaxConstantTerm(
85     "polly-opt-max-constant-term",
86     cl::desc("The maximal constant term allowed (-1 is unlimited)"), cl::Hidden,
87     cl::init(20), cl::cat(PollyCategory));
88 
89 static cl::opt<int> MaxCoefficient(
90     "polly-opt-max-coefficient",
91     cl::desc("The maximal coefficient allowed (-1 is unlimited)"), cl::Hidden,
92     cl::init(20), cl::cat(PollyCategory));
93 
94 static cl::opt<std::string>
95     MaximizeBandDepth("polly-opt-maximize-bands",
96                       cl::desc("Maximize the band depth (yes/no)"), cl::Hidden,
97                       cl::init("yes"), cl::cat(PollyCategory));
98 
99 static cl::opt<bool>
100     GreedyFusion("polly-loopfusion-greedy",
101                  cl::desc("Aggressively try to fuse everything"), cl::Hidden,
102                  cl::cat(PollyCategory));
103 
104 static cl::opt<std::string> OuterCoincidence(
105     "polly-opt-outer-coincidence",
106     cl::desc("Try to construct schedules where the outer member of each band "
107              "satisfies the coincidence constraints (yes/no)"),
108     cl::Hidden, cl::init("no"), cl::cat(PollyCategory));
109 
110 static cl::opt<int> PrevectorWidth(
111     "polly-prevect-width",
112     cl::desc(
113         "The number of loop iterations to strip-mine for pre-vectorization"),
114     cl::Hidden, cl::init(4), cl::cat(PollyCategory));
115 
116 static cl::opt<bool> FirstLevelTiling("polly-tiling",
117                                       cl::desc("Enable loop tiling"),
118                                       cl::init(true), cl::cat(PollyCategory));
119 
120 static cl::opt<int> FirstLevelDefaultTileSize(
121     "polly-default-tile-size",
122     cl::desc("The default tile size (if not enough were provided by"
123              " --polly-tile-sizes)"),
124     cl::Hidden, cl::init(32), cl::cat(PollyCategory));
125 
126 static cl::list<int>
127     FirstLevelTileSizes("polly-tile-sizes",
128                         cl::desc("A tile size for each loop dimension, filled "
129                                  "with --polly-default-tile-size"),
130                         cl::Hidden, cl::CommaSeparated, cl::cat(PollyCategory));
131 
132 static cl::opt<bool>
133     SecondLevelTiling("polly-2nd-level-tiling",
134                       cl::desc("Enable a 2nd level loop of loop tiling"),
135                       cl::cat(PollyCategory));
136 
137 static cl::opt<int> SecondLevelDefaultTileSize(
138     "polly-2nd-level-default-tile-size",
139     cl::desc("The default 2nd-level tile size (if not enough were provided by"
140              " --polly-2nd-level-tile-sizes)"),
141     cl::Hidden, cl::init(16), cl::cat(PollyCategory));
142 
143 static cl::list<int>
144     SecondLevelTileSizes("polly-2nd-level-tile-sizes",
145                          cl::desc("A tile size for each loop dimension, filled "
146                                   "with --polly-default-tile-size"),
147                          cl::Hidden, cl::CommaSeparated,
148                          cl::cat(PollyCategory));
149 
150 static cl::opt<bool> RegisterTiling("polly-register-tiling",
151                                     cl::desc("Enable register tiling"),
152                                     cl::cat(PollyCategory));
153 
154 static cl::opt<int> RegisterDefaultTileSize(
155     "polly-register-tiling-default-tile-size",
156     cl::desc("The default register tile size (if not enough were provided by"
157              " --polly-register-tile-sizes)"),
158     cl::Hidden, cl::init(2), cl::cat(PollyCategory));
159 
160 static cl::list<int>
161     RegisterTileSizes("polly-register-tile-sizes",
162                       cl::desc("A tile size for each loop dimension, filled "
163                                "with --polly-register-tile-size"),
164                       cl::Hidden, cl::CommaSeparated, cl::cat(PollyCategory));
165 
166 static cl::opt<bool> PragmaBasedOpts(
167     "polly-pragma-based-opts",
168     cl::desc("Apply user-directed transformation from metadata"),
169     cl::init(true), cl::cat(PollyCategory));
170 
171 static cl::opt<bool> EnableReschedule("polly-reschedule",
172                                       cl::desc("Optimize SCoPs using ISL"),
173                                       cl::init(true), cl::cat(PollyCategory));
174 
175 static cl::opt<bool>
176     PMBasedOpts("polly-pattern-matching-based-opts",
177                 cl::desc("Perform optimizations based on pattern matching"),
178                 cl::init(true), cl::cat(PollyCategory));
179 
180 static cl::opt<bool>
181     EnablePostopts("polly-postopts",
182                    cl::desc("Apply post-rescheduling optimizations such as "
183                             "tiling (requires -polly-reschedule)"),
184                    cl::init(true), cl::cat(PollyCategory));
185 
186 static cl::opt<bool> OptimizedScops(
187     "polly-optimized-scops",
188     cl::desc("Polly - Dump polyhedral description of Scops optimized with "
189              "the isl scheduling optimizer and the set of post-scheduling "
190              "transformations is applied on the schedule tree"),
191     cl::cat(PollyCategory));
192 
193 STATISTIC(ScopsProcessed, "Number of scops processed");
194 STATISTIC(ScopsRescheduled, "Number of scops rescheduled");
195 STATISTIC(ScopsOptimized, "Number of scops optimized");
196 
197 STATISTIC(NumAffineLoopsOptimized, "Number of affine loops optimized");
198 STATISTIC(NumBoxedLoopsOptimized, "Number of boxed loops optimized");
199 
200 #define THREE_STATISTICS(VARNAME, DESC)                                        \
201   static Statistic VARNAME[3] = {                                              \
202       {DEBUG_TYPE, #VARNAME "0", DESC " (original)"},                          \
203       {DEBUG_TYPE, #VARNAME "1", DESC " (after scheduler)"},                   \
204       {DEBUG_TYPE, #VARNAME "2", DESC " (after optimizer)"}}
205 
206 THREE_STATISTICS(NumBands, "Number of bands");
207 THREE_STATISTICS(NumBandMembers, "Number of band members");
208 THREE_STATISTICS(NumCoincident, "Number of coincident band members");
209 THREE_STATISTICS(NumPermutable, "Number of permutable bands");
210 THREE_STATISTICS(NumFilters, "Number of filter nodes");
211 THREE_STATISTICS(NumExtension, "Number of extension nodes");
212 
213 STATISTIC(FirstLevelTileOpts, "Number of first level tiling applied");
214 STATISTIC(SecondLevelTileOpts, "Number of second level tiling applied");
215 STATISTIC(RegisterTileOpts, "Number of register tiling applied");
216 STATISTIC(PrevectOpts, "Number of strip-mining for prevectorization applied");
217 STATISTIC(MatMulOpts,
218           "Number of matrix multiplication patterns detected and optimized");
219 
220 namespace {
221 /// Additional parameters of the schedule optimizer.
222 ///
223 /// Target Transform Info and the SCoP dependencies used by the schedule
224 /// optimizer.
225 struct OptimizerAdditionalInfoTy {
226   const llvm::TargetTransformInfo *TTI;
227   const Dependences *D;
228   bool PatternOpts;
229   bool Postopts;
230   bool Prevect;
231 };
232 
233 class ScheduleTreeOptimizer final {
234 public:
235   /// Apply schedule tree transformations.
236   ///
237   /// This function takes an (possibly already optimized) schedule tree and
238   /// applies a set of additional optimizations on the schedule tree. The
239   /// transformations applied include:
240   ///
241   ///   - Pattern-based optimizations
242   ///   - Tiling
243   ///   - Prevectorization
244   ///
245   /// @param Schedule The schedule object the transformations will be applied
246   ///                 to.
247   /// @param OAI      Target Transform Info and the SCoP dependencies.
248   /// @returns        The transformed schedule.
249   static isl::schedule
250   optimizeSchedule(isl::schedule Schedule,
251                    const OptimizerAdditionalInfoTy *OAI = nullptr);
252 
253   /// Apply schedule tree transformations.
254   ///
255   /// This function takes a node in an (possibly already optimized) schedule
256   /// tree and applies a set of additional optimizations on this schedule tree
257   /// node and its descendants. The transformations applied include:
258   ///
259   ///   - Pattern-based optimizations
260   ///   - Tiling
261   ///   - Prevectorization
262   ///
263   /// @param Node The schedule object post-transformations will be applied to.
264   /// @param OAI  Target Transform Info and the SCoP dependencies.
265   /// @returns    The transformed schedule.
266   static isl::schedule_node
267   optimizeScheduleNode(isl::schedule_node Node,
268                        const OptimizerAdditionalInfoTy *OAI = nullptr);
269 
270   /// Decide if the @p NewSchedule is profitable for @p S.
271   ///
272   /// @param S           The SCoP we optimize.
273   /// @param NewSchedule The new schedule we computed.
274   ///
275   /// @return True, if we believe @p NewSchedule is an improvement for @p S.
276   static bool isProfitableSchedule(polly::Scop &S, isl::schedule NewSchedule);
277 
278   /// Isolate a set of partial tile prefixes.
279   ///
280   /// This set should ensure that it contains only partial tile prefixes that
281   /// have exactly VectorWidth iterations.
282   ///
283   /// @param Node A schedule node band, which is a parent of a band node,
284   ///             that contains a vector loop.
285   /// @return Modified isl_schedule_node.
286   static isl::schedule_node isolateFullPartialTiles(isl::schedule_node Node,
287                                                     int VectorWidth);
288 
289 private:
290   /// Check if this node is a band node we want to tile.
291   ///
292   /// We look for innermost band nodes where individual dimensions are marked as
293   /// permutable.
294   ///
295   /// @param Node The node to check.
296   static bool isTileableBandNode(isl::schedule_node Node);
297 
298   /// Pre-vectorizes one scheduling dimension of a schedule band.
299   ///
300   /// prevectSchedBand splits out the dimension DimToVectorize, tiles it and
301   /// sinks the resulting point loop.
302   ///
303   /// Example (DimToVectorize=0, VectorWidth=4):
304   ///
305   /// | Before transformation:
306   /// |
307   /// | A[i,j] -> [i,j]
308   /// |
309   /// | for (i = 0; i < 128; i++)
310   /// |    for (j = 0; j < 128; j++)
311   /// |      A(i,j);
312   ///
313   /// | After transformation:
314   /// |
315   /// | for (it = 0; it < 32; it+=1)
316   /// |    for (j = 0; j < 128; j++)
317   /// |      for (ip = 0; ip <= 3; ip++)
318   /// |        A(4 * it + ip,j);
319   ///
320   /// The goal of this transformation is to create a trivially vectorizable
321   /// loop.  This means a parallel loop at the innermost level that has a
322   /// constant number of iterations corresponding to the target vector width.
323   ///
324   /// This transformation creates a loop at the innermost level. The loop has
325   /// a constant number of iterations, if the number of loop iterations at
326   /// DimToVectorize can be divided by VectorWidth. The default VectorWidth is
327   /// currently constant and not yet target specific. This function does not
328   /// reason about parallelism.
329   static isl::schedule_node prevectSchedBand(isl::schedule_node Node,
330                                              unsigned DimToVectorize,
331                                              int VectorWidth);
332 
333   /// Apply additional optimizations on the bands in the schedule tree.
334   ///
335   /// We are looking for an innermost band node and apply the following
336   /// transformations:
337   ///
338   ///  - Tile the band
339   ///      - if the band is tileable
340   ///      - if the band has more than one loop dimension
341   ///
342   ///  - Prevectorize the schedule of the band (or the point loop in case of
343   ///    tiling).
344   ///      - if vectorization is enabled
345   ///
346   /// @param Node The schedule node to (possibly) optimize.
347   /// @param User A pointer to forward some use information
348   ///        (currently unused).
349   static isl_schedule_node *optimizeBand(isl_schedule_node *Node, void *User);
350 
351   /// Apply tiling optimizations on the bands in the schedule tree.
352   ///
353   /// @param Node The schedule node to (possibly) optimize.
354   static isl::schedule_node applyTileBandOpt(isl::schedule_node Node);
355 
356   /// Apply prevectorization on the bands in the schedule tree.
357   ///
358   /// @param Node The schedule node to (possibly) prevectorize.
359   static isl::schedule_node applyPrevectBandOpt(isl::schedule_node Node);
360 };
361 
362 isl::schedule_node
363 ScheduleTreeOptimizer::isolateFullPartialTiles(isl::schedule_node Node,
364                                                int VectorWidth) {
365   assert(isl_schedule_node_get_type(Node.get()) == isl_schedule_node_band);
366   Node = Node.child(0).child(0);
367   isl::union_map SchedRelUMap = Node.get_prefix_schedule_relation();
368   isl::union_set ScheduleRangeUSet = SchedRelUMap.range();
369   isl::set ScheduleRange{ScheduleRangeUSet};
370   isl::set IsolateDomain = getPartialTilePrefixes(ScheduleRange, VectorWidth);
371   auto AtomicOption = getDimOptions(IsolateDomain.ctx(), "atomic");
372   isl::union_set IsolateOption = getIsolateOptions(IsolateDomain, 1);
373   Node = Node.parent().parent();
374   isl::union_set Options = IsolateOption.unite(AtomicOption);
375   isl::schedule_node_band Result =
376       Node.as<isl::schedule_node_band>().set_ast_build_options(Options);
377   return Result;
378 }
379 
380 struct InsertSimdMarkers final : ScheduleNodeRewriter<InsertSimdMarkers> {
381   isl::schedule_node visitBand(isl::schedule_node_band Band) {
382     isl::schedule_node Node = visitChildren(Band);
383 
384     // Only add SIMD markers to innermost bands.
385     if (!Node.first_child().isa<isl::schedule_node_leaf>())
386       return Node;
387 
388     isl::id LoopMarker = isl::id::alloc(Band.ctx(), "SIMD", nullptr);
389     return Band.insert_mark(LoopMarker);
390   }
391 };
392 
393 isl::schedule_node ScheduleTreeOptimizer::prevectSchedBand(
394     isl::schedule_node Node, unsigned DimToVectorize, int VectorWidth) {
395   assert(isl_schedule_node_get_type(Node.get()) == isl_schedule_node_band);
396 
397   auto Space = isl::manage(isl_schedule_node_band_get_space(Node.get()));
398   unsigned ScheduleDimensions = unsignedFromIslSize(Space.dim(isl::dim::set));
399   assert(DimToVectorize < ScheduleDimensions);
400 
401   if (DimToVectorize > 0) {
402     Node = isl::manage(
403         isl_schedule_node_band_split(Node.release(), DimToVectorize));
404     Node = Node.child(0);
405   }
406   if (DimToVectorize < ScheduleDimensions - 1)
407     Node = isl::manage(isl_schedule_node_band_split(Node.release(), 1));
408   Space = isl::manage(isl_schedule_node_band_get_space(Node.get()));
409   auto Sizes = isl::multi_val::zero(Space);
410   Sizes = Sizes.set_val(0, isl::val(Node.ctx(), VectorWidth));
411   Node =
412       isl::manage(isl_schedule_node_band_tile(Node.release(), Sizes.release()));
413   Node = isolateFullPartialTiles(Node, VectorWidth);
414   Node = Node.child(0);
415   // Make sure the "trivially vectorizable loop" is not unrolled. Otherwise,
416   // we will have troubles to match it in the backend.
417   Node = Node.as<isl::schedule_node_band>().set_ast_build_options(
418       isl::union_set(Node.ctx(), "{ unroll[x]: 1 = 0 }"));
419 
420   // Sink the inner loop into the smallest possible statements to make them
421   // represent a single vector instruction if possible.
422   Node = isl::manage(isl_schedule_node_band_sink(Node.release()));
423 
424   // Add SIMD markers to those vector statements.
425   InsertSimdMarkers SimdMarkerInserter;
426   Node = SimdMarkerInserter.visit(Node);
427 
428   PrevectOpts++;
429   return Node.parent();
430 }
431 
432 static bool isSimpleInnermostBand(const isl::schedule_node &Node) {
433   assert(isl_schedule_node_get_type(Node.get()) == isl_schedule_node_band);
434   assert(isl_schedule_node_n_children(Node.get()) == 1);
435 
436   auto ChildType = isl_schedule_node_get_type(Node.child(0).get());
437 
438   if (ChildType == isl_schedule_node_leaf)
439     return true;
440 
441   if (ChildType != isl_schedule_node_sequence)
442     return false;
443 
444   auto Sequence = Node.child(0);
445 
446   for (int c = 0, nc = isl_schedule_node_n_children(Sequence.get()); c < nc;
447        ++c) {
448     auto Child = Sequence.child(c);
449     if (isl_schedule_node_get_type(Child.get()) != isl_schedule_node_filter)
450       return false;
451     if (isl_schedule_node_get_type(Child.child(0).get()) !=
452         isl_schedule_node_leaf)
453       return false;
454   }
455   return true;
456 }
457 
458 bool ScheduleTreeOptimizer::isTileableBandNode(isl::schedule_node Node) {
459   if (isl_schedule_node_get_type(Node.get()) != isl_schedule_node_band)
460     return false;
461 
462   if (isl_schedule_node_n_children(Node.get()) != 1)
463     return false;
464 
465   if (!isl_schedule_node_band_get_permutable(Node.get()))
466     return false;
467 
468   auto Space = isl::manage(isl_schedule_node_band_get_space(Node.get()));
469 
470   if (unsignedFromIslSize(Space.dim(isl::dim::set)) <= 1u)
471     return false;
472 
473   return isSimpleInnermostBand(Node);
474 }
475 
476 __isl_give isl::schedule_node
477 ScheduleTreeOptimizer::applyTileBandOpt(isl::schedule_node Node) {
478   if (FirstLevelTiling) {
479     Node = tileNode(Node, "1st level tiling", FirstLevelTileSizes,
480                     FirstLevelDefaultTileSize);
481     FirstLevelTileOpts++;
482   }
483 
484   if (SecondLevelTiling) {
485     Node = tileNode(Node, "2nd level tiling", SecondLevelTileSizes,
486                     SecondLevelDefaultTileSize);
487     SecondLevelTileOpts++;
488   }
489 
490   if (RegisterTiling) {
491     Node =
492         applyRegisterTiling(Node, RegisterTileSizes, RegisterDefaultTileSize);
493     RegisterTileOpts++;
494   }
495 
496   return Node;
497 }
498 
499 isl::schedule_node
500 ScheduleTreeOptimizer::applyPrevectBandOpt(isl::schedule_node Node) {
501   auto Space = isl::manage(isl_schedule_node_band_get_space(Node.get()));
502   int Dims = unsignedFromIslSize(Space.dim(isl::dim::set));
503 
504   for (int i = Dims - 1; i >= 0; i--)
505     if (Node.as<isl::schedule_node_band>().member_get_coincident(i)) {
506       Node = prevectSchedBand(Node, i, PrevectorWidth);
507       break;
508     }
509 
510   return Node;
511 }
512 
513 __isl_give isl_schedule_node *
514 ScheduleTreeOptimizer::optimizeBand(__isl_take isl_schedule_node *NodeArg,
515                                     void *User) {
516   const OptimizerAdditionalInfoTy *OAI =
517       static_cast<const OptimizerAdditionalInfoTy *>(User);
518   assert(OAI && "Expecting optimization options");
519 
520   isl::schedule_node Node = isl::manage(NodeArg);
521   if (!isTileableBandNode(Node))
522     return Node.release();
523 
524   if (OAI->PatternOpts) {
525     isl::schedule_node PatternOptimizedSchedule =
526         tryOptimizeMatMulPattern(Node, OAI->TTI, OAI->D);
527     if (!PatternOptimizedSchedule.is_null()) {
528       MatMulOpts++;
529       return PatternOptimizedSchedule.release();
530     }
531   }
532 
533   if (OAI->Postopts)
534     Node = applyTileBandOpt(Node);
535 
536   if (OAI->Prevect) {
537     // FIXME: Prevectorization requirements are different from those checked by
538     // isTileableBandNode.
539     Node = applyPrevectBandOpt(Node);
540   }
541 
542   return Node.release();
543 }
544 
545 isl::schedule
546 ScheduleTreeOptimizer::optimizeSchedule(isl::schedule Schedule,
547                                         const OptimizerAdditionalInfoTy *OAI) {
548   auto Root = Schedule.get_root();
549   Root = optimizeScheduleNode(Root, OAI);
550   return Root.get_schedule();
551 }
552 
553 isl::schedule_node ScheduleTreeOptimizer::optimizeScheduleNode(
554     isl::schedule_node Node, const OptimizerAdditionalInfoTy *OAI) {
555   Node = isl::manage(isl_schedule_node_map_descendant_bottom_up(
556       Node.release(), optimizeBand,
557       const_cast<void *>(static_cast<const void *>(OAI))));
558   return Node;
559 }
560 
561 bool ScheduleTreeOptimizer::isProfitableSchedule(Scop &S,
562                                                  isl::schedule NewSchedule) {
563   // To understand if the schedule has been optimized we check if the schedule
564   // has changed at all.
565   // TODO: We can improve this by tracking if any necessarily beneficial
566   // transformations have been performed. This can e.g. be tiling, loop
567   // interchange, or ...) We can track this either at the place where the
568   // transformation has been performed or, in case of automatic ILP based
569   // optimizations, by comparing (yet to be defined) performance metrics
570   // before/after the scheduling optimizer
571   // (e.g., #stride-one accesses)
572   // FIXME: A schedule tree whose union_map-conversion is identical to the
573   // original schedule map may still allow for parallelization, i.e. can still
574   // be profitable.
575   auto NewScheduleMap = NewSchedule.get_map();
576   auto OldSchedule = S.getSchedule();
577   assert(!OldSchedule.is_null() &&
578          "Only IslScheduleOptimizer can insert extension nodes "
579          "that make Scop::getSchedule() return nullptr.");
580   bool changed = !OldSchedule.is_equal(NewScheduleMap);
581   return changed;
582 }
583 
584 class IslScheduleOptimizerWrapperPass final : public ScopPass {
585 public:
586   static char ID;
587 
588   explicit IslScheduleOptimizerWrapperPass() : ScopPass(ID) {}
589 
590   /// Optimize the schedule of the SCoP @p S.
591   bool runOnScop(Scop &S) override;
592 
593   /// Print the new schedule for the SCoP @p S.
594   void printScop(raw_ostream &OS, Scop &S) const override;
595 
596   /// Register all analyses and transformation required.
597   void getAnalysisUsage(AnalysisUsage &AU) const override;
598 
599   /// Release the internal memory.
600   void releaseMemory() override {
601     LastSchedule = {};
602     IslCtx.reset();
603   }
604 
605 private:
606   std::shared_ptr<isl_ctx> IslCtx;
607   isl::schedule LastSchedule;
608 };
609 
610 char IslScheduleOptimizerWrapperPass::ID = 0;
611 
612 #ifndef NDEBUG
613 static void printSchedule(llvm::raw_ostream &OS, const isl::schedule &Schedule,
614                           StringRef Desc) {
615   isl::ctx Ctx = Schedule.ctx();
616   isl_printer *P = isl_printer_to_str(Ctx.get());
617   P = isl_printer_set_yaml_style(P, ISL_YAML_STYLE_BLOCK);
618   P = isl_printer_print_schedule(P, Schedule.get());
619   char *Str = isl_printer_get_str(P);
620   OS << Desc << ": \n" << Str << "\n";
621   free(Str);
622   isl_printer_free(P);
623 }
624 #endif
625 
626 /// Collect statistics for the schedule tree.
627 ///
628 /// @param Schedule The schedule tree to analyze. If not a schedule tree it is
629 /// ignored.
630 /// @param Version  The version of the schedule tree that is analyzed.
631 ///                 0 for the original schedule tree before any transformation.
632 ///                 1 for the schedule tree after isl's rescheduling.
633 ///                 2 for the schedule tree after optimizations are applied
634 ///                 (tiling, pattern matching)
635 static void walkScheduleTreeForStatistics(isl::schedule Schedule, int Version) {
636   auto Root = Schedule.get_root();
637   if (Root.is_null())
638     return;
639 
640   isl_schedule_node_foreach_descendant_top_down(
641       Root.get(),
642       [](__isl_keep isl_schedule_node *nodeptr, void *user) -> isl_bool {
643         isl::schedule_node Node = isl::manage_copy(nodeptr);
644         int Version = *static_cast<int *>(user);
645 
646         switch (isl_schedule_node_get_type(Node.get())) {
647         case isl_schedule_node_band: {
648           NumBands[Version]++;
649           if (isl_schedule_node_band_get_permutable(Node.get()) ==
650               isl_bool_true)
651             NumPermutable[Version]++;
652 
653           int CountMembers = isl_schedule_node_band_n_member(Node.get());
654           NumBandMembers[Version] += CountMembers;
655           for (int i = 0; i < CountMembers; i += 1) {
656             if (Node.as<isl::schedule_node_band>().member_get_coincident(i))
657               NumCoincident[Version]++;
658           }
659           break;
660         }
661 
662         case isl_schedule_node_filter:
663           NumFilters[Version]++;
664           break;
665 
666         case isl_schedule_node_extension:
667           NumExtension[Version]++;
668           break;
669 
670         default:
671           break;
672         }
673 
674         return isl_bool_true;
675       },
676       &Version);
677 }
678 
679 static bool runIslScheduleOptimizer(
680     Scop &S,
681     function_ref<const Dependences &(Dependences::AnalysisLevel)> GetDeps,
682     TargetTransformInfo *TTI, OptimizationRemarkEmitter *ORE,
683     isl::schedule &LastSchedule) {
684 
685   // Skip SCoPs in case they're already optimised by PPCGCodeGeneration
686   if (S.isToBeSkipped())
687     return false;
688 
689   // Skip empty SCoPs but still allow code generation as it will delete the
690   // loops present but not needed.
691   if (S.getSize() == 0) {
692     S.markAsOptimized();
693     return false;
694   }
695 
696   ScopsProcessed++;
697 
698   // Schedule without optimizations.
699   isl::schedule Schedule = S.getScheduleTree();
700   walkScheduleTreeForStatistics(S.getScheduleTree(), 0);
701   LLVM_DEBUG(printSchedule(dbgs(), Schedule, "Original schedule tree"));
702 
703   bool HasUserTransformation = false;
704   if (PragmaBasedOpts) {
705     isl::schedule ManuallyTransformed = applyManualTransformations(
706         &S, Schedule, GetDeps(Dependences::AL_Statement), ORE);
707     if (ManuallyTransformed.is_null()) {
708       LLVM_DEBUG(dbgs() << "Error during manual optimization\n");
709       return false;
710     }
711 
712     if (ManuallyTransformed.get() != Schedule.get()) {
713       // User transformations have precedence over other transformations.
714       HasUserTransformation = true;
715       Schedule = std::move(ManuallyTransformed);
716       LLVM_DEBUG(
717           printSchedule(dbgs(), Schedule, "After manual transformations"));
718     }
719   }
720 
721   // Only continue if either manual transformations have been applied or we are
722   // allowed to apply heuristics.
723   // TODO: Detect disabled heuristics and no user-directed transformation
724   // metadata earlier in ScopDetection.
725   if (!HasUserTransformation && S.hasDisableHeuristicsHint()) {
726     LLVM_DEBUG(dbgs() << "Heuristic optimizations disabled by metadata\n");
727     return false;
728   }
729 
730   // Get dependency analysis.
731   const Dependences &D = GetDeps(Dependences::AL_Statement);
732   if (D.getSharedIslCtx() != S.getSharedIslCtx()) {
733     LLVM_DEBUG(dbgs() << "DependenceInfo for another SCoP/isl_ctx\n");
734     return false;
735   }
736   if (!D.hasValidDependences()) {
737     LLVM_DEBUG(dbgs() << "Dependency information not available\n");
738     return false;
739   }
740 
741   // Apply ISL's algorithm only if not overriden by the user. Note that
742   // post-rescheduling optimizations (tiling, pattern-based, prevectorization)
743   // rely on the coincidence/permutable annotations on schedule tree bands that
744   // are added by the rescheduling analyzer. Therefore, disabling the
745   // rescheduler implicitly also disables these optimizations.
746   if (!EnableReschedule) {
747     LLVM_DEBUG(dbgs() << "Skipping rescheduling due to command line option\n");
748   } else if (HasUserTransformation) {
749     LLVM_DEBUG(
750         dbgs() << "Skipping rescheduling due to manual transformation\n");
751   } else {
752     // Build input data.
753     int ValidityKinds =
754         Dependences::TYPE_RAW | Dependences::TYPE_WAR | Dependences::TYPE_WAW;
755     int ProximityKinds;
756 
757     if (OptimizeDeps == "all")
758       ProximityKinds =
759           Dependences::TYPE_RAW | Dependences::TYPE_WAR | Dependences::TYPE_WAW;
760     else if (OptimizeDeps == "raw")
761       ProximityKinds = Dependences::TYPE_RAW;
762     else {
763       errs() << "Do not know how to optimize for '" << OptimizeDeps << "'"
764              << " Falling back to optimizing all dependences.\n";
765       ProximityKinds =
766           Dependences::TYPE_RAW | Dependences::TYPE_WAR | Dependences::TYPE_WAW;
767     }
768 
769     isl::union_set Domain = S.getDomains();
770 
771     if (Domain.is_null())
772       return false;
773 
774     isl::union_map Validity = D.getDependences(ValidityKinds);
775     isl::union_map Proximity = D.getDependences(ProximityKinds);
776 
777     // Simplify the dependences by removing the constraints introduced by the
778     // domains. This can speed up the scheduling time significantly, as large
779     // constant coefficients will be removed from the dependences. The
780     // introduction of some additional dependences reduces the possible
781     // transformations, but in most cases, such transformation do not seem to be
782     // interesting anyway. In some cases this option may stop the scheduler to
783     // find any schedule.
784     if (SimplifyDeps == "yes") {
785       Validity = Validity.gist_domain(Domain);
786       Validity = Validity.gist_range(Domain);
787       Proximity = Proximity.gist_domain(Domain);
788       Proximity = Proximity.gist_range(Domain);
789     } else if (SimplifyDeps != "no") {
790       errs()
791           << "warning: Option -polly-opt-simplify-deps should either be 'yes' "
792              "or 'no'. Falling back to default: 'yes'\n";
793     }
794 
795     LLVM_DEBUG(dbgs() << "\n\nCompute schedule from: ");
796     LLVM_DEBUG(dbgs() << "Domain := " << Domain << ";\n");
797     LLVM_DEBUG(dbgs() << "Proximity := " << Proximity << ";\n");
798     LLVM_DEBUG(dbgs() << "Validity := " << Validity << ";\n");
799 
800     int IslMaximizeBands;
801     if (MaximizeBandDepth == "yes") {
802       IslMaximizeBands = 1;
803     } else if (MaximizeBandDepth == "no") {
804       IslMaximizeBands = 0;
805     } else {
806       errs()
807           << "warning: Option -polly-opt-maximize-bands should either be 'yes'"
808              " or 'no'. Falling back to default: 'yes'\n";
809       IslMaximizeBands = 1;
810     }
811 
812     int IslOuterCoincidence;
813     if (OuterCoincidence == "yes") {
814       IslOuterCoincidence = 1;
815     } else if (OuterCoincidence == "no") {
816       IslOuterCoincidence = 0;
817     } else {
818       errs() << "warning: Option -polly-opt-outer-coincidence should either be "
819                 "'yes' or 'no'. Falling back to default: 'no'\n";
820       IslOuterCoincidence = 0;
821     }
822 
823     isl_ctx *Ctx = S.getIslCtx().get();
824 
825     isl_options_set_schedule_outer_coincidence(Ctx, IslOuterCoincidence);
826     isl_options_set_schedule_maximize_band_depth(Ctx, IslMaximizeBands);
827     isl_options_set_schedule_max_constant_term(Ctx, MaxConstantTerm);
828     isl_options_set_schedule_max_coefficient(Ctx, MaxCoefficient);
829     isl_options_set_tile_scale_tile_loops(Ctx, 0);
830 
831     auto OnErrorStatus = isl_options_get_on_error(Ctx);
832     isl_options_set_on_error(Ctx, ISL_ON_ERROR_CONTINUE);
833 
834     auto SC = isl::schedule_constraints::on_domain(Domain);
835     SC = SC.set_proximity(Proximity);
836     SC = SC.set_validity(Validity);
837     SC = SC.set_coincidence(Validity);
838     Schedule = SC.compute_schedule();
839     isl_options_set_on_error(Ctx, OnErrorStatus);
840 
841     ScopsRescheduled++;
842     LLVM_DEBUG(printSchedule(dbgs(), Schedule, "After rescheduling"));
843   }
844 
845   walkScheduleTreeForStatistics(Schedule, 1);
846 
847   // In cases the scheduler is not able to optimize the code, we just do not
848   // touch the schedule.
849   if (Schedule.is_null())
850     return false;
851 
852   if (GreedyFusion) {
853     isl::union_map Validity = D.getDependences(
854         Dependences::TYPE_RAW | Dependences::TYPE_WAR | Dependences::TYPE_WAW);
855     Schedule = applyGreedyFusion(Schedule, Validity);
856     assert(!Schedule.is_null());
857   }
858 
859   // Apply post-rescheduling optimizations (if enabled) and/or prevectorization.
860   const OptimizerAdditionalInfoTy OAI = {
861       TTI, const_cast<Dependences *>(&D),
862       /*PatternOpts=*/!HasUserTransformation && PMBasedOpts,
863       /*Postopts=*/!HasUserTransformation && EnablePostopts,
864       /*Prevect=*/PollyVectorizerChoice != VECTORIZER_NONE};
865   if (OAI.PatternOpts || OAI.Postopts || OAI.Prevect) {
866     Schedule = ScheduleTreeOptimizer::optimizeSchedule(Schedule, &OAI);
867     Schedule = hoistExtensionNodes(Schedule);
868     LLVM_DEBUG(printSchedule(dbgs(), Schedule, "After post-optimizations"));
869     walkScheduleTreeForStatistics(Schedule, 2);
870   }
871 
872   // Skip profitability check if user transformation(s) have been applied.
873   if (!HasUserTransformation &&
874       !ScheduleTreeOptimizer::isProfitableSchedule(S, Schedule))
875     return false;
876 
877   auto ScopStats = S.getStatistics();
878   ScopsOptimized++;
879   NumAffineLoopsOptimized += ScopStats.NumAffineLoops;
880   NumBoxedLoopsOptimized += ScopStats.NumBoxedLoops;
881   LastSchedule = Schedule;
882 
883   S.setScheduleTree(Schedule);
884   S.markAsOptimized();
885 
886   if (OptimizedScops)
887     errs() << S;
888 
889   return false;
890 }
891 
892 bool IslScheduleOptimizerWrapperPass::runOnScop(Scop &S) {
893   releaseMemory();
894 
895   Function &F = S.getFunction();
896   IslCtx = S.getSharedIslCtx();
897 
898   auto getDependences =
899       [this](Dependences::AnalysisLevel) -> const Dependences & {
900     return getAnalysis<DependenceInfo>().getDependences(
901         Dependences::AL_Statement);
902   };
903   OptimizationRemarkEmitter &ORE =
904       getAnalysis<OptimizationRemarkEmitterWrapperPass>().getORE();
905   TargetTransformInfo *TTI =
906       &getAnalysis<TargetTransformInfoWrapperPass>().getTTI(F);
907   return runIslScheduleOptimizer(S, getDependences, TTI, &ORE, LastSchedule);
908 }
909 
910 static void runScheduleOptimizerPrinter(raw_ostream &OS,
911                                         isl::schedule LastSchedule) {
912   isl_printer *p;
913   char *ScheduleStr;
914 
915   OS << "Calculated schedule:\n";
916 
917   if (LastSchedule.is_null()) {
918     OS << "n/a\n";
919     return;
920   }
921 
922   p = isl_printer_to_str(LastSchedule.ctx().get());
923   p = isl_printer_set_yaml_style(p, ISL_YAML_STYLE_BLOCK);
924   p = isl_printer_print_schedule(p, LastSchedule.get());
925   ScheduleStr = isl_printer_get_str(p);
926   isl_printer_free(p);
927 
928   OS << ScheduleStr << "\n";
929 
930   free(ScheduleStr);
931 }
932 
933 void IslScheduleOptimizerWrapperPass::printScop(raw_ostream &OS, Scop &) const {
934   runScheduleOptimizerPrinter(OS, LastSchedule);
935 }
936 
937 void IslScheduleOptimizerWrapperPass::getAnalysisUsage(
938     AnalysisUsage &AU) const {
939   ScopPass::getAnalysisUsage(AU);
940   AU.addRequired<DependenceInfo>();
941   AU.addRequired<TargetTransformInfoWrapperPass>();
942   AU.addRequired<OptimizationRemarkEmitterWrapperPass>();
943 
944   AU.addPreserved<DependenceInfo>();
945   AU.addPreserved<OptimizationRemarkEmitterWrapperPass>();
946 }
947 
948 } // namespace
949 
950 Pass *polly::createIslScheduleOptimizerWrapperPass() {
951   return new IslScheduleOptimizerWrapperPass();
952 }
953 
954 INITIALIZE_PASS_BEGIN(IslScheduleOptimizerWrapperPass, "polly-opt-isl",
955                       "Polly - Optimize schedule of SCoP", false, false);
956 INITIALIZE_PASS_DEPENDENCY(DependenceInfo);
957 INITIALIZE_PASS_DEPENDENCY(ScopInfoRegionPass);
958 INITIALIZE_PASS_DEPENDENCY(TargetTransformInfoWrapperPass);
959 INITIALIZE_PASS_DEPENDENCY(OptimizationRemarkEmitterWrapperPass);
960 INITIALIZE_PASS_END(IslScheduleOptimizerWrapperPass, "polly-opt-isl",
961                     "Polly - Optimize schedule of SCoP", false, false)
962 
963 static llvm::PreservedAnalyses
964 runIslScheduleOptimizerUsingNPM(Scop &S, ScopAnalysisManager &SAM,
965                                 ScopStandardAnalysisResults &SAR, SPMUpdater &U,
966                                 raw_ostream *OS) {
967   DependenceAnalysis::Result &Deps = SAM.getResult<DependenceAnalysis>(S, SAR);
968   auto GetDeps = [&Deps](Dependences::AnalysisLevel) -> const Dependences & {
969     return Deps.getDependences(Dependences::AL_Statement);
970   };
971   OptimizationRemarkEmitter ORE(&S.getFunction());
972   TargetTransformInfo *TTI = &SAR.TTI;
973   isl::schedule LastSchedule;
974   bool Modified = runIslScheduleOptimizer(S, GetDeps, TTI, &ORE, LastSchedule);
975   if (OS) {
976     *OS << "Printing analysis 'Polly - Optimize schedule of SCoP' for region: '"
977         << S.getName() << "' in function '" << S.getFunction().getName()
978         << "':\n";
979     runScheduleOptimizerPrinter(*OS, LastSchedule);
980   }
981 
982   if (!Modified)
983     return PreservedAnalyses::all();
984 
985   PreservedAnalyses PA;
986   PA.preserveSet<AllAnalysesOn<Module>>();
987   PA.preserveSet<AllAnalysesOn<Function>>();
988   PA.preserveSet<AllAnalysesOn<Loop>>();
989   return PA;
990 }
991 
992 llvm::PreservedAnalyses
993 IslScheduleOptimizerPass::run(Scop &S, ScopAnalysisManager &SAM,
994                               ScopStandardAnalysisResults &SAR, SPMUpdater &U) {
995   return runIslScheduleOptimizerUsingNPM(S, SAM, SAR, U, nullptr);
996 }
997 
998 llvm::PreservedAnalyses
999 IslScheduleOptimizerPrinterPass::run(Scop &S, ScopAnalysisManager &SAM,
1000                                      ScopStandardAnalysisResults &SAR,
1001                                      SPMUpdater &U) {
1002   return runIslScheduleOptimizerUsingNPM(S, SAM, SAR, U, &OS);
1003 }
1004 
1005 //===----------------------------------------------------------------------===//
1006 
1007 namespace {
1008 /// Print result from IslScheduleOptimizerWrapperPass.
1009 class IslScheduleOptimizerPrinterLegacyPass final : public ScopPass {
1010 public:
1011   static char ID;
1012 
1013   IslScheduleOptimizerPrinterLegacyPass()
1014       : IslScheduleOptimizerPrinterLegacyPass(outs()) {}
1015   explicit IslScheduleOptimizerPrinterLegacyPass(llvm::raw_ostream &OS)
1016       : ScopPass(ID), OS(OS) {}
1017 
1018   bool runOnScop(Scop &S) override {
1019     IslScheduleOptimizerWrapperPass &P =
1020         getAnalysis<IslScheduleOptimizerWrapperPass>();
1021 
1022     OS << "Printing analysis '" << P.getPassName() << "' for region: '"
1023        << S.getRegion().getNameStr() << "' in function '"
1024        << S.getFunction().getName() << "':\n";
1025     P.printScop(OS, S);
1026 
1027     return false;
1028   }
1029 
1030   void getAnalysisUsage(AnalysisUsage &AU) const override {
1031     ScopPass::getAnalysisUsage(AU);
1032     AU.addRequired<IslScheduleOptimizerWrapperPass>();
1033     AU.setPreservesAll();
1034   }
1035 
1036 private:
1037   llvm::raw_ostream &OS;
1038 };
1039 
1040 char IslScheduleOptimizerPrinterLegacyPass::ID = 0;
1041 } // namespace
1042 
1043 Pass *polly::createIslScheduleOptimizerPrinterLegacyPass(raw_ostream &OS) {
1044   return new IslScheduleOptimizerPrinterLegacyPass(OS);
1045 }
1046 
1047 INITIALIZE_PASS_BEGIN(IslScheduleOptimizerPrinterLegacyPass,
1048                       "polly-print-opt-isl",
1049                       "Polly - Print optimizer schedule of SCoP", false, false);
1050 INITIALIZE_PASS_DEPENDENCY(IslScheduleOptimizerWrapperPass)
1051 INITIALIZE_PASS_END(IslScheduleOptimizerPrinterLegacyPass,
1052                     "polly-print-opt-isl",
1053                     "Polly - Print optimizer schedule of SCoP", false, false)
1054