1 //===- ScheduleOptimizer.cpp - Calculate an optimized schedule ------------===//
2 //
3 // Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
4 // See https://llvm.org/LICENSE.txt for license information.
5 // SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
6 //
7 //===----------------------------------------------------------------------===//
8 //
9 // This pass generates an entirely new schedule tree from the data dependences
10 // and iteration domains. The new schedule tree is computed in two steps:
11 //
12 // 1) The isl scheduling optimizer is run
13 //
14 // The isl scheduling optimizer creates a new schedule tree that maximizes
15 // parallelism and tileability and minimizes data-dependence distances. The
16 // algorithm used is a modified version of the ``Pluto'' algorithm:
17 //
18 //   U. Bondhugula, A. Hartono, J. Ramanujam, and P. Sadayappan.
19 //   A Practical Automatic Polyhedral Parallelizer and Locality Optimizer.
20 //   In Proceedings of the 2008 ACM SIGPLAN Conference On Programming Language
21 //   Design and Implementation, PLDI ’08, pages 101–113. ACM, 2008.
22 //
23 // 2) A set of post-scheduling transformations is applied on the schedule tree.
24 //
25 // These optimizations include:
26 //
27 //  - Tiling of the innermost tilable bands
28 //  - Prevectorization - The choice of a possible outer loop that is strip-mined
29 //                       to the innermost level to enable inner-loop
30 //                       vectorization.
31 //  - Some optimizations for spatial locality are also planned.
32 //
33 // For a detailed description of the schedule tree itself please see section 6
34 // of:
35 //
36 // Polyhedral AST generation is more than scanning polyhedra
37 // Tobias Grosser, Sven Verdoolaege, Albert Cohen
38 // ACM Transactions on Programming Languages and Systems (TOPLAS),
39 // 37(4), July 2015
40 // http://www.grosser.es/#pub-polyhedral-AST-generation
41 //
42 // This publication also contains a detailed discussion of the different options
43 // for polyhedral loop unrolling, full/partial tile separation and other uses
44 // of the schedule tree.
45 //
46 //===----------------------------------------------------------------------===//
47 
48 #include "polly/ScheduleOptimizer.h"
49 #include "polly/CodeGen/CodeGeneration.h"
50 #include "polly/DependenceInfo.h"
51 #include "polly/ManualOptimizer.h"
52 #include "polly/MatmulOptimizer.h"
53 #include "polly/Options.h"
54 #include "polly/ScheduleTreeTransform.h"
55 #include "polly/Support/ISLOStream.h"
56 #include "polly/Support/ISLTools.h"
57 #include "llvm/ADT/Sequence.h"
58 #include "llvm/ADT/Statistic.h"
59 #include "llvm/Analysis/OptimizationRemarkEmitter.h"
60 #include "llvm/InitializePasses.h"
61 #include "llvm/Support/CommandLine.h"
62 #include "isl/options.h"
63 
64 using namespace llvm;
65 using namespace polly;
66 
67 namespace llvm {
68 class Loop;
69 class Module;
70 } // namespace llvm
71 
72 #define DEBUG_TYPE "polly-opt-isl"
73 
74 static cl::opt<std::string>
75     OptimizeDeps("polly-opt-optimize-only",
76                  cl::desc("Only a certain kind of dependences (all/raw)"),
77                  cl::Hidden, cl::init("all"), cl::cat(PollyCategory));
78 
79 static cl::opt<std::string>
80     SimplifyDeps("polly-opt-simplify-deps",
81                  cl::desc("Dependences should be simplified (yes/no)"),
82                  cl::Hidden, cl::init("yes"), cl::cat(PollyCategory));
83 
84 static cl::opt<int> MaxConstantTerm(
85     "polly-opt-max-constant-term",
86     cl::desc("The maximal constant term allowed (-1 is unlimited)"), cl::Hidden,
87     cl::init(20), cl::cat(PollyCategory));
88 
89 static cl::opt<int> MaxCoefficient(
90     "polly-opt-max-coefficient",
91     cl::desc("The maximal coefficient allowed (-1 is unlimited)"), cl::Hidden,
92     cl::init(20), cl::cat(PollyCategory));
93 
94 static cl::opt<std::string>
95     MaximizeBandDepth("polly-opt-maximize-bands",
96                       cl::desc("Maximize the band depth (yes/no)"), cl::Hidden,
97                       cl::init("yes"), cl::cat(PollyCategory));
98 
99 static cl::opt<bool>
100     GreedyFusion("polly-loopfusion-greedy",
101                  cl::desc("Aggressively try to fuse everything"), cl::Hidden,
102                  cl::cat(PollyCategory));
103 
104 static cl::opt<std::string> OuterCoincidence(
105     "polly-opt-outer-coincidence",
106     cl::desc("Try to construct schedules where the outer member of each band "
107              "satisfies the coincidence constraints (yes/no)"),
108     cl::Hidden, cl::init("no"), cl::cat(PollyCategory));
109 
110 static cl::opt<int> PrevectorWidth(
111     "polly-prevect-width",
112     cl::desc(
113         "The number of loop iterations to strip-mine for pre-vectorization"),
114     cl::Hidden, cl::init(4), cl::cat(PollyCategory));
115 
116 static cl::opt<bool> FirstLevelTiling("polly-tiling",
117                                       cl::desc("Enable loop tiling"),
118                                       cl::init(true), cl::cat(PollyCategory));
119 
120 static cl::opt<int> FirstLevelDefaultTileSize(
121     "polly-default-tile-size",
122     cl::desc("The default tile size (if not enough were provided by"
123              " --polly-tile-sizes)"),
124     cl::Hidden, cl::init(32), cl::cat(PollyCategory));
125 
126 static cl::list<int>
127     FirstLevelTileSizes("polly-tile-sizes",
128                         cl::desc("A tile size for each loop dimension, filled "
129                                  "with --polly-default-tile-size"),
130                         cl::Hidden, cl::CommaSeparated, cl::cat(PollyCategory));
131 
132 static cl::opt<bool>
133     SecondLevelTiling("polly-2nd-level-tiling",
134                       cl::desc("Enable a 2nd level loop of loop tiling"),
135                       cl::cat(PollyCategory));
136 
137 static cl::opt<int> SecondLevelDefaultTileSize(
138     "polly-2nd-level-default-tile-size",
139     cl::desc("The default 2nd-level tile size (if not enough were provided by"
140              " --polly-2nd-level-tile-sizes)"),
141     cl::Hidden, cl::init(16), cl::cat(PollyCategory));
142 
143 static cl::list<int>
144     SecondLevelTileSizes("polly-2nd-level-tile-sizes",
145                          cl::desc("A tile size for each loop dimension, filled "
146                                   "with --polly-default-tile-size"),
147                          cl::Hidden, cl::CommaSeparated,
148                          cl::cat(PollyCategory));
149 
150 static cl::opt<bool> RegisterTiling("polly-register-tiling",
151                                     cl::desc("Enable register tiling"),
152                                     cl::cat(PollyCategory));
153 
154 static cl::opt<int> RegisterDefaultTileSize(
155     "polly-register-tiling-default-tile-size",
156     cl::desc("The default register tile size (if not enough were provided by"
157              " --polly-register-tile-sizes)"),
158     cl::Hidden, cl::init(2), cl::cat(PollyCategory));
159 
160 static cl::list<int>
161     RegisterTileSizes("polly-register-tile-sizes",
162                       cl::desc("A tile size for each loop dimension, filled "
163                                "with --polly-register-tile-size"),
164                       cl::Hidden, cl::CommaSeparated, cl::cat(PollyCategory));
165 
166 static cl::opt<bool> PragmaBasedOpts(
167     "polly-pragma-based-opts",
168     cl::desc("Apply user-directed transformation from metadata"),
169     cl::init(true), cl::cat(PollyCategory));
170 
171 static cl::opt<bool> EnableReschedule("polly-reschedule",
172                                       cl::desc("Optimize SCoPs using ISL"),
173                                       cl::init(true), cl::cat(PollyCategory));
174 
175 static cl::opt<bool>
176     PMBasedOpts("polly-pattern-matching-based-opts",
177                 cl::desc("Perform optimizations based on pattern matching"),
178                 cl::init(true), cl::cat(PollyCategory));
179 
180 static cl::opt<bool>
181     EnablePostopts("polly-postopts",
182                    cl::desc("Apply post-rescheduling optimizations such as "
183                             "tiling (requires -polly-reschedule)"),
184                    cl::init(true), cl::cat(PollyCategory));
185 
186 static cl::opt<bool> OptimizedScops(
187     "polly-optimized-scops",
188     cl::desc("Polly - Dump polyhedral description of Scops optimized with "
189              "the isl scheduling optimizer and the set of post-scheduling "
190              "transformations is applied on the schedule tree"),
191     cl::cat(PollyCategory));
192 
193 STATISTIC(ScopsProcessed, "Number of scops processed");
194 STATISTIC(ScopsRescheduled, "Number of scops rescheduled");
195 STATISTIC(ScopsOptimized, "Number of scops optimized");
196 
197 STATISTIC(NumAffineLoopsOptimized, "Number of affine loops optimized");
198 STATISTIC(NumBoxedLoopsOptimized, "Number of boxed loops optimized");
199 
200 #define THREE_STATISTICS(VARNAME, DESC)                                        \
201   static Statistic VARNAME[3] = {                                              \
202       {DEBUG_TYPE, #VARNAME "0", DESC " (original)"},                          \
203       {DEBUG_TYPE, #VARNAME "1", DESC " (after scheduler)"},                   \
204       {DEBUG_TYPE, #VARNAME "2", DESC " (after optimizer)"}}
205 
206 THREE_STATISTICS(NumBands, "Number of bands");
207 THREE_STATISTICS(NumBandMembers, "Number of band members");
208 THREE_STATISTICS(NumCoincident, "Number of coincident band members");
209 THREE_STATISTICS(NumPermutable, "Number of permutable bands");
210 THREE_STATISTICS(NumFilters, "Number of filter nodes");
211 THREE_STATISTICS(NumExtension, "Number of extension nodes");
212 
213 STATISTIC(FirstLevelTileOpts, "Number of first level tiling applied");
214 STATISTIC(SecondLevelTileOpts, "Number of second level tiling applied");
215 STATISTIC(RegisterTileOpts, "Number of register tiling applied");
216 STATISTIC(PrevectOpts, "Number of strip-mining for prevectorization applied");
217 STATISTIC(MatMulOpts,
218           "Number of matrix multiplication patterns detected and optimized");
219 
220 namespace {
221 /// Additional parameters of the schedule optimizer.
222 ///
223 /// Target Transform Info and the SCoP dependencies used by the schedule
224 /// optimizer.
225 struct OptimizerAdditionalInfoTy {
226   const llvm::TargetTransformInfo *TTI;
227   const Dependences *D;
228   bool PatternOpts;
229   bool Postopts;
230   bool Prevect;
231   bool &DepsChanged;
232 };
233 
234 class ScheduleTreeOptimizer final {
235 public:
236   /// Apply schedule tree transformations.
237   ///
238   /// This function takes an (possibly already optimized) schedule tree and
239   /// applies a set of additional optimizations on the schedule tree. The
240   /// transformations applied include:
241   ///
242   ///   - Pattern-based optimizations
243   ///   - Tiling
244   ///   - Prevectorization
245   ///
246   /// @param Schedule The schedule object the transformations will be applied
247   ///                 to.
248   /// @param OAI      Target Transform Info and the SCoP dependencies.
249   /// @returns        The transformed schedule.
250   static isl::schedule
251   optimizeSchedule(isl::schedule Schedule,
252                    const OptimizerAdditionalInfoTy *OAI = nullptr);
253 
254   /// Apply schedule tree transformations.
255   ///
256   /// This function takes a node in an (possibly already optimized) schedule
257   /// tree and applies a set of additional optimizations on this schedule tree
258   /// node and its descendants. The transformations applied include:
259   ///
260   ///   - Pattern-based optimizations
261   ///   - Tiling
262   ///   - Prevectorization
263   ///
264   /// @param Node The schedule object post-transformations will be applied to.
265   /// @param OAI  Target Transform Info and the SCoP dependencies.
266   /// @returns    The transformed schedule.
267   static isl::schedule_node
268   optimizeScheduleNode(isl::schedule_node Node,
269                        const OptimizerAdditionalInfoTy *OAI = nullptr);
270 
271   /// Decide if the @p NewSchedule is profitable for @p S.
272   ///
273   /// @param S           The SCoP we optimize.
274   /// @param NewSchedule The new schedule we computed.
275   ///
276   /// @return True, if we believe @p NewSchedule is an improvement for @p S.
277   static bool isProfitableSchedule(polly::Scop &S, isl::schedule NewSchedule);
278 
279   /// Isolate a set of partial tile prefixes.
280   ///
281   /// This set should ensure that it contains only partial tile prefixes that
282   /// have exactly VectorWidth iterations.
283   ///
284   /// @param Node A schedule node band, which is a parent of a band node,
285   ///             that contains a vector loop.
286   /// @return Modified isl_schedule_node.
287   static isl::schedule_node isolateFullPartialTiles(isl::schedule_node Node,
288                                                     int VectorWidth);
289 
290 private:
291   /// Check if this node is a band node we want to tile.
292   ///
293   /// We look for innermost band nodes where individual dimensions are marked as
294   /// permutable.
295   ///
296   /// @param Node The node to check.
297   static bool isTileableBandNode(isl::schedule_node Node);
298 
299   /// Pre-vectorizes one scheduling dimension of a schedule band.
300   ///
301   /// prevectSchedBand splits out the dimension DimToVectorize, tiles it and
302   /// sinks the resulting point loop.
303   ///
304   /// Example (DimToVectorize=0, VectorWidth=4):
305   ///
306   /// | Before transformation:
307   /// |
308   /// | A[i,j] -> [i,j]
309   /// |
310   /// | for (i = 0; i < 128; i++)
311   /// |    for (j = 0; j < 128; j++)
312   /// |      A(i,j);
313   ///
314   /// | After transformation:
315   /// |
316   /// | for (it = 0; it < 32; it+=1)
317   /// |    for (j = 0; j < 128; j++)
318   /// |      for (ip = 0; ip <= 3; ip++)
319   /// |        A(4 * it + ip,j);
320   ///
321   /// The goal of this transformation is to create a trivially vectorizable
322   /// loop.  This means a parallel loop at the innermost level that has a
323   /// constant number of iterations corresponding to the target vector width.
324   ///
325   /// This transformation creates a loop at the innermost level. The loop has
326   /// a constant number of iterations, if the number of loop iterations at
327   /// DimToVectorize can be divided by VectorWidth. The default VectorWidth is
328   /// currently constant and not yet target specific. This function does not
329   /// reason about parallelism.
330   static isl::schedule_node prevectSchedBand(isl::schedule_node Node,
331                                              unsigned DimToVectorize,
332                                              int VectorWidth);
333 
334   /// Apply additional optimizations on the bands in the schedule tree.
335   ///
336   /// We are looking for an innermost band node and apply the following
337   /// transformations:
338   ///
339   ///  - Tile the band
340   ///      - if the band is tileable
341   ///      - if the band has more than one loop dimension
342   ///
343   ///  - Prevectorize the schedule of the band (or the point loop in case of
344   ///    tiling).
345   ///      - if vectorization is enabled
346   ///
347   /// @param Node The schedule node to (possibly) optimize.
348   /// @param User A pointer to forward some use information
349   ///        (currently unused).
350   static isl_schedule_node *optimizeBand(isl_schedule_node *Node, void *User);
351 
352   /// Apply tiling optimizations on the bands in the schedule tree.
353   ///
354   /// @param Node The schedule node to (possibly) optimize.
355   static isl::schedule_node applyTileBandOpt(isl::schedule_node Node);
356 
357   /// Apply prevectorization on the bands in the schedule tree.
358   ///
359   /// @param Node The schedule node to (possibly) prevectorize.
360   static isl::schedule_node applyPrevectBandOpt(isl::schedule_node Node);
361 };
362 
363 isl::schedule_node
isolateFullPartialTiles(isl::schedule_node Node,int VectorWidth)364 ScheduleTreeOptimizer::isolateFullPartialTiles(isl::schedule_node Node,
365                                                int VectorWidth) {
366   assert(isl_schedule_node_get_type(Node.get()) == isl_schedule_node_band);
367   Node = Node.child(0).child(0);
368   isl::union_map SchedRelUMap = Node.get_prefix_schedule_relation();
369   isl::union_set ScheduleRangeUSet = SchedRelUMap.range();
370   isl::set ScheduleRange{ScheduleRangeUSet};
371   isl::set IsolateDomain = getPartialTilePrefixes(ScheduleRange, VectorWidth);
372   auto AtomicOption = getDimOptions(IsolateDomain.ctx(), "atomic");
373   isl::union_set IsolateOption = getIsolateOptions(IsolateDomain, 1);
374   Node = Node.parent().parent();
375   isl::union_set Options = IsolateOption.unite(AtomicOption);
376   isl::schedule_node_band Result =
377       Node.as<isl::schedule_node_band>().set_ast_build_options(Options);
378   return Result;
379 }
380 
381 struct InsertSimdMarkers final : ScheduleNodeRewriter<InsertSimdMarkers> {
visitBand__anonc229ce900111::InsertSimdMarkers382   isl::schedule_node visitBand(isl::schedule_node_band Band) {
383     isl::schedule_node Node = visitChildren(Band);
384 
385     // Only add SIMD markers to innermost bands.
386     if (!Node.first_child().isa<isl::schedule_node_leaf>())
387       return Node;
388 
389     isl::id LoopMarker = isl::id::alloc(Band.ctx(), "SIMD", nullptr);
390     return Band.insert_mark(LoopMarker);
391   }
392 };
393 
prevectSchedBand(isl::schedule_node Node,unsigned DimToVectorize,int VectorWidth)394 isl::schedule_node ScheduleTreeOptimizer::prevectSchedBand(
395     isl::schedule_node Node, unsigned DimToVectorize, int VectorWidth) {
396   assert(isl_schedule_node_get_type(Node.get()) == isl_schedule_node_band);
397 
398   auto Space = isl::manage(isl_schedule_node_band_get_space(Node.get()));
399   unsigned ScheduleDimensions = unsignedFromIslSize(Space.dim(isl::dim::set));
400   assert(DimToVectorize < ScheduleDimensions);
401 
402   if (DimToVectorize > 0) {
403     Node = isl::manage(
404         isl_schedule_node_band_split(Node.release(), DimToVectorize));
405     Node = Node.child(0);
406   }
407   if (DimToVectorize < ScheduleDimensions - 1)
408     Node = isl::manage(isl_schedule_node_band_split(Node.release(), 1));
409   Space = isl::manage(isl_schedule_node_band_get_space(Node.get()));
410   auto Sizes = isl::multi_val::zero(Space);
411   Sizes = Sizes.set_val(0, isl::val(Node.ctx(), VectorWidth));
412   Node =
413       isl::manage(isl_schedule_node_band_tile(Node.release(), Sizes.release()));
414   Node = isolateFullPartialTiles(Node, VectorWidth);
415   Node = Node.child(0);
416   // Make sure the "trivially vectorizable loop" is not unrolled. Otherwise,
417   // we will have troubles to match it in the backend.
418   Node = Node.as<isl::schedule_node_band>().set_ast_build_options(
419       isl::union_set(Node.ctx(), "{ unroll[x]: 1 = 0 }"));
420 
421   // Sink the inner loop into the smallest possible statements to make them
422   // represent a single vector instruction if possible.
423   Node = isl::manage(isl_schedule_node_band_sink(Node.release()));
424 
425   // Add SIMD markers to those vector statements.
426   InsertSimdMarkers SimdMarkerInserter;
427   Node = SimdMarkerInserter.visit(Node);
428 
429   PrevectOpts++;
430   return Node.parent();
431 }
432 
isSimpleInnermostBand(const isl::schedule_node & Node)433 static bool isSimpleInnermostBand(const isl::schedule_node &Node) {
434   assert(isl_schedule_node_get_type(Node.get()) == isl_schedule_node_band);
435   assert(isl_schedule_node_n_children(Node.get()) == 1);
436 
437   auto ChildType = isl_schedule_node_get_type(Node.child(0).get());
438 
439   if (ChildType == isl_schedule_node_leaf)
440     return true;
441 
442   if (ChildType != isl_schedule_node_sequence)
443     return false;
444 
445   auto Sequence = Node.child(0);
446 
447   for (int c = 0, nc = isl_schedule_node_n_children(Sequence.get()); c < nc;
448        ++c) {
449     auto Child = Sequence.child(c);
450     if (isl_schedule_node_get_type(Child.get()) != isl_schedule_node_filter)
451       return false;
452     if (isl_schedule_node_get_type(Child.child(0).get()) !=
453         isl_schedule_node_leaf)
454       return false;
455   }
456   return true;
457 }
458 
isTileableBandNode(isl::schedule_node Node)459 bool ScheduleTreeOptimizer::isTileableBandNode(isl::schedule_node Node) {
460   if (isl_schedule_node_get_type(Node.get()) != isl_schedule_node_band)
461     return false;
462 
463   if (isl_schedule_node_n_children(Node.get()) != 1)
464     return false;
465 
466   if (!isl_schedule_node_band_get_permutable(Node.get()))
467     return false;
468 
469   auto Space = isl::manage(isl_schedule_node_band_get_space(Node.get()));
470 
471   if (unsignedFromIslSize(Space.dim(isl::dim::set)) <= 1u)
472     return false;
473 
474   return isSimpleInnermostBand(Node);
475 }
476 
477 __isl_give isl::schedule_node
applyTileBandOpt(isl::schedule_node Node)478 ScheduleTreeOptimizer::applyTileBandOpt(isl::schedule_node Node) {
479   if (FirstLevelTiling) {
480     Node = tileNode(Node, "1st level tiling", FirstLevelTileSizes,
481                     FirstLevelDefaultTileSize);
482     FirstLevelTileOpts++;
483   }
484 
485   if (SecondLevelTiling) {
486     Node = tileNode(Node, "2nd level tiling", SecondLevelTileSizes,
487                     SecondLevelDefaultTileSize);
488     SecondLevelTileOpts++;
489   }
490 
491   if (RegisterTiling) {
492     Node =
493         applyRegisterTiling(Node, RegisterTileSizes, RegisterDefaultTileSize);
494     RegisterTileOpts++;
495   }
496 
497   return Node;
498 }
499 
500 isl::schedule_node
applyPrevectBandOpt(isl::schedule_node Node)501 ScheduleTreeOptimizer::applyPrevectBandOpt(isl::schedule_node Node) {
502   auto Space = isl::manage(isl_schedule_node_band_get_space(Node.get()));
503   int Dims = unsignedFromIslSize(Space.dim(isl::dim::set));
504 
505   for (int i = Dims - 1; i >= 0; i--)
506     if (Node.as<isl::schedule_node_band>().member_get_coincident(i)) {
507       Node = prevectSchedBand(Node, i, PrevectorWidth);
508       break;
509     }
510 
511   return Node;
512 }
513 
514 __isl_give isl_schedule_node *
optimizeBand(__isl_take isl_schedule_node * NodeArg,void * User)515 ScheduleTreeOptimizer::optimizeBand(__isl_take isl_schedule_node *NodeArg,
516                                     void *User) {
517   const OptimizerAdditionalInfoTy *OAI =
518       static_cast<const OptimizerAdditionalInfoTy *>(User);
519   assert(OAI && "Expecting optimization options");
520 
521   isl::schedule_node Node = isl::manage(NodeArg);
522   if (!isTileableBandNode(Node))
523     return Node.release();
524 
525   if (OAI->PatternOpts) {
526     isl::schedule_node PatternOptimizedSchedule =
527         tryOptimizeMatMulPattern(Node, OAI->TTI, OAI->D);
528     if (!PatternOptimizedSchedule.is_null()) {
529       MatMulOpts++;
530       OAI->DepsChanged = true;
531       return PatternOptimizedSchedule.release();
532     }
533   }
534 
535   if (OAI->Postopts)
536     Node = applyTileBandOpt(Node);
537 
538   if (OAI->Prevect) {
539     // FIXME: Prevectorization requirements are different from those checked by
540     // isTileableBandNode.
541     Node = applyPrevectBandOpt(Node);
542   }
543 
544   return Node.release();
545 }
546 
547 isl::schedule
optimizeSchedule(isl::schedule Schedule,const OptimizerAdditionalInfoTy * OAI)548 ScheduleTreeOptimizer::optimizeSchedule(isl::schedule Schedule,
549                                         const OptimizerAdditionalInfoTy *OAI) {
550   auto Root = Schedule.get_root();
551   Root = optimizeScheduleNode(Root, OAI);
552   return Root.get_schedule();
553 }
554 
optimizeScheduleNode(isl::schedule_node Node,const OptimizerAdditionalInfoTy * OAI)555 isl::schedule_node ScheduleTreeOptimizer::optimizeScheduleNode(
556     isl::schedule_node Node, const OptimizerAdditionalInfoTy *OAI) {
557   Node = isl::manage(isl_schedule_node_map_descendant_bottom_up(
558       Node.release(), optimizeBand,
559       const_cast<void *>(static_cast<const void *>(OAI))));
560   return Node;
561 }
562 
isProfitableSchedule(Scop & S,isl::schedule NewSchedule)563 bool ScheduleTreeOptimizer::isProfitableSchedule(Scop &S,
564                                                  isl::schedule NewSchedule) {
565   // To understand if the schedule has been optimized we check if the schedule
566   // has changed at all.
567   // TODO: We can improve this by tracking if any necessarily beneficial
568   // transformations have been performed. This can e.g. be tiling, loop
569   // interchange, or ...) We can track this either at the place where the
570   // transformation has been performed or, in case of automatic ILP based
571   // optimizations, by comparing (yet to be defined) performance metrics
572   // before/after the scheduling optimizer
573   // (e.g., #stride-one accesses)
574   // FIXME: A schedule tree whose union_map-conversion is identical to the
575   // original schedule map may still allow for parallelization, i.e. can still
576   // be profitable.
577   auto NewScheduleMap = NewSchedule.get_map();
578   auto OldSchedule = S.getSchedule();
579   assert(!OldSchedule.is_null() &&
580          "Only IslScheduleOptimizer can insert extension nodes "
581          "that make Scop::getSchedule() return nullptr.");
582   bool changed = !OldSchedule.is_equal(NewScheduleMap);
583   return changed;
584 }
585 
586 class IslScheduleOptimizerWrapperPass final : public ScopPass {
587 public:
588   static char ID;
589 
IslScheduleOptimizerWrapperPass()590   explicit IslScheduleOptimizerWrapperPass() : ScopPass(ID) {}
591 
592   /// Optimize the schedule of the SCoP @p S.
593   bool runOnScop(Scop &S) override;
594 
595   /// Print the new schedule for the SCoP @p S.
596   void printScop(raw_ostream &OS, Scop &S) const override;
597 
598   /// Register all analyses and transformation required.
599   void getAnalysisUsage(AnalysisUsage &AU) const override;
600 
601   /// Release the internal memory.
releaseMemory()602   void releaseMemory() override {
603     LastSchedule = {};
604     IslCtx.reset();
605   }
606 
607 private:
608   std::shared_ptr<isl_ctx> IslCtx;
609   isl::schedule LastSchedule;
610 };
611 
612 char IslScheduleOptimizerWrapperPass::ID = 0;
613 
614 #ifndef NDEBUG
printSchedule(llvm::raw_ostream & OS,const isl::schedule & Schedule,StringRef Desc)615 static void printSchedule(llvm::raw_ostream &OS, const isl::schedule &Schedule,
616                           StringRef Desc) {
617   isl::ctx Ctx = Schedule.ctx();
618   isl_printer *P = isl_printer_to_str(Ctx.get());
619   P = isl_printer_set_yaml_style(P, ISL_YAML_STYLE_BLOCK);
620   P = isl_printer_print_schedule(P, Schedule.get());
621   char *Str = isl_printer_get_str(P);
622   OS << Desc << ": \n" << Str << "\n";
623   free(Str);
624   isl_printer_free(P);
625 }
626 #endif
627 
628 /// Collect statistics for the schedule tree.
629 ///
630 /// @param Schedule The schedule tree to analyze. If not a schedule tree it is
631 /// ignored.
632 /// @param Version  The version of the schedule tree that is analyzed.
633 ///                 0 for the original schedule tree before any transformation.
634 ///                 1 for the schedule tree after isl's rescheduling.
635 ///                 2 for the schedule tree after optimizations are applied
636 ///                 (tiling, pattern matching)
walkScheduleTreeForStatistics(isl::schedule Schedule,int Version)637 static void walkScheduleTreeForStatistics(isl::schedule Schedule, int Version) {
638   auto Root = Schedule.get_root();
639   if (Root.is_null())
640     return;
641 
642   isl_schedule_node_foreach_descendant_top_down(
643       Root.get(),
644       [](__isl_keep isl_schedule_node *nodeptr, void *user) -> isl_bool {
645         isl::schedule_node Node = isl::manage_copy(nodeptr);
646         int Version = *static_cast<int *>(user);
647 
648         switch (isl_schedule_node_get_type(Node.get())) {
649         case isl_schedule_node_band: {
650           NumBands[Version]++;
651           if (isl_schedule_node_band_get_permutable(Node.get()) ==
652               isl_bool_true)
653             NumPermutable[Version]++;
654 
655           int CountMembers = isl_schedule_node_band_n_member(Node.get());
656           NumBandMembers[Version] += CountMembers;
657           for (int i = 0; i < CountMembers; i += 1) {
658             if (Node.as<isl::schedule_node_band>().member_get_coincident(i))
659               NumCoincident[Version]++;
660           }
661           break;
662         }
663 
664         case isl_schedule_node_filter:
665           NumFilters[Version]++;
666           break;
667 
668         case isl_schedule_node_extension:
669           NumExtension[Version]++;
670           break;
671 
672         default:
673           break;
674         }
675 
676         return isl_bool_true;
677       },
678       &Version);
679 }
680 
runIslScheduleOptimizer(Scop & S,function_ref<const Dependences & (Dependences::AnalysisLevel)> GetDeps,TargetTransformInfo * TTI,OptimizationRemarkEmitter * ORE,isl::schedule & LastSchedule,bool & DepsChanged)681 static void runIslScheduleOptimizer(
682     Scop &S,
683     function_ref<const Dependences &(Dependences::AnalysisLevel)> GetDeps,
684     TargetTransformInfo *TTI, OptimizationRemarkEmitter *ORE,
685     isl::schedule &LastSchedule, bool &DepsChanged) {
686 
687   // Skip SCoPs in case they're already optimised by PPCGCodeGeneration
688   if (S.isToBeSkipped())
689     return;
690 
691   // Skip empty SCoPs but still allow code generation as it will delete the
692   // loops present but not needed.
693   if (S.getSize() == 0) {
694     S.markAsOptimized();
695     return;
696   }
697 
698   ScopsProcessed++;
699 
700   // Schedule without optimizations.
701   isl::schedule Schedule = S.getScheduleTree();
702   walkScheduleTreeForStatistics(S.getScheduleTree(), 0);
703   LLVM_DEBUG(printSchedule(dbgs(), Schedule, "Original schedule tree"));
704 
705   bool HasUserTransformation = false;
706   if (PragmaBasedOpts) {
707     isl::schedule ManuallyTransformed = applyManualTransformations(
708         &S, Schedule, GetDeps(Dependences::AL_Statement), ORE);
709     if (ManuallyTransformed.is_null()) {
710       LLVM_DEBUG(dbgs() << "Error during manual optimization\n");
711       return;
712     }
713 
714     if (ManuallyTransformed.get() != Schedule.get()) {
715       // User transformations have precedence over other transformations.
716       HasUserTransformation = true;
717       Schedule = std::move(ManuallyTransformed);
718       LLVM_DEBUG(
719           printSchedule(dbgs(), Schedule, "After manual transformations"));
720     }
721   }
722 
723   // Only continue if either manual transformations have been applied or we are
724   // allowed to apply heuristics.
725   // TODO: Detect disabled heuristics and no user-directed transformation
726   // metadata earlier in ScopDetection.
727   if (!HasUserTransformation && S.hasDisableHeuristicsHint()) {
728     LLVM_DEBUG(dbgs() << "Heuristic optimizations disabled by metadata\n");
729     return;
730   }
731 
732   // Get dependency analysis.
733   const Dependences &D = GetDeps(Dependences::AL_Statement);
734   if (D.getSharedIslCtx() != S.getSharedIslCtx()) {
735     LLVM_DEBUG(dbgs() << "DependenceInfo for another SCoP/isl_ctx\n");
736     return;
737   }
738   if (!D.hasValidDependences()) {
739     LLVM_DEBUG(dbgs() << "Dependency information not available\n");
740     return;
741   }
742 
743   // Apply ISL's algorithm only if not overriden by the user. Note that
744   // post-rescheduling optimizations (tiling, pattern-based, prevectorization)
745   // rely on the coincidence/permutable annotations on schedule tree bands that
746   // are added by the rescheduling analyzer. Therefore, disabling the
747   // rescheduler implicitly also disables these optimizations.
748   if (!EnableReschedule) {
749     LLVM_DEBUG(dbgs() << "Skipping rescheduling due to command line option\n");
750   } else if (HasUserTransformation) {
751     LLVM_DEBUG(
752         dbgs() << "Skipping rescheduling due to manual transformation\n");
753   } else {
754     // Build input data.
755     int ValidityKinds =
756         Dependences::TYPE_RAW | Dependences::TYPE_WAR | Dependences::TYPE_WAW;
757     int ProximityKinds;
758 
759     if (OptimizeDeps == "all")
760       ProximityKinds =
761           Dependences::TYPE_RAW | Dependences::TYPE_WAR | Dependences::TYPE_WAW;
762     else if (OptimizeDeps == "raw")
763       ProximityKinds = Dependences::TYPE_RAW;
764     else {
765       errs() << "Do not know how to optimize for '" << OptimizeDeps << "'"
766              << " Falling back to optimizing all dependences.\n";
767       ProximityKinds =
768           Dependences::TYPE_RAW | Dependences::TYPE_WAR | Dependences::TYPE_WAW;
769     }
770 
771     isl::union_set Domain = S.getDomains();
772 
773     if (Domain.is_null())
774       return;
775 
776     isl::union_map Validity = D.getDependences(ValidityKinds);
777     isl::union_map Proximity = D.getDependences(ProximityKinds);
778 
779     // Simplify the dependences by removing the constraints introduced by the
780     // domains. This can speed up the scheduling time significantly, as large
781     // constant coefficients will be removed from the dependences. The
782     // introduction of some additional dependences reduces the possible
783     // transformations, but in most cases, such transformation do not seem to be
784     // interesting anyway. In some cases this option may stop the scheduler to
785     // find any schedule.
786     if (SimplifyDeps == "yes") {
787       Validity = Validity.gist_domain(Domain);
788       Validity = Validity.gist_range(Domain);
789       Proximity = Proximity.gist_domain(Domain);
790       Proximity = Proximity.gist_range(Domain);
791     } else if (SimplifyDeps != "no") {
792       errs()
793           << "warning: Option -polly-opt-simplify-deps should either be 'yes' "
794              "or 'no'. Falling back to default: 'yes'\n";
795     }
796 
797     LLVM_DEBUG(dbgs() << "\n\nCompute schedule from: ");
798     LLVM_DEBUG(dbgs() << "Domain := " << Domain << ";\n");
799     LLVM_DEBUG(dbgs() << "Proximity := " << Proximity << ";\n");
800     LLVM_DEBUG(dbgs() << "Validity := " << Validity << ";\n");
801 
802     int IslMaximizeBands;
803     if (MaximizeBandDepth == "yes") {
804       IslMaximizeBands = 1;
805     } else if (MaximizeBandDepth == "no") {
806       IslMaximizeBands = 0;
807     } else {
808       errs()
809           << "warning: Option -polly-opt-maximize-bands should either be 'yes'"
810              " or 'no'. Falling back to default: 'yes'\n";
811       IslMaximizeBands = 1;
812     }
813 
814     int IslOuterCoincidence;
815     if (OuterCoincidence == "yes") {
816       IslOuterCoincidence = 1;
817     } else if (OuterCoincidence == "no") {
818       IslOuterCoincidence = 0;
819     } else {
820       errs() << "warning: Option -polly-opt-outer-coincidence should either be "
821                 "'yes' or 'no'. Falling back to default: 'no'\n";
822       IslOuterCoincidence = 0;
823     }
824 
825     isl_ctx *Ctx = S.getIslCtx().get();
826 
827     isl_options_set_schedule_outer_coincidence(Ctx, IslOuterCoincidence);
828     isl_options_set_schedule_maximize_band_depth(Ctx, IslMaximizeBands);
829     isl_options_set_schedule_max_constant_term(Ctx, MaxConstantTerm);
830     isl_options_set_schedule_max_coefficient(Ctx, MaxCoefficient);
831     isl_options_set_tile_scale_tile_loops(Ctx, 0);
832 
833     auto OnErrorStatus = isl_options_get_on_error(Ctx);
834     isl_options_set_on_error(Ctx, ISL_ON_ERROR_CONTINUE);
835 
836     auto SC = isl::schedule_constraints::on_domain(Domain);
837     SC = SC.set_proximity(Proximity);
838     SC = SC.set_validity(Validity);
839     SC = SC.set_coincidence(Validity);
840     Schedule = SC.compute_schedule();
841     isl_options_set_on_error(Ctx, OnErrorStatus);
842 
843     ScopsRescheduled++;
844     LLVM_DEBUG(printSchedule(dbgs(), Schedule, "After rescheduling"));
845   }
846 
847   walkScheduleTreeForStatistics(Schedule, 1);
848 
849   // In cases the scheduler is not able to optimize the code, we just do not
850   // touch the schedule.
851   if (Schedule.is_null())
852     return;
853 
854   if (GreedyFusion) {
855     isl::union_map Validity = D.getDependences(
856         Dependences::TYPE_RAW | Dependences::TYPE_WAR | Dependences::TYPE_WAW);
857     Schedule = applyGreedyFusion(Schedule, Validity);
858     assert(!Schedule.is_null());
859   }
860 
861   // Apply post-rescheduling optimizations (if enabled) and/or prevectorization.
862   const OptimizerAdditionalInfoTy OAI = {
863       TTI,
864       const_cast<Dependences *>(&D),
865       /*PatternOpts=*/!HasUserTransformation && PMBasedOpts,
866       /*Postopts=*/!HasUserTransformation && EnablePostopts,
867       /*Prevect=*/PollyVectorizerChoice != VECTORIZER_NONE,
868       DepsChanged};
869   if (OAI.PatternOpts || OAI.Postopts || OAI.Prevect) {
870     Schedule = ScheduleTreeOptimizer::optimizeSchedule(Schedule, &OAI);
871     Schedule = hoistExtensionNodes(Schedule);
872     LLVM_DEBUG(printSchedule(dbgs(), Schedule, "After post-optimizations"));
873     walkScheduleTreeForStatistics(Schedule, 2);
874   }
875 
876   // Skip profitability check if user transformation(s) have been applied.
877   if (!HasUserTransformation &&
878       !ScheduleTreeOptimizer::isProfitableSchedule(S, Schedule))
879     return;
880 
881   auto ScopStats = S.getStatistics();
882   ScopsOptimized++;
883   NumAffineLoopsOptimized += ScopStats.NumAffineLoops;
884   NumBoxedLoopsOptimized += ScopStats.NumBoxedLoops;
885   LastSchedule = Schedule;
886 
887   S.setScheduleTree(Schedule);
888   S.markAsOptimized();
889 
890   if (OptimizedScops)
891     errs() << S;
892 }
893 
runOnScop(Scop & S)894 bool IslScheduleOptimizerWrapperPass::runOnScop(Scop &S) {
895   releaseMemory();
896 
897   Function &F = S.getFunction();
898   IslCtx = S.getSharedIslCtx();
899 
900   auto getDependences =
901       [this](Dependences::AnalysisLevel) -> const Dependences & {
902     return getAnalysis<DependenceInfo>().getDependences(
903         Dependences::AL_Statement);
904   };
905   OptimizationRemarkEmitter &ORE =
906       getAnalysis<OptimizationRemarkEmitterWrapperPass>().getORE();
907   TargetTransformInfo *TTI =
908       &getAnalysis<TargetTransformInfoWrapperPass>().getTTI(F);
909 
910   bool DepsChanged = false;
911   runIslScheduleOptimizer(S, getDependences, TTI, &ORE, LastSchedule,
912                           DepsChanged);
913   if (DepsChanged)
914     getAnalysis<DependenceInfo>().abandonDependences();
915   return false;
916 }
917 
runScheduleOptimizerPrinter(raw_ostream & OS,isl::schedule LastSchedule)918 static void runScheduleOptimizerPrinter(raw_ostream &OS,
919                                         isl::schedule LastSchedule) {
920   isl_printer *p;
921   char *ScheduleStr;
922 
923   OS << "Calculated schedule:\n";
924 
925   if (LastSchedule.is_null()) {
926     OS << "n/a\n";
927     return;
928   }
929 
930   p = isl_printer_to_str(LastSchedule.ctx().get());
931   p = isl_printer_set_yaml_style(p, ISL_YAML_STYLE_BLOCK);
932   p = isl_printer_print_schedule(p, LastSchedule.get());
933   ScheduleStr = isl_printer_get_str(p);
934   isl_printer_free(p);
935 
936   OS << ScheduleStr << "\n";
937 
938   free(ScheduleStr);
939 }
940 
printScop(raw_ostream & OS,Scop &) const941 void IslScheduleOptimizerWrapperPass::printScop(raw_ostream &OS, Scop &) const {
942   runScheduleOptimizerPrinter(OS, LastSchedule);
943 }
944 
getAnalysisUsage(AnalysisUsage & AU) const945 void IslScheduleOptimizerWrapperPass::getAnalysisUsage(
946     AnalysisUsage &AU) const {
947   ScopPass::getAnalysisUsage(AU);
948   AU.addRequired<DependenceInfo>();
949   AU.addRequired<TargetTransformInfoWrapperPass>();
950   AU.addRequired<OptimizationRemarkEmitterWrapperPass>();
951 
952   AU.addPreserved<DependenceInfo>();
953   AU.addPreserved<OptimizationRemarkEmitterWrapperPass>();
954 }
955 
956 } // namespace
957 
createIslScheduleOptimizerWrapperPass()958 Pass *polly::createIslScheduleOptimizerWrapperPass() {
959   return new IslScheduleOptimizerWrapperPass();
960 }
961 
962 INITIALIZE_PASS_BEGIN(IslScheduleOptimizerWrapperPass, "polly-opt-isl",
963                       "Polly - Optimize schedule of SCoP", false, false);
964 INITIALIZE_PASS_DEPENDENCY(DependenceInfo);
965 INITIALIZE_PASS_DEPENDENCY(ScopInfoRegionPass);
966 INITIALIZE_PASS_DEPENDENCY(TargetTransformInfoWrapperPass);
967 INITIALIZE_PASS_DEPENDENCY(OptimizationRemarkEmitterWrapperPass);
968 INITIALIZE_PASS_END(IslScheduleOptimizerWrapperPass, "polly-opt-isl",
969                     "Polly - Optimize schedule of SCoP", false, false)
970 
971 static llvm::PreservedAnalyses
runIslScheduleOptimizerUsingNPM(Scop & S,ScopAnalysisManager & SAM,ScopStandardAnalysisResults & SAR,SPMUpdater & U,raw_ostream * OS)972 runIslScheduleOptimizerUsingNPM(Scop &S, ScopAnalysisManager &SAM,
973                                 ScopStandardAnalysisResults &SAR, SPMUpdater &U,
974                                 raw_ostream *OS) {
975   DependenceAnalysis::Result &Deps = SAM.getResult<DependenceAnalysis>(S, SAR);
976   auto GetDeps = [&Deps](Dependences::AnalysisLevel) -> const Dependences & {
977     return Deps.getDependences(Dependences::AL_Statement);
978   };
979   OptimizationRemarkEmitter ORE(&S.getFunction());
980   TargetTransformInfo *TTI = &SAR.TTI;
981   isl::schedule LastSchedule;
982   bool DepsChanged = false;
983   runIslScheduleOptimizer(S, GetDeps, TTI, &ORE, LastSchedule, DepsChanged);
984   if (DepsChanged)
985     Deps.abandonDependences();
986 
987   if (OS) {
988     *OS << "Printing analysis 'Polly - Optimize schedule of SCoP' for region: '"
989         << S.getName() << "' in function '" << S.getFunction().getName()
990         << "':\n";
991     runScheduleOptimizerPrinter(*OS, LastSchedule);
992   }
993   return PreservedAnalyses::all();
994 }
995 
996 llvm::PreservedAnalyses
run(Scop & S,ScopAnalysisManager & SAM,ScopStandardAnalysisResults & SAR,SPMUpdater & U)997 IslScheduleOptimizerPass::run(Scop &S, ScopAnalysisManager &SAM,
998                               ScopStandardAnalysisResults &SAR, SPMUpdater &U) {
999   return runIslScheduleOptimizerUsingNPM(S, SAM, SAR, U, nullptr);
1000 }
1001 
1002 llvm::PreservedAnalyses
run(Scop & S,ScopAnalysisManager & SAM,ScopStandardAnalysisResults & SAR,SPMUpdater & U)1003 IslScheduleOptimizerPrinterPass::run(Scop &S, ScopAnalysisManager &SAM,
1004                                      ScopStandardAnalysisResults &SAR,
1005                                      SPMUpdater &U) {
1006   return runIslScheduleOptimizerUsingNPM(S, SAM, SAR, U, &OS);
1007 }
1008 
1009 //===----------------------------------------------------------------------===//
1010 
1011 namespace {
1012 /// Print result from IslScheduleOptimizerWrapperPass.
1013 class IslScheduleOptimizerPrinterLegacyPass final : public ScopPass {
1014 public:
1015   static char ID;
1016 
IslScheduleOptimizerPrinterLegacyPass()1017   IslScheduleOptimizerPrinterLegacyPass()
1018       : IslScheduleOptimizerPrinterLegacyPass(outs()) {}
IslScheduleOptimizerPrinterLegacyPass(llvm::raw_ostream & OS)1019   explicit IslScheduleOptimizerPrinterLegacyPass(llvm::raw_ostream &OS)
1020       : ScopPass(ID), OS(OS) {}
1021 
runOnScop(Scop & S)1022   bool runOnScop(Scop &S) override {
1023     IslScheduleOptimizerWrapperPass &P =
1024         getAnalysis<IslScheduleOptimizerWrapperPass>();
1025 
1026     OS << "Printing analysis '" << P.getPassName() << "' for region: '"
1027        << S.getRegion().getNameStr() << "' in function '"
1028        << S.getFunction().getName() << "':\n";
1029     P.printScop(OS, S);
1030 
1031     return false;
1032   }
1033 
getAnalysisUsage(AnalysisUsage & AU) const1034   void getAnalysisUsage(AnalysisUsage &AU) const override {
1035     ScopPass::getAnalysisUsage(AU);
1036     AU.addRequired<IslScheduleOptimizerWrapperPass>();
1037     AU.setPreservesAll();
1038   }
1039 
1040 private:
1041   llvm::raw_ostream &OS;
1042 };
1043 
1044 char IslScheduleOptimizerPrinterLegacyPass::ID = 0;
1045 } // namespace
1046 
createIslScheduleOptimizerPrinterLegacyPass(raw_ostream & OS)1047 Pass *polly::createIslScheduleOptimizerPrinterLegacyPass(raw_ostream &OS) {
1048   return new IslScheduleOptimizerPrinterLegacyPass(OS);
1049 }
1050 
1051 INITIALIZE_PASS_BEGIN(IslScheduleOptimizerPrinterLegacyPass,
1052                       "polly-print-opt-isl",
1053                       "Polly - Print optimizer schedule of SCoP", false, false);
1054 INITIALIZE_PASS_DEPENDENCY(IslScheduleOptimizerWrapperPass)
1055 INITIALIZE_PASS_END(IslScheduleOptimizerPrinterLegacyPass,
1056                     "polly-print-opt-isl",
1057                     "Polly - Print optimizer schedule of SCoP", false, false)
1058