1 //===- Schedule.cpp - Calculate an optimized schedule ---------------------===//
2 //
3 //                     The LLVM Compiler Infrastructure
4 //
5 // This file is distributed under the University of Illinois Open Source
6 // License. See LICENSE.TXT for details.
7 //
8 //===----------------------------------------------------------------------===//
9 //
10 // This pass the isl to calculate a schedule that is optimized for parallelism
11 // and tileablility. The algorithm used in isl is an optimized version of the
12 // algorithm described in following paper:
13 //
14 // U. Bondhugula, A. Hartono, J. Ramanujam, and P. Sadayappan.
15 // A Practical Automatic Polyhedral Parallelizer and Locality Optimizer.
16 // In Proceedings of the 2008 ACM SIGPLAN Conference On Programming Language
17 // Design and Implementation, PLDI ’08, pages 101–113. ACM, 2008.
18 //===----------------------------------------------------------------------===//
19 
20 #include "polly/ScheduleOptimizer.h"
21 #include "polly/CodeGen/CodeGeneration.h"
22 #include "polly/DependenceInfo.h"
23 #include "polly/LinkAllPasses.h"
24 #include "polly/Options.h"
25 #include "polly/ScopInfo.h"
26 #include "polly/Support/GICHelper.h"
27 #include "llvm/Support/Debug.h"
28 #include "isl/aff.h"
29 #include "isl/band.h"
30 #include "isl/constraint.h"
31 #include "isl/map.h"
32 #include "isl/options.h"
33 #include "isl/printer.h"
34 #include "isl/schedule.h"
35 #include "isl/schedule_node.h"
36 #include "isl/space.h"
37 #include "isl/union_map.h"
38 #include "isl/union_set.h"
39 
40 using namespace llvm;
41 using namespace polly;
42 
43 #define DEBUG_TYPE "polly-opt-isl"
44 
45 namespace polly {
46 bool DisablePollyTiling;
47 }
48 static cl::opt<bool, true>
49     DisableTiling("polly-no-tiling",
50                   cl::desc("Disable tiling in the scheduler"),
51                   cl::location(polly::DisablePollyTiling), cl::init(false),
52                   cl::ZeroOrMore, cl::cat(PollyCategory));
53 
54 static cl::opt<std::string>
55     OptimizeDeps("polly-opt-optimize-only",
56                  cl::desc("Only a certain kind of dependences (all/raw)"),
57                  cl::Hidden, cl::init("all"), cl::ZeroOrMore,
58                  cl::cat(PollyCategory));
59 
60 static cl::opt<std::string>
61     SimplifyDeps("polly-opt-simplify-deps",
62                  cl::desc("Dependences should be simplified (yes/no)"),
63                  cl::Hidden, cl::init("yes"), cl::ZeroOrMore,
64                  cl::cat(PollyCategory));
65 
66 static cl::opt<int> MaxConstantTerm(
67     "polly-opt-max-constant-term",
68     cl::desc("The maximal constant term allowed (-1 is unlimited)"), cl::Hidden,
69     cl::init(20), cl::ZeroOrMore, cl::cat(PollyCategory));
70 
71 static cl::opt<int> MaxCoefficient(
72     "polly-opt-max-coefficient",
73     cl::desc("The maximal coefficient allowed (-1 is unlimited)"), cl::Hidden,
74     cl::init(20), cl::ZeroOrMore, cl::cat(PollyCategory));
75 
76 static cl::opt<std::string> FusionStrategy(
77     "polly-opt-fusion", cl::desc("The fusion strategy to choose (min/max)"),
78     cl::Hidden, cl::init("min"), cl::ZeroOrMore, cl::cat(PollyCategory));
79 
80 static cl::opt<std::string>
81     MaximizeBandDepth("polly-opt-maximize-bands",
82                       cl::desc("Maximize the band depth (yes/no)"), cl::Hidden,
83                       cl::init("yes"), cl::ZeroOrMore, cl::cat(PollyCategory));
84 
85 static cl::opt<int> DefaultTileSize(
86     "polly-default-tile-size",
87     cl::desc("The default tile size (if not enough were provided by"
88              " --polly-tile-sizes)"),
89     cl::Hidden, cl::init(32), cl::ZeroOrMore, cl::cat(PollyCategory));
90 
91 static cl::list<int> TileSizes("polly-tile-sizes",
92                                cl::desc("A tile size"
93                                         " for each loop dimension, filled with"
94                                         " --polly-default-tile-size"),
95                                cl::Hidden, cl::ZeroOrMore, cl::CommaSeparated,
96                                cl::cat(PollyCategory));
97 namespace {
98 
99 class IslScheduleOptimizer : public ScopPass {
100 public:
101   static char ID;
102   explicit IslScheduleOptimizer() : ScopPass(ID) { LastSchedule = nullptr; }
103 
104   ~IslScheduleOptimizer() { isl_schedule_free(LastSchedule); }
105 
106   bool runOnScop(Scop &S) override;
107   void printScop(raw_ostream &OS, Scop &S) const override;
108   void getAnalysisUsage(AnalysisUsage &AU) const override;
109 
110 private:
111   isl_schedule *LastSchedule;
112 
113   /// @brief Decide if the @p NewSchedule is profitable for @p S.
114   ///
115   /// @param S           The SCoP we optimize.
116   /// @param NewSchedule The new schedule we computed.
117   ///
118   /// @return True, if we believe @p NewSchedule is an improvement for @p S.
119   bool isProfitableSchedule(Scop &S, __isl_keep isl_union_map *NewSchedule);
120 
121   /// @brief Create a map that pre-vectorizes one scheduling dimension.
122   ///
123   /// getPrevectorMap creates a map that maps each input dimension to the same
124   /// output dimension, except for the dimension DimToVectorize.
125   /// DimToVectorize is strip mined by 'VectorWidth' and the newly created
126   /// point loop of DimToVectorize is moved to the innermost level.
127   ///
128   /// Example (DimToVectorize=0, ScheduleDimensions=2, VectorWidth=4):
129   ///
130   /// | Before transformation
131   /// |
132   /// | A[i,j] -> [i,j]
133   /// |
134   /// | for (i = 0; i < 128; i++)
135   /// |    for (j = 0; j < 128; j++)
136   /// |      A(i,j);
137   ///
138   ///   Prevector map:
139   ///   [i,j] -> [it,j,ip] : it % 4 = 0 and it <= ip <= it + 3 and i = ip
140   ///
141   /// | After transformation:
142   /// |
143   /// | A[i,j] -> [it,j,ip] : it % 4 = 0 and it <= ip <= it + 3 and i = ip
144   /// |
145   /// | for (it = 0; it < 128; it+=4)
146   /// |    for (j = 0; j < 128; j++)
147   /// |      for (ip = max(0,it); ip < min(128, it + 3); ip++)
148   /// |        A(ip,j);
149   ///
150   /// The goal of this transformation is to create a trivially vectorizable
151   /// loop.  This means a parallel loop at the innermost level that has a
152   /// constant number of iterations corresponding to the target vector width.
153   ///
154   /// This transformation creates a loop at the innermost level. The loop has
155   /// a constant number of iterations, if the number of loop iterations at
156   /// DimToVectorize can be divided by VectorWidth. The default VectorWidth is
157   /// currently constant and not yet target specific. This function does not
158   /// reason about parallelism.
159   static __isl_give isl_map *getPrevectorMap(isl_ctx *ctx, int DimToVectorize,
160                                              int ScheduleDimensions,
161                                              int VectorWidth = 4);
162 
163   /// @brief Apply additional optimizations on the bands in the schedule tree.
164   ///
165   /// We are looking for an innermost band node and apply the following
166   /// transformations:
167   ///
168   ///  - Tile the band
169   ///      - if the band is tileable
170   ///      - if the band has more than one loop dimension
171   ///
172   ///  - Prevectorize the point loop of the tile
173   ///      - if vectorization is enabled
174   ///
175   /// @param Node The schedule node to (possibly) optimize.
176   /// @param User A pointer to forward some use information (currently unused).
177   static isl_schedule_node *optimizeBand(isl_schedule_node *Node, void *User);
178 
179   static __isl_give isl_union_map *
180   getScheduleMap(__isl_keep isl_schedule *Schedule);
181 
182   using llvm::Pass::doFinalization;
183 
184   virtual bool doFinalization() override {
185     isl_schedule_free(LastSchedule);
186     LastSchedule = nullptr;
187     return true;
188   }
189 };
190 }
191 
192 char IslScheduleOptimizer::ID = 0;
193 
194 __isl_give isl_map *
195 IslScheduleOptimizer::getPrevectorMap(isl_ctx *ctx, int DimToVectorize,
196                                       int ScheduleDimensions, int VectorWidth) {
197   isl_space *Space;
198   isl_local_space *LocalSpace, *LocalSpaceRange;
199   isl_set *Modulo;
200   isl_map *TilingMap;
201   isl_constraint *c;
202   isl_aff *Aff;
203   int PointDimension; /* ip */
204   int TileDimension;  /* it */
205   isl_val *VectorWidthMP;
206 
207   assert(0 <= DimToVectorize && DimToVectorize < ScheduleDimensions);
208 
209   Space = isl_space_alloc(ctx, 0, ScheduleDimensions, ScheduleDimensions + 1);
210   TilingMap = isl_map_universe(isl_space_copy(Space));
211   LocalSpace = isl_local_space_from_space(Space);
212   PointDimension = ScheduleDimensions;
213   TileDimension = DimToVectorize;
214 
215   // Create an identity map for everything except DimToVectorize and map
216   // DimToVectorize to the point loop at the innermost dimension.
217   for (int i = 0; i < ScheduleDimensions; i++)
218     if (i == DimToVectorize)
219       TilingMap =
220           isl_map_equate(TilingMap, isl_dim_in, i, isl_dim_out, PointDimension);
221     else
222       TilingMap = isl_map_equate(TilingMap, isl_dim_in, i, isl_dim_out, i);
223 
224   // it % 'VectorWidth' = 0
225   LocalSpaceRange = isl_local_space_range(isl_local_space_copy(LocalSpace));
226   Aff = isl_aff_zero_on_domain(LocalSpaceRange);
227   Aff = isl_aff_set_constant_si(Aff, VectorWidth);
228   Aff = isl_aff_set_coefficient_si(Aff, isl_dim_in, TileDimension, 1);
229   VectorWidthMP = isl_val_int_from_si(ctx, VectorWidth);
230   Aff = isl_aff_mod_val(Aff, VectorWidthMP);
231   Modulo = isl_pw_aff_zero_set(isl_pw_aff_from_aff(Aff));
232   TilingMap = isl_map_intersect_range(TilingMap, Modulo);
233 
234   // it <= ip
235   TilingMap = isl_map_order_le(TilingMap, isl_dim_out, TileDimension,
236                                isl_dim_out, PointDimension);
237 
238   // ip <= it + ('VectorWidth' - 1)
239   c = isl_inequality_alloc(LocalSpace);
240   isl_constraint_set_coefficient_si(c, isl_dim_out, TileDimension, 1);
241   isl_constraint_set_coefficient_si(c, isl_dim_out, PointDimension, -1);
242   isl_constraint_set_constant_si(c, VectorWidth - 1);
243   TilingMap = isl_map_add_constraint(TilingMap, c);
244 
245   return TilingMap;
246 }
247 
248 isl_schedule_node *IslScheduleOptimizer::optimizeBand(isl_schedule_node *Node,
249                                                       void *User) {
250   if (isl_schedule_node_get_type(Node) != isl_schedule_node_band)
251     return Node;
252 
253   if (isl_schedule_node_n_children(Node) != 1)
254     return Node;
255 
256   if (!isl_schedule_node_band_get_permutable(Node))
257     return Node;
258 
259   auto Space = isl_schedule_node_band_get_space(Node);
260   auto Dims = isl_space_dim(Space, isl_dim_set);
261 
262   if (Dims <= 1) {
263     isl_space_free(Space);
264     return Node;
265   }
266 
267   auto Child = isl_schedule_node_get_child(Node, 0);
268   auto Type = isl_schedule_node_get_type(Child);
269   isl_schedule_node_free(Child);
270 
271   if (Type != isl_schedule_node_leaf) {
272     isl_space_free(Space);
273     return Node;
274   }
275 
276   auto Sizes = isl_multi_val_zero(Space);
277   auto Ctx = isl_schedule_node_get_ctx(Node);
278 
279   for (unsigned i = 0; i < Dims; i++) {
280     auto tileSize = TileSizes.size() > i ? TileSizes[i] : DefaultTileSize;
281     Sizes = isl_multi_val_set_val(Sizes, i, isl_val_int_from_si(Ctx, tileSize));
282   }
283 
284   isl_schedule_node *Res;
285 
286   if (DisableTiling) {
287     isl_multi_val_free(Sizes);
288     Res = Node;
289   } else {
290     Res = isl_schedule_node_band_tile(Node, Sizes);
291   }
292 
293   if (PollyVectorizerChoice == VECTORIZER_NONE)
294     return Res;
295 
296   Child = isl_schedule_node_get_child(Res, 0);
297   auto ChildSchedule = isl_schedule_node_band_get_partial_schedule(Child);
298 
299   for (int i = Dims - 1; i >= 0; i--) {
300     if (isl_schedule_node_band_member_get_coincident(Child, i)) {
301       auto TileMap = IslScheduleOptimizer::getPrevectorMap(Ctx, i, Dims);
302       auto TileUMap = isl_union_map_from_map(TileMap);
303       auto ChildSchedule2 = isl_union_map_apply_range(
304           isl_union_map_from_multi_union_pw_aff(ChildSchedule), TileUMap);
305       ChildSchedule = isl_multi_union_pw_aff_from_union_map(ChildSchedule2);
306       break;
307     }
308   }
309 
310   isl_schedule_node_free(Res);
311   Res = isl_schedule_node_delete(Child);
312   Res = isl_schedule_node_insert_partial_schedule(Res, ChildSchedule);
313   return Res;
314 }
315 
316 __isl_give isl_union_map *
317 IslScheduleOptimizer::getScheduleMap(__isl_keep isl_schedule *Schedule) {
318   isl_schedule_node *Root = isl_schedule_get_root(Schedule);
319   Root = isl_schedule_node_map_descendant_bottom_up(
320       Root, IslScheduleOptimizer::optimizeBand, NULL);
321   auto ScheduleMap = isl_schedule_node_get_subtree_schedule_union_map(Root);
322   ScheduleMap = isl_union_map_detect_equalities(ScheduleMap);
323   isl_schedule_node_free(Root);
324   return ScheduleMap;
325 }
326 
327 bool IslScheduleOptimizer::isProfitableSchedule(
328     Scop &S, __isl_keep isl_union_map *NewSchedule) {
329   // To understand if the schedule has been optimized we check if the schedule
330   // has changed at all.
331   // TODO: We can improve this by tracking if any necessarily beneficial
332   // transformations have been performed. This can e.g. be tiling, loop
333   // interchange, or ...) We can track this either at the place where the
334   // transformation has been performed or, in case of automatic ILP based
335   // optimizations, by comparing (yet to be defined) performance metrics
336   // before/after the scheduling optimizer
337   // (e.g., #stride-one accesses)
338   isl_union_map *OldSchedule = S.getSchedule();
339   bool changed = !isl_union_map_is_equal(OldSchedule, NewSchedule);
340   isl_union_map_free(OldSchedule);
341   return changed;
342 }
343 
344 bool IslScheduleOptimizer::runOnScop(Scop &S) {
345 
346   // Skip empty SCoPs but still allow code generation as it will delete the
347   // loops present but not needed.
348   if (S.getSize() == 0) {
349     S.markAsOptimized();
350     return false;
351   }
352 
353   const Dependences &D = getAnalysis<DependenceInfo>().getDependences();
354 
355   if (!D.hasValidDependences())
356     return false;
357 
358   isl_schedule_free(LastSchedule);
359   LastSchedule = nullptr;
360 
361   // Build input data.
362   int ValidityKinds =
363       Dependences::TYPE_RAW | Dependences::TYPE_WAR | Dependences::TYPE_WAW;
364   int ProximityKinds;
365 
366   if (OptimizeDeps == "all")
367     ProximityKinds =
368         Dependences::TYPE_RAW | Dependences::TYPE_WAR | Dependences::TYPE_WAW;
369   else if (OptimizeDeps == "raw")
370     ProximityKinds = Dependences::TYPE_RAW;
371   else {
372     errs() << "Do not know how to optimize for '" << OptimizeDeps << "'"
373            << " Falling back to optimizing all dependences.\n";
374     ProximityKinds =
375         Dependences::TYPE_RAW | Dependences::TYPE_WAR | Dependences::TYPE_WAW;
376   }
377 
378   isl_union_set *Domain = S.getDomains();
379 
380   if (!Domain)
381     return false;
382 
383   isl_union_map *Validity = D.getDependences(ValidityKinds);
384   isl_union_map *Proximity = D.getDependences(ProximityKinds);
385 
386   // Simplify the dependences by removing the constraints introduced by the
387   // domains. This can speed up the scheduling time significantly, as large
388   // constant coefficients will be removed from the dependences. The
389   // introduction of some additional dependences reduces the possible
390   // transformations, but in most cases, such transformation do not seem to be
391   // interesting anyway. In some cases this option may stop the scheduler to
392   // find any schedule.
393   if (SimplifyDeps == "yes") {
394     Validity = isl_union_map_gist_domain(Validity, isl_union_set_copy(Domain));
395     Validity = isl_union_map_gist_range(Validity, isl_union_set_copy(Domain));
396     Proximity =
397         isl_union_map_gist_domain(Proximity, isl_union_set_copy(Domain));
398     Proximity = isl_union_map_gist_range(Proximity, isl_union_set_copy(Domain));
399   } else if (SimplifyDeps != "no") {
400     errs() << "warning: Option -polly-opt-simplify-deps should either be 'yes' "
401               "or 'no'. Falling back to default: 'yes'\n";
402   }
403 
404   DEBUG(dbgs() << "\n\nCompute schedule from: ");
405   DEBUG(dbgs() << "Domain := " << stringFromIslObj(Domain) << ";\n");
406   DEBUG(dbgs() << "Proximity := " << stringFromIslObj(Proximity) << ";\n");
407   DEBUG(dbgs() << "Validity := " << stringFromIslObj(Validity) << ";\n");
408 
409   unsigned IslSerializeSCCs;
410 
411   if (FusionStrategy == "max") {
412     IslSerializeSCCs = 0;
413   } else if (FusionStrategy == "min") {
414     IslSerializeSCCs = 1;
415   } else {
416     errs() << "warning: Unknown fusion strategy. Falling back to maximal "
417               "fusion.\n";
418     IslSerializeSCCs = 0;
419   }
420 
421   int IslMaximizeBands;
422 
423   if (MaximizeBandDepth == "yes") {
424     IslMaximizeBands = 1;
425   } else if (MaximizeBandDepth == "no") {
426     IslMaximizeBands = 0;
427   } else {
428     errs() << "warning: Option -polly-opt-maximize-bands should either be 'yes'"
429               " or 'no'. Falling back to default: 'yes'\n";
430     IslMaximizeBands = 1;
431   }
432 
433   isl_options_set_schedule_serialize_sccs(S.getIslCtx(), IslSerializeSCCs);
434   isl_options_set_schedule_maximize_band_depth(S.getIslCtx(), IslMaximizeBands);
435   isl_options_set_schedule_max_constant_term(S.getIslCtx(), MaxConstantTerm);
436   isl_options_set_schedule_max_coefficient(S.getIslCtx(), MaxCoefficient);
437   isl_options_set_tile_scale_tile_loops(S.getIslCtx(), 0);
438 
439   isl_options_set_on_error(S.getIslCtx(), ISL_ON_ERROR_CONTINUE);
440 
441   isl_schedule_constraints *ScheduleConstraints;
442   ScheduleConstraints = isl_schedule_constraints_on_domain(Domain);
443   ScheduleConstraints =
444       isl_schedule_constraints_set_proximity(ScheduleConstraints, Proximity);
445   ScheduleConstraints = isl_schedule_constraints_set_validity(
446       ScheduleConstraints, isl_union_map_copy(Validity));
447   ScheduleConstraints =
448       isl_schedule_constraints_set_coincidence(ScheduleConstraints, Validity);
449   isl_schedule *Schedule;
450   Schedule = isl_schedule_constraints_compute_schedule(ScheduleConstraints);
451   isl_options_set_on_error(S.getIslCtx(), ISL_ON_ERROR_ABORT);
452 
453   // In cases the scheduler is not able to optimize the code, we just do not
454   // touch the schedule.
455   if (!Schedule)
456     return false;
457 
458   DEBUG({
459     auto *P = isl_printer_to_str(S.getIslCtx());
460     P = isl_printer_set_yaml_style(P, ISL_YAML_STYLE_BLOCK);
461     P = isl_printer_print_schedule(P, Schedule);
462     dbgs() << "NewScheduleTree: \n" << isl_printer_get_str(P) << "\n";
463     isl_printer_free(P);
464   });
465 
466   isl_union_map *NewSchedule = getScheduleMap(Schedule);
467 
468   // Check if the optimizations performed were profitable, otherwise exit early.
469   if (!isProfitableSchedule(S, NewSchedule)) {
470     isl_schedule_free(Schedule);
471     isl_union_map_free(NewSchedule);
472     return false;
473   }
474 
475   S.markAsOptimized();
476 
477   for (ScopStmt &Stmt : S) {
478     isl_map *StmtSchedule;
479     isl_set *Domain = Stmt.getDomain();
480     isl_union_map *StmtBand;
481     StmtBand = isl_union_map_intersect_domain(isl_union_map_copy(NewSchedule),
482                                               isl_union_set_from_set(Domain));
483     if (isl_union_map_is_empty(StmtBand)) {
484       StmtSchedule = isl_map_from_domain(isl_set_empty(Stmt.getDomainSpace()));
485       isl_union_map_free(StmtBand);
486     } else {
487       assert(isl_union_map_n_map(StmtBand) == 1);
488       StmtSchedule = isl_map_from_union_map(StmtBand);
489     }
490 
491     Stmt.setSchedule(StmtSchedule);
492   }
493 
494   isl_schedule_free(Schedule);
495   isl_union_map_free(NewSchedule);
496   return false;
497 }
498 
499 void IslScheduleOptimizer::printScop(raw_ostream &OS, Scop &) const {
500   isl_printer *p;
501   char *ScheduleStr;
502 
503   OS << "Calculated schedule:\n";
504 
505   if (!LastSchedule) {
506     OS << "n/a\n";
507     return;
508   }
509 
510   p = isl_printer_to_str(isl_schedule_get_ctx(LastSchedule));
511   p = isl_printer_print_schedule(p, LastSchedule);
512   ScheduleStr = isl_printer_get_str(p);
513   isl_printer_free(p);
514 
515   OS << ScheduleStr << "\n";
516 }
517 
518 void IslScheduleOptimizer::getAnalysisUsage(AnalysisUsage &AU) const {
519   ScopPass::getAnalysisUsage(AU);
520   AU.addRequired<DependenceInfo>();
521 }
522 
523 Pass *polly::createIslScheduleOptimizerPass() {
524   return new IslScheduleOptimizer();
525 }
526 
527 INITIALIZE_PASS_BEGIN(IslScheduleOptimizer, "polly-opt-isl",
528                       "Polly - Optimize schedule of SCoP", false, false);
529 INITIALIZE_PASS_DEPENDENCY(DependenceInfo);
530 INITIALIZE_PASS_DEPENDENCY(ScopInfo);
531 INITIALIZE_PASS_END(IslScheduleOptimizer, "polly-opt-isl",
532                     "Polly - Optimize schedule of SCoP", false, false)
533