1 //===- ScheduleOptimizer.cpp - Calculate an optimized schedule ------------===//
2 //
3 // Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
4 // See https://llvm.org/LICENSE.txt for license information.
5 // SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
6 //
7 //===----------------------------------------------------------------------===//
8 //
9 // This pass generates an entirely new schedule tree from the data dependences
10 // and iteration domains. The new schedule tree is computed in two steps:
11 //
12 // 1) The isl scheduling optimizer is run
13 //
14 // The isl scheduling optimizer creates a new schedule tree that maximizes
15 // parallelism and tileability and minimizes data-dependence distances. The
16 // algorithm used is a modified version of the ``Pluto'' algorithm:
17 //
18 // U. Bondhugula, A. Hartono, J. Ramanujam, and P. Sadayappan.
19 // A Practical Automatic Polyhedral Parallelizer and Locality Optimizer.
20 // In Proceedings of the 2008 ACM SIGPLAN Conference On Programming Language
21 // Design and Implementation, PLDI ’08, pages 101–113. ACM, 2008.
22 //
23 // 2) A set of post-scheduling transformations is applied on the schedule tree.
24 //
25 // These optimizations include:
26 //
27 // - Tiling of the innermost tilable bands
28 // - Prevectorization - The choice of a possible outer loop that is strip-mined
29 // to the innermost level to enable inner-loop
30 // vectorization.
31 // - Some optimizations for spatial locality are also planned.
32 //
33 // For a detailed description of the schedule tree itself please see section 6
34 // of:
35 //
36 // Polyhedral AST generation is more than scanning polyhedra
37 // Tobias Grosser, Sven Verdoolaege, Albert Cohen
38 // ACM Transactions on Programming Languages and Systems (TOPLAS),
39 // 37(4), July 2015
40 // http://www.grosser.es/#pub-polyhedral-AST-generation
41 //
42 // This publication also contains a detailed discussion of the different options
43 // for polyhedral loop unrolling, full/partial tile separation and other uses
44 // of the schedule tree.
45 //
46 //===----------------------------------------------------------------------===//
47
48 #include "polly/ScheduleOptimizer.h"
49 #include "polly/CodeGen/CodeGeneration.h"
50 #include "polly/DependenceInfo.h"
51 #include "polly/ManualOptimizer.h"
52 #include "polly/MatmulOptimizer.h"
53 #include "polly/Options.h"
54 #include "polly/ScheduleTreeTransform.h"
55 #include "polly/Support/ISLOStream.h"
56 #include "polly/Support/ISLTools.h"
57 #include "llvm/ADT/Sequence.h"
58 #include "llvm/ADT/Statistic.h"
59 #include "llvm/Analysis/OptimizationRemarkEmitter.h"
60 #include "llvm/InitializePasses.h"
61 #include "llvm/Support/CommandLine.h"
62 #include "isl/options.h"
63
64 using namespace llvm;
65 using namespace polly;
66
67 namespace llvm {
68 class Loop;
69 class Module;
70 } // namespace llvm
71
72 #define DEBUG_TYPE "polly-opt-isl"
73
74 static cl::opt<std::string>
75 OptimizeDeps("polly-opt-optimize-only",
76 cl::desc("Only a certain kind of dependences (all/raw)"),
77 cl::Hidden, cl::init("all"), cl::cat(PollyCategory));
78
79 static cl::opt<std::string>
80 SimplifyDeps("polly-opt-simplify-deps",
81 cl::desc("Dependences should be simplified (yes/no)"),
82 cl::Hidden, cl::init("yes"), cl::cat(PollyCategory));
83
84 static cl::opt<int> MaxConstantTerm(
85 "polly-opt-max-constant-term",
86 cl::desc("The maximal constant term allowed (-1 is unlimited)"), cl::Hidden,
87 cl::init(20), cl::cat(PollyCategory));
88
89 static cl::opt<int> MaxCoefficient(
90 "polly-opt-max-coefficient",
91 cl::desc("The maximal coefficient allowed (-1 is unlimited)"), cl::Hidden,
92 cl::init(20), cl::cat(PollyCategory));
93
94 static cl::opt<std::string>
95 MaximizeBandDepth("polly-opt-maximize-bands",
96 cl::desc("Maximize the band depth (yes/no)"), cl::Hidden,
97 cl::init("yes"), cl::cat(PollyCategory));
98
99 static cl::opt<bool>
100 GreedyFusion("polly-loopfusion-greedy",
101 cl::desc("Aggressively try to fuse everything"), cl::Hidden,
102 cl::cat(PollyCategory));
103
104 static cl::opt<std::string> OuterCoincidence(
105 "polly-opt-outer-coincidence",
106 cl::desc("Try to construct schedules where the outer member of each band "
107 "satisfies the coincidence constraints (yes/no)"),
108 cl::Hidden, cl::init("no"), cl::cat(PollyCategory));
109
110 static cl::opt<int> PrevectorWidth(
111 "polly-prevect-width",
112 cl::desc(
113 "The number of loop iterations to strip-mine for pre-vectorization"),
114 cl::Hidden, cl::init(4), cl::cat(PollyCategory));
115
116 static cl::opt<bool> FirstLevelTiling("polly-tiling",
117 cl::desc("Enable loop tiling"),
118 cl::init(true), cl::cat(PollyCategory));
119
120 static cl::opt<int> FirstLevelDefaultTileSize(
121 "polly-default-tile-size",
122 cl::desc("The default tile size (if not enough were provided by"
123 " --polly-tile-sizes)"),
124 cl::Hidden, cl::init(32), cl::cat(PollyCategory));
125
126 static cl::list<int>
127 FirstLevelTileSizes("polly-tile-sizes",
128 cl::desc("A tile size for each loop dimension, filled "
129 "with --polly-default-tile-size"),
130 cl::Hidden, cl::CommaSeparated, cl::cat(PollyCategory));
131
132 static cl::opt<bool>
133 SecondLevelTiling("polly-2nd-level-tiling",
134 cl::desc("Enable a 2nd level loop of loop tiling"),
135 cl::cat(PollyCategory));
136
137 static cl::opt<int> SecondLevelDefaultTileSize(
138 "polly-2nd-level-default-tile-size",
139 cl::desc("The default 2nd-level tile size (if not enough were provided by"
140 " --polly-2nd-level-tile-sizes)"),
141 cl::Hidden, cl::init(16), cl::cat(PollyCategory));
142
143 static cl::list<int>
144 SecondLevelTileSizes("polly-2nd-level-tile-sizes",
145 cl::desc("A tile size for each loop dimension, filled "
146 "with --polly-default-tile-size"),
147 cl::Hidden, cl::CommaSeparated,
148 cl::cat(PollyCategory));
149
150 static cl::opt<bool> RegisterTiling("polly-register-tiling",
151 cl::desc("Enable register tiling"),
152 cl::cat(PollyCategory));
153
154 static cl::opt<int> RegisterDefaultTileSize(
155 "polly-register-tiling-default-tile-size",
156 cl::desc("The default register tile size (if not enough were provided by"
157 " --polly-register-tile-sizes)"),
158 cl::Hidden, cl::init(2), cl::cat(PollyCategory));
159
160 static cl::list<int>
161 RegisterTileSizes("polly-register-tile-sizes",
162 cl::desc("A tile size for each loop dimension, filled "
163 "with --polly-register-tile-size"),
164 cl::Hidden, cl::CommaSeparated, cl::cat(PollyCategory));
165
166 static cl::opt<bool> PragmaBasedOpts(
167 "polly-pragma-based-opts",
168 cl::desc("Apply user-directed transformation from metadata"),
169 cl::init(true), cl::cat(PollyCategory));
170
171 static cl::opt<bool> EnableReschedule("polly-reschedule",
172 cl::desc("Optimize SCoPs using ISL"),
173 cl::init(true), cl::cat(PollyCategory));
174
175 static cl::opt<bool>
176 PMBasedOpts("polly-pattern-matching-based-opts",
177 cl::desc("Perform optimizations based on pattern matching"),
178 cl::init(true), cl::cat(PollyCategory));
179
180 static cl::opt<bool>
181 EnablePostopts("polly-postopts",
182 cl::desc("Apply post-rescheduling optimizations such as "
183 "tiling (requires -polly-reschedule)"),
184 cl::init(true), cl::cat(PollyCategory));
185
186 static cl::opt<bool> OptimizedScops(
187 "polly-optimized-scops",
188 cl::desc("Polly - Dump polyhedral description of Scops optimized with "
189 "the isl scheduling optimizer and the set of post-scheduling "
190 "transformations is applied on the schedule tree"),
191 cl::cat(PollyCategory));
192
193 STATISTIC(ScopsProcessed, "Number of scops processed");
194 STATISTIC(ScopsRescheduled, "Number of scops rescheduled");
195 STATISTIC(ScopsOptimized, "Number of scops optimized");
196
197 STATISTIC(NumAffineLoopsOptimized, "Number of affine loops optimized");
198 STATISTIC(NumBoxedLoopsOptimized, "Number of boxed loops optimized");
199
200 #define THREE_STATISTICS(VARNAME, DESC) \
201 static Statistic VARNAME[3] = { \
202 {DEBUG_TYPE, #VARNAME "0", DESC " (original)"}, \
203 {DEBUG_TYPE, #VARNAME "1", DESC " (after scheduler)"}, \
204 {DEBUG_TYPE, #VARNAME "2", DESC " (after optimizer)"}}
205
206 THREE_STATISTICS(NumBands, "Number of bands");
207 THREE_STATISTICS(NumBandMembers, "Number of band members");
208 THREE_STATISTICS(NumCoincident, "Number of coincident band members");
209 THREE_STATISTICS(NumPermutable, "Number of permutable bands");
210 THREE_STATISTICS(NumFilters, "Number of filter nodes");
211 THREE_STATISTICS(NumExtension, "Number of extension nodes");
212
213 STATISTIC(FirstLevelTileOpts, "Number of first level tiling applied");
214 STATISTIC(SecondLevelTileOpts, "Number of second level tiling applied");
215 STATISTIC(RegisterTileOpts, "Number of register tiling applied");
216 STATISTIC(PrevectOpts, "Number of strip-mining for prevectorization applied");
217 STATISTIC(MatMulOpts,
218 "Number of matrix multiplication patterns detected and optimized");
219
220 namespace {
221 /// Additional parameters of the schedule optimizer.
222 ///
223 /// Target Transform Info and the SCoP dependencies used by the schedule
224 /// optimizer.
225 struct OptimizerAdditionalInfoTy {
226 const llvm::TargetTransformInfo *TTI;
227 const Dependences *D;
228 bool PatternOpts;
229 bool Postopts;
230 bool Prevect;
231 bool &DepsChanged;
232 };
233
234 class ScheduleTreeOptimizer final {
235 public:
236 /// Apply schedule tree transformations.
237 ///
238 /// This function takes an (possibly already optimized) schedule tree and
239 /// applies a set of additional optimizations on the schedule tree. The
240 /// transformations applied include:
241 ///
242 /// - Pattern-based optimizations
243 /// - Tiling
244 /// - Prevectorization
245 ///
246 /// @param Schedule The schedule object the transformations will be applied
247 /// to.
248 /// @param OAI Target Transform Info and the SCoP dependencies.
249 /// @returns The transformed schedule.
250 static isl::schedule
251 optimizeSchedule(isl::schedule Schedule,
252 const OptimizerAdditionalInfoTy *OAI = nullptr);
253
254 /// Apply schedule tree transformations.
255 ///
256 /// This function takes a node in an (possibly already optimized) schedule
257 /// tree and applies a set of additional optimizations on this schedule tree
258 /// node and its descendants. The transformations applied include:
259 ///
260 /// - Pattern-based optimizations
261 /// - Tiling
262 /// - Prevectorization
263 ///
264 /// @param Node The schedule object post-transformations will be applied to.
265 /// @param OAI Target Transform Info and the SCoP dependencies.
266 /// @returns The transformed schedule.
267 static isl::schedule_node
268 optimizeScheduleNode(isl::schedule_node Node,
269 const OptimizerAdditionalInfoTy *OAI = nullptr);
270
271 /// Decide if the @p NewSchedule is profitable for @p S.
272 ///
273 /// @param S The SCoP we optimize.
274 /// @param NewSchedule The new schedule we computed.
275 ///
276 /// @return True, if we believe @p NewSchedule is an improvement for @p S.
277 static bool isProfitableSchedule(polly::Scop &S, isl::schedule NewSchedule);
278
279 /// Isolate a set of partial tile prefixes.
280 ///
281 /// This set should ensure that it contains only partial tile prefixes that
282 /// have exactly VectorWidth iterations.
283 ///
284 /// @param Node A schedule node band, which is a parent of a band node,
285 /// that contains a vector loop.
286 /// @return Modified isl_schedule_node.
287 static isl::schedule_node isolateFullPartialTiles(isl::schedule_node Node,
288 int VectorWidth);
289
290 private:
291 /// Check if this node is a band node we want to tile.
292 ///
293 /// We look for innermost band nodes where individual dimensions are marked as
294 /// permutable.
295 ///
296 /// @param Node The node to check.
297 static bool isTileableBandNode(isl::schedule_node Node);
298
299 /// Pre-vectorizes one scheduling dimension of a schedule band.
300 ///
301 /// prevectSchedBand splits out the dimension DimToVectorize, tiles it and
302 /// sinks the resulting point loop.
303 ///
304 /// Example (DimToVectorize=0, VectorWidth=4):
305 ///
306 /// | Before transformation:
307 /// |
308 /// | A[i,j] -> [i,j]
309 /// |
310 /// | for (i = 0; i < 128; i++)
311 /// | for (j = 0; j < 128; j++)
312 /// | A(i,j);
313 ///
314 /// | After transformation:
315 /// |
316 /// | for (it = 0; it < 32; it+=1)
317 /// | for (j = 0; j < 128; j++)
318 /// | for (ip = 0; ip <= 3; ip++)
319 /// | A(4 * it + ip,j);
320 ///
321 /// The goal of this transformation is to create a trivially vectorizable
322 /// loop. This means a parallel loop at the innermost level that has a
323 /// constant number of iterations corresponding to the target vector width.
324 ///
325 /// This transformation creates a loop at the innermost level. The loop has
326 /// a constant number of iterations, if the number of loop iterations at
327 /// DimToVectorize can be divided by VectorWidth. The default VectorWidth is
328 /// currently constant and not yet target specific. This function does not
329 /// reason about parallelism.
330 static isl::schedule_node prevectSchedBand(isl::schedule_node Node,
331 unsigned DimToVectorize,
332 int VectorWidth);
333
334 /// Apply additional optimizations on the bands in the schedule tree.
335 ///
336 /// We are looking for an innermost band node and apply the following
337 /// transformations:
338 ///
339 /// - Tile the band
340 /// - if the band is tileable
341 /// - if the band has more than one loop dimension
342 ///
343 /// - Prevectorize the schedule of the band (or the point loop in case of
344 /// tiling).
345 /// - if vectorization is enabled
346 ///
347 /// @param Node The schedule node to (possibly) optimize.
348 /// @param User A pointer to forward some use information
349 /// (currently unused).
350 static isl_schedule_node *optimizeBand(isl_schedule_node *Node, void *User);
351
352 /// Apply tiling optimizations on the bands in the schedule tree.
353 ///
354 /// @param Node The schedule node to (possibly) optimize.
355 static isl::schedule_node applyTileBandOpt(isl::schedule_node Node);
356
357 /// Apply prevectorization on the bands in the schedule tree.
358 ///
359 /// @param Node The schedule node to (possibly) prevectorize.
360 static isl::schedule_node applyPrevectBandOpt(isl::schedule_node Node);
361 };
362
363 isl::schedule_node
isolateFullPartialTiles(isl::schedule_node Node,int VectorWidth)364 ScheduleTreeOptimizer::isolateFullPartialTiles(isl::schedule_node Node,
365 int VectorWidth) {
366 assert(isl_schedule_node_get_type(Node.get()) == isl_schedule_node_band);
367 Node = Node.child(0).child(0);
368 isl::union_map SchedRelUMap = Node.get_prefix_schedule_relation();
369 isl::union_set ScheduleRangeUSet = SchedRelUMap.range();
370 isl::set ScheduleRange{ScheduleRangeUSet};
371 isl::set IsolateDomain = getPartialTilePrefixes(ScheduleRange, VectorWidth);
372 auto AtomicOption = getDimOptions(IsolateDomain.ctx(), "atomic");
373 isl::union_set IsolateOption = getIsolateOptions(IsolateDomain, 1);
374 Node = Node.parent().parent();
375 isl::union_set Options = IsolateOption.unite(AtomicOption);
376 isl::schedule_node_band Result =
377 Node.as<isl::schedule_node_band>().set_ast_build_options(Options);
378 return Result;
379 }
380
381 struct InsertSimdMarkers final : ScheduleNodeRewriter<InsertSimdMarkers> {
visitBand__anonc229ce900111::InsertSimdMarkers382 isl::schedule_node visitBand(isl::schedule_node_band Band) {
383 isl::schedule_node Node = visitChildren(Band);
384
385 // Only add SIMD markers to innermost bands.
386 if (!Node.first_child().isa<isl::schedule_node_leaf>())
387 return Node;
388
389 isl::id LoopMarker = isl::id::alloc(Band.ctx(), "SIMD", nullptr);
390 return Band.insert_mark(LoopMarker);
391 }
392 };
393
prevectSchedBand(isl::schedule_node Node,unsigned DimToVectorize,int VectorWidth)394 isl::schedule_node ScheduleTreeOptimizer::prevectSchedBand(
395 isl::schedule_node Node, unsigned DimToVectorize, int VectorWidth) {
396 assert(isl_schedule_node_get_type(Node.get()) == isl_schedule_node_band);
397
398 auto Space = isl::manage(isl_schedule_node_band_get_space(Node.get()));
399 unsigned ScheduleDimensions = unsignedFromIslSize(Space.dim(isl::dim::set));
400 assert(DimToVectorize < ScheduleDimensions);
401
402 if (DimToVectorize > 0) {
403 Node = isl::manage(
404 isl_schedule_node_band_split(Node.release(), DimToVectorize));
405 Node = Node.child(0);
406 }
407 if (DimToVectorize < ScheduleDimensions - 1)
408 Node = isl::manage(isl_schedule_node_band_split(Node.release(), 1));
409 Space = isl::manage(isl_schedule_node_band_get_space(Node.get()));
410 auto Sizes = isl::multi_val::zero(Space);
411 Sizes = Sizes.set_val(0, isl::val(Node.ctx(), VectorWidth));
412 Node =
413 isl::manage(isl_schedule_node_band_tile(Node.release(), Sizes.release()));
414 Node = isolateFullPartialTiles(Node, VectorWidth);
415 Node = Node.child(0);
416 // Make sure the "trivially vectorizable loop" is not unrolled. Otherwise,
417 // we will have troubles to match it in the backend.
418 Node = Node.as<isl::schedule_node_band>().set_ast_build_options(
419 isl::union_set(Node.ctx(), "{ unroll[x]: 1 = 0 }"));
420
421 // Sink the inner loop into the smallest possible statements to make them
422 // represent a single vector instruction if possible.
423 Node = isl::manage(isl_schedule_node_band_sink(Node.release()));
424
425 // Add SIMD markers to those vector statements.
426 InsertSimdMarkers SimdMarkerInserter;
427 Node = SimdMarkerInserter.visit(Node);
428
429 PrevectOpts++;
430 return Node.parent();
431 }
432
isSimpleInnermostBand(const isl::schedule_node & Node)433 static bool isSimpleInnermostBand(const isl::schedule_node &Node) {
434 assert(isl_schedule_node_get_type(Node.get()) == isl_schedule_node_band);
435 assert(isl_schedule_node_n_children(Node.get()) == 1);
436
437 auto ChildType = isl_schedule_node_get_type(Node.child(0).get());
438
439 if (ChildType == isl_schedule_node_leaf)
440 return true;
441
442 if (ChildType != isl_schedule_node_sequence)
443 return false;
444
445 auto Sequence = Node.child(0);
446
447 for (int c = 0, nc = isl_schedule_node_n_children(Sequence.get()); c < nc;
448 ++c) {
449 auto Child = Sequence.child(c);
450 if (isl_schedule_node_get_type(Child.get()) != isl_schedule_node_filter)
451 return false;
452 if (isl_schedule_node_get_type(Child.child(0).get()) !=
453 isl_schedule_node_leaf)
454 return false;
455 }
456 return true;
457 }
458
isTileableBandNode(isl::schedule_node Node)459 bool ScheduleTreeOptimizer::isTileableBandNode(isl::schedule_node Node) {
460 if (isl_schedule_node_get_type(Node.get()) != isl_schedule_node_band)
461 return false;
462
463 if (isl_schedule_node_n_children(Node.get()) != 1)
464 return false;
465
466 if (!isl_schedule_node_band_get_permutable(Node.get()))
467 return false;
468
469 auto Space = isl::manage(isl_schedule_node_band_get_space(Node.get()));
470
471 if (unsignedFromIslSize(Space.dim(isl::dim::set)) <= 1u)
472 return false;
473
474 return isSimpleInnermostBand(Node);
475 }
476
477 __isl_give isl::schedule_node
applyTileBandOpt(isl::schedule_node Node)478 ScheduleTreeOptimizer::applyTileBandOpt(isl::schedule_node Node) {
479 if (FirstLevelTiling) {
480 Node = tileNode(Node, "1st level tiling", FirstLevelTileSizes,
481 FirstLevelDefaultTileSize);
482 FirstLevelTileOpts++;
483 }
484
485 if (SecondLevelTiling) {
486 Node = tileNode(Node, "2nd level tiling", SecondLevelTileSizes,
487 SecondLevelDefaultTileSize);
488 SecondLevelTileOpts++;
489 }
490
491 if (RegisterTiling) {
492 Node =
493 applyRegisterTiling(Node, RegisterTileSizes, RegisterDefaultTileSize);
494 RegisterTileOpts++;
495 }
496
497 return Node;
498 }
499
500 isl::schedule_node
applyPrevectBandOpt(isl::schedule_node Node)501 ScheduleTreeOptimizer::applyPrevectBandOpt(isl::schedule_node Node) {
502 auto Space = isl::manage(isl_schedule_node_band_get_space(Node.get()));
503 int Dims = unsignedFromIslSize(Space.dim(isl::dim::set));
504
505 for (int i = Dims - 1; i >= 0; i--)
506 if (Node.as<isl::schedule_node_band>().member_get_coincident(i)) {
507 Node = prevectSchedBand(Node, i, PrevectorWidth);
508 break;
509 }
510
511 return Node;
512 }
513
514 __isl_give isl_schedule_node *
optimizeBand(__isl_take isl_schedule_node * NodeArg,void * User)515 ScheduleTreeOptimizer::optimizeBand(__isl_take isl_schedule_node *NodeArg,
516 void *User) {
517 const OptimizerAdditionalInfoTy *OAI =
518 static_cast<const OptimizerAdditionalInfoTy *>(User);
519 assert(OAI && "Expecting optimization options");
520
521 isl::schedule_node Node = isl::manage(NodeArg);
522 if (!isTileableBandNode(Node))
523 return Node.release();
524
525 if (OAI->PatternOpts) {
526 isl::schedule_node PatternOptimizedSchedule =
527 tryOptimizeMatMulPattern(Node, OAI->TTI, OAI->D);
528 if (!PatternOptimizedSchedule.is_null()) {
529 MatMulOpts++;
530 OAI->DepsChanged = true;
531 return PatternOptimizedSchedule.release();
532 }
533 }
534
535 if (OAI->Postopts)
536 Node = applyTileBandOpt(Node);
537
538 if (OAI->Prevect) {
539 // FIXME: Prevectorization requirements are different from those checked by
540 // isTileableBandNode.
541 Node = applyPrevectBandOpt(Node);
542 }
543
544 return Node.release();
545 }
546
547 isl::schedule
optimizeSchedule(isl::schedule Schedule,const OptimizerAdditionalInfoTy * OAI)548 ScheduleTreeOptimizer::optimizeSchedule(isl::schedule Schedule,
549 const OptimizerAdditionalInfoTy *OAI) {
550 auto Root = Schedule.get_root();
551 Root = optimizeScheduleNode(Root, OAI);
552 return Root.get_schedule();
553 }
554
optimizeScheduleNode(isl::schedule_node Node,const OptimizerAdditionalInfoTy * OAI)555 isl::schedule_node ScheduleTreeOptimizer::optimizeScheduleNode(
556 isl::schedule_node Node, const OptimizerAdditionalInfoTy *OAI) {
557 Node = isl::manage(isl_schedule_node_map_descendant_bottom_up(
558 Node.release(), optimizeBand,
559 const_cast<void *>(static_cast<const void *>(OAI))));
560 return Node;
561 }
562
isProfitableSchedule(Scop & S,isl::schedule NewSchedule)563 bool ScheduleTreeOptimizer::isProfitableSchedule(Scop &S,
564 isl::schedule NewSchedule) {
565 // To understand if the schedule has been optimized we check if the schedule
566 // has changed at all.
567 // TODO: We can improve this by tracking if any necessarily beneficial
568 // transformations have been performed. This can e.g. be tiling, loop
569 // interchange, or ...) We can track this either at the place where the
570 // transformation has been performed or, in case of automatic ILP based
571 // optimizations, by comparing (yet to be defined) performance metrics
572 // before/after the scheduling optimizer
573 // (e.g., #stride-one accesses)
574 // FIXME: A schedule tree whose union_map-conversion is identical to the
575 // original schedule map may still allow for parallelization, i.e. can still
576 // be profitable.
577 auto NewScheduleMap = NewSchedule.get_map();
578 auto OldSchedule = S.getSchedule();
579 assert(!OldSchedule.is_null() &&
580 "Only IslScheduleOptimizer can insert extension nodes "
581 "that make Scop::getSchedule() return nullptr.");
582 bool changed = !OldSchedule.is_equal(NewScheduleMap);
583 return changed;
584 }
585
586 class IslScheduleOptimizerWrapperPass final : public ScopPass {
587 public:
588 static char ID;
589
IslScheduleOptimizerWrapperPass()590 explicit IslScheduleOptimizerWrapperPass() : ScopPass(ID) {}
591
592 /// Optimize the schedule of the SCoP @p S.
593 bool runOnScop(Scop &S) override;
594
595 /// Print the new schedule for the SCoP @p S.
596 void printScop(raw_ostream &OS, Scop &S) const override;
597
598 /// Register all analyses and transformation required.
599 void getAnalysisUsage(AnalysisUsage &AU) const override;
600
601 /// Release the internal memory.
releaseMemory()602 void releaseMemory() override {
603 LastSchedule = {};
604 IslCtx.reset();
605 }
606
607 private:
608 std::shared_ptr<isl_ctx> IslCtx;
609 isl::schedule LastSchedule;
610 };
611
612 char IslScheduleOptimizerWrapperPass::ID = 0;
613
614 #ifndef NDEBUG
printSchedule(llvm::raw_ostream & OS,const isl::schedule & Schedule,StringRef Desc)615 static void printSchedule(llvm::raw_ostream &OS, const isl::schedule &Schedule,
616 StringRef Desc) {
617 isl::ctx Ctx = Schedule.ctx();
618 isl_printer *P = isl_printer_to_str(Ctx.get());
619 P = isl_printer_set_yaml_style(P, ISL_YAML_STYLE_BLOCK);
620 P = isl_printer_print_schedule(P, Schedule.get());
621 char *Str = isl_printer_get_str(P);
622 OS << Desc << ": \n" << Str << "\n";
623 free(Str);
624 isl_printer_free(P);
625 }
626 #endif
627
628 /// Collect statistics for the schedule tree.
629 ///
630 /// @param Schedule The schedule tree to analyze. If not a schedule tree it is
631 /// ignored.
632 /// @param Version The version of the schedule tree that is analyzed.
633 /// 0 for the original schedule tree before any transformation.
634 /// 1 for the schedule tree after isl's rescheduling.
635 /// 2 for the schedule tree after optimizations are applied
636 /// (tiling, pattern matching)
walkScheduleTreeForStatistics(isl::schedule Schedule,int Version)637 static void walkScheduleTreeForStatistics(isl::schedule Schedule, int Version) {
638 auto Root = Schedule.get_root();
639 if (Root.is_null())
640 return;
641
642 isl_schedule_node_foreach_descendant_top_down(
643 Root.get(),
644 [](__isl_keep isl_schedule_node *nodeptr, void *user) -> isl_bool {
645 isl::schedule_node Node = isl::manage_copy(nodeptr);
646 int Version = *static_cast<int *>(user);
647
648 switch (isl_schedule_node_get_type(Node.get())) {
649 case isl_schedule_node_band: {
650 NumBands[Version]++;
651 if (isl_schedule_node_band_get_permutable(Node.get()) ==
652 isl_bool_true)
653 NumPermutable[Version]++;
654
655 int CountMembers = isl_schedule_node_band_n_member(Node.get());
656 NumBandMembers[Version] += CountMembers;
657 for (int i = 0; i < CountMembers; i += 1) {
658 if (Node.as<isl::schedule_node_band>().member_get_coincident(i))
659 NumCoincident[Version]++;
660 }
661 break;
662 }
663
664 case isl_schedule_node_filter:
665 NumFilters[Version]++;
666 break;
667
668 case isl_schedule_node_extension:
669 NumExtension[Version]++;
670 break;
671
672 default:
673 break;
674 }
675
676 return isl_bool_true;
677 },
678 &Version);
679 }
680
runIslScheduleOptimizer(Scop & S,function_ref<const Dependences & (Dependences::AnalysisLevel)> GetDeps,TargetTransformInfo * TTI,OptimizationRemarkEmitter * ORE,isl::schedule & LastSchedule,bool & DepsChanged)681 static void runIslScheduleOptimizer(
682 Scop &S,
683 function_ref<const Dependences &(Dependences::AnalysisLevel)> GetDeps,
684 TargetTransformInfo *TTI, OptimizationRemarkEmitter *ORE,
685 isl::schedule &LastSchedule, bool &DepsChanged) {
686
687 // Skip SCoPs in case they're already optimised by PPCGCodeGeneration
688 if (S.isToBeSkipped())
689 return;
690
691 // Skip empty SCoPs but still allow code generation as it will delete the
692 // loops present but not needed.
693 if (S.getSize() == 0) {
694 S.markAsOptimized();
695 return;
696 }
697
698 ScopsProcessed++;
699
700 // Schedule without optimizations.
701 isl::schedule Schedule = S.getScheduleTree();
702 walkScheduleTreeForStatistics(S.getScheduleTree(), 0);
703 LLVM_DEBUG(printSchedule(dbgs(), Schedule, "Original schedule tree"));
704
705 bool HasUserTransformation = false;
706 if (PragmaBasedOpts) {
707 isl::schedule ManuallyTransformed = applyManualTransformations(
708 &S, Schedule, GetDeps(Dependences::AL_Statement), ORE);
709 if (ManuallyTransformed.is_null()) {
710 LLVM_DEBUG(dbgs() << "Error during manual optimization\n");
711 return;
712 }
713
714 if (ManuallyTransformed.get() != Schedule.get()) {
715 // User transformations have precedence over other transformations.
716 HasUserTransformation = true;
717 Schedule = std::move(ManuallyTransformed);
718 LLVM_DEBUG(
719 printSchedule(dbgs(), Schedule, "After manual transformations"));
720 }
721 }
722
723 // Only continue if either manual transformations have been applied or we are
724 // allowed to apply heuristics.
725 // TODO: Detect disabled heuristics and no user-directed transformation
726 // metadata earlier in ScopDetection.
727 if (!HasUserTransformation && S.hasDisableHeuristicsHint()) {
728 LLVM_DEBUG(dbgs() << "Heuristic optimizations disabled by metadata\n");
729 return;
730 }
731
732 // Get dependency analysis.
733 const Dependences &D = GetDeps(Dependences::AL_Statement);
734 if (D.getSharedIslCtx() != S.getSharedIslCtx()) {
735 LLVM_DEBUG(dbgs() << "DependenceInfo for another SCoP/isl_ctx\n");
736 return;
737 }
738 if (!D.hasValidDependences()) {
739 LLVM_DEBUG(dbgs() << "Dependency information not available\n");
740 return;
741 }
742
743 // Apply ISL's algorithm only if not overriden by the user. Note that
744 // post-rescheduling optimizations (tiling, pattern-based, prevectorization)
745 // rely on the coincidence/permutable annotations on schedule tree bands that
746 // are added by the rescheduling analyzer. Therefore, disabling the
747 // rescheduler implicitly also disables these optimizations.
748 if (!EnableReschedule) {
749 LLVM_DEBUG(dbgs() << "Skipping rescheduling due to command line option\n");
750 } else if (HasUserTransformation) {
751 LLVM_DEBUG(
752 dbgs() << "Skipping rescheduling due to manual transformation\n");
753 } else {
754 // Build input data.
755 int ValidityKinds =
756 Dependences::TYPE_RAW | Dependences::TYPE_WAR | Dependences::TYPE_WAW;
757 int ProximityKinds;
758
759 if (OptimizeDeps == "all")
760 ProximityKinds =
761 Dependences::TYPE_RAW | Dependences::TYPE_WAR | Dependences::TYPE_WAW;
762 else if (OptimizeDeps == "raw")
763 ProximityKinds = Dependences::TYPE_RAW;
764 else {
765 errs() << "Do not know how to optimize for '" << OptimizeDeps << "'"
766 << " Falling back to optimizing all dependences.\n";
767 ProximityKinds =
768 Dependences::TYPE_RAW | Dependences::TYPE_WAR | Dependences::TYPE_WAW;
769 }
770
771 isl::union_set Domain = S.getDomains();
772
773 if (Domain.is_null())
774 return;
775
776 isl::union_map Validity = D.getDependences(ValidityKinds);
777 isl::union_map Proximity = D.getDependences(ProximityKinds);
778
779 // Simplify the dependences by removing the constraints introduced by the
780 // domains. This can speed up the scheduling time significantly, as large
781 // constant coefficients will be removed from the dependences. The
782 // introduction of some additional dependences reduces the possible
783 // transformations, but in most cases, such transformation do not seem to be
784 // interesting anyway. In some cases this option may stop the scheduler to
785 // find any schedule.
786 if (SimplifyDeps == "yes") {
787 Validity = Validity.gist_domain(Domain);
788 Validity = Validity.gist_range(Domain);
789 Proximity = Proximity.gist_domain(Domain);
790 Proximity = Proximity.gist_range(Domain);
791 } else if (SimplifyDeps != "no") {
792 errs()
793 << "warning: Option -polly-opt-simplify-deps should either be 'yes' "
794 "or 'no'. Falling back to default: 'yes'\n";
795 }
796
797 LLVM_DEBUG(dbgs() << "\n\nCompute schedule from: ");
798 LLVM_DEBUG(dbgs() << "Domain := " << Domain << ";\n");
799 LLVM_DEBUG(dbgs() << "Proximity := " << Proximity << ";\n");
800 LLVM_DEBUG(dbgs() << "Validity := " << Validity << ";\n");
801
802 int IslMaximizeBands;
803 if (MaximizeBandDepth == "yes") {
804 IslMaximizeBands = 1;
805 } else if (MaximizeBandDepth == "no") {
806 IslMaximizeBands = 0;
807 } else {
808 errs()
809 << "warning: Option -polly-opt-maximize-bands should either be 'yes'"
810 " or 'no'. Falling back to default: 'yes'\n";
811 IslMaximizeBands = 1;
812 }
813
814 int IslOuterCoincidence;
815 if (OuterCoincidence == "yes") {
816 IslOuterCoincidence = 1;
817 } else if (OuterCoincidence == "no") {
818 IslOuterCoincidence = 0;
819 } else {
820 errs() << "warning: Option -polly-opt-outer-coincidence should either be "
821 "'yes' or 'no'. Falling back to default: 'no'\n";
822 IslOuterCoincidence = 0;
823 }
824
825 isl_ctx *Ctx = S.getIslCtx().get();
826
827 isl_options_set_schedule_outer_coincidence(Ctx, IslOuterCoincidence);
828 isl_options_set_schedule_maximize_band_depth(Ctx, IslMaximizeBands);
829 isl_options_set_schedule_max_constant_term(Ctx, MaxConstantTerm);
830 isl_options_set_schedule_max_coefficient(Ctx, MaxCoefficient);
831 isl_options_set_tile_scale_tile_loops(Ctx, 0);
832
833 auto OnErrorStatus = isl_options_get_on_error(Ctx);
834 isl_options_set_on_error(Ctx, ISL_ON_ERROR_CONTINUE);
835
836 auto SC = isl::schedule_constraints::on_domain(Domain);
837 SC = SC.set_proximity(Proximity);
838 SC = SC.set_validity(Validity);
839 SC = SC.set_coincidence(Validity);
840 Schedule = SC.compute_schedule();
841 isl_options_set_on_error(Ctx, OnErrorStatus);
842
843 ScopsRescheduled++;
844 LLVM_DEBUG(printSchedule(dbgs(), Schedule, "After rescheduling"));
845 }
846
847 walkScheduleTreeForStatistics(Schedule, 1);
848
849 // In cases the scheduler is not able to optimize the code, we just do not
850 // touch the schedule.
851 if (Schedule.is_null())
852 return;
853
854 if (GreedyFusion) {
855 isl::union_map Validity = D.getDependences(
856 Dependences::TYPE_RAW | Dependences::TYPE_WAR | Dependences::TYPE_WAW);
857 Schedule = applyGreedyFusion(Schedule, Validity);
858 assert(!Schedule.is_null());
859 }
860
861 // Apply post-rescheduling optimizations (if enabled) and/or prevectorization.
862 const OptimizerAdditionalInfoTy OAI = {
863 TTI,
864 const_cast<Dependences *>(&D),
865 /*PatternOpts=*/!HasUserTransformation && PMBasedOpts,
866 /*Postopts=*/!HasUserTransformation && EnablePostopts,
867 /*Prevect=*/PollyVectorizerChoice != VECTORIZER_NONE,
868 DepsChanged};
869 if (OAI.PatternOpts || OAI.Postopts || OAI.Prevect) {
870 Schedule = ScheduleTreeOptimizer::optimizeSchedule(Schedule, &OAI);
871 Schedule = hoistExtensionNodes(Schedule);
872 LLVM_DEBUG(printSchedule(dbgs(), Schedule, "After post-optimizations"));
873 walkScheduleTreeForStatistics(Schedule, 2);
874 }
875
876 // Skip profitability check if user transformation(s) have been applied.
877 if (!HasUserTransformation &&
878 !ScheduleTreeOptimizer::isProfitableSchedule(S, Schedule))
879 return;
880
881 auto ScopStats = S.getStatistics();
882 ScopsOptimized++;
883 NumAffineLoopsOptimized += ScopStats.NumAffineLoops;
884 NumBoxedLoopsOptimized += ScopStats.NumBoxedLoops;
885 LastSchedule = Schedule;
886
887 S.setScheduleTree(Schedule);
888 S.markAsOptimized();
889
890 if (OptimizedScops)
891 errs() << S;
892 }
893
runOnScop(Scop & S)894 bool IslScheduleOptimizerWrapperPass::runOnScop(Scop &S) {
895 releaseMemory();
896
897 Function &F = S.getFunction();
898 IslCtx = S.getSharedIslCtx();
899
900 auto getDependences =
901 [this](Dependences::AnalysisLevel) -> const Dependences & {
902 return getAnalysis<DependenceInfo>().getDependences(
903 Dependences::AL_Statement);
904 };
905 OptimizationRemarkEmitter &ORE =
906 getAnalysis<OptimizationRemarkEmitterWrapperPass>().getORE();
907 TargetTransformInfo *TTI =
908 &getAnalysis<TargetTransformInfoWrapperPass>().getTTI(F);
909
910 bool DepsChanged = false;
911 runIslScheduleOptimizer(S, getDependences, TTI, &ORE, LastSchedule,
912 DepsChanged);
913 if (DepsChanged)
914 getAnalysis<DependenceInfo>().abandonDependences();
915 return false;
916 }
917
runScheduleOptimizerPrinter(raw_ostream & OS,isl::schedule LastSchedule)918 static void runScheduleOptimizerPrinter(raw_ostream &OS,
919 isl::schedule LastSchedule) {
920 isl_printer *p;
921 char *ScheduleStr;
922
923 OS << "Calculated schedule:\n";
924
925 if (LastSchedule.is_null()) {
926 OS << "n/a\n";
927 return;
928 }
929
930 p = isl_printer_to_str(LastSchedule.ctx().get());
931 p = isl_printer_set_yaml_style(p, ISL_YAML_STYLE_BLOCK);
932 p = isl_printer_print_schedule(p, LastSchedule.get());
933 ScheduleStr = isl_printer_get_str(p);
934 isl_printer_free(p);
935
936 OS << ScheduleStr << "\n";
937
938 free(ScheduleStr);
939 }
940
printScop(raw_ostream & OS,Scop &) const941 void IslScheduleOptimizerWrapperPass::printScop(raw_ostream &OS, Scop &) const {
942 runScheduleOptimizerPrinter(OS, LastSchedule);
943 }
944
getAnalysisUsage(AnalysisUsage & AU) const945 void IslScheduleOptimizerWrapperPass::getAnalysisUsage(
946 AnalysisUsage &AU) const {
947 ScopPass::getAnalysisUsage(AU);
948 AU.addRequired<DependenceInfo>();
949 AU.addRequired<TargetTransformInfoWrapperPass>();
950 AU.addRequired<OptimizationRemarkEmitterWrapperPass>();
951
952 AU.addPreserved<DependenceInfo>();
953 AU.addPreserved<OptimizationRemarkEmitterWrapperPass>();
954 }
955
956 } // namespace
957
createIslScheduleOptimizerWrapperPass()958 Pass *polly::createIslScheduleOptimizerWrapperPass() {
959 return new IslScheduleOptimizerWrapperPass();
960 }
961
962 INITIALIZE_PASS_BEGIN(IslScheduleOptimizerWrapperPass, "polly-opt-isl",
963 "Polly - Optimize schedule of SCoP", false, false);
964 INITIALIZE_PASS_DEPENDENCY(DependenceInfo);
965 INITIALIZE_PASS_DEPENDENCY(ScopInfoRegionPass);
966 INITIALIZE_PASS_DEPENDENCY(TargetTransformInfoWrapperPass);
967 INITIALIZE_PASS_DEPENDENCY(OptimizationRemarkEmitterWrapperPass);
968 INITIALIZE_PASS_END(IslScheduleOptimizerWrapperPass, "polly-opt-isl",
969 "Polly - Optimize schedule of SCoP", false, false)
970
971 static llvm::PreservedAnalyses
runIslScheduleOptimizerUsingNPM(Scop & S,ScopAnalysisManager & SAM,ScopStandardAnalysisResults & SAR,SPMUpdater & U,raw_ostream * OS)972 runIslScheduleOptimizerUsingNPM(Scop &S, ScopAnalysisManager &SAM,
973 ScopStandardAnalysisResults &SAR, SPMUpdater &U,
974 raw_ostream *OS) {
975 DependenceAnalysis::Result &Deps = SAM.getResult<DependenceAnalysis>(S, SAR);
976 auto GetDeps = [&Deps](Dependences::AnalysisLevel) -> const Dependences & {
977 return Deps.getDependences(Dependences::AL_Statement);
978 };
979 OptimizationRemarkEmitter ORE(&S.getFunction());
980 TargetTransformInfo *TTI = &SAR.TTI;
981 isl::schedule LastSchedule;
982 bool DepsChanged = false;
983 runIslScheduleOptimizer(S, GetDeps, TTI, &ORE, LastSchedule, DepsChanged);
984 if (DepsChanged)
985 Deps.abandonDependences();
986
987 if (OS) {
988 *OS << "Printing analysis 'Polly - Optimize schedule of SCoP' for region: '"
989 << S.getName() << "' in function '" << S.getFunction().getName()
990 << "':\n";
991 runScheduleOptimizerPrinter(*OS, LastSchedule);
992 }
993 return PreservedAnalyses::all();
994 }
995
996 llvm::PreservedAnalyses
run(Scop & S,ScopAnalysisManager & SAM,ScopStandardAnalysisResults & SAR,SPMUpdater & U)997 IslScheduleOptimizerPass::run(Scop &S, ScopAnalysisManager &SAM,
998 ScopStandardAnalysisResults &SAR, SPMUpdater &U) {
999 return runIslScheduleOptimizerUsingNPM(S, SAM, SAR, U, nullptr);
1000 }
1001
1002 llvm::PreservedAnalyses
run(Scop & S,ScopAnalysisManager & SAM,ScopStandardAnalysisResults & SAR,SPMUpdater & U)1003 IslScheduleOptimizerPrinterPass::run(Scop &S, ScopAnalysisManager &SAM,
1004 ScopStandardAnalysisResults &SAR,
1005 SPMUpdater &U) {
1006 return runIslScheduleOptimizerUsingNPM(S, SAM, SAR, U, &OS);
1007 }
1008
1009 //===----------------------------------------------------------------------===//
1010
1011 namespace {
1012 /// Print result from IslScheduleOptimizerWrapperPass.
1013 class IslScheduleOptimizerPrinterLegacyPass final : public ScopPass {
1014 public:
1015 static char ID;
1016
IslScheduleOptimizerPrinterLegacyPass()1017 IslScheduleOptimizerPrinterLegacyPass()
1018 : IslScheduleOptimizerPrinterLegacyPass(outs()) {}
IslScheduleOptimizerPrinterLegacyPass(llvm::raw_ostream & OS)1019 explicit IslScheduleOptimizerPrinterLegacyPass(llvm::raw_ostream &OS)
1020 : ScopPass(ID), OS(OS) {}
1021
runOnScop(Scop & S)1022 bool runOnScop(Scop &S) override {
1023 IslScheduleOptimizerWrapperPass &P =
1024 getAnalysis<IslScheduleOptimizerWrapperPass>();
1025
1026 OS << "Printing analysis '" << P.getPassName() << "' for region: '"
1027 << S.getRegion().getNameStr() << "' in function '"
1028 << S.getFunction().getName() << "':\n";
1029 P.printScop(OS, S);
1030
1031 return false;
1032 }
1033
getAnalysisUsage(AnalysisUsage & AU) const1034 void getAnalysisUsage(AnalysisUsage &AU) const override {
1035 ScopPass::getAnalysisUsage(AU);
1036 AU.addRequired<IslScheduleOptimizerWrapperPass>();
1037 AU.setPreservesAll();
1038 }
1039
1040 private:
1041 llvm::raw_ostream &OS;
1042 };
1043
1044 char IslScheduleOptimizerPrinterLegacyPass::ID = 0;
1045 } // namespace
1046
createIslScheduleOptimizerPrinterLegacyPass(raw_ostream & OS)1047 Pass *polly::createIslScheduleOptimizerPrinterLegacyPass(raw_ostream &OS) {
1048 return new IslScheduleOptimizerPrinterLegacyPass(OS);
1049 }
1050
1051 INITIALIZE_PASS_BEGIN(IslScheduleOptimizerPrinterLegacyPass,
1052 "polly-print-opt-isl",
1053 "Polly - Print optimizer schedule of SCoP", false, false);
1054 INITIALIZE_PASS_DEPENDENCY(IslScheduleOptimizerWrapperPass)
1055 INITIALIZE_PASS_END(IslScheduleOptimizerPrinterLegacyPass,
1056 "polly-print-opt-isl",
1057 "Polly - Print optimizer schedule of SCoP", false, false)
1058