1 //===- Utils.cpp - Utilities to support the Linalg dialect ----------------===//
2 //
3 // Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
4 // See https://llvm.org/LICENSE.txt for license information.
5 // SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
6 //
7 //===----------------------------------------------------------------------===//
8 //
9 // This file implements utilities for the Linalg dialect.
10 //
11 //===----------------------------------------------------------------------===//
12 
13 #include "mlir/Dialect/Linalg/Utils/Utils.h"
14 
15 #include "mlir/Dialect/Affine/EDSC/Intrinsics.h"
16 #include "mlir/Dialect/Affine/IR/AffineOps.h"
17 #include "mlir/Dialect/Linalg/IR/LinalgOps.h"
18 #include "mlir/Dialect/Linalg/IR/LinalgTypes.h"
19 #include "mlir/Dialect/SCF/EDSC/Builders.h"
20 #include "mlir/Dialect/SCF/SCF.h"
21 #include "mlir/Dialect/StandardOps/IR/Ops.h"
22 #include "mlir/IR/AffineExpr.h"
23 #include "mlir/IR/AffineMap.h"
24 #include "mlir/IR/Matchers.h"
25 #include "mlir/IR/OpImplementation.h"
26 #include "mlir/Pass/Pass.h"
27 #include "mlir/Transforms/LoopUtils.h"
28 
29 using namespace mlir;
30 using namespace mlir::linalg;
31 using namespace mlir::scf;
32 
33 Optional<RegionMatcher::BinaryOpKind>
34 RegionMatcher::matchAsScalarBinaryOp(GenericOp op) {
35   auto &region = op.region();
36   if (!llvm::hasSingleElement(region))
37     return llvm::None;
38 
39   Block &block = region.front();
40   if (block.getNumArguments() != 2 ||
41       !block.getArgument(0).getType().isSignlessIntOrFloat() ||
42       !block.getArgument(1).getType().isSignlessIntOrFloat())
43     return llvm::None;
44 
45   auto &ops = block.getOperations();
46   if (!llvm::hasSingleElement(block.without_terminator()))
47     return llvm::None;
48 
49   using mlir::matchers::m_Val;
50   auto a = m_Val(block.getArgument(0));
51   auto b = m_Val(block.getArgument(1));
52 
53   auto addPattern = m_Op<linalg::YieldOp>(m_Op<AddIOp>(a, b));
54   if (addPattern.match(&ops.back()))
55     return BinaryOpKind::IAdd;
56 
57   return llvm::None;
58 }
59 
60 bool mlir::linalg::isParallelIteratorType(Attribute attr) {
61   if (auto strAttr = attr.dyn_cast<StringAttr>()) {
62     return strAttr.getValue() == getParallelIteratorTypeName();
63   }
64   return false;
65 }
66 
67 bool mlir::linalg::isReductionIteratorType(Attribute attr) {
68   if (auto strAttr = attr.dyn_cast<StringAttr>()) {
69     return strAttr.getValue() == getReductionIteratorTypeName();
70   }
71   return false;
72 }
73 
74 bool mlir::linalg::isWindowIteratorType(Attribute attr) {
75   if (auto strAttr = attr.dyn_cast<StringAttr>()) {
76     return strAttr.getValue() == getWindowIteratorTypeName();
77   }
78   return false;
79 }
80 
81 /// Explicit instantiation of loop nest generator for different loop types.
82 template struct mlir::linalg::GenerateLoopNest<scf::ForOp>;
83 template struct mlir::linalg::GenerateLoopNest<scf::ParallelOp>;
84 template struct mlir::linalg::GenerateLoopNest<AffineForOp>;
85 
86 /// Given a list of subview ranges, extract individual values for lower, upper
87 /// bounds and steps and put them into the corresponding vectors.
88 static void unpackRanges(ArrayRef<Range> ranges, SmallVectorImpl<Value> &lbs,
89                          SmallVectorImpl<Value> &ubs,
90                          SmallVectorImpl<Value> &steps) {
91   for (Range range : ranges) {
92     lbs.emplace_back(range.offset);
93     ubs.emplace_back(range.size);
94     steps.emplace_back(range.stride);
95   }
96 }
97 
98 namespace mlir {
99 namespace linalg {
100 
101 SmallVector<int64_t, 8> getStaticShape(LinalgOp linalgOp) {
102   SmallVector<int64_t, 8> res;
103   for (Value v : linalgOp.getShapedOperands()) {
104     auto shape = v.getType().cast<ShapedType>().getShape();
105     res.append(shape.begin(), shape.end());
106   }
107   return res;
108 }
109 
110 Optional<SmallVector<int64_t, 4>> getStaticLoopRanges(LinalgOp linalgOp) {
111   SmallVector<int64_t, 8> viewSizes = getStaticShape(linalgOp);
112   AffineMap invertedMap = linalgOp.getShapesToLoopsMap();
113   if (!invertedMap)
114     return {};
115   return invertedMap.compose(viewSizes);
116 }
117 
118 /// If `size` comes from an AffineMinOp and one of the values of AffineMinOp
119 /// is a constant then return a new value set to the smallest such constant.
120 /// Otherwise returngetSmallestBoundingIndex nullptr.
121 IntegerAttr getSmallestBoundingIndex(Value size) {
122   Optional<int64_t> boundingConst = {};
123   if (auto affineMinOp = size.getDefiningOp<AffineMinOp>()) {
124     for (auto e : affineMinOp.getAffineMap().getResults())
125       if (auto cst = e.dyn_cast<AffineConstantExpr>())
126         boundingConst = boundingConst
127                             ? std::min(boundingConst.getValue(), cst.getValue())
128                             : cst.getValue();
129   } else if (auto constIndexOp = size.getDefiningOp<ConstantOp>()) {
130     if (constIndexOp.getType().isa<IndexType>())
131       boundingConst = constIndexOp.value().cast<IntegerAttr>().getInt();
132   } else if (auto affineApplyOp = size.getDefiningOp<AffineApplyOp>()) {
133     if (auto cExpr = affineApplyOp.getAffineMap()
134                          .getResult(0)
135                          .dyn_cast<AffineConstantExpr>())
136       boundingConst = cExpr.getValue();
137   }
138   if (boundingConst && *boundingConst >= 0)
139     return Builder(size.getContext()).getIndexAttr(*boundingConst);
140   return nullptr;
141 }
142 
143 /// Specialization to build an scf "for" nest.
144 template <>
145 void GenerateLoopNest<scf::ForOp>::doit(
146     ArrayRef<Range> loopRanges, ValueRange iterArgInitValues,
147     ArrayRef<Attribute> iteratorTypes,
148     function_ref<scf::ValueVector(ValueRange, ValueRange)> bodyBuilderFn,
149     Optional<LinalgLoopDistributionOptions> distributionOptions) {
150   // Create procInfo so it dominates loops, if appropriate.
151   OpBuilder &builder = edsc::ScopedContext::getBuilderRef();
152   Location loc = edsc::ScopedContext::getLocation();
153   SmallVector<ProcInfo, 2> procInfo;
154   if (distributionOptions.hasValue())
155     procInfo = distributionOptions->procInfo(builder, loc, loopRanges);
156 
157   SmallVector<Value, 4> lbs, ubs, steps;
158   unpackRanges(loopRanges, lbs, ubs, steps);
159   LoopNest loopNest =
160       edsc::loopNestBuilder(lbs, ubs, steps, iterArgInitValues, bodyBuilderFn);
161 
162   if (!distributionOptions.hasValue() || loopNest.loops.empty())
163     return;
164 
165   // Only supports cyclic distribution for now.
166   for (auto it : llvm::zip(loopNest.loops, procInfo,
167                            distributionOptions->distributionMethod))
168     if (std::get<2>(it) == DistributionMethod::Cyclic)
169       mapLoopToProcessorIds(std::get<0>(it), std::get<1>(it).procId,
170                             std::get<1>(it).nprocs);
171 }
172 
173 /// Specialization to build affine "for" nest.
174 template <>
175 void GenerateLoopNest<AffineForOp>::doit(
176     ArrayRef<Range> loopRanges, ValueRange iterArgInitValues,
177     ArrayRef<Attribute> iteratorTypes,
178     function_ref<scf::ValueVector(ValueRange, ValueRange)> bodyBuilderFn,
179     Optional<LinalgLoopDistributionOptions>) {
180   assert(iterArgInitValues.empty() && "unexpected AffineForOp init values");
181   SmallVector<Value, 4> lbs, ubs, steps;
182   unpackRanges(loopRanges, lbs, ubs, steps);
183 
184   // Affine loops require constant steps.
185   SmallVector<int64_t, 4> constantSteps;
186   constantSteps.reserve(steps.size());
187   for (Value v : steps) {
188     auto op = v.getDefiningOp<ConstantIndexOp>();
189     assert(op && "Affine loops require constant steps");
190     constantSteps.push_back(op.getValue());
191   }
192 
193   auto bodyBuilderWithoutIterArgsFn = [&](ValueRange ivs) {
194     bodyBuilderFn(ivs, {});
195   };
196   edsc::affineLoopNestBuilder(lbs, ubs, constantSteps,
197                               bodyBuilderWithoutIterArgsFn);
198 }
199 
200 /// Update the `lb`, `ub` and `step` to get per processor `lb`, `ub` and `step`.
201 static void updateBoundsForCyclicDistribution(OpBuilder &builder, Location loc,
202                                               Value procId, Value nprocs,
203                                               Value &lb, Value &ub,
204                                               Value &step) {
205   using edsc::op::operator+;
206   using edsc::op::operator*;
207   lb = lb + (procId * step);
208   step = nprocs * step;
209 }
210 
211 /// Generates a loop nest consisting of scf.parallel and scf.for, depending
212 /// on the `iteratorTypes.` Consecutive parallel loops create a single
213 /// scf.parallel operation; each sequential loop creates a new scf.for
214 /// operation. The body of the innermost loop is populated by
215 /// `bodyBuilderFn` that accepts a range of induction variables for all
216 /// loops. `ivStorage` is used to store the partial list of induction
217 /// variables.
218 // TODO: this function can be made iterative instead. However, it
219 // will have at most as many recursive calls as nested loops, which rarely
220 // exceeds 10.
221 static void
222 generateParallelLoopNest(ValueRange lbs, ValueRange ubs, ValueRange steps,
223                          ArrayRef<Attribute> iteratorTypes,
224                          function_ref<void(ValueRange)> bodyBuilderFn,
225                          SmallVectorImpl<Value> &ivStorage,
226                          ArrayRef<DistributionMethod> distributionMethod = {}) {
227   assert(lbs.size() == ubs.size());
228   assert(lbs.size() == steps.size());
229   assert(lbs.size() == iteratorTypes.size());
230 
231   // If there are no (more) loops to be generated, generate the body and be
232   // done with it.
233   if (iteratorTypes.empty())
234     return bodyBuilderFn(ivStorage);
235 
236   // Find the outermost parallel loops and drop their types from the list.
237   unsigned nLoops = iteratorTypes.size();
238   unsigned nOuterPar =
239       nLoops - iteratorTypes.drop_while(isParallelIteratorType).size();
240 
241   // If there are no outer parallel loops, generate one sequential loop and
242   // recurse. Note that we wouldn't have dropped anything from `iteratorTypes`
243   // in this case.
244   if (nOuterPar == 0) {
245     edsc::loopNestBuilder(lbs[0], ubs[0], steps[0], [&](Value iv) {
246       ivStorage.push_back(iv);
247       generateParallelLoopNest(lbs.drop_front(), ubs.drop_front(),
248                                steps.drop_front(), iteratorTypes.drop_front(),
249                                bodyBuilderFn, ivStorage, distributionMethod);
250     });
251     return;
252   }
253   if (distributionMethod.empty()) {
254     // Generate a single parallel loop-nest operation for all outermost
255     // parallel loops and recurse.
256     edsc::OperationBuilder<scf::ParallelOp>(
257         lbs.take_front(nOuterPar), ubs.take_front(nOuterPar),
258         steps.take_front(nOuterPar),
259         [&](OpBuilder &nestedBuilder, Location nestedLoc, ValueRange localIvs) {
260           edsc::ScopedContext context(nestedBuilder, nestedLoc);
261           ivStorage.append(localIvs.begin(), localIvs.end());
262           generateParallelLoopNest(
263               lbs.drop_front(nOuterPar), ubs.drop_front(nOuterPar),
264               steps.drop_front(nOuterPar), iteratorTypes.drop_front(nOuterPar),
265               bodyBuilderFn, ivStorage,
266               (distributionMethod.size() < nOuterPar)
267                   ? ArrayRef<DistributionMethod>()
268                   : distributionMethod.drop_front(nOuterPar));
269         });
270     return;
271   }
272 
273   // Process all consecutive similarly distributed loops simultaneously.
274   DistributionMethod methodToUse = distributionMethod[0];
275   unsigned numProcessed = 1;
276   for (unsigned i = 1; i < nOuterPar && i < distributionMethod.size(); ++i) {
277     if (distributionMethod[i] != methodToUse)
278       break;
279     numProcessed++;
280   }
281 
282   switch (methodToUse) {
283   case DistributionMethod::Cyclic: {
284     // Generate a single parallel loop-nest operation for all outermost
285     // parallel loops and recurse.
286     edsc::OperationBuilder<scf::ParallelOp>(
287         lbs.take_front(numProcessed), ubs.take_front(numProcessed),
288         steps.take_front(numProcessed),
289         [&](OpBuilder &nestedBuilder, Location nestedLoc, ValueRange localIvs) {
290           edsc::ScopedContext context(nestedBuilder, nestedLoc);
291           ivStorage.append(localIvs.begin(), localIvs.end());
292           generateParallelLoopNest(
293               lbs.drop_front(numProcessed), ubs.drop_front(numProcessed),
294               steps.drop_front(numProcessed),
295               iteratorTypes.drop_front(numProcessed), bodyBuilderFn, ivStorage,
296               (distributionMethod.size() < numProcessed)
297                   ? ArrayRef<DistributionMethod>()
298                   : distributionMethod.drop_front(numProcessed));
299         });
300     return;
301   }
302   case DistributionMethod::CyclicNumProcsGeNumIters: {
303     // Check (for the processed loops) that the iteration is in-bounds.
304     using edsc::op::slt;
305     using edsc::op::operator&&;
306     Value cond = slt(lbs[0], ubs[0]);
307     for (unsigned i = 1; i < numProcessed; ++i)
308       cond = cond && slt(lbs[i], ubs[i]);
309     ivStorage.append(lbs.begin(), std::next(lbs.begin(), numProcessed));
310     edsc::conditionBuilder(cond, [&]() {
311       generateParallelLoopNest(
312           lbs.drop_front(numProcessed), ubs.drop_front(numProcessed),
313           steps.drop_front(numProcessed),
314           iteratorTypes.drop_front(numProcessed), bodyBuilderFn, ivStorage,
315           distributionMethod.drop_front(numProcessed));
316     });
317     return;
318   }
319   case DistributionMethod::CyclicNumProcsEqNumIters:
320     // No check/loops needed here. Set the `%iv` to be the `%lb` and proceed
321     // with inner loop generation.
322     ivStorage.append(lbs.begin(), std::next(lbs.begin(), numProcessed));
323     generateParallelLoopNest(
324         lbs.drop_front(numProcessed), ubs.drop_front(numProcessed),
325         steps.drop_front(numProcessed), iteratorTypes.drop_front(numProcessed),
326         bodyBuilderFn, ivStorage, distributionMethod.drop_front(numProcessed));
327     return;
328   }
329 }
330 
331 /// Specialization for generating a mix of parallel and sequential scf loops.
332 template <>
333 void GenerateLoopNest<scf::ParallelOp>::doit(
334     ArrayRef<Range> loopRanges, ValueRange iterArgInitValues,
335     ArrayRef<Attribute> iteratorTypes,
336     function_ref<scf::ValueVector(ValueRange, ValueRange)> bodyBuilderFn,
337     Optional<LinalgLoopDistributionOptions> distributionOptions) {
338   assert(iterArgInitValues.empty() && "unexpected ParallelOp init values");
339   // This function may be passed more iterator types than ranges.
340   assert(iteratorTypes.size() >= loopRanges.size() &&
341          "expected iterator type for all ranges");
342   iteratorTypes = iteratorTypes.take_front(loopRanges.size());
343   SmallVector<Value, 8> lbsStorage, ubsStorage, stepsStorage, ivs;
344   unsigned numLoops = iteratorTypes.size();
345   ivs.reserve(numLoops);
346   lbsStorage.reserve(numLoops);
347   ubsStorage.reserve(numLoops);
348   stepsStorage.reserve(numLoops);
349 
350   // Get the loop lb, ub, and step.
351   unpackRanges(loopRanges, lbsStorage, ubsStorage, stepsStorage);
352 
353   // Modify the lb, ub, and step based on the distribution options.
354   SmallVector<DistributionMethod, 0> distributionMethod;
355   if (distributionOptions) {
356     auto &options = distributionOptions.getValue();
357     OpBuilder &builder = edsc::ScopedContext::getBuilderRef();
358     Location loc = edsc::ScopedContext::getLocation();
359     distributionMethod.assign(distributionOptions->distributionMethod.begin(),
360                               distributionOptions->distributionMethod.end());
361     SmallVector<Range, 2> parallelLoopRanges;
362     for (auto iteratorType : enumerate(iteratorTypes)) {
363       if (isParallelIteratorType(iteratorType.value()))
364         parallelLoopRanges.push_back(loopRanges[iteratorType.index()]);
365     }
366     if (distributionMethod.size() < parallelLoopRanges.size())
367       parallelLoopRanges.resize(distributionMethod.size());
368     SmallVector<ProcInfo, 2> procInfo =
369         options.procInfo(builder, loc, parallelLoopRanges);
370     unsigned index = 0;
371     for (auto iteratorType : enumerate(iteratorTypes)) {
372       if (index >= procInfo.size())
373         break;
374       if (isParallelIteratorType(iteratorType.value())) {
375         unsigned i = iteratorType.index();
376         updateBoundsForCyclicDistribution(builder, loc, procInfo[index].procId,
377                                           procInfo[index].nprocs, lbsStorage[i],
378                                           ubsStorage[i], stepsStorage[i]);
379         index++;
380       }
381     }
382   }
383   ValueRange lbs(lbsStorage), ubs(ubsStorage), steps(stepsStorage);
384   auto bodyBuilderWithoutIterArgsFn = [&](ValueRange ivs) {
385     bodyBuilderFn(ivs, {});
386   };
387   generateParallelLoopNest(lbs, ubs, steps, iteratorTypes,
388                            bodyBuilderWithoutIterArgsFn, ivs,
389                            distributionMethod);
390 
391   assert(ivs.size() == iteratorTypes.size() && "did not generate enough loops");
392 }
393 
394 } // namespace linalg
395 } // namespace mlir
396