1 //===- Utils.cpp - Utilities to support the Linalg dialect ----------------===//
2 //
3 // Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
4 // See https://llvm.org/LICENSE.txt for license information.
5 // SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
6 //
7 //===----------------------------------------------------------------------===//
8 //
9 // This file implements utilities for the Linalg dialect.
10 //
11 //===----------------------------------------------------------------------===//
12 
13 #include "mlir/Dialect/Linalg/Utils/Utils.h"
14 
15 #include "mlir/Analysis/SliceAnalysis.h"
16 #include "mlir/Dialect/Affine/Analysis/AffineStructures.h"
17 #include "mlir/Dialect/Affine/IR/AffineOps.h"
18 #include "mlir/Dialect/Affine/IR/AffineValueMap.h"
19 #include "mlir/Dialect/Affine/LoopUtils.h"
20 #include "mlir/Dialect/Arithmetic/IR/Arithmetic.h"
21 #include "mlir/Dialect/Arithmetic/Utils/Utils.h"
22 #include "mlir/Dialect/Linalg/IR/Linalg.h"
23 #include "mlir/Dialect/MemRef/IR/MemRef.h"
24 #include "mlir/Dialect/SCF/SCF.h"
25 #include "mlir/Dialect/Tensor/IR/Tensor.h"
26 #include "mlir/Dialect/Tensor/Utils/Utils.h"
27 #include "mlir/Dialect/Utils/StaticValueUtils.h"
28 #include "mlir/IR/AffineExpr.h"
29 #include "mlir/IR/AffineExprVisitor.h"
30 #include "mlir/IR/AffineMap.h"
31 #include "mlir/IR/Matchers.h"
32 #include "mlir/IR/OpImplementation.h"
33 #include "mlir/Pass/Pass.h"
34 #include "llvm/ADT/TypeSwitch.h"
35 #include "llvm/Support/Debug.h"
36 
37 #define DEBUG_TYPE "linalg-utils"
38 
39 using namespace mlir;
40 using namespace mlir::linalg;
41 using namespace mlir::scf;
42 
43 static bool isZero(Value v) {
44   if (auto cst = v.getDefiningOp<arith::ConstantIndexOp>())
45     return cst.value() == 0;
46   return false;
47 }
48 
49 namespace {
50 
51 // Helper visitor to determine whether an AffineExpr is tiled.
52 // This is achieved by traversing every AffineDimExpr with position `pos` and
53 // checking whether the corresponding `tileSizes[pos]` is non-zero.
54 // This also enforces only positive coefficients occur in multiplications.
55 //
56 // Example:
57 //   `d0 + 2 * d1 + d3` is tiled by [0, 0, 0, 2] but not by [0, 0, 2, 0]
58 //
59 struct TileCheck : public AffineExprVisitor<TileCheck> {
60   TileCheck(ValueRange tileSizes) : tileSizes(tileSizes) {}
61 
62   void visitDimExpr(AffineDimExpr expr) {
63     isTiled |= !isZero(tileSizes[expr.getPosition()]);
64   }
65   void visitAffineBinaryOpExpr(AffineBinaryOpExpr expr) {
66     visit(expr.getLHS());
67     visit(expr.getRHS());
68     if (expr.getKind() == mlir::AffineExprKind::Mul)
69       assert(expr.getRHS().cast<AffineConstantExpr>().getValue() > 0 &&
70              "nonpositive multiplying coefficient");
71   }
72   bool isTiled = false;
73   ValueRange tileSizes;
74 };
75 
76 } // namespace
77 
78 static bool isTiled(AffineExpr expr, ValueRange tileSizes) {
79   if (!expr)
80     return false;
81   TileCheck t(tileSizes);
82   t.visit(expr);
83   return t.isTiled;
84 }
85 
86 // Checks whether the `map  varies with respect to a non-zero `tileSize`.
87 static bool isTiled(AffineMap map, ValueRange tileSizes) {
88   if (!map)
89     return false;
90   for (unsigned r = 0; r < map.getNumResults(); ++r)
91     if (isTiled(map.getResult(r), tileSizes))
92       return true;
93   return false;
94 }
95 
96 Optional<RegionMatcher::BinaryOpKind>
97 RegionMatcher::matchAsScalarBinaryOp(GenericOp op) {
98   auto &region = op.region();
99   if (!llvm::hasSingleElement(region))
100     return llvm::None;
101 
102   Block &block = region.front();
103   if (block.getNumArguments() != 2 ||
104       !block.getArgument(0).getType().isSignlessIntOrFloat() ||
105       !block.getArgument(1).getType().isSignlessIntOrFloat())
106     return llvm::None;
107 
108   auto &ops = block.getOperations();
109   if (!llvm::hasSingleElement(block.without_terminator()))
110     return llvm::None;
111 
112   using mlir::matchers::m_Val;
113   auto a = m_Val(block.getArgument(0));
114   auto b = m_Val(block.getArgument(1));
115 
116   auto addPattern = m_Op<linalg::YieldOp>(m_Op<arith::AddIOp>(a, b));
117   if (addPattern.match(&ops.back()))
118     return BinaryOpKind::IAdd;
119 
120   return llvm::None;
121 }
122 
123 /// Explicit instantiation of loop nest generator for different loop types.
124 template struct mlir::linalg::GenerateLoopNest<scf::ForOp>;
125 template struct mlir::linalg::GenerateLoopNest<scf::ParallelOp>;
126 template struct mlir::linalg::GenerateLoopNest<AffineForOp>;
127 
128 /// Given a list of subview ranges, extract individual values for lower, upper
129 /// bounds and steps and put them into the corresponding vectors.
130 static void unpackRanges(ArrayRef<Range> ranges, SmallVectorImpl<Value> &lbs,
131                          SmallVectorImpl<Value> &ubs,
132                          SmallVectorImpl<Value> &steps) {
133   for (Range range : ranges) {
134     lbs.emplace_back(range.offset);
135     ubs.emplace_back(range.size);
136     steps.emplace_back(range.stride);
137   }
138 }
139 
140 namespace mlir {
141 namespace linalg {
142 
143 bool isPermutation(ArrayRef<int64_t> permutation) {
144   // Count the number of appearances for all indices.
145   SmallVector<int64_t> indexCounts(permutation.size(), 0);
146   for (auto index : permutation) {
147     // Exit if the index is out-of-range.
148     if (index < 0 || index >= static_cast<int64_t>(permutation.size()))
149       return false;
150     indexCounts[index]++;
151   }
152   // Return true if all indices appear once.
153   return count(indexCounts, 1) == static_cast<int64_t>(permutation.size());
154 }
155 
156 /// Helper function that creates a memref::DimOp or tensor::DimOp depending on
157 /// the type of `source`.
158 Value createOrFoldDimOp(OpBuilder &b, Location loc, Value source, int64_t dim) {
159   if (source.getType().isa<UnrankedMemRefType, MemRefType>())
160     return b.createOrFold<memref::DimOp>(loc, source, dim);
161   if (source.getType().isa<UnrankedTensorType, RankedTensorType>())
162     return b.createOrFold<tensor::DimOp>(loc, source, dim);
163   llvm_unreachable("Expected MemRefType or TensorType");
164 }
165 
166 /// Given an operation, retrieves the value of each dynamic dimension through
167 /// constructing the necessary DimOp operators.
168 SmallVector<Value, 4> getDynOperands(Location loc, Value val, OpBuilder &b) {
169   SmallVector<Value, 4> dynOperands;
170   auto shapedType = val.getType().cast<ShapedType>();
171   for (const auto &dim : llvm::enumerate(shapedType.getShape())) {
172     if (dim.value() == ShapedType::kDynamicSize)
173       dynOperands.push_back(createOrFoldDimOp(b, loc, val, dim.index()));
174   }
175   return dynOperands;
176 }
177 
178 void getUpperBoundForIndex(Value value, AffineMap &boundMap,
179                            SmallVectorImpl<Value> &boundOperands) {
180   // Initialize `boundMap` and `boundOperands` to the identity returning
181   // `value`. This combination is the default result of the method if no
182   // simplification is possible.
183   assert(value.getType().isIndex() && "expect value to have index type");
184   boundMap = AffineMap::getMultiDimIdentityMap(1, value.getContext());
185   boundOperands.assign({value});
186   canonicalizeMapAndOperands(&boundMap, &boundOperands);
187 
188   // Continue only if there is an affine index computation to simplify.
189   Operation *definingOp = value.getDefiningOp();
190   if (!definingOp || !isa<AffineApplyOp, AffineMinOp>(definingOp))
191     return;
192 
193   // Get the backward slice containing the affine index computation.
194   SetVector<Operation *> backwardSlice;
195   getBackwardSlice(definingOp, &backwardSlice, [](Operation *op) {
196     return isa<AffineApplyOp, AffineMinOp>(op);
197   });
198   backwardSlice.insert(definingOp);
199 
200   // Setup a system of affine constraints that describe the index computation.
201   FlatAffineValueConstraints constraints;
202 
203   // Helper to find or create an identifier for the given value.
204   auto findOrCreateId = [&](Value value) {
205     if (!constraints.containsId(value)) {
206       constraints.appendDimId(value);
207       return true;
208     }
209     unsigned pos;
210     constraints.findId(value, &pos);
211     return pos < constraints.getNumDimIds();
212   };
213   // Helper to get the position for the given value.
214   auto getPosition = [&](Value value) {
215     unsigned pos;
216     bool exists = constraints.findId(value, &pos);
217     (void)exists;
218     assert(exists && "expect to find the identifier");
219     return pos;
220   };
221 
222   // Add the affine operations in `backwardSlice` to the constraints.
223   for (Operation *op : llvm::reverse(backwardSlice)) {
224     // Add an identifier for all op results and operands.
225     if (!(llvm::all_of(op->getResults(), findOrCreateId) &&
226           llvm::all_of(op->getOperands(), findOrCreateId)))
227       return;
228     // Add AffineApplyOps to the constraints.
229     if (auto applyOp = dyn_cast<AffineApplyOp>(op)) {
230       AffineValueMap valueMap(applyOp.getAffineMap(), applyOp.getOperands(),
231                               applyOp.getResult());
232       if (failed(constraints.composeMap(&valueMap)))
233         return;
234       continue;
235     }
236     // Add AffineMinOps to the constraints.
237     auto minOp = cast<AffineMinOp>(op);
238     AffineMap map = constraints.computeAlignedMap(minOp.getAffineMap(),
239                                                   minOp.getOperands());
240     if (failed(constraints.addBound(FlatAffineConstraints::UB,
241                                     getPosition(minOp.getResult()), map)))
242       return;
243   }
244 
245   // Obtain an upper bound for the affine index computation by projecting out
246   // all temporary results and expressing the upper bound for `value` in terms
247   // of the terminals of the index computation.
248   SmallVector<AffineMap> lowerBounds(1), upperBounds(1);
249   constraints.getSliceBounds(getPosition(value), 1, value.getContext(),
250                              &lowerBounds, &upperBounds);
251 
252   // Verify `upperBounds[0]` is valid and has at least one result.
253   if (!upperBounds[0] || upperBounds[0].getNumResults() == 0)
254     return;
255 
256   // Set `boundMap` and `boundOperands` to the computed upper bound.
257   boundMap = upperBounds[0];
258   constraints.getAllValues(&boundOperands);
259   erase_value(boundOperands, value);
260   canonicalizeMapAndOperands(&boundMap, &boundOperands);
261 }
262 
263 FailureOr<int64_t> getConstantUpperBoundForIndex(Value value) {
264   // Compute an upper bound for `value`.
265   AffineMap boundMap;
266   SmallVector<Value> boundOperands;
267   getUpperBoundForIndex(value, boundMap, boundOperands);
268 
269   // Search the results of `boundMap` for constant upper bounds.
270   SmallVector<int64_t> constantBounds;
271   for (AffineExpr result : boundMap.getResults())
272     if (auto constExpr = result.dyn_cast<AffineConstantExpr>())
273       constantBounds.push_back(constExpr.getValue());
274 
275   // Return the minimal upper bound or failure if none is found.
276   if (constantBounds.empty())
277     return failure();
278   return *std::min_element(constantBounds.begin(), constantBounds.end());
279 }
280 
281 tensor::ExtractSliceOp makeComposedExtractSliceOp(
282     OpBuilder &b, Location loc, Value source, ArrayRef<OpFoldResult> offsets,
283     ArrayRef<OpFoldResult> sizes, ArrayRef<OpFoldResult> strides) {
284   assert(source && "expect source to be nonzero");
285 
286   // Do not fold if the producer is not an ExtractSliceOp.
287   auto producerOp = source.getDefiningOp<tensor::ExtractSliceOp>();
288   if (!producerOp)
289     return b.create<tensor::ExtractSliceOp>(loc, source, offsets, sizes,
290                                             strides);
291 
292   // Do not fold if the producer is rank reducing or if there are any non-unit
293   // strides. Supporting non-unit strides complicates the offset computation
294   // since the consumer offsets need to be multiplied by the producer strides.
295   // TODO: support non-unit strides once there are use cases.
296   SmallVector<OpFoldResult> allStrides = producerOp.getMixedStrides();
297   allStrides.append(strides.begin(), strides.end());
298   bool hasNonUnitStride = any_of(allStrides, [](OpFoldResult ofr) {
299     return getConstantIntValue(ofr) != static_cast<int64_t>(1);
300   });
301   if (hasNonUnitStride ||
302       producerOp.getSourceType().getRank() !=
303           producerOp.getResult().getType().cast<ShapedType>().getRank())
304     return b.create<tensor::ExtractSliceOp>(loc, source, offsets, sizes,
305                                             strides);
306 
307   // Fold the producer by adding the offests and extracting the slice directly
308   // from the producer source tensor.
309   SmallVector<OpFoldResult> foldedOffsets(offsets.begin(), offsets.end());
310   AffineExpr dim1, dim2;
311   bindDims(b.getContext(), dim1, dim2);
312   for (const auto &en : enumerate(producerOp.getMixedOffsets())) {
313     SmallVector<Value> offsetValues = {
314         getValueOrCreateConstantIndexOp(b, loc, foldedOffsets[en.index()]),
315         getValueOrCreateConstantIndexOp(b, loc, en.value())};
316     foldedOffsets[en.index()] =
317         makeComposedAffineApply(b, loc, dim1 + dim2, offsetValues).getResult();
318   }
319   return b.create<tensor::ExtractSliceOp>(loc, producerOp.source(),
320                                           foldedOffsets, sizes, strides);
321 }
322 
323 Value makeComposedPadHighOp(OpBuilder &b, Location loc, RankedTensorType type,
324                             Value source, Value pad, bool nofold) {
325   assert(type.hasStaticShape() && "expect tensor type to have static shape");
326 
327   // Exit if `source` is not defined by an ExtractSliceOp.
328   auto sliceOp = source.getDefiningOp<tensor::ExtractSliceOp>();
329   if (!sliceOp)
330     return tensor::createPadHighOp(type, source, pad, nofold, loc, b);
331 
332   // Search the `source` use-def chain for padded LinalgOps.
333   Value current = sliceOp.source();
334   while (current) {
335     auto linalgOp = current.getDefiningOp<LinalgOp>();
336     if (!linalgOp)
337       break;
338     OpResult opResult = current.cast<OpResult>();
339     current = linalgOp.getOutputOperand(opResult.getResultNumber())->get();
340   }
341   auto padOp = current ? current.getDefiningOp<tensor::PadOp>() : nullptr;
342 
343   // Exit if the search fails to match a tensor::PadOp at the end of the matched
344   // LinalgOp sequence.
345   if (!padOp)
346     return tensor::createPadHighOp(type, source, pad, nofold, loc, b);
347 
348   // Exit if the padded result type does not match.
349   if (sliceOp.source().getType() != type)
350     return tensor::createPadHighOp(type, source, pad, nofold, loc, b);
351 
352   // Exit if the LinalgOps are not high padded.
353   if (llvm::any_of(padOp.getMixedLowPad(), [](OpFoldResult ofr) {
354         return getConstantIntValue(ofr) != static_cast<int64_t>(0);
355       }))
356     return tensor::createPadHighOp(type, source, pad, nofold, loc, b);
357 
358   // Exit if `padOpSliceOp`, which defines the slice used by
359   // `padOp`, is rank-reducing.
360   auto padOpSliceOp = padOp.source().getDefiningOp<tensor::ExtractSliceOp>();
361   if (!padOpSliceOp ||
362       sliceOp.getMixedSizes().size() != padOpSliceOp.getMixedSizes().size())
363     return tensor::createPadHighOp(type, source, pad, nofold, loc, b);
364 
365   // Exit if the sizes of the dynamic sizes of `sliceOp` do not match the size
366   // of the slice padded by `padOp`.
367   if (llvm::any_of(
368           llvm::zip(sliceOp.getMixedSizes(), padOpSliceOp.getMixedSizes()),
369           [](std::tuple<OpFoldResult, OpFoldResult> it) {
370             return !isEqualConstantIntOrValue(std::get<0>(it), std::get<1>(it));
371           }))
372     return tensor::createPadHighOp(type, source, pad, nofold, loc, b);
373 
374   // Exit if the padding values do not match.
375   Attribute padOpPadAttr, padAttr;
376   Value padOpPad = padOp.getConstantPaddingValue();
377   if (!padOpPad || !matchPattern(padOpPad, m_Constant(&padOpPadAttr)) ||
378       !matchPattern(pad, m_Constant(&padAttr)) || padOpPadAttr != padAttr)
379     return tensor::createPadHighOp(type, source, pad, nofold, loc, b);
380 
381   // Return the padded result if the padding values and sizes match.
382   return sliceOp.source();
383 }
384 
385 GenericOp makeTransposeOp(OpBuilder &b, Location loc, Value inputTensor,
386                           Value outputTensor,
387                           ArrayRef<int64_t> transposeVector) {
388   auto resultTensorType = outputTensor.getType().cast<RankedTensorType>();
389   Type elementType = resultTensorType.getElementType();
390 
391   assert(isPermutation(transposeVector) &&
392          "expect transpose vector to be a permutation");
393   assert(transposeVector.size() ==
394              static_cast<size_t>(resultTensorType.getRank()) &&
395          "expect transpose vector size to match result tensor rank");
396 
397   // Compute the transpose and the indentity indexing maps.
398   SmallVector<AffineMap> indexingMaps = {
399       inversePermutation(AffineMap::getPermutationMap(
400           SmallVector<unsigned>(transposeVector.begin(), transposeVector.end()),
401           b.getContext())),
402       AffineMap::getMultiDimIdentityMap(transposeVector.size(),
403                                         b.getContext())};
404   SmallVector<llvm::StringRef> iteratorTypes(transposeVector.size(),
405                                              getParallelIteratorTypeName());
406 
407   // Create a GenericOp to transpose `inputTensor` into `outputTensor`.
408   auto transposeOp = b.create<GenericOp>(
409       loc, resultTensorType, inputTensor, outputTensor,
410       b.getAffineMapArrayAttr(indexingMaps), b.getStrArrayAttr(iteratorTypes),
411       /*doc=*/nullptr,
412       /*library_call=*/nullptr);
413   Region &body = transposeOp.getRegion();
414   body.push_back(new Block());
415   body.front().addArguments({elementType, elementType}, {loc, loc});
416 
417   // Create the body of the transpose operation.
418   OpBuilder::InsertionGuard g(b);
419   b.setInsertionPointToEnd(&body.front());
420   b.create<YieldOp>(loc, transposeOp.getRegion().front().getArgument(0));
421   return transposeOp;
422 }
423 
424 GenericOp makeMemRefCopyOp(OpBuilder &b, Location loc, Value from, Value to) {
425   auto memrefTypeTo = to.getType().cast<MemRefType>();
426 #ifndef NDEBUG
427   auto memrefTypeFrom = from.getType().cast<MemRefType>();
428   assert(memrefTypeFrom.getRank() == memrefTypeTo.getRank() &&
429          "`from` and `to` memref must have the same rank");
430 #endif // NDEBUG
431 
432   AffineMap id =
433       AffineMap::getMultiDimIdentityMap(memrefTypeTo.getRank(), b.getContext());
434   SmallVector<StringRef> iteratorTypes(memrefTypeTo.getRank(),
435                                        getParallelIteratorTypeName());
436   return b.create<linalg::GenericOp>(
437       loc,
438       /*inputs=*/from,
439       /*outputs=*/to,
440       /*indexingMaps=*/llvm::makeArrayRef({id, id}),
441       /*iteratorTypes=*/iteratorTypes,
442       [](OpBuilder &b, Location loc, ValueRange args) {
443         b.create<linalg::YieldOp>(loc, args.front());
444       });
445 }
446 
447 /// Specialization to build an scf "for" nest.
448 template <>
449 void GenerateLoopNest<scf::ForOp>::doit(
450     OpBuilder &b, Location loc, ArrayRef<Range> loopRanges, LinalgOp linalgOp,
451     ArrayRef<Attribute> iteratorTypes,
452     function_ref<scf::ValueVector(OpBuilder &, Location, ValueRange,
453                                   ValueRange)>
454         bodyBuilderFn,
455     Optional<LinalgLoopDistributionOptions> distributionOptions,
456     ArrayRef<StringRef> distributionTypes) {
457   SmallVector<Value> iterArgInitValues = linalgOp.getOutputTensorOperands();
458   // Create procInfo so it dominates loops, if appropriate.
459   SmallVector<ProcInfo, 4> procInfo;
460   SmallVector<DistributionMethod, 0> distributionMethod;
461   if (distributionOptions.hasValue()) {
462     // Collect loop ranges for parallel dimensions.
463     SmallVector<Range, 2> parallelLoopRanges;
464     for (const auto &iteratorType : enumerate(iteratorTypes))
465       if (isParallelIterator(iteratorType.value()))
466         parallelLoopRanges.push_back(loopRanges[iteratorType.index()]);
467 
468     // Get their distribution schemes.
469     distributionMethod = distributionOptions->distributionMethod;
470     if (distributionMethod.size() < parallelLoopRanges.size())
471       parallelLoopRanges.resize(distributionMethod.size());
472     procInfo = distributionOptions->procInfo(b, loc, parallelLoopRanges);
473   }
474 
475   SmallVector<Value, 4> lbs, ubs, steps;
476   unpackRanges(loopRanges, lbs, ubs, steps);
477   LoopNest loopNest = mlir::scf::buildLoopNest(
478       b, loc, lbs, ubs, steps, iterArgInitValues,
479       [&](OpBuilder &b, Location loc, ValueRange ivs, ValueRange iterArgs) {
480         assert(iterArgs.size() == linalgOp.getOutputTensorOperands().size() &&
481                "expect the number of output tensors and iter args to match");
482         SmallVector<Value> operandValuesToUse =
483             linalgOp.getInputAndOutputOperands();
484         if (!iterArgs.empty()) {
485           operandValuesToUse = linalgOp.getInputOperands();
486           operandValuesToUse.append(iterArgs.begin(), iterArgs.end());
487         }
488         return bodyBuilderFn(b, loc, ivs, operandValuesToUse);
489       });
490 
491   if (!distributionOptions || loopNest.loops.empty())
492     return;
493 
494   // Filter out scf.for loops that were created out of parallel dimensions.
495   SmallVector<scf::ForOp, 4> loops;
496   for (const auto &iteratorType : enumerate(iteratorTypes))
497     if (isParallelIterator(iteratorType.value()))
498       loops.push_back(loopNest.loops[iteratorType.index()]);
499 
500   // Distribute - only supports cyclic distribution for now.
501   for (auto it : llvm::zip(loops, procInfo, distributionMethod))
502     if (std::get<2>(it) == DistributionMethod::Cyclic)
503       mapLoopToProcessorIds(std::get<0>(it), std::get<1>(it).procId,
504                             std::get<1>(it).nprocs);
505 }
506 
507 /// Specialization to build affine "for" nest.
508 template <>
509 void GenerateLoopNest<AffineForOp>::doit(
510     OpBuilder &b, Location loc, ArrayRef<Range> loopRanges, LinalgOp linalgOp,
511     ArrayRef<Attribute> iteratorTypes,
512     function_ref<scf::ValueVector(OpBuilder &, Location, ValueRange,
513                                   ValueRange)>
514         bodyBuilderFn,
515     Optional<LinalgLoopDistributionOptions>, ArrayRef<StringRef>) {
516   SmallVector<Value> iterArgInitValues = linalgOp.getOutputTensorOperands();
517   assert(iterArgInitValues.empty() && "unexpected AffineForOp init values");
518   SmallVector<Value, 4> lbs, ubs, steps;
519   unpackRanges(loopRanges, lbs, ubs, steps);
520 
521   // Affine loops require constant steps.
522   SmallVector<int64_t, 4> constantSteps;
523   constantSteps.reserve(steps.size());
524   for (Value v : steps) {
525     auto op = v.getDefiningOp<arith::ConstantIndexOp>();
526     assert(op && "Affine loops require constant steps");
527     constantSteps.push_back(op.value());
528   }
529 
530   mlir::buildAffineLoopNest(b, loc, lbs, ubs, constantSteps,
531                             [&](OpBuilder &b, Location loc, ValueRange ivs) {
532                               SmallVector<Value> operandValuesToUse =
533                                   linalgOp.getInputAndOutputOperands();
534                               bodyBuilderFn(b, loc, ivs, operandValuesToUse);
535                             });
536 }
537 
538 /// Update the `lb`, `ub` and `step` to get per processor `lb`, `ub` and `step`.
539 void updateBoundsForCyclicDistribution(OpBuilder &b, Location loc, Value procId,
540                                        Value nprocs, Value &lb, Value &ub,
541                                        Value &step) {
542   AffineExpr d0, d1;
543   bindDims(b.getContext(), d0, d1);
544   AffineExpr s0 = getAffineSymbolExpr(0, b.getContext());
545   lb = makeComposedAffineApply(b, loc, d0 + d1 * s0, {lb, procId, step});
546   step = makeComposedAffineApply(b, loc, d0 * s0, {nprocs, step});
547 }
548 
549 /// Generates a loop nest consisting of scf.parallel and scf.for, depending
550 /// on the `iteratorTypes.` Consecutive parallel loops create a single
551 /// scf.parallel operation; each sequential loop creates a new scf.for
552 /// operation. The body of the innermost loop is populated by
553 /// `bodyBuilderFn` that accepts a range of induction variables for all
554 /// loops. `ivStorage` is used to store the partial list of induction
555 /// variables.
556 // TODO: this function can be made iterative instead. However, it
557 // will have at most as many recursive calls as nested loops, which rarely
558 // exceeds 10.
559 static void generateParallelLoopNest(
560     OpBuilder &b, Location loc, ValueRange lbs, ValueRange ubs,
561     ValueRange steps, ArrayRef<Attribute> iteratorTypes,
562     function_ref<void(OpBuilder &, Location, ValueRange)> bodyBuilderFn,
563     SmallVectorImpl<Value> &ivStorage,
564     ArrayRef<DistributionMethod> distributionMethod = {}) {
565   assert(lbs.size() == ubs.size());
566   assert(lbs.size() == steps.size());
567   assert(lbs.size() == iteratorTypes.size());
568 
569   // If there are no (more) loops to be generated, generate the body and be
570   // done with it.
571   if (iteratorTypes.empty()) {
572     bodyBuilderFn(b, loc, ivStorage);
573     return;
574   }
575 
576   // Find the outermost parallel loops and drop their types from the list.
577   unsigned nLoops = iteratorTypes.size();
578   unsigned nOuterPar =
579       nLoops - iteratorTypes.drop_while(isParallelIterator).size();
580 
581   // If there are no outer parallel loops, generate one sequential loop and
582   // recurse. Note that we wouldn't have dropped anything from `iteratorTypes`
583   // in this case.
584   if (nOuterPar == 0) {
585     LoopNest singleLoop = buildLoopNest(
586         b, loc, lbs.take_front(), ubs.take_front(), steps.take_front(),
587         [&](OpBuilder &b, Location loc, ValueRange ivs) {
588           ivStorage.append(ivs.begin(), ivs.end());
589           generateParallelLoopNest(b, loc, lbs.drop_front(), ubs.drop_front(),
590                                    steps.drop_front(),
591                                    iteratorTypes.drop_front(), bodyBuilderFn,
592                                    ivStorage, distributionMethod);
593         });
594     return;
595   }
596   if (distributionMethod.empty()) {
597     // Generate a single parallel loop-nest operation for all outermost
598     // parallel loops and recurse.
599     b.create<scf::ParallelOp>(
600         loc, lbs.take_front(nOuterPar), ubs.take_front(nOuterPar),
601         steps.take_front(nOuterPar),
602         [&](OpBuilder &nestedBuilder, Location nestedLoc, ValueRange localIvs) {
603           ivStorage.append(localIvs.begin(), localIvs.end());
604           generateParallelLoopNest(
605               nestedBuilder, nestedLoc, lbs.drop_front(nOuterPar),
606               ubs.drop_front(nOuterPar), steps.drop_front(nOuterPar),
607               iteratorTypes.drop_front(nOuterPar), bodyBuilderFn, ivStorage,
608               (distributionMethod.size() < nOuterPar)
609                   ? ArrayRef<DistributionMethod>()
610                   : distributionMethod.drop_front(nOuterPar));
611         });
612     return;
613   }
614 
615   // Process all consecutive similarly distributed loops simultaneously.
616   DistributionMethod methodToUse = distributionMethod[0];
617   unsigned numProcessed = 1;
618   for (unsigned i = 1; i < nOuterPar && i < distributionMethod.size(); ++i) {
619     if (distributionMethod[i] != methodToUse)
620       break;
621     numProcessed++;
622   }
623 
624   switch (methodToUse) {
625   case DistributionMethod::Cyclic: {
626     // Generate a single parallel loop-nest operation for all outermost
627     // parallel loops and recurse.
628     b.create<scf::ParallelOp>(
629         loc, lbs.take_front(numProcessed), ubs.take_front(numProcessed),
630         steps.take_front(numProcessed),
631         [&](OpBuilder &nestedBuilder, Location nestedLoc, ValueRange localIvs) {
632           ivStorage.append(localIvs.begin(), localIvs.end());
633           generateParallelLoopNest(
634               nestedBuilder, nestedLoc, lbs.drop_front(numProcessed),
635               ubs.drop_front(numProcessed), steps.drop_front(numProcessed),
636               iteratorTypes.drop_front(numProcessed), bodyBuilderFn, ivStorage,
637               (distributionMethod.size() < numProcessed)
638                   ? ArrayRef<DistributionMethod>()
639                   : distributionMethod.drop_front(numProcessed));
640         });
641     return;
642   }
643   case DistributionMethod::CyclicNumProcsGeNumIters: {
644     // Check (for the processed loops) that the iteration is in-bounds.
645     ArithBuilder ab(b, loc);
646     Value cond = ab.slt(lbs[0], ubs[0]);
647     for (unsigned i = 1; i < numProcessed; ++i)
648       cond = ab._and(cond, ab.slt(lbs[i], ubs[i]));
649     ivStorage.append(lbs.begin(), std::next(lbs.begin(), numProcessed));
650     b.create<scf::IfOp>(loc, cond, [&](OpBuilder &b, Location loc) {
651       generateParallelLoopNest(
652           b, loc, lbs.drop_front(numProcessed), ubs.drop_front(numProcessed),
653           steps.drop_front(numProcessed),
654           iteratorTypes.drop_front(numProcessed), bodyBuilderFn, ivStorage,
655           distributionMethod.drop_front(numProcessed));
656       b.create<scf::YieldOp>(loc, ValueRange{});
657     });
658     return;
659   }
660   case DistributionMethod::CyclicNumProcsEqNumIters:
661     // No check/loops needed here. Set the `%iv` to be the `%lb` and proceed
662     // with inner loop generation.
663     ivStorage.append(lbs.begin(), std::next(lbs.begin(), numProcessed));
664     generateParallelLoopNest(
665         b, loc, lbs.drop_front(numProcessed), ubs.drop_front(numProcessed),
666         steps.drop_front(numProcessed), iteratorTypes.drop_front(numProcessed),
667         bodyBuilderFn, ivStorage, distributionMethod.drop_front(numProcessed));
668     return;
669   }
670 }
671 
672 /// Specialization for generating a mix of parallel and sequential scf loops.
673 template <>
674 void GenerateLoopNest<scf::ParallelOp>::doit(
675     OpBuilder &b, Location loc, ArrayRef<Range> loopRanges, LinalgOp linalgOp,
676     ArrayRef<Attribute> iteratorTypes,
677     function_ref<scf::ValueVector(OpBuilder &, Location, ValueRange,
678                                   ValueRange)>
679         bodyBuilderFn,
680     Optional<LinalgLoopDistributionOptions> distributionOptions,
681     ArrayRef<StringRef> distributionTypes) {
682   SmallVector<Value> iterArgInitValues = linalgOp.getOutputTensorOperands();
683   assert(iterArgInitValues.empty() && "unexpected ParallelOp init values");
684   // This function may be passed more iterator types than ranges.
685   assert(iteratorTypes.size() >= loopRanges.size() &&
686          "expected iterator type for all ranges");
687   iteratorTypes = iteratorTypes.take_front(loopRanges.size());
688   SmallVector<Value, 8> lbsStorage, ubsStorage, stepsStorage, ivs;
689   unsigned numLoops = iteratorTypes.size();
690   ivs.reserve(numLoops);
691   lbsStorage.reserve(numLoops);
692   ubsStorage.reserve(numLoops);
693   stepsStorage.reserve(numLoops);
694 
695   // Get the loop lb, ub, and step.
696   unpackRanges(loopRanges, lbsStorage, ubsStorage, stepsStorage);
697 
698   // Modify the lb, ub, and step based on the distribution options.
699   SmallVector<DistributionMethod, 0> distributionMethod;
700   if (distributionOptions) {
701     auto &options = distributionOptions.getValue();
702     distributionMethod.assign(distributionOptions->distributionMethod.begin(),
703                               distributionOptions->distributionMethod.end());
704     SmallVector<Range, 2> parallelLoopRanges;
705     for (const auto &iteratorType : enumerate(iteratorTypes)) {
706       if (isParallelIterator(iteratorType.value()))
707         parallelLoopRanges.push_back(loopRanges[iteratorType.index()]);
708     }
709     if (distributionMethod.size() < parallelLoopRanges.size())
710       parallelLoopRanges.resize(distributionMethod.size());
711     SmallVector<ProcInfo, 2> procInfo =
712         options.procInfo(b, loc, parallelLoopRanges);
713     unsigned index = 0;
714     for (const auto &iteratorType : enumerate(iteratorTypes)) {
715       if (index >= procInfo.size())
716         break;
717       if (isParallelIterator(iteratorType.value())) {
718         unsigned i = iteratorType.index();
719         updateBoundsForCyclicDistribution(b, loc, procInfo[index].procId,
720                                           procInfo[index].nprocs, lbsStorage[i],
721                                           ubsStorage[i], stepsStorage[i]);
722         index++;
723       }
724     }
725   }
726   ValueRange lbs(lbsStorage), ubs(ubsStorage), steps(stepsStorage);
727   generateParallelLoopNest(
728       b, loc, lbs, ubs, steps, iteratorTypes,
729       [&](OpBuilder &b, Location loc, ValueRange ivs) {
730         SmallVector<Value> operandValuesToUse =
731             linalgOp.getInputAndOutputOperands();
732         bodyBuilderFn(b, loc, ivs, operandValuesToUse);
733       },
734       ivs, distributionMethod);
735 
736   assert(ivs.size() == iteratorTypes.size() && "did not generate enough loops");
737 }
738 
739 static Value fullyComposeAndAffineApply(OpBuilder &b, Location loc,
740                                         AffineExpr expr, ValueRange operands) {
741   AffineMap map = AffineMap::inferFromExprList({expr}).front();
742   SmallVector<Value> normalizedOperands(operands.begin(), operands.end());
743   mlir::fullyComposeAffineMapAndOperands(&map, &normalizedOperands);
744   canonicalizeMapAndOperands(&map, &normalizedOperands);
745   return b.createOrFold<AffineApplyOp>(loc, map, normalizedOperands);
746 }
747 
748 Value makeTiledShape(OpBuilder &builder, Location loc, Value valueToTile,
749                      ValueRange tileSizes, AffineMap map, ValueRange lbs,
750                      ValueRange ubs, ValueRange subShapeSizes) {
751   auto shapedType = valueToTile.getType().dyn_cast<ShapedType>();
752   assert(shapedType && "only shaped types can be tiled");
753   ArrayRef<int64_t> shape = shapedType.getShape();
754   int64_t rank = shapedType.getRank();
755 
756   // Construct a new subview / extract_slice for the tile.
757   SmallVector<OpFoldResult, 4> offsets, sizes, strides;
758   offsets.reserve(rank);
759   sizes.reserve(rank);
760   strides.reserve(rank);
761   for (unsigned r = 0; r < rank; ++r) {
762     LLVM_DEBUG(llvm::dbgs() << "makeTiledShape: for dim#" << r);
763     if (!isTiled(map.getSubMap({r}), tileSizes)) {
764       offsets.push_back(builder.getIndexAttr(0));
765       Value dim = createOrFoldDimOp(builder, loc, valueToTile, r);
766       sizes.push_back(getAsOpFoldResult(dim));
767       strides.push_back(builder.getIndexAttr(1));
768       LLVM_DEBUG(llvm::dbgs() << ": not tiled: use size: " << dim << "\n");
769       continue;
770     }
771     LLVM_DEBUG(llvm::dbgs() << ": tiled: figure out subsize...\n");
772 
773     // Tiling creates a new slice at the proper index, the slice step is 1
774     // (i.e. the op does not subsample, stepping occurs in the loop).
775     auto m = map.getSubMap({r});
776     LLVM_DEBUG(llvm::dbgs() << "makeTiledShape: submap: " << m << "\n");
777     auto offset = applyMapToValues(builder, loc, m, lbs).front();
778     offsets.push_back(offset);
779     auto closedIntSize =
780         applyMapToValues(builder, loc, m, subShapeSizes).front();
781     // Resulting size needs to be made half open interval again.
782     AffineExpr s0 = getAffineSymbolExpr(0, builder.getContext());
783     Value size =
784         fullyComposeAndAffineApply(builder, loc, s0 + 1, closedIntSize);
785     LLVM_DEBUG(llvm::dbgs() << "makeTiledShape: raw size: " << size << "\n");
786 
787     // The size of the subview / extract_slice should be trimmed to avoid
788     // out-of-bounds accesses, unless:
789     // a. We statically know the subshape size divides the shape size evenly.
790     // b. The subshape size is 1. According to the way the loops are set up,
791     //    tensors with "0" dimensions would never be constructed.
792     int64_t shapeSize = shape[r];
793     auto sizeCst = size.getDefiningOp<arith::ConstantIndexOp>();
794     auto hasTileSizeOne = sizeCst && sizeCst.value() == 1;
795     auto dividesEvenly = sizeCst && !ShapedType::isDynamic(shapeSize) &&
796                          ((shapeSize % sizeCst.value()) == 0);
797     if (!hasTileSizeOne && !dividesEvenly) {
798       LLVM_DEBUG(llvm::dbgs() << "makeTiledShape: shapeSize=" << shapeSize
799                               << ", size: " << size
800                               << ": make sure in bound with affine.min\n");
801 
802       AffineExpr dim0, dim1, dim2;
803       bindDims(builder.getContext(), dim0, dim1, dim2);
804 
805       // Get the dimension size for this dimension. We need to first calculate
806       // the max index and then plus one. This is important because for
807       // convolution ops, we have its input window dimension's affine map of the
808       // form `(d0 * s0 + d1)`, where `d0`/`d1 is an output/filter window
809       // dimension and `s0` is stride. Directly use the dimension size of
810       // output/filer window dimensions will cause incorrect calculation.
811       AffineMap minusOneMap =
812           AffineMap::inferFromExprList({ArrayRef<AffineExpr>{dim0 - 1}})
813               .front();
814       AffineMap plusOneMap =
815           AffineMap::inferFromExprList({ArrayRef<AffineExpr>{dim0 + 1}})
816               .front();
817       auto maxIndices = llvm::to_vector<8>(llvm::map_range(ubs, [&](Value ub) {
818         return makeComposedAffineApply(builder, loc, minusOneMap, {ub})
819             .getResult();
820       }));
821       Value maxIndex = applyMapToValues(builder, loc, m, maxIndices).front();
822       Value d = makeComposedAffineApply(builder, loc, plusOneMap, {maxIndex});
823 
824       // Compute min(size, dim - offset) to avoid out-of-bounds accesses.
825       AffineMap minMap = AffineMap::inferFromExprList(
826                              {ArrayRef<AffineExpr>{dim0, dim1 - dim2}})
827                              .front();
828       SmallVector<Value, 4> operands{size, d, offset};
829       fullyComposeAffineMapAndOperands(&minMap, &operands);
830       canonicalizeMapAndOperands(&minMap, &operands);
831       size = builder.create<AffineMinOp>(loc, builder.getIndexType(), minMap,
832                                          operands);
833     }
834 
835     sizes.push_back(size);
836     LLVM_DEBUG(llvm::dbgs()
837                << "makeTiledShape: new offset: " << offset << "\n");
838     LLVM_DEBUG(llvm::dbgs() << "makeTiledShape: new size: " << size << "\n");
839     strides.push_back(builder.getIndexAttr(1));
840   }
841 
842   auto *sliceOp = TypeSwitch<ShapedType, Operation *>(shapedType)
843                       .Case([&](MemRefType) {
844                         return builder.create<memref::SubViewOp>(
845                             loc, valueToTile, offsets, sizes, strides);
846                       })
847                       .Case([&](RankedTensorType) {
848                         return makeComposedExtractSliceOp(
849                             builder, loc, valueToTile, offsets, sizes, strides);
850                       })
851                       .Default([](ShapedType) -> Operation * {
852                         llvm_unreachable("Unexpected shaped type");
853                       });
854   return sliceOp->getResult(0);
855 }
856 
857 SmallVector<Value> computeTileOffsets(OpBuilder &b, Location loc,
858                                       ValueRange ivs, ValueRange tileSizes) {
859   SmallVector<Value> offsets;
860   for (unsigned idx = 0, idxIvs = 0, e = tileSizes.size(); idx < e; ++idx) {
861     LLVM_DEBUG(llvm::dbgs() << "makeTiledShapes: for loop#" << idx << "\n");
862     bool isTiled = !isZero(tileSizes[idx]);
863     offsets.push_back(
864         isTiled ? ivs[idxIvs++]
865                 : b.create<arith::ConstantIndexOp>(loc, 0).getResult());
866     LLVM_DEBUG(llvm::dbgs()
867                << "computeTileOffsets: " << offsets.back() << "\n");
868   }
869   return offsets;
870 }
871 
872 SmallVector<Value> computeTileSizes(OpBuilder &b, Location loc, ValueRange ivs,
873                                     ValueRange tileSizes,
874                                     ArrayRef<Value> sizeBounds) {
875   SmallVector<Value> sizes;
876   for (unsigned idx = 0, e = tileSizes.size(); idx < e; ++idx) {
877     bool isTiled = !isZero(tileSizes[idx]);
878     // Before composing, we need to make range a closed interval.
879     Value size = isTiled ? tileSizes[idx] : sizeBounds[idx];
880     AffineExpr d0 = getAffineDimExpr(0, b.getContext());
881     sizes.push_back(fullyComposeAndAffineApply(b, loc, d0 - 1, size));
882     LLVM_DEBUG(llvm::dbgs() << "computeTileSizes: " << sizes.back() << "\n");
883   }
884   return sizes;
885 }
886 
887 SmallVector<Value, 4> makeTiledShapes(OpBuilder &b, Location loc,
888                                       LinalgOp linalgOp,
889                                       ArrayRef<Value> valuesToTile,
890                                       ValueRange ivs, ValueRange tileSizes,
891                                       ArrayRef<Value> sizeBounds) {
892   assert(ivs.size() == static_cast<size_t>(llvm::count_if(
893                            llvm::make_range(tileSizes.begin(), tileSizes.end()),
894                            [](Value v) { return !isZero(v); })) &&
895          "expected as many ivs as non-zero sizes");
896 
897   // Construct (potentially temporary) mins and maxes on which to apply maps
898   // that define tile subshapes.
899   SmallVector<Value> lbs = computeTileOffsets(b, loc, ivs, tileSizes);
900   SmallVector<Value> subShapeSizes =
901       computeTileSizes(b, loc, ivs, tileSizes, sizeBounds);
902 
903   assert(static_cast<int64_t>(valuesToTile.size()) ==
904              linalgOp.getNumInputsAndOutputs() &&
905          "expected one value to tile for every operand");
906   SmallVector<Value, 4> tiledShapes;
907   tiledShapes.reserve(valuesToTile.size());
908   for (OpOperand *opOperand : linalgOp.getInputAndOutputOperands()) {
909     Value shapedOp = valuesToTile[opOperand->getOperandNumber()];
910     LLVM_DEBUG(llvm::dbgs() << "makeTiledShapes: for operand " << shapedOp);
911     AffineMap map = linalgOp.getTiedIndexingMap(opOperand);
912     // Use `opOperand` as is if it is not tiled and not an output tensor. Having
913     // an extract/insert slice pair for all output tensors simplifies follow up
914     // transformations such as padding and bufferization since the
915     // extract/insert slice pairs make the accessed iteration argument
916     // subdomains explicit.
917     if (!isTiled(map, tileSizes) && !linalgOp.isOutputTensor(opOperand)) {
918       tiledShapes.push_back(shapedOp);
919       LLVM_DEBUG(llvm::dbgs() << ": not tiled: use shape: "
920                               << opOperand->get().getType() << "\n");
921       continue;
922     }
923     LLVM_DEBUG(llvm::dbgs() << ": tiled: figure out subshape...\n");
924 
925     tiledShapes.push_back(makeTiledShape(b, loc, shapedOp, tileSizes, map, lbs,
926                                          sizeBounds, subShapeSizes));
927   }
928 
929   return tiledShapes;
930 }
931 
932 void addTileLoopIvsToIndexOpResults(OpBuilder &b, LinalgOp tiledOp,
933                                     ArrayRef<Value> ivs) {
934   if (tiledOp.hasIndexSemantics()) {
935     for (IndexOp indexOp : tiledOp.getBlock()->getOps<IndexOp>()) {
936       if (ivs[indexOp.dim()] == nullptr)
937         continue;
938       OpBuilder::InsertionGuard guard(b);
939       b.setInsertionPointAfter(indexOp);
940       AffineExpr index, offset;
941       bindDims(b.getContext(), index, offset);
942       AffineApplyOp applyOp = makeComposedAffineApply(
943           b, indexOp.getLoc(), index + offset,
944           ValueRange{indexOp.getResult(), ivs[indexOp.dim()]});
945       indexOp.getResult().replaceAllUsesExcept(applyOp, applyOp);
946     }
947   }
948 }
949 
950 } // namespace linalg
951 } // namespace mlir
952