Linalg/Utils/Utils.cpp

//===- Utils.cpp - Utilities to support the Linalg dialect ----------------===//
//
// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
// See https://llvm.org/LICENSE.txt for license information.
// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
//
//===----------------------------------------------------------------------===//
//
// This file implements utilities for the Linalg dialect.
//
//===----------------------------------------------------------------------===//

#include "mlir/Dialect/Linalg/Utils/Utils.h"

#include "mlir/Dialect/Affine/EDSC/Intrinsics.h"
#include "mlir/Dialect/Affine/IR/AffineOps.h"
#include "mlir/Dialect/Linalg/IR/LinalgOps.h"
#include "mlir/Dialect/Linalg/IR/LinalgTypes.h"
#include "mlir/Dialect/SCF/EDSC/Builders.h"
#include "mlir/Dialect/SCF/SCF.h"
#include "mlir/Dialect/StandardOps/IR/Ops.h"
#include "mlir/IR/AffineExpr.h"
#include "mlir/IR/AffineMap.h"
#include "mlir/IR/Matchers.h"
#include "mlir/IR/OpImplementation.h"
#include "mlir/Pass/Pass.h"
#include "mlir/Transforms/LoopUtils.h"

using namespace mlir;
using namespace mlir::linalg;
using namespace mlir::scf;

Optional<RegionMatcher::BinaryOpKind>
RegionMatcher::matchAsScalarBinaryOp(GenericOp op) {
  auto &region = op.region();
  if (!llvm::hasSingleElement(region))
    return llvm::None;

  Block &block = region.front();
  if (block.getNumArguments() != 2 ||
      !block.getArgument(0).getType().isSignlessIntOrFloat() ||
      !block.getArgument(1).getType().isSignlessIntOrFloat())
    return llvm::None;

  auto &ops = block.getOperations();
  if (!llvm::hasSingleElement(block.without_terminator()))
    return llvm::None;

  using mlir::matchers::m_Val;
  auto a = m_Val(block.getArgument(0));
  auto b = m_Val(block.getArgument(1));

  auto addPattern = m_Op<linalg::YieldOp>(m_Op<AddIOp>(a, b));
  if (addPattern.match(&ops.back()))
    return BinaryOpKind::IAdd;

  return llvm::None;
}

bool mlir::linalg::isParallelIteratorType(Attribute attr) {
  if (auto strAttr = attr.dyn_cast<StringAttr>()) {
    return strAttr.getValue() == getParallelIteratorTypeName();
  }
  return false;
}

bool mlir::linalg::isReductionIteratorType(Attribute attr) {
  if (auto strAttr = attr.dyn_cast<StringAttr>()) {
    return strAttr.getValue() == getReductionIteratorTypeName();
  }
  return false;
}

bool mlir::linalg::isWindowIteratorType(Attribute attr) {
  if (auto strAttr = attr.dyn_cast<StringAttr>()) {
    return strAttr.getValue() == getWindowIteratorTypeName();
  }
  return false;
}

/// Explicit instantiation of loop nest generator for different loop types.
template struct mlir::linalg::GenerateLoopNest<scf::ForOp>;
template struct mlir::linalg::GenerateLoopNest<scf::ParallelOp>;
template struct mlir::linalg::GenerateLoopNest<AffineForOp>;

/// Given a list of subview ranges, extract individual values for lower, upper
/// bounds and steps and put them into the corresponding vectors.
static void unpackRanges(ArrayRef<Range> ranges, SmallVectorImpl<Value> &lbs,
                         SmallVectorImpl<Value> &ubs,
                         SmallVectorImpl<Value> &steps) {
  for (Range range : ranges) {
    lbs.emplace_back(range.offset);
    ubs.emplace_back(range.size);
    steps.emplace_back(range.stride);
  }
}

namespace mlir {
namespace linalg {

SmallVector<int64_t, 8> getStaticShape(LinalgOp linalgOp) {
  SmallVector<int64_t, 8> res;
  for (Value v : linalgOp.getShapedOperands()) {
    auto shape = v.getType().cast<ShapedType>().getShape();
    res.append(shape.begin(), shape.end());
  }
  return res;
}

Optional<SmallVector<int64_t, 4>> getStaticLoopRanges(LinalgOp linalgOp) {
  SmallVector<int64_t, 8> viewSizes = getStaticShape(linalgOp);
  AffineMap invertedMap = linalgOp.getShapesToLoopsMap();
  if (!invertedMap)
    return {};
  return invertedMap.compose(viewSizes);
}

/// If `size` comes from an AffineMinOp and one of the values of AffineMinOp
/// is a constant then return a new value set to the smallest such constant.
/// Otherwise returngetSmallestBoundingIndex nullptr.
IntegerAttr getSmallestBoundingIndex(Value size) {
  Optional<int64_t> boundingConst = {};
  if (auto affineMinOp = size.getDefiningOp<AffineMinOp>()) {
    for (auto e : affineMinOp.getAffineMap().getResults())
      if (auto cst = e.dyn_cast<AffineConstantExpr>())
        boundingConst = boundingConst
                            ? std::min(boundingConst.getValue(), cst.getValue())
                            : cst.getValue();
  } else if (auto constIndexOp = size.getDefiningOp<ConstantOp>()) {
    if (constIndexOp.getType().isa<IndexType>())
      boundingConst = constIndexOp.value().cast<IntegerAttr>().getInt();
  } else if (auto affineApplyOp = size.getDefiningOp<AffineApplyOp>()) {
    if (auto cExpr = affineApplyOp.getAffineMap()
                         .getResult(0)
                         .dyn_cast<AffineConstantExpr>())
      boundingConst = cExpr.getValue();
  }
  if (boundingConst && *boundingConst >= 0)
    return Builder(size.getContext()).getIndexAttr(*boundingConst);
  return nullptr;
}

/// Specialization to build an scf "for" nest.
template <>
void GenerateLoopNest<scf::ForOp>::doit(
    ArrayRef<Range> loopRanges, ValueRange iterArgInitValues,
    ArrayRef<Attribute> iteratorTypes,
    function_ref<scf::ValueVector(ValueRange, ValueRange)> bodyBuilderFn,
    Optional<LinalgLoopDistributionOptions> distributionOptions) {
  // Create procInfo so it dominates loops, if appropriate.
  OpBuilder &builder = edsc::ScopedContext::getBuilderRef();
  Location loc = edsc::ScopedContext::getLocation();
  SmallVector<ProcInfo, 2> procInfo;
  if (distributionOptions.hasValue())
    procInfo = distributionOptions->procInfo(builder, loc, loopRanges);

  SmallVector<Value, 4> lbs, ubs, steps;
  unpackRanges(loopRanges, lbs, ubs, steps);
  LoopNest loopNest =
      edsc::loopNestBuilder(lbs, ubs, steps, iterArgInitValues, bodyBuilderFn);

  if (!distributionOptions.hasValue() || loopNest.loops.empty())
    return;

  // Only supports cyclic distribution for now.
  for (auto it : llvm::zip(loopNest.loops, procInfo,
                           distributionOptions->distributionMethod))
    if (std::get<2>(it) == DistributionMethod::Cyclic)
      mapLoopToProcessorIds(std::get<0>(it), std::get<1>(it).procId,
                            std::get<1>(it).nprocs);
}

/// Specialization to build affine "for" nest.
template <>
void GenerateLoopNest<AffineForOp>::doit(
    ArrayRef<Range> loopRanges, ValueRange iterArgInitValues,
    ArrayRef<Attribute> iteratorTypes,
    function_ref<scf::ValueVector(ValueRange, ValueRange)> bodyBuilderFn,
    Optional<LinalgLoopDistributionOptions>) {
  assert(iterArgInitValues.empty() && "unexpected AffineForOp init values");
  SmallVector<Value, 4> lbs, ubs, steps;
  unpackRanges(loopRanges, lbs, ubs, steps);

  // Affine loops require constant steps.
  SmallVector<int64_t, 4> constantSteps;
  constantSteps.reserve(steps.size());
  for (Value v : steps) {
    auto op = v.getDefiningOp<ConstantIndexOp>();
    assert(op && "Affine loops require constant steps");
    constantSteps.push_back(op.getValue());
  }

  auto bodyBuilderWithoutIterArgsFn = [&](ValueRange ivs) {
    bodyBuilderFn(ivs, {});
  };
  edsc::affineLoopNestBuilder(lbs, ubs, constantSteps,
                              bodyBuilderWithoutIterArgsFn);
}

/// Update the `lb`, `ub` and `step` to get per processor `lb`, `ub` and `step`.
static void updateBoundsForCyclicDistribution(OpBuilder &builder, Location loc,
                                              Value procId, Value nprocs,
                                              Value &lb, Value &ub,
                                              Value &step) {
  using edsc::op::operator+;
  using edsc::op::operator*;
  lb = lb + (procId * step);
  step = nprocs * step;
}

/// Generates a loop nest consisting of scf.parallel and scf.for, depending
/// on the `iteratorTypes.` Consecutive parallel loops create a single
/// scf.parallel operation; each sequential loop creates a new scf.for
/// operation. The body of the innermost loop is populated by
/// `bodyBuilderFn` that accepts a range of induction variables for all
/// loops. `ivStorage` is used to store the partial list of induction
/// variables.
// TODO: this function can be made iterative instead. However, it
// will have at most as many recursive calls as nested loops, which rarely
// exceeds 10.
static void
generateParallelLoopNest(ValueRange lbs, ValueRange ubs, ValueRange steps,
                         ArrayRef<Attribute> iteratorTypes,
                         function_ref<void(ValueRange)> bodyBuilderFn,
                         SmallVectorImpl<Value> &ivStorage,
                         ArrayRef<DistributionMethod> distributionMethod = {}) {
  assert(lbs.size() == ubs.size());
  assert(lbs.size() == steps.size());
  assert(lbs.size() == iteratorTypes.size());

  // If there are no (more) loops to be generated, generate the body and be
  // done with it.
  if (iteratorTypes.empty())
    return bodyBuilderFn(ivStorage);

  // Find the outermost parallel loops and drop their types from the list.
  unsigned nLoops = iteratorTypes.size();
  unsigned nOuterPar =
      nLoops - iteratorTypes.drop_while(isParallelIteratorType).size();

  // If there are no outer parallel loops, generate one sequential loop and
  // recurse. Note that we wouldn't have dropped anything from `iteratorTypes`
  // in this case.
  if (nOuterPar == 0) {
    edsc::loopNestBuilder(lbs[0], ubs[0], steps[0], [&](Value iv) {
      ivStorage.push_back(iv);
      generateParallelLoopNest(lbs.drop_front(), ubs.drop_front(),
                               steps.drop_front(), iteratorTypes.drop_front(),
                               bodyBuilderFn, ivStorage, distributionMethod);
    });
    return;
  }
  if (distributionMethod.empty()) {
    // Generate a single parallel loop-nest operation for all outermost
    // parallel loops and recurse.
    edsc::OperationBuilder<scf::ParallelOp>(
        lbs.take_front(nOuterPar), ubs.take_front(nOuterPar),
        steps.take_front(nOuterPar),
        [&](OpBuilder &nestedBuilder, Location nestedLoc, ValueRange localIvs) {
          edsc::ScopedContext context(nestedBuilder, nestedLoc);
          ivStorage.append(localIvs.begin(), localIvs.end());
          generateParallelLoopNest(
              lbs.drop_front(nOuterPar), ubs.drop_front(nOuterPar),
              steps.drop_front(nOuterPar), iteratorTypes.drop_front(nOuterPar),
              bodyBuilderFn, ivStorage,
              (distributionMethod.size() < nOuterPar)
                  ? ArrayRef<DistributionMethod>()
                  : distributionMethod.drop_front(nOuterPar));
        });
    return;
  }

  // Process all consecutive similarly distributed loops simultaneously.
  DistributionMethod methodToUse = distributionMethod[0];
  unsigned numProcessed = 1;
  for (unsigned i = 1; i < nOuterPar && i < distributionMethod.size(); ++i) {
    if (distributionMethod[i] != methodToUse)
      break;
    numProcessed++;
  }

  switch (methodToUse) {
  case DistributionMethod::Cyclic: {
    // Generate a single parallel loop-nest operation for all outermost
    // parallel loops and recurse.
    edsc::OperationBuilder<scf::ParallelOp>(
        lbs.take_front(numProcessed), ubs.take_front(numProcessed),
        steps.take_front(numProcessed),
        [&](OpBuilder &nestedBuilder, Location nestedLoc, ValueRange localIvs) {
          edsc::ScopedContext context(nestedBuilder, nestedLoc);
          ivStorage.append(localIvs.begin(), localIvs.end());
          generateParallelLoopNest(
              lbs.drop_front(numProcessed), ubs.drop_front(numProcessed),
              steps.drop_front(numProcessed),
              iteratorTypes.drop_front(numProcessed), bodyBuilderFn, ivStorage,
              (distributionMethod.size() < numProcessed)
                  ? ArrayRef<DistributionMethod>()
                  : distributionMethod.drop_front(numProcessed));
        });
    return;
  }
  case DistributionMethod::CyclicNumProcsGeNumIters: {
    // Check (for the processed loops) that the iteration is in-bounds.
    using edsc::op::slt;
    using edsc::op::operator&&;
    Value cond = slt(lbs[0], ubs[0]);
    for (unsigned i = 1; i < numProcessed; ++i)
      cond = cond && slt(lbs[i], ubs[i]);
    ivStorage.append(lbs.begin(), std::next(lbs.begin(), numProcessed));
    edsc::conditionBuilder(cond, [&]() {
      generateParallelLoopNest(
          lbs.drop_front(numProcessed), ubs.drop_front(numProcessed),
          steps.drop_front(numProcessed),
          iteratorTypes.drop_front(numProcessed), bodyBuilderFn, ivStorage,
          distributionMethod.drop_front(numProcessed));
    });
    return;
  }
  case DistributionMethod::CyclicNumProcsEqNumIters:
    // No check/loops needed here. Set the `%iv` to be the `%lb` and proceed
    // with inner loop generation.
    ivStorage.append(lbs.begin(), std::next(lbs.begin(), numProcessed));
    generateParallelLoopNest(
        lbs.drop_front(numProcessed), ubs.drop_front(numProcessed),
        steps.drop_front(numProcessed), iteratorTypes.drop_front(numProcessed),
        bodyBuilderFn, ivStorage, distributionMethod.drop_front(numProcessed));
    return;
  }
}

/// Specialization for generating a mix of parallel and sequential scf loops.
template <>
void GenerateLoopNest<scf::ParallelOp>::doit(
    ArrayRef<Range> loopRanges, ValueRange iterArgInitValues,
    ArrayRef<Attribute> iteratorTypes,
    function_ref<scf::ValueVector(ValueRange, ValueRange)> bodyBuilderFn,
    Optional<LinalgLoopDistributionOptions> distributionOptions) {
  assert(iterArgInitValues.empty() && "unexpected ParallelOp init values");
  // This function may be passed more iterator types than ranges.
  assert(iteratorTypes.size() >= loopRanges.size() &&
         "expected iterator type for all ranges");
  iteratorTypes = iteratorTypes.take_front(loopRanges.size());
  SmallVector<Value, 8> lbsStorage, ubsStorage, stepsStorage, ivs;
  unsigned numLoops = iteratorTypes.size();
  ivs.reserve(numLoops);
  lbsStorage.reserve(numLoops);
  ubsStorage.reserve(numLoops);
  stepsStorage.reserve(numLoops);

  // Get the loop lb, ub, and step.
  unpackRanges(loopRanges, lbsStorage, ubsStorage, stepsStorage);

  // Modify the lb, ub, and step based on the distribution options.
  SmallVector<DistributionMethod, 0> distributionMethod;
  if (distributionOptions) {
    auto &options = distributionOptions.getValue();
    OpBuilder &builder = edsc::ScopedContext::getBuilderRef();
    Location loc = edsc::ScopedContext::getLocation();
    distributionMethod.assign(distributionOptions->distributionMethod.begin(),
                              distributionOptions->distributionMethod.end());
    SmallVector<Range, 2> parallelLoopRanges;
    for (auto iteratorType : enumerate(iteratorTypes)) {
      if (isParallelIteratorType(iteratorType.value()))
        parallelLoopRanges.push_back(loopRanges[iteratorType.index()]);
    }
    if (distributionMethod.size() < parallelLoopRanges.size())
      parallelLoopRanges.resize(distributionMethod.size());
    SmallVector<ProcInfo, 2> procInfo =
        options.procInfo(builder, loc, parallelLoopRanges);
    unsigned index = 0;
    for (auto iteratorType : enumerate(iteratorTypes)) {
      if (index >= procInfo.size())
        break;
      if (isParallelIteratorType(iteratorType.value())) {
        unsigned i = iteratorType.index();
        updateBoundsForCyclicDistribution(builder, loc, procInfo[index].procId,
                                          procInfo[index].nprocs, lbsStorage[i],
                                          ubsStorage[i], stepsStorage[i]);
        index++;
      }
    }
  }
  ValueRange lbs(lbsStorage), ubs(ubsStorage), steps(stepsStorage);
  auto bodyBuilderWithoutIterArgsFn = [&](ValueRange ivs) {
    bodyBuilderFn(ivs, {});
  };
  generateParallelLoopNest(lbs, ubs, steps, iteratorTypes,
                           bodyBuilderWithoutIterArgsFn, ivs,
                           distributionMethod);

  assert(ivs.size() == iteratorTypes.size() && "did not generate enough loops");
}

} // namespace linalg
} // namespace mlir