Async/Transforms/AsyncParallelFor.cpp

*c30ab6c2SEugene Zhulenev//===- AsyncParallelFor.cpp - Implementation of Async Parallel For --------===//
*c30ab6c2SEugene Zhulenev//
*c30ab6c2SEugene Zhulenev// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
*c30ab6c2SEugene Zhulenev// See https://llvm.org/LICENSE.txt for license information.
*c30ab6c2SEugene Zhulenev// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
*c30ab6c2SEugene Zhulenev//
*c30ab6c2SEugene Zhulenev//===----------------------------------------------------------------------===//
*c30ab6c2SEugene Zhulenev//
*c30ab6c2SEugene Zhulenev// This file implements scf.parallel to src.for + async.execute conversion pass.
*c30ab6c2SEugene Zhulenev//
*c30ab6c2SEugene Zhulenev//===----------------------------------------------------------------------===//
*c30ab6c2SEugene Zhulenev
*c30ab6c2SEugene Zhulenev#include "PassDetail.h"
*c30ab6c2SEugene Zhulenev#include "mlir/Dialect/Async/IR/Async.h"
*c30ab6c2SEugene Zhulenev#include "mlir/Dialect/Async/Passes.h"
*c30ab6c2SEugene Zhulenev#include "mlir/Dialect/SCF/SCF.h"
*c30ab6c2SEugene Zhulenev#include "mlir/Dialect/StandardOps/IR/Ops.h"
*c30ab6c2SEugene Zhulenev#include "mlir/IR/BlockAndValueMapping.h"
*c30ab6c2SEugene Zhulenev#include "mlir/IR/PatternMatch.h"
*c30ab6c2SEugene Zhulenev#include "mlir/Transforms/GreedyPatternRewriteDriver.h"
*c30ab6c2SEugene Zhulenev
*c30ab6c2SEugene Zhulenevusing namespace mlir;
*c30ab6c2SEugene Zhulenevusing namespace mlir::async;
*c30ab6c2SEugene Zhulenev
*c30ab6c2SEugene Zhulenev#define DEBUG_TYPE "async-parallel-for"
*c30ab6c2SEugene Zhulenev
*c30ab6c2SEugene Zhulenevnamespace {
*c30ab6c2SEugene Zhulenev
*c30ab6c2SEugene Zhulenev// Rewrite scf.parallel operation into multiple concurrent async.execute
*c30ab6c2SEugene Zhulenev// operations over non overlapping subranges of the original loop.
*c30ab6c2SEugene Zhulenev//
*c30ab6c2SEugene Zhulenev// Example:
*c30ab6c2SEugene Zhulenev//
*c30ab6c2SEugene Zhulenev//   scf.for (%i, %j) = (%lbi, %lbj) to (%ubi, %ubj) step (%si, %sj) {
*c30ab6c2SEugene Zhulenev//     "do_some_compute"(%i, %j): () -> ()
*c30ab6c2SEugene Zhulenev//   }
*c30ab6c2SEugene Zhulenev//
*c30ab6c2SEugene Zhulenev// Converted to:
*c30ab6c2SEugene Zhulenev//
*c30ab6c2SEugene Zhulenev//   %c0 = constant 0 : index
*c30ab6c2SEugene Zhulenev//   %c1 = constant 1 : index
*c30ab6c2SEugene Zhulenev//
*c30ab6c2SEugene Zhulenev//   // Compute blocks sizes for each induction variable.
*c30ab6c2SEugene Zhulenev//   %num_blocks_i = ... : index
*c30ab6c2SEugene Zhulenev//   %num_blocks_j = ... : index
*c30ab6c2SEugene Zhulenev//   %block_size_i = ... : index
*c30ab6c2SEugene Zhulenev//   %block_size_j = ... : index
*c30ab6c2SEugene Zhulenev//
*c30ab6c2SEugene Zhulenev//   // Create an async group to track async execute ops.
*c30ab6c2SEugene Zhulenev//   %group = async.create_group
*c30ab6c2SEugene Zhulenev//
*c30ab6c2SEugene Zhulenev//   scf.for %bi = %c0 to %num_blocks_i step %c1 {
*c30ab6c2SEugene Zhulenev//     %block_start_i = ... : index
*c30ab6c2SEugene Zhulenev//     %block_end_i   = ... : index
*c30ab6c2SEugene Zhulenev//
*c30ab6c2SEugene Zhulenev//     scf.for %bj = %c0 to %num_blocks_j step %c1 {
*c30ab6c2SEugene Zhulenev//       %block_start_j = ... : index
*c30ab6c2SEugene Zhulenev//       %block_end_j   = ... : index
*c30ab6c2SEugene Zhulenev//
*c30ab6c2SEugene Zhulenev//       // Execute the body of original parallel operation for the current
*c30ab6c2SEugene Zhulenev//       // block.
*c30ab6c2SEugene Zhulenev//       %token = async.execute {
*c30ab6c2SEugene Zhulenev//         scf.for %i = %block_start_i to %block_end_i step %si {
*c30ab6c2SEugene Zhulenev//           scf.for %j = %block_start_j to %block_end_j step %sj {
*c30ab6c2SEugene Zhulenev//             "do_some_compute"(%i, %j): () -> ()
*c30ab6c2SEugene Zhulenev//           }
*c30ab6c2SEugene Zhulenev//         }
*c30ab6c2SEugene Zhulenev//       }
*c30ab6c2SEugene Zhulenev//
*c30ab6c2SEugene Zhulenev//       // Add produced async token to the group.
*c30ab6c2SEugene Zhulenev//       async.add_to_group %token, %group
*c30ab6c2SEugene Zhulenev//     }
*c30ab6c2SEugene Zhulenev//   }
*c30ab6c2SEugene Zhulenev//
*c30ab6c2SEugene Zhulenev//   // Await completion of all async.execute operations.
*c30ab6c2SEugene Zhulenev//   async.await_all %group
*c30ab6c2SEugene Zhulenev//
*c30ab6c2SEugene Zhulenev// In this example outer loop launches inner block level loops as separate async
*c30ab6c2SEugene Zhulenev// execute operations which will be executed concurrently.
*c30ab6c2SEugene Zhulenev//
*c30ab6c2SEugene Zhulenev// At the end it waits for the completiom of all async execute operations.
*c30ab6c2SEugene Zhulenev//
*c30ab6c2SEugene Zhulenevstruct AsyncParallelForRewrite : public OpRewritePattern<scf::ParallelOp> {
*c30ab6c2SEugene Zhulenevpublic:
*c30ab6c2SEugene Zhulenev  AsyncParallelForRewrite(MLIRContext *ctx, int numConcurrentAsyncExecute)
*c30ab6c2SEugene Zhulenev      : OpRewritePattern(ctx),
*c30ab6c2SEugene Zhulenev        numConcurrentAsyncExecute(numConcurrentAsyncExecute) {}
*c30ab6c2SEugene Zhulenev
*c30ab6c2SEugene Zhulenev  LogicalResult matchAndRewrite(scf::ParallelOp op,
*c30ab6c2SEugene Zhulenev                                PatternRewriter &rewriter) const override;
*c30ab6c2SEugene Zhulenev
*c30ab6c2SEugene Zhulenevprivate:
*c30ab6c2SEugene Zhulenev  int numConcurrentAsyncExecute;
*c30ab6c2SEugene Zhulenev};
*c30ab6c2SEugene Zhulenev
*c30ab6c2SEugene Zhulenevstruct AsyncParallelForPass
*c30ab6c2SEugene Zhulenev    : public AsyncParallelForBase<AsyncParallelForPass> {
*c30ab6c2SEugene Zhulenev  AsyncParallelForPass() = default;
*c30ab6c2SEugene Zhulenev  void runOnFunction() override;
*c30ab6c2SEugene Zhulenev};
*c30ab6c2SEugene Zhulenev
*c30ab6c2SEugene Zhulenev} // namespace
*c30ab6c2SEugene Zhulenev
*c30ab6c2SEugene ZhulenevLogicalResult
*c30ab6c2SEugene ZhulenevAsyncParallelForRewrite::matchAndRewrite(scf::ParallelOp op,
*c30ab6c2SEugene Zhulenev                                         PatternRewriter &rewriter) const {
*c30ab6c2SEugene Zhulenev  // We do not currently support rewrite for parallel op with reductions.
*c30ab6c2SEugene Zhulenev  if (op.getNumReductions() != 0)
*c30ab6c2SEugene Zhulenev    return failure();
*c30ab6c2SEugene Zhulenev
*c30ab6c2SEugene Zhulenev  MLIRContext *ctx = op.getContext();
*c30ab6c2SEugene Zhulenev  Location loc = op.getLoc();
*c30ab6c2SEugene Zhulenev
*c30ab6c2SEugene Zhulenev  // Index constants used below.
*c30ab6c2SEugene Zhulenev  auto indexTy = IndexType::get(ctx);
*c30ab6c2SEugene Zhulenev  auto zero = IntegerAttr::get(indexTy, 0);
*c30ab6c2SEugene Zhulenev  auto one = IntegerAttr::get(indexTy, 1);
*c30ab6c2SEugene Zhulenev  auto c0 = rewriter.create<ConstantOp>(loc, indexTy, zero);
*c30ab6c2SEugene Zhulenev  auto c1 = rewriter.create<ConstantOp>(loc, indexTy, one);
*c30ab6c2SEugene Zhulenev
*c30ab6c2SEugene Zhulenev  // Shorthand for signed integer ceil division operation.
*c30ab6c2SEugene Zhulenev  auto divup = [&](Value x, Value y) -> Value {
*c30ab6c2SEugene Zhulenev    return rewriter.create<SignedCeilDivIOp>(loc, x, y);
*c30ab6c2SEugene Zhulenev  };
*c30ab6c2SEugene Zhulenev
*c30ab6c2SEugene Zhulenev  // Compute trip count for each loop induction variable:
*c30ab6c2SEugene Zhulenev  //   tripCount = divUp(upperBound - lowerBound, step);
*c30ab6c2SEugene Zhulenev  SmallVector<Value, 4> tripCounts(op.getNumLoops());
*c30ab6c2SEugene Zhulenev  for (size_t i = 0; i < op.getNumLoops(); ++i) {
*c30ab6c2SEugene Zhulenev    auto lb = op.lowerBound()[i];
*c30ab6c2SEugene Zhulenev    auto ub = op.upperBound()[i];
*c30ab6c2SEugene Zhulenev    auto step = op.step()[i];
*c30ab6c2SEugene Zhulenev    auto range = rewriter.create<SubIOp>(loc, ub, lb);
*c30ab6c2SEugene Zhulenev    tripCounts[i] = divup(range, step);
*c30ab6c2SEugene Zhulenev  }
*c30ab6c2SEugene Zhulenev
*c30ab6c2SEugene Zhulenev  // The target number of concurrent async.execute ops.
*c30ab6c2SEugene Zhulenev  auto numExecuteOps = rewriter.create<ConstantOp>(
*c30ab6c2SEugene Zhulenev      loc, indexTy, IntegerAttr::get(indexTy, numConcurrentAsyncExecute));
*c30ab6c2SEugene Zhulenev
*c30ab6c2SEugene Zhulenev  // Blocks sizes configuration for each induction variable.
*c30ab6c2SEugene Zhulenev
*c30ab6c2SEugene Zhulenev  // We try to use maximum available concurrency in outer dimensions first
*c30ab6c2SEugene Zhulenev  // (assuming that parallel induction variables are corresponding to some
*c30ab6c2SEugene Zhulenev  // multidimensional access, e.g. in (%d0, %d1, ..., %dn) = (<from>) to (<to>)
*c30ab6c2SEugene Zhulenev  // we will try to parallelize iteration along the %d0. If %d0 is too small,
*c30ab6c2SEugene Zhulenev  // we'll parallelize iteration over %d1, and so on.
*c30ab6c2SEugene Zhulenev  SmallVector<Value, 4> targetNumBlocks(op.getNumLoops());
*c30ab6c2SEugene Zhulenev  SmallVector<Value, 4> blockSize(op.getNumLoops());
*c30ab6c2SEugene Zhulenev  SmallVector<Value, 4> numBlocks(op.getNumLoops());
*c30ab6c2SEugene Zhulenev
*c30ab6c2SEugene Zhulenev  // Compute block size and number of blocks along the first induction variable.
*c30ab6c2SEugene Zhulenev  targetNumBlocks[0] = numExecuteOps;
*c30ab6c2SEugene Zhulenev  blockSize[0] = divup(tripCounts[0], targetNumBlocks[0]);
*c30ab6c2SEugene Zhulenev  numBlocks[0] = divup(tripCounts[0], blockSize[0]);
*c30ab6c2SEugene Zhulenev
*c30ab6c2SEugene Zhulenev  // Assign remaining available concurrency to other induction variables.
*c30ab6c2SEugene Zhulenev  for (size_t i = 1; i < op.getNumLoops(); ++i) {
*c30ab6c2SEugene Zhulenev    targetNumBlocks[i] = divup(targetNumBlocks[i - 1], numBlocks[i - 1]);
*c30ab6c2SEugene Zhulenev    blockSize[i] = divup(tripCounts[i], targetNumBlocks[i]);
*c30ab6c2SEugene Zhulenev    numBlocks[i] = divup(tripCounts[i], blockSize[i]);
*c30ab6c2SEugene Zhulenev  }
*c30ab6c2SEugene Zhulenev
*c30ab6c2SEugene Zhulenev  // Create an async.group to wait on all async tokens from async execute ops.
*c30ab6c2SEugene Zhulenev  auto group = rewriter.create<CreateGroupOp>(loc, GroupType::get(ctx));
*c30ab6c2SEugene Zhulenev
*c30ab6c2SEugene Zhulenev  // Build a scf.for loop nest from the parallel operation.
*c30ab6c2SEugene Zhulenev
*c30ab6c2SEugene Zhulenev  // Lower/upper bounds for nest block level computations.
*c30ab6c2SEugene Zhulenev  SmallVector<Value, 4> blockLowerBounds(op.getNumLoops());
*c30ab6c2SEugene Zhulenev  SmallVector<Value, 4> blockUpperBounds(op.getNumLoops());
*c30ab6c2SEugene Zhulenev  SmallVector<Value, 4> blockInductionVars(op.getNumLoops());
*c30ab6c2SEugene Zhulenev
*c30ab6c2SEugene Zhulenev  using LoopBodyBuilder =
*c30ab6c2SEugene Zhulenev      std::function<void(OpBuilder &, Location, Value, ValueRange)>;
*c30ab6c2SEugene Zhulenev  using LoopBuilder = std::function<LoopBodyBuilder(size_t loopIdx)>;
*c30ab6c2SEugene Zhulenev
*c30ab6c2SEugene Zhulenev  // Builds inner loop nest inside async.execute operation that does all the
*c30ab6c2SEugene Zhulenev  // work concurrently.
*c30ab6c2SEugene Zhulenev  LoopBuilder workLoopBuilder = [&](size_t loopIdx) -> LoopBodyBuilder {
*c30ab6c2SEugene Zhulenev    return [&, loopIdx](OpBuilder &b, Location loc, Value iv, ValueRange args) {
*c30ab6c2SEugene Zhulenev      blockInductionVars[loopIdx] = iv;
*c30ab6c2SEugene Zhulenev
*c30ab6c2SEugene Zhulenev      // Continute building async loop nest.
*c30ab6c2SEugene Zhulenev      if (loopIdx < op.getNumLoops() - 1) {
*c30ab6c2SEugene Zhulenev        b.create<scf::ForOp>(
*c30ab6c2SEugene Zhulenev            loc, blockLowerBounds[loopIdx + 1], blockUpperBounds[loopIdx + 1],
*c30ab6c2SEugene Zhulenev            op.step()[loopIdx + 1], ValueRange(), workLoopBuilder(loopIdx + 1));
*c30ab6c2SEugene Zhulenev        b.create<scf::YieldOp>(loc);
*c30ab6c2SEugene Zhulenev        return;
*c30ab6c2SEugene Zhulenev      }
*c30ab6c2SEugene Zhulenev
*c30ab6c2SEugene Zhulenev      // Copy the body of the parallel op with new loop bounds.
*c30ab6c2SEugene Zhulenev      BlockAndValueMapping mapping;
*c30ab6c2SEugene Zhulenev      mapping.map(op.getInductionVars(), blockInductionVars);
*c30ab6c2SEugene Zhulenev
*c30ab6c2SEugene Zhulenev      for (auto &bodyOp : op.getLoopBody().getOps())
*c30ab6c2SEugene Zhulenev        b.clone(bodyOp, mapping);
*c30ab6c2SEugene Zhulenev    };
*c30ab6c2SEugene Zhulenev  };
*c30ab6c2SEugene Zhulenev
*c30ab6c2SEugene Zhulenev  // Builds a loop nest that does async execute op dispatching.
*c30ab6c2SEugene Zhulenev  LoopBuilder asyncLoopBuilder = [&](size_t loopIdx) -> LoopBodyBuilder {
*c30ab6c2SEugene Zhulenev    return [&, loopIdx](OpBuilder &b, Location loc, Value iv, ValueRange args) {
*c30ab6c2SEugene Zhulenev      auto lb = op.lowerBound()[loopIdx];
*c30ab6c2SEugene Zhulenev      auto ub = op.upperBound()[loopIdx];
*c30ab6c2SEugene Zhulenev      auto step = op.step()[loopIdx];
*c30ab6c2SEugene Zhulenev
*c30ab6c2SEugene Zhulenev      // Compute lower bound for the current block:
*c30ab6c2SEugene Zhulenev      //   blockLowerBound = iv * blockSize * step + lowerBound
*c30ab6c2SEugene Zhulenev      auto s0 = b.create<MulIOp>(loc, iv, blockSize[loopIdx]);
*c30ab6c2SEugene Zhulenev      auto s1 = b.create<MulIOp>(loc, s0, step);
*c30ab6c2SEugene Zhulenev      auto s2 = b.create<AddIOp>(loc, s1, lb);
*c30ab6c2SEugene Zhulenev      blockLowerBounds[loopIdx] = s2;
*c30ab6c2SEugene Zhulenev
*c30ab6c2SEugene Zhulenev      // Compute upper bound for the current block:
*c30ab6c2SEugene Zhulenev      //   blockUpperBound = min(upperBound,
*c30ab6c2SEugene Zhulenev      //                         blockLowerBound + blockSize * step)
*c30ab6c2SEugene Zhulenev      auto e0 = b.create<MulIOp>(loc, blockSize[loopIdx], step);
*c30ab6c2SEugene Zhulenev      auto e1 = b.create<AddIOp>(loc, e0, s2);
*c30ab6c2SEugene Zhulenev      auto e2 = b.create<CmpIOp>(loc, CmpIPredicate::slt, e1, ub);
*c30ab6c2SEugene Zhulenev      auto e3 = b.create<SelectOp>(loc, e2, e1, ub);
*c30ab6c2SEugene Zhulenev      blockUpperBounds[loopIdx] = e3;
*c30ab6c2SEugene Zhulenev
*c30ab6c2SEugene Zhulenev      // Continue building async dispatch loop nest.
*c30ab6c2SEugene Zhulenev      if (loopIdx < op.getNumLoops() - 1) {
*c30ab6c2SEugene Zhulenev        b.create<scf::ForOp>(loc, c0, numBlocks[loopIdx + 1], c1, ValueRange(),
*c30ab6c2SEugene Zhulenev                             asyncLoopBuilder(loopIdx + 1));
*c30ab6c2SEugene Zhulenev        b.create<scf::YieldOp>(loc);
*c30ab6c2SEugene Zhulenev        return;
*c30ab6c2SEugene Zhulenev      }
*c30ab6c2SEugene Zhulenev
*c30ab6c2SEugene Zhulenev      // Build the inner loop nest that will do the actual work inside the
*c30ab6c2SEugene Zhulenev      // `async.execute` body region.
*c30ab6c2SEugene Zhulenev      auto executeBodyBuilder = [&](OpBuilder &executeBuilder,
*c30ab6c2SEugene Zhulenev                                    Location executeLoc,
*c30ab6c2SEugene Zhulenev                                    ValueRange executeArgs) {
*c30ab6c2SEugene Zhulenev        executeBuilder.create<scf::ForOp>(executeLoc, blockLowerBounds[0],
*c30ab6c2SEugene Zhulenev                                          blockUpperBounds[0], op.step()[0],
*c30ab6c2SEugene Zhulenev                                          ValueRange(), workLoopBuilder(0));
*c30ab6c2SEugene Zhulenev        executeBuilder.create<async::YieldOp>(executeLoc, ValueRange());
*c30ab6c2SEugene Zhulenev      };
*c30ab6c2SEugene Zhulenev
*c30ab6c2SEugene Zhulenev      auto execute = b.create<ExecuteOp>(
*c30ab6c2SEugene Zhulenev          loc, /*resultTypes=*/TypeRange(), /*dependencies=*/ValueRange(),
*c30ab6c2SEugene Zhulenev          /*operands=*/ValueRange(), executeBodyBuilder);
*c30ab6c2SEugene Zhulenev      auto rankType = IndexType::get(ctx);
*c30ab6c2SEugene Zhulenev      b.create<AddToGroupOp>(loc, rankType, execute.token(), group.result());
*c30ab6c2SEugene Zhulenev      b.create<scf::YieldOp>(loc);
*c30ab6c2SEugene Zhulenev    };
*c30ab6c2SEugene Zhulenev  };
*c30ab6c2SEugene Zhulenev
*c30ab6c2SEugene Zhulenev  // Start building a loop nest from the first induction variable.
*c30ab6c2SEugene Zhulenev  rewriter.create<scf::ForOp>(loc, c0, numBlocks[0], c1, ValueRange(),
*c30ab6c2SEugene Zhulenev                              asyncLoopBuilder(0));
*c30ab6c2SEugene Zhulenev
*c30ab6c2SEugene Zhulenev  // Wait for the completion of all subtasks.
*c30ab6c2SEugene Zhulenev  rewriter.create<AwaitAllOp>(loc, group.result());
*c30ab6c2SEugene Zhulenev
*c30ab6c2SEugene Zhulenev  // Erase the original parallel operation.
*c30ab6c2SEugene Zhulenev  rewriter.eraseOp(op);
*c30ab6c2SEugene Zhulenev
*c30ab6c2SEugene Zhulenev  return success();
*c30ab6c2SEugene Zhulenev}
*c30ab6c2SEugene Zhulenev
*c30ab6c2SEugene Zhulenevvoid AsyncParallelForPass::runOnFunction() {
*c30ab6c2SEugene Zhulenev  MLIRContext *ctx = &getContext();
*c30ab6c2SEugene Zhulenev
*c30ab6c2SEugene Zhulenev  OwningRewritePatternList patterns;
*c30ab6c2SEugene Zhulenev  patterns.insert<AsyncParallelForRewrite>(ctx, numConcurrentAsyncExecute);
*c30ab6c2SEugene Zhulenev
*c30ab6c2SEugene Zhulenev  if (failed(applyPatternsAndFoldGreedily(getFunction(), std::move(patterns))))
*c30ab6c2SEugene Zhulenev    signalPassFailure();
*c30ab6c2SEugene Zhulenev}
*c30ab6c2SEugene Zhulenev
*c30ab6c2SEugene Zhulenevstd::unique_ptr<OperationPass<FuncOp>> mlir::createAsyncParallelForPass() {
*c30ab6c2SEugene Zhulenev  return std::make_unique<AsyncParallelForPass>();
*c30ab6c2SEugene Zhulenev}